diff --git a/.bazelrc b/.bazelrc
index 612a1e59aeb806..4ff08fd221f5dc 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -231,7 +231,7 @@ common:apple-toolchain --host_crosstool_top=@local_config_apple_cc//:toolchain
 common:macos_arm64 --cpu=darwin_arm64
 common:macos_arm64 --macos_minimum_os=11.0
 common:macos_arm64 --config=clang_local
-common:macos_arm64 --platforms=@build_bazel_apple_support//configs/platforms:darwin_arm64
+common:macos_arm64 --platforms=@build_bazel_apple_support//platforms:darwin_arm64
 
 # iOS configs for each architecture and the fat binary builds.
 common:ios --apple_platform_type=ios
@@ -244,16 +244,16 @@ common:ios_armv7 --cpu=ios_armv7
 common:ios_armv7 --platforms=@org_tensorflow//tensorflow/tools/toolchains/ios:ios_armv7
 common:ios_arm64 --config=ios
 common:ios_arm64 --cpu=ios_arm64
-common:ios_arm64 --platforms=@build_bazel_apple_support//configs/platforms:ios_arm64
+common:ios_arm64 --platforms=@build_bazel_apple_support//platforms:ios_arm64
 common:ios_arm64e --config=ios
 common:ios_arm64e --cpu=ios_arm64e
-common:ios_arm64e --platforms=@build_bazel_apple_support//configs/platforms:ios_arm64e
+common:ios_arm64e --platforms=@build_bazel_apple_support//platforms:ios_arm64e
 common:ios_sim_arm64 --config=ios
 common:ios_sim_arm64 --cpu=ios_sim_arm64
-common:ios_sim_arm64 --platforms=@build_bazel_apple_support//configs/platforms:ios_sim_arm64
+common:ios_sim_arm64 --platforms=@build_bazel_apple_support//platforms:ios_sim_arm64
 common:ios_x86_64 --config=ios
 common:ios_x86_64 --cpu=ios_x86_64
-common:ios_x86_64 --platforms=@build_bazel_apple_support//configs/platforms:ios_x86_64
+common:ios_x86_64 --platforms=@build_bazel_apple_support//platforms:ios_x86_64
 common:ios_fat --config=ios
 common:ios_fat --ios_multi_cpus=armv7,arm64,i386,x86_64
 
@@ -282,19 +282,20 @@ common:mkl_threadpool -c opt
 # Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
 # with Eigen threadpool support
 common:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
+common:mkl_aarch64_threadpool --@compute_library//:openmp=false
 common:mkl_aarch64_threadpool -c opt
 
 # This is an alias for the mkl_aarch64_threadpool build.
 common:mkl_aarch64 --config=mkl_aarch64_threadpool
 
-# Default CUDA, CUDNN and NVSHMEM versions.
+# Default CUDA, CUDNN, NCCL and NVSHMEM versions.
 common:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
 common:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
 common:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
+common:cuda_version --repo_env=HERMETIC_NCCL_VERSION="2.27.7"
 
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 common:cuda --repo_env TF_NEED_CUDA=1
-common:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 common:cuda --@local_config_cuda//:enable_cuda
 common:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
@@ -329,8 +330,6 @@ common:cuda_clang --linkopt="-lm"
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 common:cuda_clang_official --config=cuda_clang
 common:cuda_clang_official --config=cuda_version
-common:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
-common:cuda_clang_official --crosstool_top="@local_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 common:cuda_nvcc --config=cuda
@@ -653,12 +652,6 @@ common:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 
-# Download CUDA/CUDNN redistributions to preserve the repositories cache between
-# CPU and GPU builds.
-# TODO(ybaturina): Uncomment when RBE is ready to support this.
-common:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
-common:rbe_linux_cpu --config=cuda_version
-
 # Deprecated RBE config with non-hermetic toolchains.
 common:rbe_linux_cpu_clang_local --config=rbe_linux_cpu
 common:rbe_linux_cpu_clang_local --config=clang_local
@@ -785,14 +778,18 @@ common:release_gpu_linux_clang_local --config=release_cpu_linux_clang_local
 
 common:release_arm64_linux --config=release_linux_base
 common:release_arm64_linux --config=linux_arm64
-common:release_arm64_linux --config=clang_local
-common:release_arm64_linux --repo_env=CC="/usr/lib/llvm-18/bin/clang"
-common:release_arm64_linux --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/clang"
-common:release_arm64_linux --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
 common:release_arm64_linux --config=mkl_aarch64_threadpool
 common:release_arm64_linux --copt=-flax-vector-conversions
 test:release_arm64_linux --flaky_test_attempts=3
 
+# Deprecated release CPU config with non-hermetic toolchains.
+common:release_arm64_linux_clang_local --config=release_arm64_linux
+common:release_arm64_linux_clang_local --config=clang_local
+common:release_arm64_linux_clang_local --repo_env=CC="/usr/lib/llvm-18/bin/clang"
+common:release_arm64_linux_clang_local --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/clang"
+common:release_arm64_linux_clang_local --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
+test:release_arm64_linux_clang_local --flaky_test_attempts=3
+
 common:release_cpu_macos --config=avx_linux
 
 # Base build configs for macOS
diff --git a/.github/workflows/arm-cd.yml b/.github/workflows/arm-cd.yml
index 2e3912041d9cf2..5430fc1c8151e8 100644
--- a/.github/workflows/arm-cd.yml
+++ b/.github/workflows/arm-cd.yml
@@ -52,12 +52,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: 'nightly'
       - name: Checkout repository for releases (skipped for nightly)
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build and test pip wheel
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended-cpp.yml b/.github/workflows/arm-ci-extended-cpp.yml
index 54903a6998b090..09085e814daba1 100644
--- a/.github/workflows/arm-ci-extended-cpp.yml
+++ b/.github/workflows/arm-ci-extended-cpp.yml
@@ -50,12 +50,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: 'nightly'
       - name: Checkout repository
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build binary and run C++ tests
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 2235cfc2d986da..94237fcaa6cca5 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -51,12 +51,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: 'nightly'
       - name: Checkout repository
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build binary and run python tests on nightly for all python versions
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index a141bdd4676852..12d8ab4a2cf719 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -47,7 +47,7 @@ jobs:
         shell: bash
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build binary and run python tests
         shell: bash
         run: |
diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 6421e08ccf0839..de578ffec96327 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -30,7 +30,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out a copy of the repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Check whether the citation metadata from CITATION.cff is valid
         uses: citation-file-format/cffconvert-github-action@4cf11baa70a673bfdf9dad0acc7ee33b3f4b6084 # v2.0.0
diff --git a/.github/workflows/issue-on-pr-rollback.yml b/.github/workflows/issue-on-pr-rollback.yml
index d5e0661a5f356b..1d548e9204e563 100644
--- a/.github/workflows/issue-on-pr-rollback.yml
+++ b/.github/workflows/issue-on-pr-rollback.yml
@@ -33,7 +33,7 @@ jobs:
       startsWith(github.event.head_commit.message, 'Rollback of PR #')
     steps:
       - name: Checkout repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Create a new Github Issue
         uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index 07896a48470753..984dead9db7388 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -28,7 +28,7 @@ permissions:
 jobs:
   scan-scheduled:
     if: github.repository == 'tensorflow/tensorflow'
-    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.2.4"
+    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.3.0"
     with:
       scan-args: |-
         --lockfile=requirements.txt:./requirements_lock_3_9.txt
diff --git a/.github/workflows/pylint-presubmit.yml b/.github/workflows/pylint-presubmit.yml
index 59068d9d86f45d..483cf5bfc0addf 100644
--- a/.github/workflows/pylint-presubmit.yml
+++ b/.github/workflows/pylint-presubmit.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout code
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - name: Get file changes
       id: get_file_changes
       uses: trilom/file-changes-action@a6ca26c14274c33b15e6499323aac178af06ad4b # v1.2.4
@@ -38,7 +38,7 @@ jobs:
       run: |
         echo Changed files: ${{ steps.get_file_changes.outputs.files }}
     - name: Set up Python 3.9
-      uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+      uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.9"
     - name: Install Python dependencies
diff --git a/.github/workflows/release-branch-cherrypick.yml b/.github/workflows/release-branch-cherrypick.yml
index 69e03a040ae1a2..fc643c92d304d1 100644
--- a/.github/workflows/release-branch-cherrypick.yml
+++ b/.github/workflows/release-branch-cherrypick.yml
@@ -45,7 +45,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     steps:
     - name: Checkout code
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       with:
         ref: ${{ github.event.inputs.release_branch }}
     - name: Get some helpful info for formatting
@@ -58,7 +58,7 @@ jobs:
           echo "SHORTSHA=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%h")" >> "$GITHUB_OUTPUT"
           echo "TITLE=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%s")" >> "$GITHUB_OUTPUT"
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+      uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
       with:
         title: '${{ github.event.inputs.release_branch }} cherry-pick: ${{ steps.cherrypick.outputs.SHORTSHA }} "${{ steps.cherrypick.outputs.TITLE }}"'
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index e635c4cd8ccc88..ce2d7075019b5d 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -41,7 +41,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           persist-credentials: false
 
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v3.29.5
+        uses: github/codeql-action/upload-sarif@fe4161a26a8629af62121b670040955b330f9af2 # v3.29.5
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index a8dba883f5ff14..d2cc83b7f5c2c2 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -30,7 +30,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     steps:
     - name: Checkout code
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - name: Update the RBE Configs
       run: |
         function map() {
@@ -130,7 +130,7 @@ jobs:
         map sigbuild-r2.17-clang-python3.11 2.17-python3.11
         map sigbuild-r2.17-clang-python3.12 2.17-python3.12
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+      uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
       with:
         title: Update the RBE images to the latest container versions
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/WORKSPACE b/WORKSPACE
index 0fc24cb3edd116..0c4c70e21101bc 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,26 +4,28 @@ workspace(name = "org_tensorflow")
 
 # buildifier: disable=load-on-top
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
-http_archive(
+tf_http_archive(
     name = "rules_shell",
     sha256 = "bc61ef94facc78e20a645726f64756e5e285a045037c7a61f65af2941f4c25e1",
     strip_prefix = "rules_shell-0.4.1",
-    url = "https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz",
+    urls = tf_mirror_urls(
+        "https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz",
+    ),
 )
 
 # Initialize toolchains for ML projects.
 #
 # A hermetic build system is designed to produce completely reproducible builds for C++.
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
-http_archive(
+tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
-    strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
-    urls = [
-        "https://github.com/yuriivcs/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
-    ],
+    sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+    strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
+    urls = tf_mirror_urls(
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
+    ),
 )
 
 load(
diff --git a/ci/official/containers/ml_build/Dockerfile b/ci/official/containers/ml_build/Dockerfile
index a4fb0cd9b1640a..ba090e65c95b33 100644
--- a/ci/official/containers/ml_build/Dockerfile
+++ b/ci/official/containers/ml_build/Dockerfile
@@ -58,10 +58,10 @@ RUN if [ -e "/usr/local/cuda/compat/libcuda.so.1" ]; then ln -s /usr/local/cuda/
 # - buildozer: clean bazel build deps
 # - gcloud SDK: communicate with Google Cloud Platform (GCP) for RBE, CI
 # - patchelf: Utility tool to modify existing ELF executables and libraries
-RUN git clone --branch v1.11.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
-RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.21.0/bazelisk-linux-amd64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-amd64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildozer-linux-amd64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
+RUN git clone --branch v1.13.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
+RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.27.0/bazelisk-linux-amd64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
+RUN wget https://github.com/bazelbuild/buildtools/releases/download/v8.2.1/buildifier-linux-amd64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
+RUN wget https://github.com/bazelbuild/buildtools/releases/download/v8.2.1/buildozer-linux-amd64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
 
 RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz | \
     tar zxf - google-cloud-sdk && \
diff --git a/ci/official/containers/ml_build_arm64/Dockerfile b/ci/official/containers/ml_build_arm64/Dockerfile
deleted file mode 100644
index 379162d0d1af76..00000000000000
--- a/ci/official/containers/ml_build_arm64/Dockerfile
+++ /dev/null
@@ -1,75 +0,0 @@
-################################################################################
-FROM ubuntu:20.04@sha256:8e5c4f0285ecbb4ead070431d29b576a530d3166df73ec44affc1cd27555141b as devel
-################################################################################
-
-# Install devtoolset build dependencies
-COPY setup.sources.sh /setup.sources.sh
-COPY setup.packages.sh /setup.packages.sh
-COPY builder.packages.txt /builder.packages.txt
-
-RUN /setup.sources.sh && /setup.packages.sh /builder.packages.txt
-
-RUN update-ca-certificates
-# Install devtoolset-9 in /dt10 with glibc 2.17 and libstdc++ 4.8, for building
-# manylinux2014-compatible packages.
-COPY builder.devtoolset/fixlinks_aarch64.sh /fixlinks.sh
-COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
-COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
-COPY builder.devtoolset/gcc9-fixups.patch /gcc9-fixups.patch
-COPY builder.devtoolset/stringop_trunc.patch /stringop_trunc.patch
-
-RUN /build_devtoolset.sh devtoolset-10 /dt10
-
-# Build later version of patchelf that is not so buggy
-RUN wget https://github.com/NixOS/patchelf/releases/download/0.18.0/patchelf-0.18.0-aarch64.tar.gz && tar -zxvf patchelf-0.18.0-aarch64.tar.gz -C /usr && rm -rf patchelf-0.18.0-aarch64.tar.gz
-
-RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-arm.tar.gz | \
-    tar zxf - google-cloud-sdk && \
-    google-cloud-sdk/install.sh --quiet
-ENV PATH="$PATH:/google-cloud-sdk/bin/"
-
-# Install various tools.
-# - bats: bash unit testing framework
-#         NOTE: v1.6.0 seems to have a bug that made "git" in setup_file break
-# - bazelisk: always use the correct bazel version
-# - buildifier: clean bazel build depshttps://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-arm64
-# - buildozer: clean bazel build deps
-RUN git clone --branch v1.11.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
-RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.21.0/bazelisk-linux-arm64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-arm64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildozer-linux-arm64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
-
-RUN groupadd -g 1001 buildslave && useradd -m -u 1001 -g buildslave buildslave
-RUN mkdir -p /tf/venv
-RUN chown -R buildslave:buildslave /tf
-
-# All lines past this point are reset when $CACHEBUSTER is set. We need this
-# for Python specifically because we install some nightly packages which are
-# likely to change daily.
-ARG CACHEBUSTER=0
-RUN echo $CACHEBUSTER
-
-# Setup build and environment
-COPY devel.bashrc /root/.bashrc
-COPY ld.so.conf /dt10/etc/
-
-# Make sure clang is on the path
-RUN ln -s /usr/lib/llvm-18/bin/clang /usr/bin/clang
-
-# Setup JAX Python environment.
-COPY requirements.txt /requirements.txt
-COPY setup.python.sh /setup.python.sh
-RUN /setup.python.sh python3.9 requirements.txt
-RUN /setup.python.sh python3.10 requirements.txt
-RUN /setup.python.sh python3.11 requirements.txt
-RUN /setup.python.sh python3.12 requirements.txt
-RUN /setup.python.sh python3.13 requirements.txt
-# python3.13-nogil is a free-threaded build of python3.13.
-RUN /setup.python.sh python3.13-nogil requirements.txt
-RUN /setup.python.sh python3.14 requirements.txt
-RUN /setup.python.sh python3.14-nogil requirements.txt
-
-# Python commands by default run under 3.11
-RUN ln -sf /usr/bin/python3.11 /usr/bin/python3
-RUN ln -sf /usr/bin/python3.11 /usr/bin/python
-RUN ln -sf /usr/lib/python3.11 /usr/lib/tf_python
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh b/ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh
deleted file mode 100755
index d59923d405a8c8..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Builds a devtoolset cross-compiler targeting manylinux2014 (glibc 2.17 / libstdc++ 4.8).
-
-VERSION="$1"
-TARGET="$2"
-
-case "${VERSION}" in
-devtoolset-9)
-  LIBSTDCXX_VERSION="6.0.28"
-  LIBSTDCXX_ABI="new"
-  ;;
-devtoolset-10)
-  LIBSTDCXX_VERSION="6.0.28"
-  LIBSTDCXX_ABI="new"
-  ;;
-*)
-  echo "Usage: $0 {devtoolset-9|devtoolset-10} <target-directory> <arch>"
-  echo "Use 'devtoolset-9' to build a manylinux2014 compatible toolchain"
-  exit 1
-  ;;
-esac
-
-mkdir -p "${TARGET}"
-
-mkdir -p ${TARGET}/usr/include
-
-# Put the current kernel headers from ubuntu in place.
-ln -s "/usr/include/linux" "${TARGET}/usr/include/linux"
-ln -s "/usr/include/asm-generic" "${TARGET}/usr/include/asm-generic"
-ln -s "/usr/include/aarch64-linux-gnu/asm" "${TARGET}/usr/include/asm"
-
-# Download glibc's shared and development libraries based on the value of the
-# `VERSION` parameter.
-# Note: 'Templatizing' this and the other conditional branches would require
-# defining several variables (version, os, path) making it difficult to maintain
-# and extend for future modifications.
-mkdir -p glibc-src
-mkdir -p glibc-build
-cd glibc-src
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/os/Source/SPackages/glibc-2.17-317.el7.src.rpm"
-rpm2cpio "glibc-2.17-317.el7.src.rpm" |cpio -idmv
-tar -xvzf "glibc-2.17-c758a686.tar.gz" --strip 1
-tar -xvzf "glibc-2.17-c758a686-releng.tar.gz" --strip 1
-sed -i '/patch0060/d' glibc.spec
-/rpm-patch.sh "glibc.spec"
-rm -f "glibc-2.17-317.el7.src.rpm" "glibc-2.17-c758a686.tar.gz" "glibc-2.17-c758a686-releng.tar.gz"
-patch -p1 < /gcc9-fixups.patch
-patch -p1 < /stringop_trunc.patch
-cd ../glibc-build
-../glibc-src/configure --prefix=/usr --disable-werror --enable-obsolete-rpc --disable-profile
-make -j$(nproc)
-make install DESTDIR=${TARGET}
-cd ..
-
-# Symlinks in the binary distribution are set up for installation in /usr, we
-# need to fix up all the links to stay within /${TARGET}.
-/fixlinks.sh "/${TARGET}"
-
-# Patch to allow non-glibc 2.12 compatible builds to work.
-sed -i '54i#define TCP_USER_TIMEOUT 18' "/${TARGET}/usr/include/netinet/tcp.h"
-
-# Download specific version of libstdc++ shared library based on the value of
-# the `VERSION` parameter
-  # Download binary libstdc++ 4.8 shared library release
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "http://old-releases.ubuntu.com/ubuntu/pool/main/g/gcc-4.8/libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
-    unar "libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
-    tar -C "${TARGET}" -xvzf "libstdc++6_4.8.1-10ubuntu8_arm64/data.tar.gz" "./usr/lib/aarch64-linux-gnu/libstdc++.so.6.0.18"  && \
-    rm -rf "libstdc++6_4.8.1-10ubuntu8_arm64.deb" "libstdc++6_4.8.1-10ubuntu8_arm64"
-
-mkdir -p "${TARGET}-src"
-cd "${TARGET}-src"
-
-# Build a devtoolset cross-compiler based on our glibc 2.12/glibc 2.17 sysroot setup.
-case "${VERSION}" in
-devtoolset-9)
-  wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm"
-  rpm2cpio "devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm" |cpio -idmv
-  tar -xvf "gcc-9.3.1-20200408.tar.xz" --strip 1
-  ;;
-devtoolset-10)
-  wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm"
-  rpm2cpio "devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm" |cpio -idmv
-  tar -xvf "gcc-10.2.1-20210130.tar.xz" --strip 1
-  ;;
-esac
-
-# Apply the devtoolset patches to gcc.
-/rpm-patch.sh "gcc.spec"
-
-./contrib/download_prerequisites
-
-mkdir -p "${TARGET}-build"
-cd "${TARGET}-build"
-
-"${TARGET}-src/configure" \
-      --prefix="${TARGET}/usr" \
-      --with-sysroot="/${TARGET}" \
-      --disable-bootstrap \
-      --disable-libmpx \
-      --enable-libsanitizer \
-      --disable-libunwind-exceptions \
-      --disable-libunwind-exceptions \
-      --disable-lto \
-      --disable-multilib \
-      --enable-__cxa_atexit \
-      --enable-gnu-indirect-function \
-      --enable-gnu-unique-object \
-      --enable-initfini-array \
-      --enable-languages="c,c++" \
-      --enable-linker-build-id \
-      --enable-plugin \
-      --enable-shared \
-      --enable-threads=posix \
-      --with-default-libstdcxx-abi=${LIBSTDCXX_ABI} \
-      --with-gcc-major-version-only \
-      --with-linker-hash-style="gnu" \
-      && \
-      make -j$(nproc) && \
-      make install
-
-
-# Create the devtoolset libstdc++ linkerscript that links dynamically against
-# the system libstdc++ 4.4 and provides all other symbols statically.
-# Note that the installation path for libstdc++ here is ${TARGET}/usr/lib64/
-mv "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}" \
-   "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}.backup"
-echo -e "OUTPUT_FORMAT(elf64-littleaarch64)\nINPUT ( libstdc++.so.6.0.18 -lstdc++_nonshared44 )" \
-   > "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}"
-cp "./aarch64-unknown-linux-gnu/libstdc++-v3/src/.libs/libstdc++_nonshared44.a" \
-   "${TARGET}/usr/lib64"
-
-
-# Link in architecture specific includes from the system; note that we cannot
-# link in the whole aarch64-linux-gnu folder, as otherwise we're overlaying
-# system gcc paths that we do not want to find.
-# TODO(klimek): Automate linking in all non-gcc / non-kernel include
-# directories.
-mkdir -p "${TARGET}/usr/include/aarch64-linux-gnu"
-PYTHON_VERSIONS=("python3.9" "python3.10" "python3.11" "python3.12")
-for v in "${PYTHON_VERSIONS[@]}"; do
-  ln -s "/usr/local/include/${v}" "${TARGET}/usr/include/aarch64-linux-gnu/${v}"
-done
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh b/ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh
deleted file mode 100755
index 09a5f9854d42ef..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Re-direct all links in $1 that are relative to be canonical
-
-BASE="$1"
-find "${BASE}" -type l | \
-  while read l ; do
-    if [[ "$(readlink "$l")" == \.\./* ]]; then
-      CANONICAL="$(readlink "$l")";
-      rm "$l";
-      ln -s "${CANONICAL}" "$l"
-    fi
-  done
-
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch b/ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch
deleted file mode 100644
index 7b9bbf358ada74..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch
+++ /dev/null
@@ -1,270 +0,0 @@
-diff --git a/iconv/gconv.h b/iconv/gconv.h
-index 3f9112e..8e60197 100644
---- a/iconv/gconv.h
-+++ b/iconv/gconv.h
-@@ -174,7 +174,7 @@ typedef struct __gconv_info
- {
-   size_t __nsteps;
-   struct __gconv_step *__steps;
--  __extension__ struct __gconv_step_data __data __flexarr;
-+  __extension__ struct __gconv_step_data __data[0];
- } *__gconv_t;
- 
- #endif /* gconv.h */
-diff --git a/include/libc-symbols.h b/include/libc-symbols.h
-index c555bf2..143b26d 100644
---- a/include/libc-symbols.h
-+++ b/include/libc-symbols.h
-@@ -107,6 +107,11 @@
- # endif
- #endif
- 
-+#ifndef __attribute_copy__
-+/* Provide an empty definition when cdefs.h is not included.  */
-+# define __attribute_copy__(arg)
-+#endif
-+
- #ifndef __ASSEMBLER__
- /* GCC understands weak symbols and aliases; use its interface where
-    possible, instead of embedded assembly language.  */
-@@ -114,7 +119,8 @@
- /* Define ALIASNAME as a strong alias for NAME.  */
- # define strong_alias(name, aliasname) _strong_alias(name, aliasname)
- # define _strong_alias(name, aliasname) \
--  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
-+  extern __typeof (name) aliasname __attribute__ ((alias (#name))) \
-+    __attribute_copy__ (name);
- 
- /* This comes between the return type and function name in
-    a function definition to make that definition weak.  */
-@@ -125,14 +131,16 @@
-    If weak aliases are not available, this defines a strong alias.  */
- # define weak_alias(name, aliasname) _weak_alias (name, aliasname)
- # define _weak_alias(name, aliasname) \
--  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
-+  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name))) \
-+    __attribute_copy__ (name);
- 
- /* Same as WEAK_ALIAS, but mark symbol as hidden.  */
- # define weak_hidden_alias(name, aliasname) \
-   _weak_hidden_alias (name, aliasname)
- # define _weak_hidden_alias(name, aliasname) \
-   extern __typeof (name) aliasname \
--    __attribute__ ((weak, alias (#name), __visibility__ ("hidden")));
-+    __attribute__ ((weak, alias (#name), __visibility__ ("hidden"))) \
-+    __attribute_copy__ (name);
- 
- /* Declare SYMBOL as weak undefined symbol (resolved to 0 if not defined).  */
- # define weak_extern(symbol) _weak_extern (weak symbol)
-@@ -528,7 +536,8 @@ for linking")
- #  define __hidden_ver1(local, internal, name) \
-   extern __typeof (name) __EI_##name __asm__(__hidden_asmname (#internal)); \
-   extern __typeof (name) __EI_##name \
--	__attribute__((alias (__hidden_asmname (#local))))
-+    __attribute__((alias (__hidden_asmname (#local))))	\
-+    __attribute_copy__ (name)
- #  define hidden_ver(local, name)	__hidden_ver1(local, __GI_##name, name);
- #  define hidden_data_ver(local, name)	hidden_ver(local, name)
- #  define hidden_def(name)		__hidden_ver1(__GI_##name, name, name);
-@@ -541,7 +550,8 @@ for linking")
- #  define __hidden_nolink1(local, internal, name, version) \
-   __hidden_nolink2 (local, internal, name, version)
- #  define __hidden_nolink2(local, internal, name, version) \
--  extern __typeof (name) internal __attribute__ ((alias (#local))); \
-+  extern __typeof (name) internal __attribute__ ((alias (#local)))	\
-+    __attribute_copy__ (name);						\
-   __hidden_nolink3 (local, internal, #name "@" #version)
- #  define __hidden_nolink3(local, internal, vername) \
-   __asm__ (".symver " #internal ", " vername);
-diff --git a/locale/weightwc.h b/locale/weightwc.h
-index e966c03..22ab790 100644
---- a/locale/weightwc.h
-+++ b/locale/weightwc.h
-@@ -79,19 +79,19 @@ findidx (const wint_t **cpp, size_t len)
- 	    if (cp[cnt] != usrc[cnt])
- 	      break;
- 
--	  if (cnt < nhere - 1)
-+	  if (cnt < nhere - 1 || cnt == len)
- 	    {
- 	      cp += 2 * nhere;
- 	      continue;
- 	    }
- 
--	  if (cp[nhere - 1] > usrc[nhere -1])
-+	  if (cp[nhere - 1] > usrc[nhere - 1])
- 	    {
- 	      cp += 2 * nhere;
- 	      continue;
- 	    }
- 
--	  if (cp[2 * nhere - 1] < usrc[nhere -1])
-+	  if (cp[2 * nhere - 1] < usrc[nhere - 1])
- 	    {
- 	      cp += 2 * nhere;
- 	      continue;
-diff --git a/locale/xlocale.h b/locale/xlocale.h
-index 98c080b..843bd45 100644
---- a/locale/xlocale.h
-+++ b/locale/xlocale.h
-@@ -20,6 +20,9 @@
- #ifndef _XLOCALE_H
- #define _XLOCALE_H	1
- 
-+#ifndef _BITS_TYPES___LOCALE_T_H
-+#define _BITS_TYPES___LOCALE_T_H 1
-+
- /* Structure for reentrant locale using functions.  This is an
-    (almost) opaque type for the user level programs.  The file and
-    this data structure is not standardized.  Don't rely on it.  It can
-@@ -41,4 +44,6 @@ typedef struct __locale_struct
- /* POSIX 2008 makes locale_t official.  */
- typedef __locale_t locale_t;
- 
-+#endif /* bits/types/__locale_t.h */
-+
- #endif /* xlocale.h */
-diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
-index d1cb3dd..30482a1 100644
---- a/misc/sys/cdefs.h
-+++ b/misc/sys/cdefs.h
-@@ -423,4 +423,14 @@
- # endif
- #endif
- 
-+/* Undefine (also defined in libc-symbols.h).  */
-+#undef __attribute_copy__
-+#if __GNUC_PREREQ (9, 0)
-+/* Copies attributes from the declaration or type referenced by
-+   the argument.  */
-+# define __attribute_copy__(arg) __attribute__ ((__copy__ (arg)))
-+#else
-+# define __attribute_copy__(arg)
-+#endif
-+
- #endif	 /* sys/cdefs.h */
-diff --git a/stdlib/setenv.c b/stdlib/setenv.c
-index 45efe2e..06bfab0 100644
---- a/stdlib/setenv.c
-+++ b/stdlib/setenv.c
-@@ -319,6 +319,7 @@ unsetenv (const char *name)
- 
-   ep = __environ;
-   if (ep != NULL)
-+  {
-     while (*ep != NULL)
-       if (!strncmp (*ep, name, len) && (*ep)[len] == '=')
- 	{
-@@ -332,6 +333,7 @@ unsetenv (const char *name)
- 	}
-       else
- 	++ep;
-+  }
- 
-   UNLOCK;
- 
-diff --git a/support/Makefile b/support/Makefile
-index a253698..2f4e2a9 100644
---- a/support/Makefile
-+++ b/support/Makefile
-@@ -167,13 +167,6 @@ CFLAGS-support_paths.c = \
- 		-DINSTDIR_PATH=\"$(prefix)\" \
- 		-DLIBDIR_PATH=\"$(libdir)\"
- 
--ifeq (,$(CXX))
--LINKS_DSO_PROGRAM = links-dso-program-c
--else
--LINKS_DSO_PROGRAM = links-dso-program
--LDLIBS-links-dso-program = -lstdc++ -lgcc -lgcc_s $(libunwind)
--endif
--
- LDLIBS-test-container = $(libsupport)
- 
- others += test-container
-@@ -182,9 +175,6 @@ others-noinstall += test-container
- others += shell-container echo-container true-container
- others-noinstall += shell-container echo-container true-container
- 
--others += $(LINKS_DSO_PROGRAM)
--others-noinstall += $(LINKS_DSO_PROGRAM)
--
- $(objpfx)test-container : $(libsupport)
- $(objpfx)shell-container : $(libsupport)
- $(objpfx)echo-container : $(libsupport)
-diff --git a/support/links-dso-program.cc b/support/links-dso-program.cc
-index 8ff3155..f9d2b77 100644
---- a/support/links-dso-program.cc
-+++ b/support/links-dso-program.cc
-@@ -3,6 +3,11 @@
-    backported.  */
- #define _ISOMAC 1
- 
-+#define __GLIBC_USE(F)	__GLIBC_USE_ ## F
-+
-+# define __attribute_alloc_size__(params) \
-+  __attribute__ ((__alloc_size__ params))
-+
- #include <iostream>
- 
- using namespace std;
-diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
-index 185402f..bbdeae0 100644
---- a/sysdeps/aarch64/dl-machine.h
-+++ b/sysdeps/aarch64/dl-machine.h
-@@ -49,23 +49,11 @@ elf_machine_load_address (void)
-   /* To figure out the load address we use the definition that for any symbol:
-      dynamic_addr(symbol) = static_addr(symbol) + load_addr
- 
--     The choice of symbol is arbitrary. The static address we obtain
--     by constructing a non GOT reference to the symbol, the dynamic
--     address of the symbol we compute using adrp/add to compute the
--     symbol's address relative to the PC. */
--
--  ElfW(Addr) static_addr;
--  ElfW(Addr) dynamic_addr;
--
--  asm ("					\n\
--	adrp	%1, _dl_start;			\n\
--        add	%1, %1, #:lo12:_dl_start        \n\
--        ldr	%w0, 1f				\n\
--	b	2f				\n\
--1:	.word	_dl_start			\n\
--2:						\n\
--       " : "=r" (static_addr),  "=r" (dynamic_addr));
--  return dynamic_addr - static_addr;
-+    _DYNAMIC sysmbol is used here as its link-time address stored in
-+    the special unrelocated first GOT entry.  */
-+
-+    extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
-+    return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
- }
- 
- /* Set up the loaded object described by L so its unrelocated PLT
-diff --git a/sysdeps/ieee754/dbl-64/k_rem_pio2.c b/sysdeps/ieee754/dbl-64/k_rem_pio2.c
-index fcf956a..e2c5d29 100644
---- a/sysdeps/ieee754/dbl-64/k_rem_pio2.c
-+++ b/sysdeps/ieee754/dbl-64/k_rem_pio2.c
-@@ -172,7 +172,8 @@ int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int3
- 
-     /* compute q[0],q[1],...q[jk] */
- 	for (i=0;i<=jk;i++) {
--	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
-+	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
-+	    q[i] = fw;
- 	}
- 
- 	jz = jk;
-diff --git a/sysdeps/ieee754/flt-32/k_rem_pio2f.c b/sysdeps/ieee754/flt-32/k_rem_pio2f.c
-index e54a067..215b0e0 100644
---- a/sysdeps/ieee754/flt-32/k_rem_pio2f.c
-+++ b/sysdeps/ieee754/flt-32/k_rem_pio2f.c
-@@ -65,7 +65,8 @@ int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const int32
- 
-     /* compute q[0],q[1],...q[jk] */
- 	for (i=0;i<=jk;i++) {
--	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
-+	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
-+	    q[i] = fw;
- 	}
- 
- 	jz = jk;
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh b/ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh
deleted file mode 100755
index 892ae2af86a3fa..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Given an RPM spec file $1, apply its patches.
-
-SPEC="$1"
-grep '%patch' "${SPEC}" |while read cmd ; do
-  N=$(echo "${cmd}" |sed 's,%patch\([0-9]\+\).*,\1,')
-  file=$(grep "Patch$N:" "${SPEC}" |sed 's,.*: ,,')
-  parg=$(echo "${cmd}" |sed 's,.*\(-p[0-9]\).*,\1,')
-  if [[ ! "${file}" =~ doxygen && "${cmd}" != \#* ]]; then
-    echo "patch ${parg} -s < ${file}"
-    patch ${parg} -s < "${file}"
-  fi
-done
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch b/ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch
deleted file mode 100644
index bd9e5533118d6c..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch
+++ /dev/null
@@ -1,1204 +0,0 @@
-diff --git a/bits/utmp.h b/bits/utmp.h
-index 775123d..bf28c6d 100644
---- a/bits/utmp.h
-+++ b/bits/utmp.h
-@@ -1,5 +1,5 @@
--/* The `struct utmp' type, describing entries in the utmp file.  Generic/BSDish
--   Copyright (C) 1993, 1996, 1997 Free Software Foundation, Inc.
-+/* The `struct utmp' type, describing entries in the utmp file.
-+   Copyright (C) 1993-2022 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -14,7 +14,7 @@
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
-+   <https://www.gnu.org/licenses/>.  */
- 
- #ifndef _UTMP_H
- # error "Never include <bits/utmp.h> directly; use <utmp.h> instead."
-@@ -24,11 +24,13 @@
- #include <time.h>
- 
- 
--#define	UT_NAMESIZE	8
--#define	UT_LINESIZE	8
--#define	UT_HOSTSIZE	16
-+#define UT_LINESIZE	32
-+#define UT_NAMESIZE	32
-+#define UT_HOSTSIZE	256
- 
- 
-+/* The structure describing an entry in the database of
-+   previous logins.  */
- struct lastlog
-   {
-     time_t ll_time;
-@@ -36,12 +38,16 @@ struct lastlog
-     char ll_host[UT_HOSTSIZE];
-   };
- 
-+/* The structure describing an entry in the user accounting database.  */
- struct utmp
-   {
--    char ut_line[UT_LINESIZE];
--    char ut_user[UT_NAMESIZE];
-+    char ut_line[UT_LINESIZE]
-+      __attribute_nonstring__;	/* Devicename.  */
-+    char ut_user[UT_NAMESIZE]
-+      __attribute_nonstring__;	/* Username.  */
- #define ut_name ut_user
--    char ut_host[UT_HOSTSIZE];
-+    char ut_host[UT_HOSTSIZE]
-+      __attribute_nonstring__;	/* Hostname for remote login.  */
-     long int ut_time;
-   };
- 
-diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
-index 30482a1..551d5fd 100644
---- a/misc/sys/cdefs.h
-+++ b/misc/sys/cdefs.h
-@@ -391,6 +391,15 @@
- 
- #include <bits/wordsize.h>
- 
-+#if __GNUC_PREREQ (8, 0)
-+/* Describes a char array whose address can safely be passed as the first
-+   argument to strncpy and strncat, as the char array is not necessarily
-+   a NUL-terminated string.  */
-+# define __attribute_nonstring__ __attribute__ ((__nonstring__))
-+#else
-+# define __attribute_nonstring__
-+#endif
-+
- #if defined __LONG_DOUBLE_MATH_OPTIONAL && defined __NO_LONG_DOUBLE_MATH
- # define __LDBL_COMPAT 1
- # ifdef __REDIRECT
-diff --git a/nis/nss_nisplus/nisplus-parser.c b/nis/nss_nisplus/nisplus-parser.c
-index a4d76fb..41600f0 100644
---- a/nis/nss_nisplus/nisplus-parser.c
-+++ b/nis/nss_nisplus/nisplus-parser.c
-@@ -82,7 +82,7 @@ _nss_nisplus_parse_pwent (nis_result *result, struct passwd *pw,
- 
-   char *numstr = NISOBJVAL (2, obj);
-   len = NISOBJLEN (2, obj);
--  if (len == 0 && numstr[len - 1] != '\0')
-+  if (len == 0 || numstr[len - 1] != '\0')
-     {
-       if (len >= room_left)
- 	goto no_more_room;
-@@ -98,7 +98,7 @@ _nss_nisplus_parse_pwent (nis_result *result, struct passwd *pw,
- 
-   numstr = NISOBJVAL (3, obj);
-   len = NISOBJLEN (3, obj);
--  if (len == 0 && numstr[len - 1] != '\0')
-+  if (len == 0 || numstr[len - 1] != '\0')
-     {
-       if (len >= room_left)
- 	goto no_more_room;
-diff --git a/string/bits/string2.h b/string/bits/string2.h
-index c9bf593..f461fc1 100644
---- a/string/bits/string2.h
-+++ b/string/bits/string2.h
-@@ -47,29 +47,7 @@
- #endif
- 
- #if _STRING_ARCH_unaligned
--/* If we can do unaligned memory accesses we must know the endianess.  */
--# include <endian.h>
- # include <bits/types.h>
--
--# if __BYTE_ORDER == __LITTLE_ENDIAN
--#  define __STRING2_SMALL_GET16(src, idx) \
--     (((const unsigned char *) (const char *) (src))[idx + 1] << 8	      \
--      | ((const unsigned char *) (const char *) (src))[idx])
--#  define __STRING2_SMALL_GET32(src, idx) \
--     (((((const unsigned char *) (const char *) (src))[idx + 3] << 8	      \
--	| ((const unsigned char *) (const char *) (src))[idx + 2]) << 8	      \
--       | ((const unsigned char *) (const char *) (src))[idx + 1]) << 8	      \
--      | ((const unsigned char *) (const char *) (src))[idx])
--# else
--#  define __STRING2_SMALL_GET16(src, idx) \
--     (((const unsigned char *) (const char *) (src))[idx] << 8		      \
--      | ((const unsigned char *) (const char *) (src))[idx + 1])
--#  define __STRING2_SMALL_GET32(src, idx) \
--     (((((const unsigned char *) (const char *) (src))[idx] << 8	      \
--	| ((const unsigned char *) (const char *) (src))[idx + 1]) << 8	      \
--       | ((const unsigned char *) (const char *) (src))[idx + 2]) << 8	      \
--      | ((const unsigned char *) (const char *) (src))[idx + 3])
--# endif
- #else
- /* These are a few types we need for the optimizations if we cannot
-    use unaligned memory accesses.  */
-@@ -94,148 +72,11 @@ __STRING2_COPY_TYPE (8);
- 
- /* Set N bytes of S to C.  */
- #if !defined _HAVE_STRING_ARCH_memset
--# if !__GNUC_PREREQ (3, 0)
--#  if _STRING_ARCH_unaligned
--#   define memset(s, c, n) \
--  (__extension__ (__builtin_constant_p (n) && (n) <= 16			      \
--		  ? ((n) == 1						      \
--		     ? __memset_1 (s, c)				      \
--		     : __memset_gc (s, c, n))				      \
--		  : (__builtin_constant_p (c) && (c) == '\0'		      \
--		     ? ({ void *__s = (s); __bzero (__s, n); __s; })	      \
--		     : memset (s, c, n))))
--
--#   define __memset_1(s, c) ({ void *__s = (s);				      \
--			    *((__uint8_t *) __s) = (__uint8_t) c; __s; })
--
--#   define __memset_gc(s, c, n) \
--  ({ void *__s = (s);							      \
--     union {								      \
--       unsigned int __ui;						      \
--       unsigned short int __usi;					      \
--       unsigned char __uc;						      \
--     } *__u = __s;							      \
--     __uint8_t __c = (__uint8_t) (c);					      \
--									      \
--     /* This `switch' statement will be removed at compile-time.  */	      \
--     switch ((unsigned int) (n))					      \
--       {								      \
--       case 15:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 11:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 7:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 3:								      \
--	 __u->__usi = (unsigned short int) __c * 0x0101;		      \
--	 __u = __extension__ ((void *) __u + 2);			      \
--	 __u->__uc = (unsigned char) __c;				      \
--	 break;								      \
--									      \
--       case 14:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 10:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 6:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 2:								      \
--	 __u->__usi = (unsigned short int) __c * 0x0101;		      \
--	 break;								      \
--									      \
--       case 13:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 9:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 5:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 1:								      \
--	 __u->__uc = (unsigned char) __c;				      \
--	 break;								      \
--									      \
--       case 16:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 12:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 8:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 4:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--       case 0:								      \
--	 break;								      \
--       }								      \
--									      \
--     __s; })
--#  else
--#   define memset(s, c, n) \
--  (__extension__ (__builtin_constant_p (c) && (c) == '\0'		      \
--		  ? ({ void *__s = (s); __bzero (__s, n); __s; })	      \
--		  : memset (s, c, n)))
--#  endif
--# endif
--
--/* GCC < 3.0 optimizes memset(s, 0, n) but not bzero(s, n).
--   The optimization is broken before EGCS 1.1.
--   GCC 3.0+ has __builtin_bzero as well, but at least till GCC 3.4
--   if it decides to call the library function, it calls memset
--   and not bzero.  */
--# if __GNUC_PREREQ (2, 91)
--#  define __bzero(s, n) __builtin_memset (s, '\0', n)
--# endif
--
-+# define __bzero(s, n) __builtin_memset (s, '\0', n)
- #endif
- 
--
--/* Copy N bytes from SRC to DEST, returning pointer to byte following the
--   last copied.  */
--#ifdef __USE_GNU
--# if !defined _HAVE_STRING_ARCH_mempcpy || defined _FORCE_INLINES
--#  ifndef _HAVE_STRING_ARCH_mempcpy
--#   if __GNUC_PREREQ (3, 4)
--#    define __mempcpy(dest, src, n) __builtin_mempcpy (dest, src, n)
--#   elif __GNUC_PREREQ (3, 0)
--#    define __mempcpy(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  && __string2_1bptr_p (src) && n <= 8			      \
--		  ? __builtin_memcpy (dest, src, n) + (n)		      \
--		  : __mempcpy (dest, src, n)))
--#   else
--#    define __mempcpy(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  && __string2_1bptr_p (src) && n <= 8			      \
--		  ? __mempcpy_small (dest, __mempcpy_args (src), n)	      \
--		  : __mempcpy (dest, src, n)))
--#   endif
--/* In glibc we use this function frequently but for namespace reasons
--   we have to use the name `__mempcpy'.  */
--#   define mempcpy(dest, src, n) __mempcpy (dest, src, n)
--#  endif
--
--#  if !__GNUC_PREREQ (3, 0) || defined _FORCE_INLINES
--#   if _STRING_ARCH_unaligned
--#    ifndef _FORCE_INLINES
--#     define __mempcpy_args(src) \
--     ((const char *) (src))[0], ((const char *) (src))[2],		      \
--     ((const char *) (src))[4], ((const char *) (src))[6],		      \
--     __extension__ __STRING2_SMALL_GET16 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET16 (src, 4),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 4)
--#    endif
--__STRING_INLINE void *__mempcpy_small (void *, char, char, char, char,
--				       __uint16_t, __uint16_t, __uint32_t,
--				       __uint32_t, size_t);
-+#if defined _FORCE_INLINES
-+# if _STRING_ARCH_unaligned
- __STRING_INLINE void *
- __mempcpy_small (void *__dest1,
- 		 char __src0_1, char __src2_1, char __src4_1, char __src6_1,
-@@ -298,44 +139,7 @@ __mempcpy_small (void *__dest1,
-     }
-   return (void *) __u;
- }
--#   else
--#    ifndef _FORCE_INLINES
--#     define __mempcpy_args(src) \
--     ((const char *) (src))[0],						      \
--     __extension__ ((__STRING2_COPY_ARR2)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1] } }),	      \
--     __extension__ ((__STRING2_COPY_ARR3)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2] } }),				      \
--     __extension__ ((__STRING2_COPY_ARR4)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3] } }),	      \
--     __extension__ ((__STRING2_COPY_ARR5)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4] } }),				      \
--     __extension__ ((__STRING2_COPY_ARR6)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5] } }),	      \
--     __extension__ ((__STRING2_COPY_ARR7)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6] } }),				      \
--     __extension__ ((__STRING2_COPY_ARR8)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6], ((const char *) (src))[7] } })
--#    endif
--__STRING_INLINE void *__mempcpy_small (void *, char, __STRING2_COPY_ARR2,
--				       __STRING2_COPY_ARR3,
--				       __STRING2_COPY_ARR4,
--				       __STRING2_COPY_ARR5,
--				       __STRING2_COPY_ARR6,
--				       __STRING2_COPY_ARR7,
--				       __STRING2_COPY_ARR8, size_t);
-+#  else
- __STRING_INLINE void *
- __mempcpy_small (void *__dest, char __src1,
- 		 __STRING2_COPY_ARR2 __src2, __STRING2_COPY_ARR3 __src3,
-@@ -382,8 +186,6 @@ __mempcpy_small (void *__dest, char __src1,
-     }
-   return __extension__ ((void *) __u + __srclen);
- }
--#   endif
--#  endif
- # endif
- #endif
- 
-@@ -391,44 +193,17 @@ __mempcpy_small (void *__dest, char __src1,
- /* Return pointer to C in S.  */
- #ifndef _HAVE_STRING_ARCH_strchr
- extern void *__rawmemchr (const void *__s, int __c);
--# if __GNUC_PREREQ (3, 2)
- #  define strchr(s, c) \
-   (__extension__ (__builtin_constant_p (c) && !__builtin_constant_p (s)	      \
- 		  && (c) == '\0'					      \
- 		  ? (char *) __rawmemchr (s, c)				      \
- 		  : __builtin_strchr (s, c)))
--# else
--#  define strchr(s, c) \
--  (__extension__ (__builtin_constant_p (c) && (c) == '\0'		      \
--		  ? (char *) __rawmemchr (s, c)				      \
--		  : strchr (s, c)))
--# endif
- #endif
- 
- 
- /* Copy SRC to DEST.  */
--#if (!defined _HAVE_STRING_ARCH_strcpy && !__GNUC_PREREQ (3, 0)) \
--    || defined _FORCE_INLINES
--# if !defined _HAVE_STRING_ARCH_strcpy && !__GNUC_PREREQ (3, 0)
--#  define strcpy(dest, src) \
--  (__extension__ (__builtin_constant_p (src)				      \
--		  ? (__string2_1bptr_p (src) && strlen (src) + 1 <= 8	      \
--		     ? __strcpy_small (dest, __strcpy_args (src),	      \
--				       strlen (src) + 1)		      \
--		     : (char *) memcpy (dest, src, strlen (src) + 1))	      \
--		  : strcpy (dest, src)))
--# endif
--
-+#if defined _FORCE_INLINES
- # if _STRING_ARCH_unaligned
--#  ifndef _FORCE_INLINES
--#   define __strcpy_args(src) \
--     __extension__ __STRING2_SMALL_GET16 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET16 (src, 4),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 4)
--#  endif
--__STRING_INLINE char *__strcpy_small (char *, __uint16_t, __uint16_t,
--				      __uint32_t, __uint32_t, size_t);
- __STRING_INLINE char *
- __strcpy_small (char *__dest,
- 		__uint16_t __src0_2, __uint16_t __src4_2,
-@@ -482,42 +257,6 @@ __strcpy_small (char *__dest,
-   return __dest;
- }
- # else
--#  ifndef _FORCE_INLINES
--#   define __strcpy_args(src) \
--     __extension__ ((__STRING2_COPY_ARR2)				      \
--      { { ((const char *) (src))[0], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR3)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR4)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR5)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR6)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR7)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR8)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6], '\0' } })
--#  endif
--__STRING_INLINE char *__strcpy_small (char *, __STRING2_COPY_ARR2,
--				      __STRING2_COPY_ARR3,
--				      __STRING2_COPY_ARR4,
--				      __STRING2_COPY_ARR5,
--				      __STRING2_COPY_ARR6,
--				      __STRING2_COPY_ARR7,
--				      __STRING2_COPY_ARR8, size_t);
- __STRING_INLINE char *
- __strcpy_small (char *__dest,
- 		__STRING2_COPY_ARR2 __src2, __STRING2_COPY_ARR3 __src3,
-@@ -570,44 +309,15 @@ __strcpy_small (char *__dest,
- 
- /* Copy SRC to DEST, returning pointer to final NUL byte.  */
- #ifdef __USE_GNU
--# if !defined _HAVE_STRING_ARCH_stpcpy || defined _FORCE_INLINES
--#  ifndef _HAVE_STRING_ARCH_stpcpy
--#   if __GNUC_PREREQ (3, 4)
--#    define __stpcpy(dest, src) __builtin_stpcpy (dest, src)
--#   elif __GNUC_PREREQ (3, 0)
--#    define __stpcpy(dest, src) \
--  (__extension__ (__builtin_constant_p (src)				      \
--		  ? (__string2_1bptr_p (src) && strlen (src) + 1 <= 8	      \
--		     ? __builtin_strcpy (dest, src) + strlen (src)	      \
--		     : ((char *) (__mempcpy) (dest, src, strlen (src) + 1)    \
--			- 1))						      \
--		  : __stpcpy (dest, src)))
--#   else
--#    define __stpcpy(dest, src) \
--  (__extension__ (__builtin_constant_p (src)				      \
--		  ? (__string2_1bptr_p (src) && strlen (src) + 1 <= 8	      \
--		     ? __stpcpy_small (dest, __stpcpy_args (src),	      \
--				       strlen (src) + 1)		      \
--		     : ((char *) (__mempcpy) (dest, src, strlen (src) + 1)    \
--			- 1))						      \
--		  : __stpcpy (dest, src)))
--#   endif
-+# ifndef _HAVE_STRING_ARCH_stpcpy
-+#  define __stpcpy(dest, src) __builtin_stpcpy (dest, src)
- /* In glibc we use this function frequently but for namespace reasons
-    we have to use the name `__stpcpy'.  */
--#   define stpcpy(dest, src) __stpcpy (dest, src)
--#  endif
-+#  define stpcpy(dest, src) __stpcpy (dest, src)
-+# endif
- 
--#  if !__GNUC_PREREQ (3, 0) || defined _FORCE_INLINES
--#   if _STRING_ARCH_unaligned
--#    ifndef _FORCE_INLINES
--#     define __stpcpy_args(src) \
--     __extension__ __STRING2_SMALL_GET16 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET16 (src, 4),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 4)
--#    endif
--__STRING_INLINE char *__stpcpy_small (char *, __uint16_t, __uint16_t,
--				      __uint32_t, __uint32_t, size_t);
-+# ifndef _FORCE_INLINES
-+#  if _STRING_ARCH_unaligned
- __STRING_INLINE char *
- __stpcpy_small (char *__dest,
- 		__uint16_t __src0_2, __uint16_t __src4_2,
-@@ -665,43 +375,7 @@ __stpcpy_small (char *__dest,
-     }
-   return &__u->__c;
- }
--#   else
--#    ifndef _FORCE_INLINES
--#     define __stpcpy_args(src) \
--     __extension__ ((__STRING2_COPY_ARR2)				      \
--      { { ((const char *) (src))[0], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR3)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR4)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR5)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR6)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR7)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR8)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6], '\0' } })
--#    endif
--__STRING_INLINE char *__stpcpy_small (char *, __STRING2_COPY_ARR2,
--				      __STRING2_COPY_ARR3,
--				      __STRING2_COPY_ARR4,
--				      __STRING2_COPY_ARR5,
--				      __STRING2_COPY_ARR6,
--				      __STRING2_COPY_ARR7,
--				      __STRING2_COPY_ARR8, size_t);
-+#  else
- __STRING_INLINE char *
- __stpcpy_small (char *__dest,
- 		__STRING2_COPY_ARR2 __src2, __STRING2_COPY_ARR3 __src3,
-@@ -748,27 +422,11 @@ __stpcpy_small (char *__dest,
-   }
-   return __dest + __srclen - 1;
- }
--#   endif
- #  endif
- # endif
- #endif
- 
- 
--/* Copy no more than N characters of SRC to DEST.  */
--#ifndef _HAVE_STRING_ARCH_strncpy
--# if __GNUC_PREREQ (3, 2)
--#  define strncpy(dest, src, n) __builtin_strncpy (dest, src, n)
--# else
--#  define strncpy(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  ? (strlen (src) + 1 >= ((size_t) (n))			      \
--		     ? (char *) memcpy (dest, src, n)			      \
--		     : strncpy (dest, src, n))				      \
--		  : strncpy (dest, src, n)))
--# endif
--#endif
--
--
- /* Append no more than N characters from SRC onto DEST.  */
- #ifndef _HAVE_STRING_ARCH_strncat
- # ifdef _USE_STRING_ARCH_strchr
-@@ -780,380 +438,29 @@ __stpcpy_small (char *__dest,
- 		       : (*((char *) __mempcpy (strchr (__dest, '\0'),	      \
- 						src, n)) = '\0', __dest))     \
- 		    : strncat (dest, src, n); }))
--# elif __GNUC_PREREQ (3, 2)
--#  define strncat(dest, src, n) __builtin_strncat (dest, src, n)
- # else
--#  define strncat(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  ? (strlen (src) < ((size_t) (n))			      \
--		     ? strcat (dest, src)				      \
--		     : strncat (dest, src, n))				      \
--		  : strncat (dest, src, n)))
--# endif
--#endif
--
--
--/* Compare characters of S1 and S2.  */
--#ifndef _HAVE_STRING_ARCH_strcmp
--# if __GNUC_PREREQ (3, 2)
--#  define strcmp(s1, s2) \
--  __extension__								      \
--  ({ size_t __s1_len, __s2_len;						      \
--     (__builtin_constant_p (s1) && __builtin_constant_p (s2)		      \
--      && (__s1_len = strlen (s1), __s2_len = strlen (s2),		      \
--	  (!__string2_1bptr_p (s1) || __s1_len >= 4)			      \
--	  && (!__string2_1bptr_p (s2) || __s2_len >= 4))		      \
--      ? __builtin_strcmp (s1, s2)					      \
--      : (__builtin_constant_p (s1) && __string2_1bptr_p (s1)		      \
--	 && (__s1_len = strlen (s1), __s1_len < 4)			      \
--	 ? (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    ? __builtin_strcmp (s1, s2)					      \
--	    : __strcmp_cg (s1, s2, __s1_len))				      \
--	 : (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    && (__s2_len = strlen (s2), __s2_len < 4)			      \
--	    ? (__builtin_constant_p (s1) && __string2_1bptr_p (s1)	      \
--	       ? __builtin_strcmp (s1, s2)				      \
--	       : __strcmp_gc (s1, s2, __s2_len))			      \
--	    : __builtin_strcmp (s1, s2)))); })
--# else
--#  define strcmp(s1, s2) \
--  __extension__								      \
--  ({ size_t __s1_len, __s2_len;						      \
--     (__builtin_constant_p (s1) && __builtin_constant_p (s2)		      \
--      && (__s1_len = strlen (s1), __s2_len = strlen (s2),		      \
--	  (!__string2_1bptr_p (s1) || __s1_len >= 4)			      \
--	  && (!__string2_1bptr_p (s2) || __s2_len >= 4))		      \
--      ? memcmp ((const char *) (s1), (const char *) (s2),		      \
--		(__s1_len < __s2_len ? __s1_len : __s2_len) + 1)	      \
--      : (__builtin_constant_p (s1) && __string2_1bptr_p (s1)		      \
--	 && (__s1_len = strlen (s1), __s1_len < 4)			      \
--	 ? (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    ? __strcmp_cc (s1, s2, __s1_len)				      \
--	    : __strcmp_cg (s1, s2, __s1_len))				      \
--	 : (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    && (__s2_len = strlen (s2), __s2_len < 4)			      \
--	    ? (__builtin_constant_p (s1) && __string2_1bptr_p (s1)	      \
--	       ? __strcmp_cc (s1, s2, __s2_len)				      \
--	       : __strcmp_gc (s1, s2, __s2_len))			      \
--	    : strcmp (s1, s2)))); })
-+#  define strncat(dest, src, n) __builtin_strncat (dest, src, n)
- # endif
--
--# define __strcmp_cc(s1, s2, l) \
--  (__extension__ ({ int __result =					      \
--		      (((const unsigned char *) (const char *) (s1))[0]	      \
--		       - ((const unsigned char *) (const char *)(s2))[0]);    \
--		    if (l > 0 && __result == 0)				      \
--		      {							      \
--			__result = (((const unsigned char *)		      \
--				     (const char *) (s1))[1]		      \
--				    - ((const unsigned char *)		      \
--				       (const char *) (s2))[1]);	      \
--			if (l > 1 && __result == 0)			      \
--			  {						      \
--			    __result =					      \
--			      (((const unsigned char *)			      \
--				(const char *) (s1))[2]			      \
--			       - ((const unsigned char *)		      \
--				  (const char *) (s2))[2]);		      \
--			    if (l > 2 && __result == 0)			      \
--			      __result =				      \
--				(((const unsigned char *)		      \
--				  (const char *) (s1))[3]		      \
--				 - ((const unsigned char *)		      \
--				    (const char *) (s2))[3]);		      \
--			  }						      \
--		      }							      \
--		    __result; }))
--
--# define __strcmp_cg(s1, s2, l1) \
--  (__extension__ ({ const unsigned char *__s2 =				      \
--		      (const unsigned char *) (const char *) (s2);	      \
--		    int __result =					      \
--		      (((const unsigned char *) (const char *) (s1))[0]	      \
--		       - __s2[0]);					      \
--		    if (l1 > 0 && __result == 0)			      \
--		      {							      \
--			__result = (((const unsigned char *)		      \
--				     (const char *) (s1))[1] - __s2[1]);      \
--			if (l1 > 1 && __result == 0)			      \
--			  {						      \
--			    __result = (((const unsigned char *)	      \
--					 (const char *) (s1))[2] - __s2[2]);  \
--			    if (l1 > 2 && __result == 0)		      \
--			      __result = (((const unsigned char *)	      \
--					  (const char *)  (s1))[3]	      \
--					  - __s2[3]);			      \
--			  }						      \
--		      }							      \
--		    __result; }))
--
--# define __strcmp_gc(s1, s2, l2) \
--  (__extension__ ({ const unsigned char *__s1 =				      \
--		      (const unsigned char *) (const char *) (s1);	      \
--		    register int __result =				      \
--		      __s1[0] - ((const unsigned char *)		      \
--				 (const char *) (s2))[0];		      \
--		    if (l2 > 0 && __result == 0)			      \
--		      {							      \
--			__result = (__s1[1]				      \
--				    - ((const unsigned char *)		      \
--				       (const char *) (s2))[1]);	      \
--			if (l2 > 1 && __result == 0)			      \
--			  {						      \
--			    __result =					      \
--			      (__s1[2] - ((const unsigned char *)	      \
--					  (const char *) (s2))[2]);	      \
--			    if (l2 > 2 && __result == 0)		      \
--			      __result =				      \
--				(__s1[3]				      \
--				 - ((const unsigned char *)		      \
--				    (const char *) (s2))[3]);		      \
--			  }						      \
--		      }							      \
--		    __result; }))
--#endif
--
--
--/* Compare N characters of S1 and S2.  */
--#ifndef _HAVE_STRING_ARCH_strncmp
--# define strncmp(s1, s2, n)						      \
--  (__extension__ (__builtin_constant_p (n)				      \
--		  && ((__builtin_constant_p (s1)			      \
--		       && strlen (s1) < ((size_t) (n)))			      \
--		      || (__builtin_constant_p (s2)			      \
--			  && strlen (s2) < ((size_t) (n))))		      \
--		  ? strcmp (s1, s2) : strncmp (s1, s2, n)))
- #endif
- 
- 
- /* Return the length of the initial segment of S which
-    consists entirely of characters not in REJECT.  */
--#if !defined _HAVE_STRING_ARCH_strcspn || defined _FORCE_INLINES
--# ifndef _HAVE_STRING_ARCH_strcspn
--#  if __GNUC_PREREQ (3, 2)
--#   define strcspn(s, reject) \
--  __extension__								      \
--  ({ char __r0, __r1, __r2;						      \
--     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
--      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
--	 ? __builtin_strcspn (s, reject)				      \
--	 : ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
--	    ? strlen (s)						      \
--	    : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')	      \
--	       ? __strcspn_c1 (s, __r0)					      \
--	       : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
--		  ? __strcspn_c2 (s, __r0, __r1)			      \
--		  : (((const char *) (reject))[3] == '\0'		      \
--		     ? __strcspn_c3 (s, __r0, __r1, __r2)		      \
--		     : __builtin_strcspn (s, reject))))))		      \
--      : __builtin_strcspn (s, reject)); })
--#  else
--#   define strcspn(s, reject) \
--  __extension__								      \
--  ({ char __r0, __r1, __r2;						      \
--     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
--      ? ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
--	 ? strlen (s)							      \
--	 : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')		      \
--	    ? __strcspn_c1 (s, __r0)					      \
--	    : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
--	       ? __strcspn_c2 (s, __r0, __r1)				      \
--	       : (((const char *) (reject))[3] == '\0'			      \
--		  ? __strcspn_c3 (s, __r0, __r1, __r2)			      \
--		  : strcspn (s, reject)))))				      \
--      : strcspn (s, reject)); })
--#  endif
--# endif
--
--__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject);
--__STRING_INLINE size_t
--__strcspn_c1 (const char *__s, int __reject)
--{
--  size_t __result = 0;
--  while (__s[__result] != '\0' && __s[__result] != __reject)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1,
--				     int __reject2);
--__STRING_INLINE size_t
--__strcspn_c2 (const char *__s, int __reject1, int __reject2)
--{
--  size_t __result = 0;
--  while (__s[__result] != '\0' && __s[__result] != __reject1
--	 && __s[__result] != __reject2)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1,
--				     int __reject2, int __reject3);
--__STRING_INLINE size_t
--__strcspn_c3 (const char *__s, int __reject1, int __reject2,
--	      int __reject3)
--{
--  size_t __result = 0;
--  while (__s[__result] != '\0' && __s[__result] != __reject1
--	 && __s[__result] != __reject2 && __s[__result] != __reject3)
--    ++__result;
--  return __result;
--}
-+#ifndef _HAVE_STRING_ARCH_strcspn
-+# define strcspn(s, reject) __builtin_strcspn (s, reject)
- #endif
- 
- 
- /* Return the length of the initial segment of S which
-    consists entirely of characters in ACCEPT.  */
--#if !defined _HAVE_STRING_ARCH_strspn || defined _FORCE_INLINES
--# ifndef _HAVE_STRING_ARCH_strspn
--#  if __GNUC_PREREQ (3, 2)
--#   define strspn(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
--	 ? __builtin_strspn (s, accept)					      \
--	 : ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
--	    ? ((void) (s), (size_t) 0)					      \
--	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
--	       ? __strspn_c1 (s, __a0)					      \
--	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--		  ? __strspn_c2 (s, __a0, __a1)				      \
--		  : (((const char *) (accept))[3] == '\0'		      \
--		     ? __strspn_c3 (s, __a0, __a1, __a2)		      \
--		     : __builtin_strspn (s, accept))))))		      \
--      : __builtin_strspn (s, accept)); })
--#  else
--#   define strspn(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
--	 ? ((void) (s), (size_t) 0)					      \
--	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
--	    ? __strspn_c1 (s, __a0)					      \
--	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--	       ? __strspn_c2 (s, __a0, __a1)				      \
--	       : (((const char *) (accept))[3] == '\0'			      \
--		  ? __strspn_c3 (s, __a0, __a1, __a2)			      \
--		  : strspn (s, accept)))))				      \
--      : strspn (s, accept)); })
--#  endif
--# endif
--
--__STRING_INLINE size_t __strspn_c1 (const char *__s, int __accept);
--__STRING_INLINE size_t
--__strspn_c1 (const char *__s, int __accept)
--{
--  size_t __result = 0;
--  /* Please note that __accept never can be '\0'.  */
--  while (__s[__result] == __accept)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strspn_c2 (const char *__s, int __accept1,
--				    int __accept2);
--__STRING_INLINE size_t
--__strspn_c2 (const char *__s, int __accept1, int __accept2)
--{
--  size_t __result = 0;
--  /* Please note that __accept1 and __accept2 never can be '\0'.  */
--  while (__s[__result] == __accept1 || __s[__result] == __accept2)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strspn_c3 (const char *__s, int __accept1,
--				    int __accept2, int __accept3);
--__STRING_INLINE size_t
--__strspn_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
--{
--  size_t __result = 0;
--  /* Please note that __accept1 to __accept3 never can be '\0'.  */
--  while (__s[__result] == __accept1 || __s[__result] == __accept2
--	 || __s[__result] == __accept3)
--    ++__result;
--  return __result;
--}
-+#ifndef _HAVE_STRING_ARCH_strspn
-+# define strspn(s, accept) __builtin_strspn (s, accept)
- #endif
- 
- 
- /* Find the first occurrence in S of any character in ACCEPT.  */
--#if !defined _HAVE_STRING_ARCH_strpbrk || defined _FORCE_INLINES
--# ifndef _HAVE_STRING_ARCH_strpbrk
--#  if __GNUC_PREREQ (3, 2)
--#   define strpbrk(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
--	 ? __builtin_strpbrk (s, accept)				      \
--	 : ((__a0 = ((const char  *) (accept))[0], __a0 == '\0')	      \
--	    ? ((void) (s), (char *) NULL)				      \
--	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
--	       ? __builtin_strchr (s, __a0)				      \
--	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--		  ? __strpbrk_c2 (s, __a0, __a1)			      \
--		  : (((const char *) (accept))[3] == '\0'		      \
--		     ? __strpbrk_c3 (s, __a0, __a1, __a2)		      \
--		     : __builtin_strpbrk (s, accept))))))		      \
--      : __builtin_strpbrk (s, accept)); })
--#  else
--#   define strpbrk(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__a0 = ((const char  *) (accept))[0], __a0 == '\0')		      \
--	 ? ((void) (s), (char *) NULL)					      \
--	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
--	    ? strchr (s, __a0)						      \
--	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--	       ? __strpbrk_c2 (s, __a0, __a1)				      \
--	       : (((const char *) (accept))[3] == '\0'			      \
--		  ? __strpbrk_c3 (s, __a0, __a1, __a2)			      \
--		  : strpbrk (s, accept)))))				      \
--      : strpbrk (s, accept)); })
--#  endif
--# endif
--
--__STRING_INLINE char *__strpbrk_c2 (const char *__s, int __accept1,
--				    int __accept2);
--__STRING_INLINE char *
--__strpbrk_c2 (const char *__s, int __accept1, int __accept2)
--{
--  /* Please note that __accept1 and __accept2 never can be '\0'.  */
--  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2)
--    ++__s;
--  return *__s == '\0' ? NULL : (char *) (size_t) __s;
--}
--
--__STRING_INLINE char *__strpbrk_c3 (const char *__s, int __accept1,
--				    int __accept2, int __accept3);
--__STRING_INLINE char *
--__strpbrk_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
--{
--  /* Please note that __accept1 to __accept3 never can be '\0'.  */
--  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2
--	 && *__s != __accept3)
--    ++__s;
--  return *__s == '\0' ? NULL : (char *) (size_t) __s;
--}
--#endif
--
--
--/* Find the first occurrence of NEEDLE in HAYSTACK.  Newer gcc versions
--   do this itself.  */
--#if !defined _HAVE_STRING_ARCH_strstr && !__GNUC_PREREQ (2, 97)
--# define strstr(haystack, needle) \
--  (__extension__ (__builtin_constant_p (needle) && __string2_1bptr_p (needle) \
--		  ? (((const char *) (needle))[0] == '\0'		      \
--		     ? (char *) (size_t) (haystack)			      \
--		     : (((const char *) (needle))[1] == '\0'		      \
--			? strchr (haystack,				      \
--				  ((const char *) (needle))[0]) 	      \
--			: strstr (haystack, needle)))			      \
--		  : strstr (haystack, needle)))
-+#ifndef _HAVE_STRING_ARCH_strpbrk
-+# define strpbrk(s, accept) __builtin_strpbrk (s, accept)
- #endif
- 
- 
-diff --git a/string/strncat.c b/string/strncat.c
-index dcfb04d..a9cb913 100644
---- a/string/strncat.c
-+++ b/string/strncat.c
-@@ -1,4 +1,4 @@
--/* Copyright (C) 1991,1997,2011 Free Software Foundation, Inc.
-+/* Copyright (C) 1991-2022 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -13,14 +13,10 @@
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
-+   <https://www.gnu.org/licenses/>.  */
- 
- #include <string.h>
- 
--#ifdef _LIBC
--# include <memcopy.h>
--#endif
--
- #ifndef STRNCAT
- # undef strncat
- # define STRNCAT  strncat
-@@ -29,54 +25,16 @@
- char *
- STRNCAT (char *s1, const char *s2, size_t n)
- {
--  char c;
-   char *s = s1;
- 
-   /* Find the end of S1.  */
--  do
--    c = *s1++;
--  while (c != '\0');
--
--  /* Make S1 point before next character, so we can increment
--     it while memory is read (wins on pipelined cpus).  */
--  s1 -= 2;
-+  s1 += strlen (s1);
- 
--  if (n >= 4)
--    {
--      size_t n4 = n >> 2;
--      do
--	{
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	} while (--n4 > 0);
--      n &= 3;
--    }
-+  size_t ss = __strnlen (s2, n);
- 
--  while (n > 0)
--    {
--      c = *s2++;
--      *++s1 = c;
--      if (c == '\0')
--	return s;
--      n--;
--    }
--
--  if (c != '\0')
--    *++s1 = '\0';
-+  s1[ss] = '\0';
-+  memcpy (s1, s2, ss);
- 
-   return s;
- }
-+
-diff --git a/string/strncpy.c b/string/strncpy.c
-index 19d501e..83fb610 100644
---- a/string/strncpy.c
-+++ b/string/strncpy.c
-@@ -1,4 +1,4 @@
--/* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
-+/* Copyright (C) 1991-2022 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -13,75 +13,22 @@
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
-+   <https://www.gnu.org/licenses/>.  */
- 
- #include <string.h>
--#include <memcopy.h>
- 
- #undef strncpy
- 
- #ifndef STRNCPY
--#define STRNCPY strncpy
-+ #define STRNCPY strncpy
- #endif
- 
- char *
- STRNCPY (char *s1, const char *s2, size_t n)
- {
--  char c;
--  char *s = s1;
--
--  --s1;
--
--  if (n >= 4)
--    {
--      size_t n4 = n >> 2;
--
--      for (;;)
--	{
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  if (--n4 == 0)
--	    goto last_chars;
--	}
--      n = n - (s1 - s) - 1;
--      if (n == 0)
--	return s;
--      goto zero_fill;
--    }
--
-- last_chars:
--  n &= 3;
--  if (n == 0)
--    return s;
--
--  do
--    {
--      c = *s2++;
--      *++s1 = c;
--      if (--n == 0)
--	return s;
--    }
--  while (c != '\0');
--
-- zero_fill:
--  do
--    *++s1 = '\0';
--  while (--n > 0);
--
--  return s;
-+  size_t size = __strnlen (s2, n);
-+  if (size != n)
-+    memset (s1 + size, '\0', n - size);
-+  return memcpy (s1, s2, size);
- }
- libc_hidden_builtin_def (strncpy)
-diff --git a/sysdeps/gnu/bits/utmp.h b/sysdeps/gnu/bits/utmp.h
-index aed2750..434a533 100644
---- a/sysdeps/gnu/bits/utmp.h
-+++ b/sysdeps/gnu/bits/utmp.h
-@@ -59,10 +59,14 @@ struct utmp
- {
-   short int ut_type;		/* Type of login.  */
-   pid_t ut_pid;			/* Process ID of login process.  */
--  char ut_line[UT_LINESIZE];	/* Devicename.  */
--  char ut_id[4];		/* Inittab ID.  */
--  char ut_user[UT_NAMESIZE];	/* Username.  */
--  char ut_host[UT_HOSTSIZE];	/* Hostname for remote login.  */
-+  char ut_line[UT_LINESIZE]
-+    __attribute_nonstring__;	/* Devicename.  */
-+  char ut_id[4]
-+    __attribute_nonstring__;		/* Inittab ID.  */
-+  char ut_user[UT_NAMESIZE]
-+    __attribute_nonstring__;	/* Username.  */
-+  char ut_host[UT_HOSTSIZE]
-+    __attribute_nonstring__;	/* Hostname for remote login.  */
-   struct exit_status ut_exit;	/* Exit status of a process marked
- 				   as DEAD_PROCESS.  */
- /* The ut_session and ut_tv fields must be the same size when compiled
-diff --git a/sysdeps/gnu/bits/utmpx.h b/sysdeps/gnu/bits/utmpx.h
-index f8716ca..13d84e4 100644
---- a/sysdeps/gnu/bits/utmpx.h
-+++ b/sysdeps/gnu/bits/utmpx.h
-@@ -56,10 +56,14 @@ struct utmpx
- {
-   short int ut_type;		/* Type of login.  */
-   __pid_t ut_pid;		/* Process ID of login process.  */
--  char ut_line[__UT_LINESIZE];	/* Devicename.  */
--  char ut_id[4];		/* Inittab ID. */
--  char ut_user[__UT_NAMESIZE];	/* Username.  */
--  char ut_host[__UT_HOSTSIZE];	/* Hostname for remote login.  */
-+  char ut_line[__UT_LINESIZE]
-+    __attribute_nonstring__;	/* Devicename.  */
-+  char ut_id[4]
-+    __attribute_nonstring__;		/* Inittab ID. */
-+  char ut_user[__UT_NAMESIZE]
-+    __attribute_nonstring__;	/* Username.  */
-+  char ut_host[__UT_HOSTSIZE]
-+    __attribute_nonstring__;	/* Hostname for remote login.  */
-   struct __exit_status ut_exit;	/* Exit status of a process marked
- 				   as DEAD_PROCESS.  */
- 
-diff --git a/sysdeps/unix/sysv/linux/if_index.c b/sysdeps/unix/sysv/linux/if_index.c
-index 8ba5eae..b620d21 100644
---- a/sysdeps/unix/sysv/linux/if_index.c
-+++ b/sysdeps/unix/sysv/linux/if_index.c
-@@ -38,12 +38,19 @@ __if_nametoindex (const char *ifname)
-   return 0;
- #else
-   struct ifreq ifr;
-+  if (strlen (ifname) >= IFNAMSIZ)
-+    {
-+      __set_errno (ENODEV);
-+      return 0;
-+    }
-+
-+  strncpy (ifr.ifr_name, ifname, sizeof (ifr.ifr_name));
-+
-   int fd = __opensock ();
- 
-   if (fd < 0)
-     return 0;
- 
--  strncpy (ifr.ifr_name, ifname, sizeof (ifr.ifr_name));
-   if (__ioctl (fd, SIOCGIFINDEX, &ifr) < 0)
-     {
-       int saved_errno = errno;
-diff --git a/timezone/zic.c b/timezone/zic.c
-index a5202a1..772d081 100644
---- a/timezone/zic.c
-+++ b/timezone/zic.c
-@@ -1609,7 +1609,7 @@ writezone(const char *const name, const char *const string)
- 		}
- #define DO(field)	((void) fwrite(tzh.field, sizeof tzh.field, 1, fp))
- 		tzh = tzh0;
--		(void) strncpy(tzh.tzh_magic, TZ_MAGIC, sizeof tzh.tzh_magic);
-+		memcpy(tzh.tzh_magic, TZ_MAGIC, sizeof tzh.tzh_magic);
- 		tzh.tzh_version[0] = ZIC_VERSION;
- 		convert(eitol(thistypecnt), tzh.tzh_ttisgmtcnt);
- 		convert(eitol(thistypecnt), tzh.tzh_ttisstdcnt);
diff --git a/ci/official/containers/ml_build_arm64/builder.packages.txt b/ci/official/containers/ml_build_arm64/builder.packages.txt
deleted file mode 100644
index 2be317ca4e256b..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.packages.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# Packages needed to build devtoolset
-file
-flex
-make
-build-essential
-patch
-rpm2cpio
-unar
-wget
-xz-utils
-cpio
-gawk
-texinfo
-gettext
-
-# Other build-related tools
-software-properties-common
-apt-transport-https
-autoconf
-automake
-ca-certificates
-pkg-config
-libcurl3-dev
-libcurl4-openssl-dev
-libfreetype6-dev
-libhdf5-serial-dev
-libomp-18-dev
-libssl-dev
-libtool
-libssl-dev
-libxml2-dev
-libxslt1-dev
-libzmq3-dev
-llvm-18
-clang-18
-clang-tidy-18
-lld-18
-clang-format-12
-curl
-git
-parallel
-sudo
-swig
-unzip
-zip
-openjdk-21-jdk
-vim
diff --git a/ci/official/containers/ml_build_arm64/devel.bashrc b/ci/official/containers/ml_build_arm64/devel.bashrc
deleted file mode 100644
index 755d48783b1b9a..00000000000000
--- a/ci/official/containers/ml_build_arm64/devel.bashrc
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-
-# Do not print anything if this is not being used interactively
-[ -z "$PS1" ] && return
-
-# Set up attractive prompt
-export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
-export TERM=xterm-256color
-alias grep="grep --color=auto"
-alias ls="ls --color=auto"
-# Fix nvidia-docker
-ldconfig 
diff --git a/ci/official/containers/ml_build_arm64/ld.so.conf b/ci/official/containers/ml_build_arm64/ld.so.conf
deleted file mode 100644
index e2aa028720ed2c..00000000000000
--- a/ci/official/containers/ml_build_arm64/ld.so.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Builds a devtoolset cross-compiler targeting manylinux2014 (glibc 2.17 / libstdc++ 4.8).
-
-/lib64
diff --git a/ci/official/containers/ml_build_arm64/requirements.txt b/ci/official/containers/ml_build_arm64/requirements.txt
deleted file mode 100644
index 6ae6deda141234..00000000000000
--- a/ci/official/containers/ml_build_arm64/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-portpicker==1.6.0
-# For wheel verification, and uploading
-auditwheel ~= 6.1.0
-twine ~= 6.1.0
-
-# uv is faster than pip for installing Python packages.
-uv ~= 0.5.30
\ No newline at end of file
diff --git a/ci/official/containers/ml_build_arm64/setup.packages.sh b/ci/official/containers/ml_build_arm64/setup.packages.sh
deleted file mode 100755
index 347b853e349385..00000000000000
--- a/ci/official/containers/ml_build_arm64/setup.packages.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# setup.packages.sh: Given a list of Ubuntu packages, install them and clean up.
-# Usage: setup.packages.sh <package_list.txt>
-set -e
-
-# Prevent apt install tzinfo from asking our location (assumes UTC)
-export DEBIAN_FRONTEND=noninteractive
-
-apt-get update
-# Remove commented lines and blank lines
-apt-get install -y --no-install-recommends $(sed -e '/^\s*#.*$/d' -e '/^\s*$/d' "$1" | sort -u)
-rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/ci/official/containers/ml_build_arm64/setup.python.sh b/ci/official/containers/ml_build_arm64/setup.python.sh
deleted file mode 100755
index ff5ade526536fa..00000000000000
--- a/ci/official/containers/ml_build_arm64/setup.python.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# setup.python.sh: Install a specific Python version and packages for it.
-# Usage: setup.python.sh <pyversion> <requirements.txt>
-set -xe
-
-source ~/.bashrc
-VERSION=$1
-REQUIREMENTS=$2
-
-add-apt-repository ppa:deadsnakes/ppa
-# Install Python packages for this container's version
-if [[ ${VERSION} == "python3.13-nogil" ]]; then
-  cat >pythons.txt <<EOF
-$VERSION
-EOF
-elif [[ ${VERSION} == "python3.13" ]]; then
-  cat >pythons.txt <<EOF
-$VERSION
-$VERSION-dev
-$VERSION-venv
-EOF
-else
-  cat >pythons.txt <<EOF
-$VERSION
-$VERSION-dev
-$VERSION-venv
-EOF
-fi
-
-if [[ ${VERSION} == "python3.14" ]]; then
-  if [[ ! -d Python-3.14.0rc1 ]]; then
-    apt update && apt install -y libssl-dev zlib1g-dev libbz2-dev libreadline-dev libncurses5-dev libffi-dev liblzma-dev
-    wget https://www.python.org/ftp/python/3.14.0/Python-3.14.0rc1.tar.xz
-    tar -xf Python-3.14.0rc1.tar.xz
-  fi
-  pushd Python-3.14.0rc1
-  mkdir -p /python314-0rc1
-  CC=clang-18 CXX=clang++-18 ./configure --prefix /python314-0rc1 --with-ensurepip=install
-  make -j$(nproc)
-  make install -j$(nproc)
-  ln -s /python314-0rc1/bin/python3 /usr/bin/python3.14
-  popd
-elif [[ ${VERSION} == "python3.14-nogil" ]]; then
-  if [[ ! -d Python-3.14.0rc1 ]]; then
-    apt update && apt install -y libssl-dev zlib1g-dev libbz2-dev libreadline-dev libncurses5-dev libffi-dev liblzma-dev
-    wget https://www.python.org/ftp/python/3.14.0/Python-3.14.0rc1.tar.xz
-    tar -xf Python-3.14.0rc1.tar.xz
-  fi
-  pushd Python-3.14.0rc1
-  mkdir -p /python314-0rc1-nogil
-  CC=clang-18 CXX=clang++-18 ./configure --prefix /python314-0rc1-nogil --disable-gil --with-ensurepip=install
-  make -j$(nproc)
-  make install -j$(nproc)
-  ln -s /python314-0rc1-nogil/bin/python3 /usr/bin/python3.14-nogil
-  popd
-else
-  /setup.packages.sh pythons.txt
-fi
-
-# Re-link pyconfig.h from aarch64-linux-gnu into the devtoolset directory
-# for any Python version present
-pushd /usr/include/aarch64-linux-gnu
-for f in $(ls | grep python); do
-  # set up symlink for devtoolset-10
-  rm -f /dt10/usr/include/aarch64-linux-gnu/$f
-  ln -s /usr/include/aarch64-linux-gnu/$f /dt10/usr/include/aarch64-linux-gnu/$f
-done
-popd
-
-# Python 3.10 include headers fix:
-# sysconfig.get_path('include') incorrectly points to /usr/local/include/python
-# map /usr/include/python3.10 to /usr/local/include/python3.10
-if [[ ! -f "/usr/local/include/$VERSION" ]]; then
-  ln -sf /usr/include/$VERSION /usr/local/include/$VERSION
-fi
-
-# Install pip
-
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 https://bootstrap.pypa.io/get-pip.py
-/usr/bin/$VERSION get-pip.py
-/usr/bin/$VERSION -m pip install --no-cache-dir --upgrade pip
-
-# For Python 3.13t, do not install twine as it does not have pre-built wheels
-# for this Python version and building it from source fails. We only need twine
-# to be present on the system Python which in this case is 3.12.
-if [[ ${VERSION} == "python3.13-nogil" || ${VERSION} == "python3.14" || ${VERSION} == "python3.14-nogil" ]]; then
-  grep -v "twine" $REQUIREMENTS > requirements_without_twine.txt
-  REQUIREMENTS=requirements_without_twine.txt
-fi
-
-# Disable the cache dir to save image space, and install packages
-/usr/bin/$VERSION -m pip install --no-cache-dir -r $REQUIREMENTS -U
diff --git a/ci/official/containers/ml_build_arm64/setup.sources.sh b/ci/official/containers/ml_build_arm64/setup.sources.sh
deleted file mode 100755
index f8c87d4ceade60..00000000000000
--- a/ci/official/containers/ml_build_arm64/setup.sources.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# setup.python.sh: Install a specific Python version and packages for it.
-# Usage: setup.python.sh <pyversion> <requirements.txt>
-
-# Sets up custom apt sources for our TF images.
-
-# Prevent apt install tzinfo from asking our location (assumes UTC)
-export DEBIAN_FRONTEND=noninteractive
-
-# Set up shared custom sources
-apt-get update
-apt-get install -y gnupg ca-certificates
-
-# Deadsnakes: https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa
-apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776
-
-# LLVM/Clang: https://apt.llvm.org/
-apt-key adv --fetch-keys https://apt.llvm.org/llvm-snapshot.gpg.key
-
-# Set up custom sources
-cat >/etc/apt/sources.list.d/custom.list <<SOURCES
-# More Python versions: Deadsnakes
-deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
-deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
-
-# LLVM/Clang 18 repository
-deb http://apt.llvm.org/focal/ llvm-toolchain-focal-18 main
-deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-18 main
-SOURCES
diff --git a/ci/official/envs/linux_arm64 b/ci/official/envs/linux_arm64
index c886ca75f93f57..9c312f132a4a2f 100644
--- a/ci/official/envs/linux_arm64
+++ b/ci/official/envs/linux_arm64
@@ -26,7 +26,7 @@ TFCI_INDEX_HTML_ENABLE=1
 TFCI_LIB_SUFFIX="-cpu-linux-arm64"
 TFCI_OUTPUT_DIR=build_output
 TFCI_WHL_AUDIT_ENABLE=1
-TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
+TFCI_WHL_AUDIT_PLAT=manylinux_2_27_aarch64
 TFCI_WHL_BAZEL_TEST_ENABLE=1
 TFCI_WHL_SIZE_LIMIT=275M
 TFCI_WHL_SIZE_LIMIT_ENABLE=1
diff --git a/ci/official/envs/linux_x86_tpu b/ci/official/envs/linux_x86_tpu
deleted file mode 100644
index bde958b1a5b3d4..00000000000000
--- a/ci/official/envs/linux_x86_tpu
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-source ci/official/envs/linux_x86
-TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config release_cpu_linux --config=tpu"
-TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
-TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow_tpu"
-TFCI_LIB_SUFFIX="-tpu-linux-x86_64"
-TFCI_WHL_BAZEL_TEST_ENABLE=0
-TFCI_WHL_IMPORT_TEST_ENABLE=0
-TFCI_WHL_SIZE_LIMIT=580M
-TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-wheels/index.html"
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
index 1bef2b2f7903df..fb8ab2398ca308 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
@@ -729,13 +729,13 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
index 7bc734c2624710..f28cc6e55aad19 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
@@ -728,13 +728,13 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
index 8d9d9dc47fc5d7..39d9e567a4f3ec 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
@@ -728,13 +728,13 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
index 41eb61f5557d7f..dfe108225bb4bf 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
@@ -725,13 +725,13 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/utilities/code_check_full.bats b/ci/official/utilities/code_check_full.bats
index 50488bb0a35f3f..22c315dc3c29bb 100644
--- a/ci/official/utilities/code_check_full.bats
+++ b/ci/official/utilities/code_check_full.bats
@@ -216,6 +216,7 @@ EOF
     --@local_config_cuda//cuda:include_cuda_libs=false \
     --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
     --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
+    --repo_env=HERMETIC_NCCL_VERSION="2.27.7" \
     "somepath(//tensorflow/tools/pip_package:wheel, " \
     "@local_config_cuda//cuda:cudart + "\
     "@local_config_cuda//cuda:cudart + "\
@@ -240,6 +241,7 @@ EOF
     --@local_config_cuda//cuda:include_cuda_libs=false \
     --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
     --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
+    --repo_env=HERMETIC_NCCL_VERSION="2.27.7" \
     --define framework_shared_object=false \
     "somepath(//tensorflow/tools/pip_package:wheel, " \
     "@local_config_cuda//cuda:cudart + "\
diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
index 36a6e6b78b5604..a2645ee5ddbdb4 100644
--- a/requirements_lock_3_10.txt
+++ b/requirements_lock_3_10.txt
@@ -748,13 +748,13 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
index 6238e70c957632..cd51c5e0c0c338 100644
--- a/requirements_lock_3_11.txt
+++ b/requirements_lock_3_11.txt
@@ -747,13 +747,13 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_12.txt b/requirements_lock_3_12.txt
index 2d655921b2f9d8..1b8d63c9d75147 100644
--- a/requirements_lock_3_12.txt
+++ b/requirements_lock_3_12.txt
@@ -747,13 +747,13 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_13.txt b/requirements_lock_3_13.txt
index 45461447246243..ded80d5230a8c9 100644
--- a/requirements_lock_3_13.txt
+++ b/requirements_lock_3_13.txt
@@ -729,13 +729,13 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
index 26e2d0ae19171b..6e68ddf6f79595 100644
--- a/requirements_lock_3_9.txt
+++ b/requirements_lock_3_9.txt
@@ -734,13 +734,13 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/tensorflow/c/eager/custom_device_testutil.cc b/tensorflow/c/eager/custom_device_testutil.cc
index f4221e765cd39b..d31bd6e4257c6c 100644
--- a/tensorflow/c/eager/custom_device_testutil.cc
+++ b/tensorflow/c/eager/custom_device_testutil.cc
@@ -27,8 +27,8 @@ limitations under the License.
 namespace {
 
 struct LoggingDevice {
-  tensorflow::string device_name;
-  tensorflow::string underlying_device;
+  std::string device_name;
+  std::string underlying_device;
   // Set to true whenever a TensorHandle is copied onto the device
   bool* arrived_flag;
   // Set to true whenever an operation is executed
@@ -59,9 +59,10 @@ void LoggedTensorDeallocator(void* data) {
   delete reinterpret_cast<LoggedTensor*>(data);
 }
 
-TFE_TensorHandle* MakeLoggedTensorHandle(
-    TFE_Context* context, const tensorflow::string& logging_device_name,
-    std::unique_ptr<LoggedTensor> t, TF_Status* status) {
+TFE_TensorHandle* MakeLoggedTensorHandle(TFE_Context* context,
+                                         const std::string& logging_device_name,
+                                         std::unique_ptr<LoggedTensor> t,
+                                         TF_Status* status) {
   auto dtype = TFE_TensorHandleDataType(t->tensor);
   TFE_CustomDeviceTensorHandleMethods handle_methods;
   handle_methods.num_dims = &LoggedTensorNumDims;
diff --git a/tensorflow/c/experimental/ops/gen/cpp/BUILD b/tensorflow/c/experimental/ops/gen/cpp/BUILD
index 1e1d4eca98106a..05bd307fd499ec 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/BUILD
@@ -28,7 +28,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:str_util",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
index 45e7b87069e361..e4b82c59072123 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h"
 
+#include <string>
 #include <vector>
 
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
index e1db2c9b8ce14b..d97bd7ee2d921f 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h"
 
 #include <algorithm>
+#include <string>
 #include <vector>
 
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
index 7c8231a71133f5..cd4e0af1ec8454 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
 
+#include <string>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
index 50db08df1db988..b3d33c379549b5 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h"
 
 #include <algorithm>
+#include <string>
 
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
index 0ec8108bee7aaf..5aea065a45dffc 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h"
 
+#include <string>
+
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/path.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
index b490cc7fe9e86a..96f317f6201286 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h"
 
+#include <string>
+
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
index 63cb5f30eb1d9d..766adae9a558a1 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h"
 
 #include <iterator>
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
index 6a608d759a3753..5acf000cd71169 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include <string>
+
 #include "absl/log/log.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
index 6621d1aea2c217..cdcbad089a556e 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include <string>
+
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"
 #include "tensorflow/c/experimental/ops/gen/common/source_code.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 6e8dbc8512fa86..a7e93841a98627 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -117,6 +117,7 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
diff --git a/tensorflow/c/kernels/bitcast_op_test.cc b/tensorflow/c/kernels/bitcast_op_test.cc
index c44bc832547dab..e7ae841194f226 100644
--- a/tensorflow/c/kernels/bitcast_op_test.cc
+++ b/tensorflow/c/kernels/bitcast_op_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
diff --git a/tensorflow/c/kernels/merge_summary_op.cc b/tensorflow/c/kernels/merge_summary_op.cc
index ddbc3440d47dc1..9945f473874e20 100644
--- a/tensorflow/c/kernels/merge_summary_op.cc
+++ b/tensorflow/c/kernels/merge_summary_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <memory>
 #include <sstream>
+#include <string>
 #include <unordered_set>
 
 #include "absl/log/check.h"
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index b8b8b2f29cfe13..3064224e9b12bf 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -405,7 +405,7 @@ TEST_F(TestKernelAttr, String) {
                                           /*max_length*/ 5, status);
 
     EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    EXPECT_EQ("bunny", string(static_cast<const char*>(val.get()), 5));
+    EXPECT_EQ("bunny", std::string(static_cast<const char*>(val.get()), 5));
     TF_DeleteStatus(status);
     return static_cast<void*>(s);
   };
@@ -421,7 +421,7 @@ TEST_F(TestKernelAttr, StringList) {
     s->created = true;
     s->compute_called = false;
 
-    std::vector<string> list = {"bugs", "bunny", "duck"};
+    std::vector<std::string> list = {"bugs", "bunny", "duck"};
     int list_total_size = 0;
     for (const auto& s : list) {
       list_total_size += s.size();
@@ -440,7 +440,8 @@ TEST_F(TestKernelAttr, StringList) {
 
     for (size_t i = 0; i < list.size(); ++i) {
       EXPECT_EQ(list[i].size(), lens[i]) << i;
-      EXPECT_EQ(list[i], string(static_cast<const char*>(values[i]), lens[i]))
+      EXPECT_EQ(list[i],
+                std::string(static_cast<const char*>(values[i]), lens[i]))
           << i;
     }
     TF_DeleteStatus(status);
@@ -823,7 +824,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     TF_Status* s = TF_NewStatus();
     TF_GetInput(ctx, 0, &input, s);
     EXPECT_EQ(TF_OK, TF_GetCode(s)) << "Failed to get input: " << TF_Message(s);
-    EXPECT_EQ(123, *static_cast<tensorflow::uint8*>(TF_TensorData(input)));
+    EXPECT_EQ(123, *static_cast<uint8_t*>(TF_TensorData(input)));
     TF_GetInput(ctx, -1, &input, s);
     EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
     TF_GetInput(ctx, 3, &input, s);
@@ -866,7 +867,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     p.device = &dummy_device;
     p.step_id = 43;
 
-    Tensor t(tensorflow::uint8(123));
+    Tensor t(uint8_t(123));
 
     absl::InlinedVector<TensorValue, 4UL> inputs;
     // Simulate 2 inputs
@@ -886,7 +887,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
 
     ASSERT_EQ(2, num_inputs);
     ASSERT_EQ(1, num_outputs);
-    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<tensorflow::uint8>()());
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<uint8_t>()());
   }
 }
 
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index c2a4d73f8ad620..e49f5a099ee72d 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -84,7 +84,7 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output) {
       *out_shape_and_type->mutable_type() = p.type;
     }
   }
-  string result;
+  std::string result;
   handle_data.SerializeToString(&result);
   return result;
 }
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index bfa665a09f7588..3131284b4802bd 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -359,6 +359,7 @@ cc_library(
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/cc/experimental/libexport/load.cc b/tensorflow/cc/experimental/libexport/load.cc
index fd7f2d159e6166..670fa4f51f5cc1 100644
--- a/tensorflow/cc/experimental/libexport/load.cc
+++ b/tensorflow/cc/experimental/libexport/load.cc
@@ -31,8 +31,9 @@ using protobuf::RepeatedPtrField;
 absl::StatusOr<TFPackage> TFPackage::Load(const std::string& path) {
   // Load the proto
   TFPackage tf_package;
-  const string saved_model_pb_path = io::JoinPath(path, kSavedModelFilenamePb);
-  const string saved_model_pbtxt_path =
+  const std::string saved_model_pb_path =
+      io::JoinPath(path, kSavedModelFilenamePb);
+  const std::string saved_model_pbtxt_path =
       io::JoinPath(path, kSavedModelFilenamePbTxt);
   if (Env::Default()->FileExists(saved_model_pb_path).ok()) {
     TF_RETURN_IF_ERROR(ReadBinaryProto(Env::Default(), saved_model_pb_path,
diff --git a/tensorflow/cc/experimental/libexport/save_test.cc b/tensorflow/cc/experimental/libexport/save_test.cc
index fbcc3c2e53b426..1a0ba4f0662a92 100644
--- a/tensorflow/cc/experimental/libexport/save_test.cc
+++ b/tensorflow/cc/experimental/libexport/save_test.cc
@@ -25,7 +25,7 @@ namespace libexport {
 namespace {
 
 TEST(SaveTest, TestDirectoryStructure) {
-  const string base_dir = tensorflow::io::JoinPath(
+  const std::string base_dir = tensorflow::io::JoinPath(
       tensorflow::testing::TmpDir(), "test_directory_structure");
   TF_ASSERT_OK(Save(base_dir));
   TF_ASSERT_OK(Env::Default()->IsDirectory(base_dir));
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 0185fd11d37dec..b457f602b4a5b0 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -79,7 +79,7 @@ Scope Scope::DisabledShapeInferenceScope() {
                         /* disable_shape_inference */ true));
 }
 
-Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
+Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const std::string& name,
                   bool copy_names)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
@@ -98,8 +98,8 @@ Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
-Scope::Impl::Impl(const Scope& other, Tags::OpName, const string& name,
-                  const string& op_name)
+Scope::Impl::Impl(const Scope& other, Tags::OpName, const std::string& name,
+                  const std::string& op_name)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -140,7 +140,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ControlDeps,
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
-Scope::Impl::Impl(const Scope& other, Tags::Device, const string& device)
+Scope::Impl::Impl(const Scope& other, Tags::Device, const std::string& device)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -158,7 +158,7 @@ Scope::Impl::Impl(const Scope& other, Tags::Device, const string& device)
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::SingleUseScope,
-                  const string& op_name)
+                  const std::string& op_name)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -193,7 +193,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ExitOnError)
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::KernelLabel,
-                  const string& kernel_label)
+                  const std::string& kernel_label)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -227,12 +227,12 @@ Scope::Impl::Impl(const Scope& other, Tags::Colocate,
       xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(
           clear_colocations
-              ? std::unordered_set<string>()
+              ? std::unordered_set<std::string>()
               : other.impl()->GetColocationConstraints(colocate_with_op)),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::AssignedDevice,
-                  const string& assigned_device)
+                  const std::string& assigned_device)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -250,7 +250,7 @@ Scope::Impl::Impl(const Scope& other, Tags::AssignedDevice,
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::XlaCluster,
-                  const string& xla_cluster)
+                  const std::string& xla_cluster)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -267,13 +267,13 @@ Scope::Impl::Impl(const Scope& other, Tags::XlaCluster,
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
-std::unordered_set<string> Scope::Impl::GetColocationConstraints(
+std::unordered_set<std::string> Scope::Impl::GetColocationConstraints(
     const Operation& colocate_with_op) const {
-  std::unordered_set<string> current_constraints(colocation_constraints_);
+  std::unordered_set<std::string> current_constraints(colocation_constraints_);
   const AttrSlice attrs = colocate_with_op.node()->attrs();
-  std::vector<string> node_constraints;
+  std::vector<std::string> node_constraints;
   if (TryGetNodeAttr(attrs, kColocationAttrName, &node_constraints)) {
-    for (const string& entry : node_constraints) {
+    for (const std::string& entry : node_constraints) {
       absl::string_view s(entry);
       if (absl::ConsumePrefix(&s, kColocationGroupPrefix)) {
         current_constraints.emplace(s);
@@ -335,13 +335,14 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
   }
 
   if (!impl()->colocation_constraints_.empty()) {
-    std::vector<string> constraints(impl()->colocation_constraints_.begin(),
-                                    impl()->colocation_constraints_.end());
+    std::vector<std::string> constraints(
+        impl()->colocation_constraints_.begin(),
+        impl()->colocation_constraints_.end());
     // Sort the set.
     std::sort(constraints.begin(), constraints.end());
     // Add loc:@ prefix
     std::transform(constraints.begin(), constraints.end(), constraints.begin(),
-                   [](const string& s) {
+                   [](const std::string& s) {
                      return absl::StrCat(kColocationGroupPrefix, s);
                    });
     builder->Attr(kColocationAttrName, constraints);
@@ -357,8 +358,8 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
   }
 }
 
-string Scope::Impl::GetUniqueName(const string& prefix,
-                                  bool check_single_use) const {
+std::string Scope::Impl::GetUniqueName(const std::string& prefix,
+                                       bool check_single_use) const {
   if (check_single_use && single_use_scope()) {
     if (*scope_used_) {
       *status_ =
@@ -373,7 +374,7 @@ string Scope::Impl::GetUniqueName(const string& prefix,
     name_map_->insert({prefix, 0});
     return prefix;
   }
-  string unique_name;
+  std::string unique_name;
   do {
     unique_name = absl::StrCat(prefix, kSuffixSeparator, ++entry->second);
   } while (name_map_->find(unique_name) != name_map_->end());
@@ -381,15 +382,15 @@ string Scope::Impl::GetUniqueName(const string& prefix,
   return unique_name;
 }
 
-string Scope::Impl::GetNameForOp(const string& default_name) const {
-  const string unique_name =
+std::string Scope::Impl::GetNameForOp(const std::string& default_name) const {
+  const std::string unique_name =
       GetUniqueName(default_name, true /* check_single_use */);
-  const string sep =
+  const std::string sep =
       name_.empty() || unique_name.empty() ? "" : kScopeSeparator;
   return absl::StrCat(name_, sep, unique_name);
 }
 
-string Scope::GetUniqueNameForOp(const string& default_name) const {
+std::string Scope::GetUniqueNameForOp(const std::string& default_name) const {
   if (impl()->single_use_scope()) {
     if (impl()->op_name_.empty() || *impl()->scope_used_) {
       *impl()->status_ =
@@ -403,21 +404,21 @@ string Scope::GetUniqueNameForOp(const string& default_name) const {
                                   : impl()->GetNameForOp(impl()->op_name_);
 }
 
-Scope Scope::NewSubScope(const string& child_scope_name) const {
+Scope Scope::NewSubScope(const std::string& child_scope_name) const {
   if (child_scope_name.empty()) {
     return Scope(new Impl(*this, Impl::Tags::ScopeName(), impl()->name_,
                           true /* copy_names */));
   }
-  const string unique_name =
+  const std::string unique_name =
       impl()->GetUniqueName(child_scope_name, false /* check_single_use */);
-  const string sep =
+  const std::string sep =
       impl()->name_.empty() || unique_name.empty() ? "" : kScopeSeparator;
   return Scope(new Impl(*this, Impl::Tags::ScopeName(),
                         absl::StrCat(impl()->name_, sep, unique_name),
                         false /* copy_names */));
 }
 
-Scope Scope::WithOpNameImpl(const string& op_name) const {
+Scope Scope::WithOpNameImpl(const std::string& op_name) const {
   if (impl()->single_use_scope()) {
     UpdateStatus(errors::InvalidArgument("Cannot set op name ", op_name,
                                          " on this scope"));
@@ -446,15 +447,15 @@ Scope Scope::WithNoControlDependencies() const {
                         /* clear_control_deps */ true));
 }
 
-Scope Scope::WithDevice(const string& device) const {
+Scope Scope::WithDevice(const std::string& device) const {
   return Scope(new Impl(*this, Impl::Tags::Device(), device));
 }
 
-Scope Scope::WithAssignedDevice(const string& assigned_device) const {
+Scope Scope::WithAssignedDevice(const std::string& assigned_device) const {
   return Scope(new Impl(*this, Impl::Tags::AssignedDevice(), assigned_device));
 }
 
-Scope Scope::WithXlaCluster(const string& xla_cluster) const {
+Scope Scope::WithXlaCluster(const std::string& xla_cluster) const {
   return Scope(new Impl(*this, Impl::Tags::XlaCluster(), xla_cluster));
 }
 
@@ -472,12 +473,12 @@ Scope Scope::ExitOnError() const {
   return Scope(new Impl(*this, Impl::Tags::ExitOnError()));
 }
 
-Scope Scope::WithKernelLabel(const string& kernel_label) const {
+Scope Scope::WithKernelLabel(const std::string& kernel_label) const {
   return Scope(new Impl(*this, Impl::Tags::KernelLabel(), kernel_label));
 }
 
 CompositeOpScopes Scope::GetCompositeOpScopes(
-    const string& composite_op_name) const {
+    const std::string& composite_op_name) const {
   if (impl()->op_name_.empty() && composite_op_name.empty()) {
     UpdateStatus(errors::InvalidArgument(
         "Cannot create composite op scopes with empty name"));
@@ -486,8 +487,9 @@ CompositeOpScopes Scope::GetCompositeOpScopes(
   if (!impl()->single_use_scope()) {
     Scope child = NewSubScope(impl()->op_name_.empty() ? composite_op_name
                                                        : impl()->op_name_);
-    const string child_op_sep = impl()->name_.empty() ? "" : kSuffixSeparator;
-    const string child_name =
+    const std::string child_op_sep =
+        impl()->name_.empty() ? "" : kSuffixSeparator;
+    const std::string child_name =
         absl::StrCat(impl()->name_, child_op_sep, child.impl()->name_);
     return {child,
             Scope(new Impl(child, Impl::Tags::SingleUseScope(), child_name))};
@@ -510,11 +512,11 @@ class InternalScope {
                         ShapeRefiner* refiner) {
     Scope::Impl::NameMap* name_map = new Scope::Impl::NameMap;
     for (const Node* node : graph->nodes()) {
-      const string& name = node->name();
+      const std::string& name = node->name();
       (*name_map)[name] = 0;
       // Add all name prefixes ('/' separated).
       size_t idx = -1;
-      while ((idx = name.find(kScopeSeparator, idx + 1)) != string::npos) {
+      while ((idx = name.find(kScopeSeparator, idx + 1)) != std::string::npos) {
         (*name_map)[name.substr(0, idx)] = 0;
       }
     }
@@ -533,7 +535,7 @@ Scope NewInternalScope(Graph* graph, absl::Status* status,
   return InternalScope::NewScope(graph, status, refiner);
 }
 
-absl::Status CreateOutputWithScope(string op_name,
+absl::Status CreateOutputWithScope(std::string op_name,
                                    absl::Span<const ::tensorflow::Input> inputs,
                                    const Scope& scope, Output* output) {
   TF_RETURN_IF_ERROR(scope.status());
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index f3c3fd045a3d6f..f0189c60c714e1 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/grad_op_registry.h"
 #include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc
index deb90eec264ee7..bb37c90b3f32a8 100644
--- a/tensorflow/cc/gradients/image_grad.cc
+++ b/tensorflow/cc/gradients/image_grad.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index c785af15f95447..af39009ad3f2a5 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 #include <iterator>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 6309080492c1da..9b980bd9e8321d 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <functional>
 #include <string>
 #include <vector>
diff --git a/tensorflow/cc/saved_model/experimental/tests/BUILD b/tensorflow/cc/saved_model/experimental/tests/BUILD
index 3270ca916e14a0..995f2a18d6979b 100644
--- a/tensorflow/cc/saved_model/experimental/tests/BUILD
+++ b/tensorflow/cc/saved_model/experimental/tests/BUILD
@@ -23,5 +23,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index ac85bd728cb7e4..baa3b6be991076 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/cc/experimental/base/public/runtime.h"
 #include "tensorflow/cc/experimental/base/public/runtime_builder.h"
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 1d23f9d87e2d7d..87b696edc39681 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -118,7 +118,7 @@ absl::Status QueueRunner::StartAndCollectCostGraph(
 
 absl::Status QueueRunner::Start(Session* sess, int wait_for) {
   counter_ = std::make_unique<BlockingCounter>(runs_);
-  for (const string& enqueue_op : enqueue_op_names_) {
+  for (const std::string& enqueue_op : enqueue_op_names_) {
     thread_pool_->Schedule(
         std::bind(&QueueRunner::Run, this, sess, enqueue_op));
   }
@@ -182,7 +182,7 @@ void QueueRunner::UpdateStatus(const absl::Status& status) {
   }
 }
 
-void QueueRunner::Run(Session* sess, const string& enqueue_op) {
+void QueueRunner::Run(Session* sess, const std::string& enqueue_op) {
   bool first_iteration = true;
   absl::Status status;
   while (status.ok()) {
@@ -245,7 +245,7 @@ void QueueRunner::SetRunArgumentsAndCostGraph(const RunOptions& run_options) {
   run_options_ = run_options;
 }
 
-absl::Status QueueRunner::RealRun(Session* sess, const string& op,
+absl::Status QueueRunner::RealRun(Session* sess, const std::string& op,
                                   bool update_costs) {
   absl::Status s;
   if (update_costs && cg_mu_) {
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index d5d6ca10a56821..ffba8987c6d518 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -97,7 +97,7 @@ class QueueRunner : public RunnerInterface {
   absl::Status Init(const QueueRunnerDef& queue_runner_def);
 
   // The Run function for each thread.
-  void Run(Session* sess, const string& enqueue_op);
+  void Run(Session* sess, const std::string& enqueue_op);
 
   // Updates the internal status; it only keeps OK or the first unexpected error
   // status.
@@ -112,12 +112,12 @@ class QueueRunner : public RunnerInterface {
 
   void SetRunArgumentsAndCostGraph(const RunOptions& run_options);
 
-  absl::Status RealRun(Session* sess, const string& op, bool update_costs);
+  absl::Status RealRun(Session* sess, const std::string& op, bool update_costs);
 
-  string queue_name_;
-  std::vector<string> enqueue_op_names_;
-  string close_op_name_;
-  string cancel_op_name_;
+  std::string queue_name_;
+  std::vector<std::string> enqueue_op_names_;
+  std::string close_op_name_;
+  std::string cancel_op_name_;
   // code::Code casted to int to avoid a hash function.
   std::unordered_set<int> queue_closed_exception_types_;
 
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 87cb051b75df63..1042ff1fa7a896 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -1206,9 +1206,9 @@ absl::StatusOr<EmbeddedConstantBuffers> GenerateConstantBuffersData(
       auto aot_thunk_result_temp,
       xla::cpu::CpuAotCompilationResult::FromString(serialized, nullptr));
 
-  TF_ASSIGN_OR_RETURN(
-      auto executable,
-      std::move(*aot_thunk_result_temp).LoadExecutable(nullptr, nullptr));
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      std::move(*aot_thunk_result_temp)
+                          .LoadExecutable(/*stream_exec=*/nullptr));
 
   xla::cpu::CpuExecutable* cpu_executable =
       tsl::down_cast<xla::cpu::CpuExecutable*>(executable.get());
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 7c1772c084750c..91313abca45a24 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -65,7 +65,10 @@ filegroup(
 # Please use the individual targets in the deps list as needed. See b/336889334.
 cc_library(
     name = "jit",
-    visibility = internal_visibility([":legacy_jit_users"]),
+    visibility = internal_visibility([
+        ":legacy_jit_users",
+        "//tensorflow/python/profiler:__pkg__",
+    ]),
     deps = [
         ":xla_cpu_device",
         ":xla_cpu_jit",
@@ -253,7 +256,6 @@ cc_library(
     hdrs = ["xla_device_context.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":xla_launch_util",
         ":xla_tensor",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:layout_util",
@@ -261,14 +263,29 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:util",
+        "@local_xla//xla:literal",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:status_macros",
         "@local_xla//xla/client:local_client",
+        "@local_xla//xla/service:stream_pool",
+        "@local_xla//xla/stream_executor:allocator_stats",
+        "@local_xla//xla/stream_executor:event",
+        "@local_xla//xla/stream_executor:stream",
+        "@local_xla//xla/stream_executor:stream_executor_h",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
@@ -682,7 +699,9 @@ tf_cc_test(
     name = "xla_launch_util_test",
     srcs = ["xla_launch_util_test.cc"],
     deps = [
+        ":device_compilation_profiler",
         ":device_compiler",
+        ":device_executable_persistor",
         ":flags_headers",
         ":pjrt_device_compiler_client",
         ":variable_info",
@@ -691,25 +710,35 @@ tf_cc_test(
         ":xla_cpu_jit",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_launch_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:session_options",
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
         "//tensorflow/core/tfrt/common:pjrt_util",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:literal",
+        "@local_xla//xla:literal_util",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:pjrt_common",
+        "@local_xla//xla/pjrt:pjrt_executable",
         "@local_xla//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "@local_xla//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "@local_xla//xla/tests:literal_test_util",
         "@local_xla//xla/tsl/framework:device_id_utils",
         "@local_xla//xla/tsl/lib/core:status_test_util",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
@@ -976,8 +1005,7 @@ tf_cc_test(
         "//tensorflow/core:session_options",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
     ],
 )
@@ -1057,6 +1085,9 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla:status_macros",
     ],
 )
@@ -1075,11 +1106,10 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
-        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:constant_op",
-        "@local_tsl//tsl/platform:status",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -1106,12 +1136,15 @@ tf_cc_test(
     deps = [
         ":encapsulate_util",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -1216,11 +1249,16 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:function_body",
         "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/platform:hash",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -1363,6 +1401,9 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/common_runtime:device_set",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -1414,6 +1455,8 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -1450,7 +1493,7 @@ cc_library(
         ":xla_activity_proto_cc",
         ":xla_cluster_util",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
-        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/tf2xla:tf2xla_defs",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:core_cpu",
@@ -1458,12 +1501,17 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:function_body",
+        "//tensorflow/core/common_runtime:function_utils",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@local_xla//xla:union_find",
         "@local_xla//xla:util",
         "@local_xla//xla/service/graphcycles",
@@ -1474,6 +1522,7 @@ tf_cc_test(
     name = "compilability_check_util_test",
     srcs = ["compilability_check_util_test.cc"],
     deps = [
+        ":common",
         ":compilability_check_util",
         ":xla_cpu_device",
         ":xla_cpu_jit",
@@ -1482,17 +1531,17 @@ tf_cc_test(
         "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla:test_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
-        "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2052,12 +2101,15 @@ tf_cuda_cc_test(
     srcs = ["xla_platform_info_test.cc"],
     tags = tf_cuda_tests_tags() + ["config-cuda-only"],
     deps = [
+        ":device_compilation_profiler",
+        ":device_compiler",
         ":flags_headers",
         ":test_util",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_gpu_device",
         ":xla_gpu_jit",
         "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:lib_proto_parsing",
@@ -2069,8 +2121,12 @@ tf_cuda_cc_test(
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
         "//tensorflow/core/tfrt/common:pjrt_util",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/client:local_client",
+        "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "@local_xla//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 6c77648817f808..8da8b2055c6c2b 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -15,52 +15,40 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/compilability_check_util.h"
 
-#include <algorithm>
-#include <atomic>
-#include <deque>
+#include <cstddef>
 #include <iterator>
-#include <limits>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/device_util.h"
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
-#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/service/graphcycles/graphcycles.h"
-#include "xla/union_find.h"
-#include "xla/util.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/public/version.h"
-#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -341,8 +329,8 @@ bool RecursiveCompilabilityChecker::IsCompilableCall(
     return false;
   }
 
-  auto release_handle_on_return = gtl::MakeCleanup(
-      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+  auto release_handle_on_return =
+      gtl::MakeCleanup([&] { CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
   bool is_compilable = true;
   for (const Node* node : fbody->graph->op_nodes()) {
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index ea24176bb04a4a..185afab797ee1e 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -15,21 +15,32 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/compilability_check_util.h"
 
-#include "absl/memory/memory.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -260,7 +271,7 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalWhileNode) {
   GraphDef graph_def;
   TF_EXPECT_OK(builder.ToGraphDef(&graph_def));
   std::unique_ptr<Graph> graph(new Graph(flib_def_.get()));
-  TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
 
   auto while_node_it = std::find_if(
       graph->nodes().begin(), graph->nodes().end(),
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 94b136a02b99cf..776ec3915e2f73 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -15,26 +15,52 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
+#include <algorithm>
+#include <functional>
+#include <map>
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/state_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 #include "tensorflow/compiler/jit/test_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/equal_graph_def.h"
@@ -499,7 +525,7 @@ absl::Status Encapsulate(
   // Create FunctionLibraryRuntime.
   SessionOptions session_options;
   std::vector<std::unique_ptr<Device>> devices;
-  TF_CHECK_OK(DeviceFactory::AddDevices(
+  CHECK_OK(DeviceFactory::AddDevices(
       session_options, "/job:localhost/replica:0/task:0", &devices));
   OptimizerOptions opts;
   auto device_mgr = std::make_unique<StaticDeviceMgr>(std::move(devices));
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 6d1661222e3eaf..4d2b71327b3250 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -15,12 +15,19 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 
+#include <vector>
+
+#include "absl/log/check.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -35,16 +42,16 @@ TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
   Output add = ops::Add(s.WithOpName("add"), const_0, const_1);
   Output identity = ops::Identity(s.WithOpName("identity"), add);
   Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
+  CHECK_OK(s.ToGraph(&g));
 
-  TF_CHECK_OK(PerformStaticShapeInferenceBeforeEncapsulation(&g));
+  CHECK_OK(PerformStaticShapeInferenceBeforeEncapsulation(&g));
 
   // Check that "add" node now has _xla_inferred_shapes attr.
   auto node_index = g.BuildNodeNameIndex();
   Node *add_node = node_index["add"];
   std::vector<PartialTensorShape> output_shapes;
-  TF_CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
-                          &output_shapes));
+  CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
+                       &output_shapes));
   EXPECT_EQ(output_shapes.size(), 1);
   TensorShapeProto shape_proto;
   output_shapes[0].AsProto(&shape_proto);
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index acd5319cf8ed16..6b0570b704e2d7 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -15,19 +15,31 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
@@ -47,23 +59,23 @@ static std::unique_ptr<Graph> MakeOuterGraph(
   auto w = ops::Placeholder(scope.WithOpName("W"), DT_RESOURCE);
 
   NodeDef def;
-  TF_CHECK_OK(NodeDefBuilder("launch0", function, &flib_def)
-                  .Input(a.node()->name(), 0, DT_INT32)
-                  .Input(b.node()->name(), 0, DT_FLOAT)
-                  .Input(c.node()->name(), 0, DT_INT32)
-                  .Input(d.node()->name(), 0, DT_FLOAT)
-                  .Input(u.node()->name(), 0, DT_RESOURCE)
-                  .Input(v.node()->name(), 0, DT_RESOURCE)
-                  .Input(w.node()->name(), 0, DT_RESOURCE)
-                  .Device("/gpu:0")
-                  .Attr(kXlaClusterIdAttr, "launch0")
-                  .Attr("_variable_start_index", 4)
-                  .Finalize(&def));
+  CHECK_OK(NodeDefBuilder("launch0", function, &flib_def)
+               .Input(a.node()->name(), 0, DT_INT32)
+               .Input(b.node()->name(), 0, DT_FLOAT)
+               .Input(c.node()->name(), 0, DT_INT32)
+               .Input(d.node()->name(), 0, DT_FLOAT)
+               .Input(u.node()->name(), 0, DT_RESOURCE)
+               .Input(v.node()->name(), 0, DT_RESOURCE)
+               .Input(w.node()->name(), 0, DT_RESOURCE)
+               .Device("/gpu:0")
+               .Attr(kXlaClusterIdAttr, "launch0")
+               .Attr("_variable_start_index", 4)
+               .Finalize(&def));
 
   absl::Status status;
   Node* launch = scope.graph()->AddNode(def, &status);
-  TF_CHECK_OK(status);
-  TF_CHECK_OK(scope.DoShapeInference(launch));
+  CHECK_OK(status);
+  CHECK_OK(scope.DoShapeInference(launch));
   scope.graph()->AddEdge(a.node(), 0, launch, 0);
   scope.graph()->AddEdge(b.node(), 0, launch, 1);
   scope.graph()->AddEdge(c.node(), 0, launch, 2);
@@ -89,7 +101,7 @@ static std::unique_ptr<Graph> MakeOuterGraph(
   auto consumer3 = ops::Identity(scope.WithOpName("consumer3"), out3);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(scope.ToGraph(graph.get()));
+  CHECK_OK(scope.ToGraph(graph.get()));
   return graph;
 }
 
@@ -135,7 +147,7 @@ static std::unique_ptr<Graph> MakeBodyGraph() {
       ops::_Retval(scope.WithOpName("readu_0_retval_RetVal"), read_u, 3);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(scope.ToGraph(graph.get()));
+  CHECK_OK(scope.ToGraph(graph.get()));
   return graph;
 }
 
@@ -160,7 +172,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
       };
       add_attrs(e.node());
 
-      TF_CHECK_OK(scope.ToGraph(graph.get()));
+      CHECK_OK(scope.ToGraph(graph.get()));
       auto get_node_in_graph = [&graph](Node* node) {
         return graph->FindNodeId(node->id());
       };
@@ -178,7 +190,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
                               get_node_in_graph(e.node()), true);
       }
     }
-    TF_CHECK_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
+    CHECK_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
     return SerializeGraphDeterministic(*graph).value();
   };
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index 1a6441a80726a0..aa6ad2e4eeed8c 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -15,22 +15,39 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 
+#include <initializer_list>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "xla/hlo/testlib/test.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -50,7 +67,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   auto ret0 = ops::_Retval(s.WithOpName("ret0"), add, 0);
   auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg1, 1);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
   auto node_name_image = g->BuildNodeNameIndex();
   Node *add_node = node_name_image["add"];
   EXPECT_NE(add_node, nullptr);
@@ -61,7 +78,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   node_name_image = g->BuildNodeNameIndex();
 
@@ -75,7 +92,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   Node *recv_at_host = node_name_image["outside_compilation_cluster__0_recv"];
   EXPECT_NE(recv_at_host, nullptr);
   std::vector<DataType> recv_at_host_dtypes;
-  TF_CHECK_OK(
+  CHECK_OK(
       GetNodeAttr(recv_at_host->attrs(), "Toutputs", &recv_at_host_dtypes));
   EXPECT_EQ(recv_at_host_dtypes.size(), 3);
   EXPECT_EQ(recv_at_host_dtypes[0], DT_INT32);
@@ -88,7 +105,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   Node *send_from_host = node_name_image["outside_compilation_cluster__0_send"];
   EXPECT_NE(send_from_host, nullptr);
   std::vector<DataType> send_from_host_dtypes;
-  TF_CHECK_OK(
+  CHECK_OK(
       GetNodeAttr(send_from_host->attrs(), "Tinputs", &send_from_host_dtypes));
   EXPECT_EQ(send_from_host_dtypes.size(), 2);
   EXPECT_EQ(send_from_host_dtypes[0], DT_INT32);
@@ -115,8 +132,8 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   EXPECT_TRUE(has_control_edge_to_send_from_host);
   // Verify step 7: necessary attrs added to call_node_def.
   NameAttrList shape_inference_graph;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
-                          "shape_inference_graph", &shape_inference_graph));
+  CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
+                       "shape_inference_graph", &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph.name(),
             "_outside_compilation_shape_inference_cluster__0");
 }
@@ -126,13 +143,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, NoSendFromHost) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_INT32, 0);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster", "");
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   auto node_name_image = g->BuildNodeNameIndex();
 
@@ -152,13 +169,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, NoRecvAtHost) {
   Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
   auto ret = ops::_Retval(s.WithOpName("ret"), const0, 0);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster", "");
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   auto node_name_image = g->BuildNodeNameIndex();
 
@@ -176,13 +193,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, NoKeyPlaceholder) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster", "");
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   auto node_name_image = g->BuildNodeNameIndex();
 
@@ -202,7 +219,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
   auto ret = ops::_Retval(s.WithOpName("ret"), const0, 0);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
   auto node_name_image = g->BuildNodeNameIndex();
   Node *const0_node = node_name_image["const0"];
   EXPECT_NE(const0_node, nullptr);
@@ -214,13 +231,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   node_name_image = g->BuildNodeNameIndex();
 
   // Check "shape" attr is available in call_node_def.
   std::vector<TensorShapeProto> shapes;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()), "shapes", &shapes));
+  CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
 }
@@ -230,7 +247,7 @@ class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
   void SetUp() override {
     SessionOptions session_options;
     std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::AddDevices(
+    CHECK_OK(DeviceFactory::AddDevices(
         session_options, "/job:localhost/replica:0/task:0", &devices));
     device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
   }
@@ -275,7 +292,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
     Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
     Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity0"]->AddAttr("_oc", "0");
     node_name_image["identity1"]->AddAttr("_oc", "1");
@@ -284,7 +301,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -295,15 +312,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
   std::unique_ptr<FunctionBody> xla_fbody;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld, &xla_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"), AttrSlice(),
+                                   &fld, &xla_fbody));
   auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
@@ -313,26 +330,26 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   EXPECT_NE(host_compute_1, nullptr);
   // Check XlaHostCompute nodes' "tpu_core" attr.
   int tpu_core;
-  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "tpu_core", &tpu_core));
+  CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "tpu_core", &tpu_core));
   EXPECT_EQ(tpu_core, 1);
-  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "tpu_core", &tpu_core));
+  CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "tpu_core", &tpu_core));
   EXPECT_EQ(tpu_core, 0);
   // Check XlaHostCompute nodes' "shapes" attr. "0" should not have shapes, and
   // "1" should have shapes.
   std::vector<TensorShapeProto> shapes;
-  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shapes", &shapes));
+  CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 0);
-  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
+  CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
   // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
   // empty values.
   NameAttrList shape_inference_graph;
-  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
-                          &shape_inference_graph));
+  CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
+                       &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph.name(), "");
-  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
-                          &shape_inference_graph));
+  CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
+                       &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph.name(), "");
 
   // Check `shape_inference_graphs`.
@@ -344,7 +361,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   device_ordinal_temp_value.set_i(0);
   protobuf::Map<std::string, AttrValue> host_func_attrs;
   host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-  TF_CHECK_OK(FunctionDefToBodyHelper(
+  CHECK_OK(FunctionDefToBodyHelper(
       *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld, &host_fbody));
   Graph *host_graph = host_fbody->graph;
   Node *key_placeholder = nullptr, *sequencer = nullptr;
@@ -377,7 +394,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   EXPECT_EQ(num_recv_at_host, 1);
   for (Node *n : send_recv_nodes) {
     Node *input_node;
-    TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
+    CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
     EXPECT_EQ(input_node, key_placeholder);
 
     bool has_control_edge_to_sequencer = false;
@@ -399,10 +416,10 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -413,7 +430,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -435,7 +452,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Output identity = ops::Identity(s.WithOpName("identity_true_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_true_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -443,7 +460,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *true_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "true_fn", true_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "true_fn", true_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -451,7 +468,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Output identity = ops::Identity(s.WithOpName("identity_false_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_false_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -459,7 +476,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *false_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "false_fn", false_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "false_fn", false_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -474,10 +491,10 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
                          true_fn, false_fn);
     ops::_Retval retval(s.WithOpName("retval"), if_op.output[0], 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -488,7 +505,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -500,9 +517,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     device_ordinal_temp_value.set_i(0);
     protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &host_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &host_fbody));
     Graph *host_graph = host_fbody->graph;
     auto node_name_index = host_graph->BuildNodeNameIndex();
 
@@ -515,7 +532,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Node *if_oc_node = node_name_index["oc_if_if"];
     EXPECT_NE(if_oc_node, nullptr);
     Node *if_oc_node_cond_input;
-    TF_CHECK_OK(if_oc_node->input_node(0, &if_oc_node_cond_input));
+    CHECK_OK(if_oc_node->input_node(0, &if_oc_node_cond_input));
     EXPECT_EQ(if_oc_node_cond_input, recv_if_pred_node);
 
     // Check that then_branch outside compilation has node "identity_true_fn".
@@ -546,8 +563,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   // Check XLA graph.
   {
     std::unique_ptr<FunctionBody> xla_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                        AttrSlice(), &fld, &xla_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                     AttrSlice(), &fld, &xla_fbody));
     Graph *xla_graph = xla_fbody->graph;
     auto node_name_index = xla_graph->BuildNodeNameIndex();
 
@@ -569,7 +586,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Node *if_node = node_name_index["if"];
     EXPECT_NE(if_node, nullptr);
     std::vector<std::string> token_inputs;
-    TF_CHECK_OK(
+    CHECK_OK(
         GetNodeAttr(if_node->def(), "_xla_token_input_nodes", &token_inputs));
     EXPECT_THAT(token_inputs, ::testing::ElementsAre("send_oc_if_pred_if"));
   }
@@ -586,7 +603,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     Output identity = ops::Identity(s.WithOpName("identity_cond_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_cond_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -594,7 +611,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *cond_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cond_fn", cond_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cond_fn", cond_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -602,7 +619,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     Output identity = ops::Identity(s.WithOpName("identity_body_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_body_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -610,7 +627,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *body_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "body_fn", body_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "body_fn", body_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -624,10 +641,10 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
                    cond_fn, body_fn);
     ops::_Retval retval(s.WithOpName("retval"), while_op.output[0], 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -638,7 +655,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -650,9 +667,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     device_ordinal_temp_value.set_i(0);
     protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &host_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &host_fbody));
     Graph *host_graph = host_fbody->graph;
     auto node_name_index = host_graph->BuildNodeNameIndex();
 
@@ -713,7 +730,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     Output identity = ops::Identity(s.WithOpName("identity"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -721,7 +738,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *true_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
   {
@@ -736,35 +753,35 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
       tensor_proto.add_int_val(1);
     }
     NodeDef const_def;
-    TF_CHECK_OK(NodeDefBuilder("const", "Const")
-                    .Attr("dtype", DT_INT32)
-                    .Attr("value", tensor_proto)
-                    .Finalize(&const_def));
+    CHECK_OK(NodeDefBuilder("const", "Const")
+                 .Attr("dtype", DT_INT32)
+                 .Attr("value", tensor_proto)
+                 .Finalize(&const_def));
     absl::Status s;
     Node *const_node = g->AddNode(const_def, &s);
-    TF_CHECK_OK(s);
+    CHECK_OK(s);
 
     NodeDef fn_def;
-    TF_CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
-                    .Input("const", 0, DT_INT32)
-                    .Finalize(&fn_def));
+    CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
+                 .Input("const", 0, DT_INT32)
+                 .Finalize(&fn_def));
     Node *fn_node = g->AddNode(fn_def, &s);
-    TF_CHECK_OK(s);
+    CHECK_OK(s);
     g->AddEdge(const_node, 0, fn_node, 0);
 
     NodeDef ret_def;
-    TF_CHECK_OK(NodeDefBuilder("ret", "_Retval")
-                    .Attr("index", 0)
-                    .Attr("T", DT_INT32)
-                    .Input("fn", 0, DT_INT32)
-                    .Finalize(&ret_def));
+    CHECK_OK(NodeDefBuilder("ret", "_Retval")
+                 .Attr("index", 0)
+                 .Attr("T", DT_INT32)
+                 .Input("fn", 0, DT_INT32)
+                 .Finalize(&ret_def));
     Node *ret_node = g->AddNode(ret_def, &s);
-    TF_CHECK_OK(s);
+    CHECK_OK(s);
     g->AddEdge(fn_node, 0, ret_node, 0);
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
-    TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(fld.AddFunctionDef(*xla_fdef));
   }
 
   protobuf::Map<std::string, tensorflow::AttrValue> attrs;
@@ -774,7 +791,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -786,9 +803,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     device_ordinal_temp_value.set_i(0);
     protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &host_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &host_fbody));
     Graph *host_graph = host_fbody->graph;
     auto node_name_index = host_graph->BuildNodeNameIndex();
 
@@ -797,9 +814,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     EXPECT_NE(call_node, nullptr);
 
     std::unique_ptr<FunctionBody> call_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("oc_func_call_host_fn"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &call_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("oc_func_call_host_fn"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &call_fbody));
 
     // Verify we have _XlaRecvAtHost and _XlaSendFromHost nodes.
     bool has_recv = false, has_send = false;
@@ -817,8 +834,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
   // Check XLA graph.
   {
     std::unique_ptr<FunctionBody> xla_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                        AttrSlice(), &fld, &xla_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                     AttrSlice(), &fld, &xla_fbody));
     Graph *xla_graph = xla_fbody->graph;
     auto node_name_index = xla_graph->BuildNodeNameIndex();
 
@@ -828,8 +845,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     EXPECT_EQ(fn_node->type_string(), "fn_oc");
 
     std::unique_ptr<FunctionBody> call_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("fn_oc"), AttrSlice(), &fld,
-                                        &call_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("fn_oc"), AttrSlice(), &fld,
+                                     &call_fbody));
 
     // Verify we have XlaHostCompute nodes.
     bool has_hc = false;
@@ -857,7 +874,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
     Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
     Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     std::cout << "Graph is " << (*g).ToGraphDefDebug().DebugString()
               << std::endl;
     auto node_name_image = g->BuildNodeNameIndex();
@@ -869,7 +886,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -880,15 +897,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
   std::unique_ptr<FunctionBody> xla_fbody;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld, &xla_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"), AttrSlice(),
+                                   &fld, &xla_fbody));
   auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
@@ -899,8 +916,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
   std::vector<std::string> token_input_nodes;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
 
   std::vector<std::string> expected_token_input_nodes_0(
       {"_xla_token_arg_node"});
@@ -908,8 +925,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   token_input_nodes.clear();
   std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
 
   // Check there is a control edge from host_compute_0 to host_compute_1.
@@ -940,7 +957,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
         s.WithOpName("identity1").WithControlDependencies(identity0), const0);
     Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     std::cout << "Graph is " << (*g).ToGraphDefDebug().DebugString()
               << std::endl;
     auto node_name_image = g->BuildNodeNameIndex();
@@ -952,7 +969,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -963,15 +980,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
   std::unique_ptr<FunctionBody> xla_fbody;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld, &xla_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"), AttrSlice(),
+                                   &fld, &xla_fbody));
   auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
@@ -982,8 +999,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
   std::vector<std::string> token_input_nodes;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
 
   std::vector<std::string> expected_token_input_nodes_0(
       {"_xla_token_arg_node"});
@@ -991,8 +1008,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   token_input_nodes.clear();
   std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
 
   // Check there is a control edge from host_compute_0 to host_compute_1.
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 325f79b95e3a5e..54d6276c05cc32 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -166,7 +166,7 @@ class ExecutableClosureStore {
  public:
   ExecutableClosureStore() : key_counter_(0) {}
 
-  using KeyT = string;
+  using KeyT = std::string;
 
   KeyT Produce(ExecutableClosure<ExecutableType, ClientType> result) {
     mutex_lock l(mutex_);
@@ -217,7 +217,8 @@ se::Stream* GetStream(OpKernelContext* ctx) {
 
 XlaComputationLaunchContext GetLaunchContext(
     const XlaPlatformInfo& platform_info, OpKernelContext* ctx,
-    xla::LocalClient* client, se::DeviceMemoryAllocator* allocator) {
+    xla::LocalClient* client,
+    stream_executor::DeviceAddressAllocator* allocator) {
   se::Stream* stream = GetStream(ctx);
   int device_ordinal = stream ? stream->parent()->device_ordinal()
                               : client->default_device_ordinal();
@@ -230,7 +231,7 @@ XlaComputationLaunchContext GetLaunchContext(
 
 absl::Status GetTaskName(const absl::string_view device_name,
                          std::string* task_name) {
-  string ignored;
+  std::string ignored;
   if (!DeviceNameUtils::SplitDeviceName(device_name, task_name, &ignored)) {
     return errors::InvalidArgument("Unable to parse device name: ",
                                    device_name);
@@ -246,7 +247,7 @@ xla::SendDeviceMemoryFunction GetSendDeviceMemoryFunction(
   return
       [ctx, program_key](
           int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
-          const se::DeviceMemoryBase& device_memory_base,
+          const stream_executor::DeviceAddressBase& device_memory_base,
           const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
           -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
         auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
@@ -293,7 +294,7 @@ xla::RecvDeviceMemoryFunction GetRecvDeviceMemoryFunction(
   return
       [ctx, program_key](
           int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
-          se::DeviceMemoryBase* device_memory_base,
+          stream_executor::DeviceAddressBase* device_memory_base,
           const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
           -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
         auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
@@ -339,7 +340,7 @@ absl::StatusOr<xla::ExecutionOutput> RunExecutable(
     const XlaComputationLaunchContext& launch_context,
     std::vector<xla::ExecutionInput> execution_inputs,
     xla::ExecutableRunOptions run_options, xla::LocalExecutable* executable,
-    OpKernelContext* ctx, se::DeviceMemoryAllocator* allocator) {
+    OpKernelContext* ctx, stream_executor::DeviceAddressAllocator* allocator) {
   VLOG(2) << "Executing Xla Computation.";
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
@@ -620,7 +621,7 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         resource_var_ptrs[resources[i]] = variable_infos[i].var()->tensor();
       }
 
-      std::shared_ptr<se::DeviceMemoryAllocator> allocator =
+      std::shared_ptr<stream_executor::DeviceAddressAllocator> allocator =
           GetAllocator(ctx->device(), GetStream(ctx), platform_info);
       XlaComputationLaunchContext launch_context =
           GetLaunchContext(platform_info, ctx, client, allocator.get());
@@ -928,7 +929,7 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
 
   XlaExecutableClosure closure =
       XlaExecutableClosureStore::Global()->Consume(key);
-  std::shared_ptr<se::DeviceMemoryAllocator> allocator =
+  std::shared_ptr<stream_executor::DeviceAddressAllocator> allocator =
       GetAllocator(ctx->device(), GetStream(ctx), platform_info_);
   XlaComputationLaunchContext launch_context =
       GetLaunchContext(platform_info_, ctx, closure.client(), allocator.get());
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1d4031a4ffc926..89d5ea8863151b 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <initializer_list>
 #include <memory>
 #include <set>
 #include <string>
@@ -24,33 +26,50 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/data_flow_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/list_ops.h"
+#include "tensorflow/cc/ops/logging_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/no_op.h"
+#include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/state_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
 #include "tensorflow/compiler/jit/node_matchers.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/graph_def_builder_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -503,7 +522,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopesGlobalJitOverridden) {
     ops::BinaryOp(
         "MatMul", a, b,
         builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   FunctionDefLibrary flib;
@@ -536,7 +555,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
     ops::BinaryOp(
         "MatMul", a, b,
         builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
@@ -574,7 +593,7 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) {
                       .WithName("D")
                       .WithAttr(kXlaCompileAttr, true)
                       .WithAttr(kXlaScopeAttr, "Scope2"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
@@ -607,7 +626,7 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
                                .WithAttr(kXlaCompileAttr, true)
                                .WithAttr(kXlaScopeAttr, "ScopeB"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
@@ -797,11 +816,11 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
     auto BuildNoopNode = [](absl::string_view name, Graph* graph) {
       NodeDefBuilder builder(name, "NoOp");
       NodeDef def;
-      TF_CHECK_OK(builder.Finalize(&def));
+      CHECK_OK(builder.Finalize(&def));
 
       absl::Status status;
       Node* node = graph->AddNode(def, &status);
-      TF_CHECK_OK(status);
+      CHECK_OK(status);
       return node;
     };
 
diff --git a/tensorflow/compiler/jit/node_matchers.cc b/tensorflow/compiler/jit/node_matchers.cc
index db158fc84a0173..93c07d5539ccc2 100644
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@@ -15,16 +15,31 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/node_matchers.h"
 
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 
 namespace tensorflow {
@@ -515,7 +530,7 @@ impl::NodeMatcherProperties impl::Attr(std::string name) {
 
 NodeMatcherProperties ConstantValue(
     const ::tensorflow::Input::Initializer& val) {
-  TF_CHECK_OK(val.status);
+  CHECK_OK(val.status);
   NodeMatcherProperties props;
   props.set_constant_value(val.tensor);
   return props;
diff --git a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
index a833e9827c028a..6f3450f67e0e38 100644
--- a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
+++ b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
@@ -13,25 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/strings/match.h"
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -49,9 +52,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
     auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg0, 1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
   }
   {
     // Function for While's "body".
@@ -64,9 +67,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg0, 0);
     auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg1, 1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
   }
   {
     // Function for While's "cond".
@@ -77,9 +80,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
     Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_BOOL, 1);
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f3", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f3", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -106,11 +109,11 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
   auto ret2 = ops::_Retval(s.WithOpName("ret2"), while_op.output[0], 2);
   auto ret3 = ops::_Retval(s.WithOpName("ret3"), while_op.output[1], 3);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   std::vector<std::unique_ptr<FunctionBody>> fbodies;
-  TF_CHECK_OK(RearrangeFunctionArguments(
-      [&](const NameAttrList &function, const FunctionBody **fbody) {
+  CHECK_OK(RearrangeFunctionArguments(
+      [&](const NameAttrList& function, const FunctionBody** fbody) {
         std::unique_ptr<FunctionBody> new_fbody;
         TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fld.Find(function.name()),
                                                    AttrSlice(&function.attr()),
@@ -136,33 +139,33 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
   const Node *if_node = node_name_index.at("if");
   ASSERT_NE(if_node, nullptr);
   const Node *input_node;
-  TF_CHECK_OK(if_node->input_node(1, &input_node));
+  CHECK_OK(if_node->input_node(1, &input_node));
   EXPECT_EQ(input_node->name(), "arg1");
-  TF_CHECK_OK(if_node->input_node(2, &input_node));
+  CHECK_OK(if_node->input_node(2, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
   const Node *ret0_node = node_name_index.at("ret0");
   ASSERT_NE(ret0_node, nullptr);
-  TF_CHECK_OK(ret0_node->input_node(0, &input_node));
+  CHECK_OK(ret0_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "if");
   const Node *ret1_node = node_name_index.at("ret1");
   ASSERT_NE(ret1_node, nullptr);
-  TF_CHECK_OK(ret1_node->input_node(0, &input_node));
+  CHECK_OK(ret1_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
 
   // Check node "while" input and output edges.
   const Node *while_node = node_name_index.at("while");
   ASSERT_NE(while_node, nullptr);
-  TF_CHECK_OK(while_node->input_node(0, &input_node));
+  CHECK_OK(while_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "arg1");
-  TF_CHECK_OK(while_node->input_node(1, &input_node));
+  CHECK_OK(while_node->input_node(1, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
   const Node *ret2_node = node_name_index.at("ret2");
   ASSERT_NE(ret2_node, nullptr);
-  TF_CHECK_OK(ret2_node->input_node(0, &input_node));
+  CHECK_OK(ret2_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
   const Node *ret3_node = node_name_index.at("ret3");
   ASSERT_NE(ret3_node, nullptr);
-  TF_CHECK_OK(ret3_node->input_node(0, &input_node));
+  CHECK_OK(ret3_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "while");
 }
 
@@ -182,9 +185,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
     auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg0, 1);
     auto ret2 = ops::_Retval(s.WithOpName("ret2"), arg2, 2);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
   }
   {
     // Function for While's "cond".
@@ -197,9 +200,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
     Output cond = ops::Const(s.WithOpName("const"), true, TensorShape({}));
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), cond, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -217,7 +220,7 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
                              std::initializer_list<Input>{arg0, arg1, arg2},
                              cond_fn, body_fn);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   std::vector<std::unique_ptr<FunctionBody>> fbodies;
   absl::Status status = RearrangeFunctionArguments(
diff --git a/tensorflow/compiler/jit/shape_inference_test.cc b/tensorflow/compiler/jit/shape_inference_test.cc
index 599d442de4b092..807505672357cb 100644
--- a/tensorflow/compiler/jit/shape_inference_test.cc
+++ b/tensorflow/compiler/jit/shape_inference_test.cc
@@ -17,27 +17,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/shape_inference.h"
 
+#include <cstdint>
+#include <initializer_list>
 #include <map>
 #include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace {
@@ -55,7 +56,7 @@ TEST(ShapeInferenceTest, Basics) {
   auto g = ops::AddN(root.WithOpName("G"), std::initializer_list<Output>{e, f});
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(root.ToGraph(graph.get()));
+  CHECK_OK(root.ToGraph(graph.get()));
 
   GraphShapeInfo shape_info;
   TF_ASSERT_OK(InferShapes(graph.get(), /*arg_shapes=*/{},
@@ -84,7 +85,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSize) {
   b.node()->AddAttr("_index", 1);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(root.ToGraph(graph.get()));
+  CHECK_OK(root.ToGraph(graph.get()));
 
   std::map<int, InferredShape> arg_shapes;
   arg_shapes[0].shape = TensorShape({2, 3});
@@ -118,7 +119,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSizeIncompleteUserArgs) {
   b.node()->AddAttr("_index", 0);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(root.ToGraph(graph.get()));
+  CHECK_OK(root.ToGraph(graph.get()));
 
   std::map<int, InferredShape> arg_shapes;
   arg_shapes[0].shape = TensorShape({2, 3});
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index 30a9ab51faf105..b72fd6e7aaa6eb 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -15,14 +15,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/test_util.h"
 
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "xla/status_macros.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -71,15 +85,15 @@ void DeviceSetup::AddDevicesAndSetUp(
   }
 
   std::vector<std::unique_ptr<Device>> devices;
-  TF_CHECK_OK(DeviceFactory::AddDevices(
-      options, "/job:localhost/replica:0/task:0", &devices));
+  CHECK_OK(DeviceFactory::AddDevices(options, "/job:localhost/replica:0/task:0",
+                                     &devices));
   device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
 
   OptimizerOptions opts;
   lib_def_ = std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(),
                                                          FunctionDefLibrary());
   if (fdef.has_value()) {
-    TF_CHECK_OK(lib_def_->AddFunctionDef(*fdef));
+    CHECK_OK(lib_def_->AddFunctionDef(*fdef));
   }
   pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_.get(), Env::Default(), /*config=*/nullptr,
@@ -96,7 +110,7 @@ Device* DeviceSetup::GetDevice(const std::string& device_name) {
   std::string full_device_name = absl::StrCat(
       "/job:localhost/replica:0/task:0/device:", device_name, ":0");
   Device* device;
-  TF_CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
+  CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
   return device;
 }
 
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index 40de3e19dfd6d1..4c6a59e3f682fc 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -49,6 +49,8 @@ tf_cc_test(
     deps = [
         ":auto_clustering_test_helper",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test.cc b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
index d108bc51b5ee33..806abbeb8e6d6a 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/tests/auto_clustering_test_helper.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
@@ -85,7 +91,7 @@ absl::Status BenchmarkHelper(absl::string_view key, benchmark::State& state) {
 }
 
 void BM_MarkForCompilationPass_KerasImagenetMain(benchmark::State& state) {
-  TF_CHECK_OK(BenchmarkHelper("keras_imagenet_main", state));
+  CHECK_OK(BenchmarkHelper("keras_imagenet_main", state));
 }
 
 BENCHMARK(BM_MarkForCompilationPass_KerasImagenetMain);
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 6d7e5518524c29..1d51d4d1ca2b90 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -15,25 +15,50 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "xla/service/graphcycles/graphcycles.h"
 #include "xla/status_macros.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/edgeset.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/xla_config_registry.h"
@@ -460,8 +485,8 @@ absl::StatusOr<bool> DoesAnyCalleeHaveRefNodes(
       return true;
     }
 
-    auto release_handle_on_return = gtl::MakeCleanup(
-        [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+    auto release_handle_on_return =
+        gtl::MakeCleanup([&] { CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
 
     const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
     TF_RETURN_IF_ERROR(GetNodesRelatedToRefVariablesInDirection(
diff --git a/tensorflow/compiler/jit/xla_device_compiler_client.cc b/tensorflow/compiler/jit/xla_device_compiler_client.cc
index 71be1f7ec6b25d..ff565042347ae1 100644
--- a/tensorflow/compiler/jit/xla_device_compiler_client.cc
+++ b/tensorflow/compiler/jit/xla_device_compiler_client.cc
@@ -81,7 +81,7 @@ absl::StatusOr<std::string> XlaDeviceCompilerClient::BuildSerializedExecutable(
   xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
       options, result, client_->default_device_ordinal());
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_results,
+      std::vector<std::unique_ptr<xla::CompiledModule>> aot_results,
       client_->CompileAheadOfTime(*result.computation, argument_layouts,
                                   build_options));
   TF_RET_CHECK(aot_results.size() == 1);
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 8b38d79f58e415..027fd494ed8af5 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device_context.h"
 
+#include <cstddef>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -22,15 +23,37 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "xla/util.h"
-#include "tensorflow/core/common_runtime/device.h"
+#include "xla/client/local_client.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/service/stream_pool.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_reference.h"
-#include "tsl/platform/statusor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool.h"
 
 namespace tensorflow {
 
@@ -249,7 +272,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   // shape as it is derived from the cpu_tensor's shape using
   // shape_representation_fn_.
   xla::MutableBorrowingLiteral literal;
-  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(
+  CHECK_OK(HostTensorToMutableBorrowingLiteral(
       xla::LayoutUtil::GetWithDefaultLayout(
           xla_tensor->shaped_buffer().on_host_shape()),
       cpu_tensor, &literal));
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_test.cc b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
index 12ab76a7c1ce37..1804b1728c8c7f 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_test.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
@@ -15,16 +15,23 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
-#include "absl/memory/memory.h"
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
 #include "absl/status/status.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -63,7 +70,7 @@ class XlaKernelCreatorTest : public ::testing::Test {
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 1});
     std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::AddDevices(
+    CHECK_OK(DeviceFactory::AddDevices(
         options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 8ccb236897ce39..c35a7d0457c6ff 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -66,7 +66,6 @@ limitations under the License.
 #include "xla/tsl/framework/device_id_utils.h"
 #include "xla/tsl/framework/serving_device_selector_policies.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -85,6 +84,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
+#include "tsl/platform/casts.h"
 
 namespace tensorflow {
 namespace {
@@ -323,7 +323,7 @@ absl::Status SetOutputForConstant(
     }
     ctx->op_device_context()->CopyCPUTensorToDevice(
         &const_tensor, device, output_tensor,
-        [&](absl::Status status) { TF_CHECK_OK(status); });
+        [&](absl::Status status) { CHECK_OK(status); });
 
     if (device->device_type() == DEVICE_GPU) {
       // The GPUDeviceContext enqueues the host->device transfer in a
@@ -562,7 +562,7 @@ XlaComputationLaunchContext::BuildXlaCompilerArguments(
   }
 
   absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
-  TF_CHECK_OK(CreateVariableInfoLookup(variable_args, variable_info_lookup));
+  CHECK_OK(CreateVariableInfoLookup(variable_args, variable_info_lookup));
   for (int64_t input_num = 0; input_num < inputs.size(); ++input_num) {
     const Tensor* input = inputs[input_num];
     XlaCompiler::Argument& arg = out.emplace_back();
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 2876b3a7b96373..401f15587fcf39 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "xla/client/local_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
diff --git a/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc b/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc
index 563e75c5d61b28..e3f32f8403379a 100644
--- a/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc
@@ -112,7 +112,7 @@ class PjRtExecutionUtilGpuTest : public OpsTestBase {
 
     // Create the DeviceCompiler to help with compiling executables.
     auto pjrt_client_or = GetOrCreatePjRtClient(device_type_);
-    TF_CHECK_OK(pjrt_client_or.status());
+    CHECK_OK(pjrt_client_or.status());
     pjrt_client_ = pjrt_client_or.value();
     device_compiler_ = new PjRtDeviceCompiler(
         std::make_unique<PjRtDeviceExecutablePersistor>(
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
index d8ed5feac79f12..a2eb031da6c38c 100644
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -21,33 +21,50 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/device_executable_persistor.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/variable_info_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/framework/device_id_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -93,11 +110,11 @@ class PjRtExecutionUtilTest : public OpsTestBase {
     xla::CpuClientOptions options;
     options.asynchronous = true;
     options.cpu_device_count = 1;
-    TF_CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
+    CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
         device_type, xla::GetXlaPjrtCpuClient(options).value()));
 
     // device_context_ should be a PjRtDeviceContext.
-    TF_CHECK_OK(device_->TryGetDeviceContext(&device_context_));
+    CHECK_OK(device_->TryGetDeviceContext(&device_context_));
 
     // Get the host allocator.
     AllocatorAttributes host_alloc_attr;
@@ -111,7 +128,7 @@ class PjRtExecutionUtilTest : public OpsTestBase {
 
     // Create the DeviceCompiler to help with compiling executables.
     auto pjrt_client_or = GetOrCreatePjRtClient(device_type_);
-    TF_CHECK_OK(pjrt_client_or.status());
+    CHECK_OK(pjrt_client_or.status());
     pjrt_client_ = pjrt_client_or.value();
     device_compiler_ = new PjRtDeviceCompiler(
         std::make_unique<PjRtDeviceExecutablePersistor>(
diff --git a/tensorflow/compiler/jit/xla_platform_info_test.cc b/tensorflow/compiler/jit/xla_platform_info_test.cc
index 84fd60ef6c7e33..7b45521daf2827 100644
--- a/tensorflow/compiler/jit/xla_platform_info_test.cc
+++ b/tensorflow/compiler/jit/xla_platform_info_test.cc
@@ -18,17 +18,21 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/refcount.h"
-#include "tensorflow/core/platform/status_matchers.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
@@ -65,7 +69,7 @@ TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerXlaDeviceMetadata) {
 
   Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
   const XlaDevice::Metadata* metadata = nullptr;
-  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -91,7 +95,7 @@ TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerXlaDeviceCacheEnabled) {
 
   Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
   const XlaDevice::Metadata* metadata = nullptr;
-  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -134,7 +138,7 @@ TEST_F(XlaPlatformInfoTest, GetOrCreatePjRtDeviceCompilerAndProfilerXlaDevice) {
 
   Device* device = device_setup_.GetDevice(device_type.type());
   const XlaDevice::Metadata* metadata = nullptr;
-  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   ResourceMgr resource_mgr("");
@@ -254,7 +258,7 @@ TEST_F(XlaPlatformInfoTest,
   xla::CpuClientOptions options;
   options.asynchronous = true;
   options.cpu_device_count = 1;
-  TF_CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
+  CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
       device_type, xla::GetXlaPjrtCpuClient(options).value()));
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
 
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index e9cdad219dd28d..d6792cd7802d96 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -55,7 +55,7 @@ absl::Status XlaTensor::AllocateShapedBuffer(DataType dtype,
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
+    TF_ASSIGN_OR_RETURN(se::ScopedDeviceAddress<uint8_t> buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false,
                             subshape.layout().memory_space()));
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index ab6c5abeca86f0..b48a8ef6411711 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1699,6 +1699,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1714,7 +1715,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
         "@local_xla//xla/tsl/platform:errors",
-        "@local_xla//xla/tsl/platform:status",
         "@local_xla//xla/tsl/platform:statusor",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:vhlo_ops",
diff --git a/tensorflow/compiler/mlir/lite/debug/debug_test.cc b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
index b82d5725182745..a9337c0c84f944 100644
--- a/tensorflow/compiler/mlir/lite/debug/debug_test.cc
+++ b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
@@ -103,20 +103,21 @@ class InitPassManagerTest : public testing::Test {
     context_.loadAllAvailableDialects();
 
     mlir::OpBuilder builder(&context_);
-    module_ = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
+    module_ = mlir::ModuleOp::create(builder, builder.getUnknownLoc());
 
     builder.setInsertionPointToStart(module_->getBody());
-    auto func = builder.create<mlir::func::FuncOp>(  //
-        builder.getUnknownLoc(), "main", builder.getFunctionType({}, {}));
+    auto func = mlir::func::FuncOp::create(builder,  //
+                                           builder.getUnknownLoc(), "main",
+                                           builder.getFunctionType({}, {}));
     func->setAttr("tfl.func", builder.getUnitAttr());
     builder.setInsertionPointToStart(func.addEntryBlock());
     llvm::SmallVector<int> shape{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    builder.create<mlir::arith::ConstantOp>(
-        builder.getUnknownLoc(),
+    mlir::arith::ConstantOp::create(
+        builder, builder.getUnknownLoc(),
         mlir::DenseIntElementsAttr::get(
             mlir::RankedTensorType::get(shape.size(), builder.getI32Type()),
             shape));
-    builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc());
+    mlir::func::ReturnOp::create(builder, builder.getUnknownLoc());
   }
 
   absl::Status GetDumpDir(std::string* dump_dir) {
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
index 533a69bdfd9efa..614f9738356019 100644
--- a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
@@ -163,8 +163,8 @@ func::FuncOp BuildFuncOp(const Subgraph& subgraph, OpBuilder& builder,
     Value cloned_output = values_in_scope.lookup(result);
     return_operands.push_back(cloned_output);
   }
-  function_builder.create<mlir::func::ReturnOp>(new_func.getLoc(),
-                                                return_operands);
+  mlir::func::ReturnOp::create(function_builder, new_func.getLoc(),
+                               return_operands);
   ops_added.func_op = new_func;
   module.push_back(new_func);
   return new_func;
@@ -179,8 +179,8 @@ void ExtractSubgraphToFunc(const Subgraph& subgraph, OpBuilder& builder,
   Operation* last_output = subgraph.partition_ops_.back();
 
   builder.setInsertionPoint(last_output);
-  auto call_op = builder.create<func::CallOp>(last_output->getLoc(), func,
-                                              subgraph.FuncArguments());
+  auto call_op = func::CallOp::create(builder, last_output->getLoc(), func,
+                                      subgraph.FuncArguments());
   ops_added.call_op = call_op;
   // FuncOutputs refer to the original `Values` in input module which are now
   // invalid after pulling out the defining ops. The values in
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
index 787190318b63ad..c5c8c040c2bb28 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
@@ -89,8 +89,8 @@ void ConvertQuantizedOpToFloat(mlir::func::FuncOp func, OpBuilder* builder) {
         auto dequantized_input_type =
             mlir::quant::QuantizedType::castToExpressedType(input_type);
         builder->setInsertionPoint(op);
-        auto dequantize_op = builder->create<TFL::DequantizeOp>(
-            op->getLoc(), dequantized_input_type, input.get());
+        auto dequantize_op = TFL::DequantizeOp::create(
+            *builder, op->getLoc(), dequantized_input_type, input.get());
         dequantized_inputs.push_back(dequantize_op);
       } else {
         dequantized_inputs.push_back(input.get());
@@ -126,8 +126,9 @@ void ConvertQuantizedOpToFloat(mlir::func::FuncOp func, OpBuilder* builder) {
       Value new_result = new_op->getResult(i);
       if (IsQI8Type(result_type) || IsQUI8Type(result_type)) {
         builder->setInsertionPoint(op);
-        TFL::QuantizeOp quant_op = builder->create<TFL::QuantizeOp>(
-            op->getLoc(), result_type, new_result, TypeAttr::get(result_type));
+        TFL::QuantizeOp quant_op =
+            TFL::QuantizeOp::create(*builder, op->getLoc(), result_type,
+                                    new_result, TypeAttr::get(result_type));
         new_result = quant_op.getResult();
       }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
index d701254f333322..e6d7c6425abafe 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
@@ -85,11 +85,11 @@ TFL::ReshapeOp InsertReshapeOp(Location loc, Value input, Type element_type,
   auto new_shape_attr =
       mlir::DenseIntElementsAttr::get(reshape_shape_type, new_shape_array_i32);
 
-  auto new_shape = builder->create<TFL::ConstOp>(loc, new_shape_attr);
+  auto new_shape = TFL::ConstOp::create(*builder, loc, new_shape_attr);
 
   auto reshape_out_type = RankedTensorType::get(new_shape_array, element_type);
-  return builder->create<TFL::ReshapeOp>(loc, reshape_out_type, input,
-                                         new_shape);
+  return TFL::ReshapeOp::create(*builder, loc, reshape_out_type, input,
+                                new_shape);
 }
 
 LogicalResult EnsureBias(Operation* op, int bias_idx,
@@ -148,7 +148,7 @@ TF::ConstOp PadConstValues(Operation* input_op, int value_to_pad,
   auto new_value_i32_attr =
       mlir::DenseIntElementsAttr::get(value_shape_type, value_i32);
 
-  return builder->create<TF::ConstOp>(loc, new_value_i32_attr);
+  return TF::ConstOp::create(*builder, loc, new_value_i32_attr);
 }
 
 SmallVector<Value, 4> SliceOutputs(Operation* split_op, Value input,
@@ -186,13 +186,13 @@ SmallVector<Value, 4> SliceOutputs(Operation* split_op, Value input,
         mlir::DenseIntElementsAttr::get(slice_type, slice_size);
 
     auto slice_begin_const =
-        rewriter->create<TFL::ConstOp>(split_op->getLoc(), slice_begin_attr);
+        TFL::ConstOp::create(*rewriter, split_op->getLoc(), slice_begin_attr);
     auto slice_size_const =
-        rewriter->create<TFL::ConstOp>(split_op->getLoc(), slice_size_attr);
+        TFL::ConstOp::create(*rewriter, split_op->getLoc(), slice_size_attr);
 
-    auto slice_op = rewriter->create<TFL::SliceOp>(
-        split_op->getLoc(), current_output_type, input, slice_begin_const,
-        slice_size_const);
+    auto slice_op =
+        TFL::SliceOp::create(*rewriter, split_op->getLoc(), current_output_type,
+                             input, slice_begin_const, slice_size_const);
 
     // Rewire output.
     slice_outputs.push_back(slice_op.getResult());
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
index 58940205edf1ab..300daee0f9a40d 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
@@ -430,8 +430,8 @@ void PickSubgraphsPass::RewireSubgraphs(
         if (call.getCallee() != impl.getName()) {
           // We need to rebuild the call op. :(
           builder->setInsertionPoint(call);
-          auto new_call = builder->create<func::CallOp>(call.getLoc(), impl,
-                                                        call.getOperands());
+          auto new_call = func::CallOp::create(*builder, call.getLoc(), impl,
+                                               call.getOperands());
 
           // Set interface_name & target to the call_op as well.
           new_call->setAttr(kInterfaceNameAttr,
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 41dffc228a6b2c..67eef87eb872ad 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -269,7 +269,7 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
 static bool IsConst(Operation* op) {
   return isa<mlir::func::ConstantOp, mlir::arith::ConstantOp, mlir::TF::ConstOp,
              tfl::ConstOp, tfl::QConstOp, tfl::SparseConstOp,
-             tfl::SparseQConstOp, mlir::TFL::NoValueOp,
+             tfl::ExternalConstOp, tfl::SparseQConstOp, mlir::TFL::NoValueOp,
              mlir::stablehlo::ConstantOp, mlir::vhlo::ConstantOpV1>(op);
 }
 
@@ -632,6 +632,12 @@ class Translator {
   std::optional<BufferOffset<tflite::Buffer>> BuildBuffer(
       Value value, bool can_be_deduplicated, int& index);
 
+  // Builds external buffer and external buffer group from the given value. If
+  // the value is not defined by a constant op with external buffer attributes,
+  // returns std::nullopt.
+  std::optional<BufferOffset<tflite::ExternalBuffer>> BuildExternalBuffer(
+      Value value, uint32_t external_buffer_id);
+
   // Build TFLite tensor from the given type. This function is for tfl.lstm
   // intermediates, which should have UniformQuantizedType.
   std::optional<BufferOffset<tflite::Tensor>> BuildTensorFromType(
@@ -647,6 +653,7 @@ class Translator {
   // corresponding buffer. Emits error and returns std::nullopt on failure.
   std::optional<BufferOffset<tflite::Tensor>> BuildTensor(
       Value value, const std::string& name, unsigned buffer_idx,
+      unsigned external_buffer_id,
       const std::optional<BufferOffset<tflite::QuantizationParameters>>&
           quant_parameters);
 
@@ -858,6 +865,13 @@ class Translator {
   BufferOffset<tflite::Buffer> empty_buffer_;
 
   std::vector<BufferOffset<tflite::Buffer>> buffers_;
+
+  // External buffers
+  std::vector<BufferOffset<tflite::ExternalBuffer>> external_buffers_;
+  std::vector<BufferOffset<tflite::ExternalBufferGroup>>
+      external_buffer_groups_;
+  absl::flat_hash_map<std::string, uint32_t> external_buffer_group_map_;
+
   // Maps subgraph index and tensor name in the graph to the tensor index.
   absl::flat_hash_map<int, absl::flat_hash_map<std::string, int>>
       tensor_index_map_;
@@ -986,6 +1000,44 @@ std::string Translator::UniqueName(mlir::Value val) {
   return std::string(name_mapper_.GetUniqueName(val));
 }
 
+std::optional<BufferOffset<tflite::ExternalBuffer>>
+Translator::BuildExternalBuffer(mlir::Value value,
+                                uint32_t external_buffer_id) {
+  if (value.getDefiningOp() == nullptr) {
+    return std::nullopt;
+  }
+  auto inst = mlir::dyn_cast<tfl::ExternalConstOp>(value.getDefiningOp());
+  if (!inst) {
+    return std::nullopt;
+  }
+  auto meta = inst.getExternalBufferAttr();
+  if (!meta) {
+    return std::nullopt;
+  }
+
+  std::string group_name = meta.getGroupName().str();
+  uint64_t offset = meta.getOffset();
+  uint64_t length = meta.getLength();
+  std::string packing = meta.getPacking().str();
+
+  uint32_t group_index = 0;
+  if (auto it = external_buffer_group_map_.find(group_name);
+      it != external_buffer_group_map_.end()) {
+    group_index = it->second;
+  } else {
+    int index = external_buffer_groups_.size();
+    external_buffer_groups_.push_back(tflite::CreateExternalBufferGroup(
+        builder_, builder_.CreateString(group_name)));
+    external_buffer_group_map_[group_name] = index;
+    group_index = index;
+  }
+
+  auto external_buffer = tflite::CreateExternalBuffer(
+      builder_, external_buffer_id, group_index, offset, length,
+      builder_.CreateString(packing));
+  return external_buffer;
+}
+
 std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     mlir::Value value, bool can_be_deduplicated, int& index) {
   can_be_deduplicated = can_be_deduplicated && !disable_buffer_deduping_;
@@ -1241,11 +1293,13 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
       /*buffer=*/0, builder_.CreateString(name), q_params,
       /*is_variable=*/false, /*sparsity=*/0, /*shape_signature=*/0,
       /*has_rank=*/tensor_type.hasRank(),
-      variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
+      variant_params->empty() ? 0 : builder_.CreateVector(*variant_params),
+      /*external_buffer=*/0);
 }
 
 std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     Value value, const std::string& name, unsigned buffer_idx,
+    unsigned external_buffer_id,
     const std::optional<BufferOffset<tflite::QuantizationParameters>>&
         quant_parameters) {
   auto type = mlir::cast<TensorType>(value.getType());
@@ -1371,7 +1425,8 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         (is_variable ? 0 : buffer_idx), builder_.CreateString(name), q_params,
         /*is_variable=*/is_variable, s_params, /*shape_signature=*/0,
         /*has_rank=*/has_rank,
-        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
+        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params),
+        external_buffer_id);
   } else {
     return tflite::CreateTensor(
         builder_, builder_.CreateVector(shape), tflite_element_type,
@@ -1379,7 +1434,8 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         /*is_variable=*/is_variable, s_params,
         /*shape_signature=*/builder_.CreateVector(shape_signature),
         /*has_rank=*/has_rank,
-        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
+        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params),
+        external_buffer_id);
   }
 }
 
@@ -3292,27 +3348,41 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
       }
     }
 
+    // External buffer id is enforced to have MSB set to 1 to distinguish from
+    // buffer index/id, with the assumption that the number of external buffers
+    // are less than 2^31.
+    uint32_t external_buffer_id =
+        (1 << 31) | static_cast<uint32_t>(external_buffers_.size());
     int buffer_index = buffers_.size();
-    // If a constant is returned as subgraph's output, this constant cannot be
-    // deduplicated.
-    const bool not_returned_by_subgraph = llvm::none_of(
-        value.getUsers(),
-        [](Operation* user) { return llvm::isa<mlir::func::ReturnOp>(user); });
+
     // TODO(ashwinm): Check if for stateful tensors, if it is also needed to
     // make the Buffer empty apart from setting the buffer_idx=0 in the
     // Tensor. This does not seem to affect runtime behavior for RNN/LSTM,
     // but would be good for reducing memory footprint.
-    if (value.getDefiningOp()) {
+    if (auto external_buffer_or =
+            BuildExternalBuffer(value, external_buffer_id);
+        external_buffer_or.has_value()) {
+      buffer_index = 0;
+      external_buffers_.push_back(*external_buffer_or);
+    } else if (value.getDefiningOp()) {
+      // If a constant is returned as subgraph's output, this constant cannot be
+      // deduplicated.
+      const bool not_returned_by_subgraph =
+          llvm::none_of(value.getUsers(), [](Operation* user) {
+            return llvm::isa<mlir::func::ReturnOp>(user);
+          });
       auto buffer_or =
           BuildBuffer(value, not_returned_by_subgraph, buffer_index);
       if (!buffer_or) return false;
+      external_buffer_id = 0;
       buffers_.push_back(*buffer_or);
     } else {
+      external_buffer_id = 0;
       buffers_.push_back(empty_buffer_);
     }
 
-    auto tensor_or =
-        BuildTensor(value, tensor_name, buffer_index, quant_parameters);
+    auto tensor_or = BuildTensor(value, tensor_name, buffer_index,
+                                 external_buffer_id, quant_parameters);
     if (!tensor_or) return false;
     tensors.push_back(*tensor_or);
 
@@ -4192,11 +4262,15 @@ std::optional<std::string> Translator::TranslateInternal() {
   }
   auto signature_defs = CreateSignatureDefs(signature_defs_vec);
 
-  auto model = tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
-                                   builder_.CreateVector(opcodes_),
-                                   builder_.CreateVector(subgraphs_),
-                                   description, builder_.CreateVector(buffers_),
-                                   metadata_buffer, *metadata, *signature_defs);
+  bool has_external_buffers = !external_buffers_.empty();
+  auto model = tflite::CreateModel(
+      builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(opcodes_),
+      builder_.CreateVector(subgraphs_), description,
+      builder_.CreateVector(buffers_), metadata_buffer, *metadata,
+      *signature_defs,
+      has_external_buffers ? builder_.CreateVector(external_buffer_groups_) : 0,
+      has_external_buffers ? builder_.CreateVector(external_buffers_) : 0);
+
   tflite::FinishModelBuffer(builder_, model);
   // There is a limit of 2GB for a flatbuffer.
   bool flatbuffer_limit_exceeded = builder_.GetSize() > flatbuffer_size_max;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 4bd8ae5ce0dbb3..ab7d782dba8d33 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -101,7 +102,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -457,9 +457,9 @@ std::string GetMlirOpName(const tflite::OperatorT& op,
   return mlir::GetMlirOpNameFromOpCode(op_code);
 }
 
-StatusOr<Operation*> BuildExternalConstOp(const tflite::TensorT& tensor,
-                                          int32_t buffer_index,
-                                          OpBuilder builder, Location loc) {
+StatusOr<Operation*> BuildExternalConstOpWithBufferIndex(
+    const tflite::TensorT& tensor, int32_t buffer_index, OpBuilder builder,
+    Location loc) {
   TF_ASSIGN_OR_RETURN(mlir::TensorType type,
                       tfl::GetTensorType(tensor, builder,
                                          /*is_constant=*/true));
@@ -468,7 +468,45 @@ StatusOr<Operation*> BuildExternalConstOp(const tflite::TensorT& tensor,
     return errors::Internal("Constant doesn't have a shape");
   }
   auto op = builder.create<tfl::ExternalConstOp>(
-      loc, shaped_type, builder.getI32IntegerAttr(buffer_index));
+      loc, shaped_type,
+      /*buffer_index=*/builder.getI32IntegerAttr(buffer_index),
+      /*external_buffer=*/nullptr);
+  return op.getOperation();
+}
+
+StatusOr<Operation*> BuildExternalConstOpWithExternalBuffer(
+    const tflite::ModelT& model, const tflite::TensorT& tensor,
+    OpBuilder builder, Location loc) {
+  TF_ASSIGN_OR_RETURN(mlir::TensorType type,
+                      tfl::GetTensorType(tensor, builder,
+                                         /*is_constant=*/true));
+  auto shaped_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
+  if (!shaped_type) {
+    return errors::Internal("Constant doesn't have a shape");
+  }
+
+  tflite::ExternalBufferT* external_buffer = nullptr;
+  for (const auto& extbuf : model.external_buffers) {
+    if (extbuf->id == tensor.external_buffer) {
+      external_buffer = extbuf.get();
+      break;
+    }
+  }
+  if (external_buffer == nullptr) {
+    return errors::Internal("External buffer not found");
+  }
+
+  std::string group_name =
+      model.external_buffer_groups[external_buffer->group]->name;
+  auto op = builder.create<tfl::ExternalConstOp>(
+      loc, shaped_type, /*buffer_index=*/nullptr,
+      /*external_buffer=*/
+      tfl::ExternalBufferAttr::get(
+          builder.getContext(),
+          /*group_name=*/builder.getStringAttr(group_name),
+          /*offset=*/external_buffer->offset,
+          /*length=*/external_buffer->length,
+          /*packing=*/builder.getStringAttr(external_buffer->packing)));
   return op.getOperation();
 }
 
@@ -936,8 +974,8 @@ StatusOr<Operation*> ConvertOp(
   if (op_name == "tfl.lstm") {
     // TODO(b/147587779): add the right region if region is empty.
     op_state.addRegion();
-    TF_CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
-                                          builder));
+    CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
+                                       builder));
   }
   if (op_name == "tfl.while") {
     // Adds two empty regions for "tfl.while". We will fill the regions after
@@ -948,8 +986,8 @@ StatusOr<Operation*> ConvertOp(
     op_state.addRegion();
   }
   if (op_name == "tfl.unidirectional_sequence_lstm") {
-    TF_CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
-                                          builder));
+    CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
+                                       builder));
   }
   if (op_name == "tfl.reshape") {
     // Flattens reshape ops when more than one dimension shape operand is given.
@@ -1347,7 +1385,8 @@ mlir::ResultRange MaybeWrapInControlNode(mlir::Operation* op,
 // ordered_output_arrays in the same order. If signature is not null, then the
 // inputs/outputs in signature will be attached to the FuncOp.
 StatusOr<FuncOp> ConvertSubgraph(
-    const tflite::SubGraphT& subgraph, llvm::StringRef name,
+    const tflite::ModelT& model, const tflite::SubGraphT& subgraph,
+    llvm::StringRef name,
     const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
@@ -1511,22 +1550,30 @@ StatusOr<FuncOp> ConvertSubgraph(
         StatusOr<Operation*> op_or_err;
         std::vector<uint8_t> buffer;
         // Check if constant tensor is stored outside of the flatbuffers.
-        if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
-          const uint8_t* file_begin_ptr =
-              reinterpret_cast<const uint8_t*>(model_ptr->allocation()->base());
-          buffer = std::vector<uint8_t>(
-              file_begin_ptr + buffers[const_tensor.buffer]->offset,
-              file_begin_ptr + buffers[const_tensor.buffer]->offset +
-                  buffers[const_tensor.buffer]->size);
+        if (const_tensor.external_buffer != 0) {
+          op_or_err = BuildExternalConstOpWithExternalBuffer(
+              model, const_tensor, op_builder, const_loc);
         } else {
-          buffer = buffers[const_tensor.buffer]->data;
+          if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
+            const uint8_t* file_begin_ptr = reinterpret_cast<const uint8_t*>(
+                model_ptr->allocation()->base());
+
+            buffer = std::vector<uint8_t>(
+                file_begin_ptr + buffers[const_tensor.buffer]->offset,
+                file_begin_ptr + buffers[const_tensor.buffer]->offset +
+                    buffers[const_tensor.buffer]->size);
+          } else {
+            buffer = buffers[const_tensor.buffer]->data;
+          }
+          op_or_err =
+              use_external_constant
+                  ? BuildExternalConstOpWithBufferIndex(const_tensor,
+                                                        const_tensor.buffer,
+                                                        op_builder, const_loc)
+                  : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
+                                 op_builder, const_loc, use_stablehlo_constant);
         }
-        op_or_err =
-            use_external_constant
-                ? BuildExternalConstOp(const_tensor, const_tensor.buffer,
-                                       op_builder, const_loc)
-                : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
-                               op_builder, const_loc, use_stablehlo_constant);
+
         if (!op_or_err.ok()) {
           return emitError(const_loc, op_or_err.status().ToString()),
                  op_or_err.status();
@@ -1584,23 +1631,29 @@ StatusOr<FuncOp> ConvertSubgraph(
       StatusOr<Operation*> op_or_err;
       std::vector<uint8_t> buffer;
       // Check if constant tensor is stored outside of the flatbuffers.
-      if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
-        const uint8_t* file_begin_ptr =
-            reinterpret_cast<const uint8_t*>(model_ptr->allocation()->base());
-
-        buffer = std::vector<uint8_t>(
-            file_begin_ptr + buffers[const_tensor.buffer]->offset,
-            file_begin_ptr + buffers[const_tensor.buffer]->offset +
-                buffers[const_tensor.buffer]->size);
+      if (const_tensor.external_buffer != 0) {
+        op_or_err = BuildExternalConstOpWithExternalBuffer(
+            model, const_tensor, op_builder, const_loc);
       } else {
-        buffer = buffers[const_tensor.buffer]->data;
+        if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
+          const uint8_t* file_begin_ptr =
+              reinterpret_cast<const uint8_t*>(model_ptr->allocation()->base());
+
+          buffer = std::vector<uint8_t>(
+              file_begin_ptr + buffers[const_tensor.buffer]->offset,
+              file_begin_ptr + buffers[const_tensor.buffer]->offset +
+                  buffers[const_tensor.buffer]->size);
+        } else {
+          buffer = buffers[const_tensor.buffer]->data;
+        }
+        op_or_err =
+            use_external_constant
+                ? BuildExternalConstOpWithBufferIndex(
+                      const_tensor, const_tensor.buffer, op_builder, const_loc)
+                : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
+                               op_builder, const_loc, use_stablehlo_constant);
       }
-      op_or_err =
-          use_external_constant
-              ? BuildExternalConstOp(const_tensor, const_tensor.buffer,
-                                     op_builder, const_loc)
-              : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
-                             op_builder, const_loc, use_stablehlo_constant);
+
       if (!op_or_err.ok()) {
         return emitError(const_loc, op_or_err.status().ToString()),
                op_or_err.status();
@@ -1862,8 +1915,8 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
         SubgraphName(set_implicit_main_func, e.index(), *subgraph);
     uint32_t subgraph_index = static_cast<uint32_t>(e.index());
     auto func_or_error = ConvertSubgraph(
-        *subgraph, name, model->operator_codes, func_names, model->buffers,
-        base_loc, builder,
+        *model, *subgraph, name, model->operator_codes, func_names,
+        model->buffers, base_loc, builder,
         /*is_entry_point=*/
         set_implicit_main_func
             ? e.index() == 0
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
index 57e4ec22976df3..6fa287a8c8b013 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
@@ -166,4 +166,16 @@ def TFL_ConstBytesAttr : AttrDef<TFL_Dialect, "ConstBytes"> {
   let hasCustomAssemblyFormat = 1;
 }
 
+def TFL_ExternalBufferAttr : AttrDef<TFL_Dialect, "ExternalBuffer"> {
+  let mnemonic = "external_buffer";
+  let parameters = (ins
+      "::mlir::StringAttr":$group_name,
+      "uint64_t":$offset,
+      "uint64_t":$length,
+      "::mlir::StringAttr":$packing
+  );
+  let summary = "Flatbuffer external buffer metadata.";
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 #endif // TFL_OP_ENUMS
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index c90859cd6accfe..4c7e784d5069fd 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -951,11 +951,15 @@ def TFL_ExternalConstOp : Op<TFL_Dialect, "external_const", [
   let summary = "External const op.";
 
   let description = [{
-    External const op holds a `buffer_index` which points to a constant
-    in the flatbuffer.
+    External const op that can hold :
+    - `buffer_index` which points to a constant in the flatbuffer.
+    - `external_buffer` which contains metadata for external buffer outside flatbuffer.
   }];
 
-  let arguments = (ins I32Attr:$buffer_index);
+  let arguments = (ins
+    OptionalAttr<I32Attr>:$buffer_index,
+    OptionalAttr<TFL_ExternalBufferAttr>:$external_buffer
+  );
 
   let results = (outs AnyTensor:$output);
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
index 66d307dd2fbd86..1da38c2c9f466e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
@@ -346,10 +346,10 @@ void CreateVerifier(mlir::Operation* quantizing_op,
   BoolAttr log =
       rewriter.getBoolAttr(quant_params.numeric_verify_spec.log_if_failed_flag);
   // Verify the quantized value by sending the result to the verifier.
-  rewriter.create<VerifierT>(
-      quantizing_op->getLoc(), quantized_op->getResult(result_idx).getType(),
-      quantized_op->getResult(result_idx), quantizing_op->getResult(result_idx),
-      tolerance, log);
+  VerifierT::create(rewriter, quantizing_op->getLoc(),
+                    quantized_op->getResult(result_idx).getType(),
+                    quantized_op->getResult(result_idx),
+                    quantizing_op->getResult(result_idx), tolerance, log);
 }
 
 template <>
@@ -645,8 +645,8 @@ class QuantizationPattern : public RewritePattern {
             if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
               continue;
             }
-            auto cst = rewriter.create<arith::ConstantOp>(
-                quantized_op->getLoc(), attr);
+            auto cst = arith::ConstantOp::create(rewriter,
+                                                 quantized_op->getLoc(), attr);
             quantizing_op->setOperand(i, cst.getResult());
           }
         }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
index 1e1f79af16cbd6..b131a5f0e1060b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
@@ -43,7 +43,7 @@ limitations under the License.
 // Note: branched from tensorflow/lite/tools/optimize/quantize_weights_test.cc
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -766,7 +766,7 @@ TEST_F(QuantizeWeightsTest, DequantizeConvBlocklisted) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -777,8 +777,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index 6c43167a78cbae..529b5d2161be32 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -150,10 +150,10 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.
     Value value = tf_op.getOutputs();
-    auto quantize = rewriter.create<quantfork::QuantizeCastOp>(
-        tf_op.getLoc(), qtype.getValue(), value);
-    auto dequantize = rewriter.create<quantfork::DequantizeCastOp>(
-        tf_op.getLoc(), res_type, quantize.getResult());
+    auto quantize = quantfork::QuantizeCastOp::create(rewriter, tf_op.getLoc(),
+                                                      qtype.getValue(), value);
+    auto dequantize = quantfork::DequantizeCastOp::create(
+        rewriter, tf_op.getLoc(), res_type, quantize.getResult());
     value.replaceAllUsesWith(dequantize);
     quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 
diff --git a/tensorflow/compiler/mlir/lite/schema/BUILD b/tensorflow/compiler/mlir/lite/schema/BUILD
index 0c0381439d2b5e..649e198336c911 100644
--- a/tensorflow/compiler/mlir/lite/schema/BUILD
+++ b/tensorflow/compiler/mlir/lite/schema/BUILD
@@ -1,5 +1,10 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
+# copybara:uncomment_begin(google-only)
+# load("@flatbuffers//:flatbuffers.bzl", "flatbuffers_library", "ts_flatbuffers_library")
+# copybara:uncomment_end
+
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
@@ -131,3 +136,15 @@ tf_cc_test(
         "@flatbuffers//:flatc_library",
     ],
 )
+
+# copybara:uncomment_begin(google-only)
+# flatbuffers_library(
+#     name = "schema_fbslib",
+#     srcs = ["schema.fbs"],
+# )
+#
+# ts_flatbuffers_library(
+#     name = "schema_ts_fbs",
+#     deps = [":schema_fbslib"],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
index 4107859b7412af..0dd7e1f3b97a1c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
@@ -677,13 +677,12 @@ class ComposeUniformQuantizedConvolutionOp
         CreateI8F32UniformQuantizedType(
             uniform_quantize_call_op.getLoc(), *rewriter.getContext(),
             input_scale_value, input_zero_point_value);
-    auto input_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            uniform_quantize_call_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input_value.getType())
-                .clone(input_quantized_element_type),
-            /*operand=*/input_value);
+    auto input_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, uniform_quantize_call_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input_value.getType())
+            .clone(input_quantized_element_type),
+        /*operand=*/input_value);
 
     rewriter.replaceAllUsesWith(input_i8_to_f32_convert_op.getResult(),
                                 input_uniform_quantize_op.getResult());
@@ -754,8 +753,8 @@ class ComposeUniformQuantizedConvolutionOp
             /*quantization_dimension=*/3);
 
     // Create a new constant op for the filter in i8.
-    auto quantized_filter_constant_op = rewriter.create<stablehlo::ConstantOp>(
-        filter_op->getLoc(),
+    auto quantized_filter_constant_op = stablehlo::ConstantOp::create(
+        rewriter, filter_op->getLoc(),
         /*output=*/
         filter_i8_value_attr.getType().clone(filter_quantized_element_type),
         /*value=*/filter_i8_value_attr);
@@ -797,18 +796,16 @@ class ComposeUniformQuantizedConvolutionOp
 
     SmallVector<Type> new_conv_output_types = {
         output_uniform_quantized_tensor_type};
-    auto new_conv_op_with_output_type =
-        rewriter.create<stablehlo::ConvolutionOp>(
-            op.getLoc(), new_conv_output_types, op.getOperands(),
-            op->getAttrs());
+    auto new_conv_op_with_output_type = stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), new_conv_output_types, op.getOperands(),
+        op->getAttrs());
 
     rewriter.replaceAllUsesWith(op.getResult(),
                                 new_conv_op_with_output_type.getResult());
 
-    auto new_output_dequant_op =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            rewriter.getUnknownLoc(),
-            /*operand=*/new_conv_op_with_output_type);
+    auto new_output_dequant_op = stablehlo::UniformDequantizeOp::create(
+        rewriter, rewriter.getUnknownLoc(),
+        /*operand=*/new_conv_op_with_output_type);
 
     auto output_uniform_dequantize_call_op = cast<func::CallOp>(
         *output_uniform_quantize_call_op.getResult(0).user_begin());
@@ -1035,13 +1032,12 @@ class ComposeUniformQuantizedDotGeneralOp
             input_scale_value, input_zero_point_value);
 
     Value input_value = input_uniform_quantize_call_pattern->GetInputValue();
-    auto input_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            input_i8_to_f32_convert_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input_value.getType())
-                .clone(input_uniform_quantized_type),
-            /*operand=*/input_value);
+    auto input_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, input_i8_to_f32_convert_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input_value.getType())
+            .clone(input_uniform_quantized_type),
+        /*operand=*/input_value);
 
     rewriter.replaceAllUsesWith(input_i8_to_f32_convert_op.getResult(),
                                 input_uniform_quantize_op.getResult());
@@ -1116,8 +1112,8 @@ class ComposeUniformQuantizedDotGeneralOp
             quantization_dimension);
 
     // Create a new constant op for the filter in i8.
-    auto quantized_filter_constant_op = rewriter.create<stablehlo::ConstantOp>(
-        filter_constant_op.getLoc(),
+    auto quantized_filter_constant_op = stablehlo::ConstantOp::create(
+        rewriter, filter_constant_op.getLoc(),
         /*output=*/
         mlir::cast<TensorType>(filter_constant_op.getResult().getType())
             .clone(filter_uniform_quantized_type),
@@ -1157,8 +1153,8 @@ class ComposeUniformQuantizedDotGeneralOp
             output_uniform_quantize_call_op.getLoc(), *rewriter.getContext(),
             output_scale_value, output_zero_point_value);
 
-    auto new_dot_general_op = rewriter.create<stablehlo::DotGeneralOp>(
-        op.getLoc(), /*resultType0=*/
+    auto new_dot_general_op = stablehlo::DotGeneralOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/
         mlir::cast<TensorType>(op.getResult().getType())
             .clone(output_uniform_quantized_type),
         /*lhs=*/op.getLhs(), /*rhs=*/op.getRhs(),
@@ -1168,10 +1164,9 @@ class ComposeUniformQuantizedDotGeneralOp
 
     rewriter.replaceAllUsesWith(op.getResult(), new_dot_general_op.getResult());
 
-    auto new_output_dequant_op =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            output_uniform_dequantize_call_op.getLoc(),
-            /*operand=*/new_dot_general_op);
+    auto new_output_dequant_op = stablehlo::UniformDequantizeOp::create(
+        rewriter, output_uniform_dequantize_call_op.getLoc(),
+        /*operand=*/new_dot_general_op);
 
     rewriter.replaceAllUsesWith(output_uniform_dequantize_call_op.getResult(0),
                                 new_output_dequant_op.getResult());
@@ -1423,13 +1418,12 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
             input1_scale_value, input1_zero_point_value);
 
     Value input1_value = input1_uniform_quantize_call_pattern->GetInputValue();
-    auto input1_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            input1_uniform_quantize_call_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input1_value.getType())
-                .clone(input1_uniform_quantized_type),
-            /*operand=*/input1_value);
+    auto input1_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, input1_uniform_quantize_call_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input1_value.getType())
+            .clone(input1_uniform_quantized_type),
+        /*operand=*/input1_value);
 
     rewriter.replaceAllUsesWith(input1_zero_point_subtract_op.getResult(),
                                 input1_uniform_quantize_op.getResult());
@@ -1462,13 +1456,12 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
             input2_scale_value, input2_zero_point_value);
 
     Value input2_value = input2_uniform_quantize_call_pattern->GetInputValue();
-    auto input2_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            input2_uniform_quantize_call_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input2_value.getType())
-                .clone(input2_uniform_quantized_type),
-            /*operand=*/input2_value);
+    auto input2_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, input2_uniform_quantize_call_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input2_value.getType())
+            .clone(input2_uniform_quantized_type),
+        /*operand=*/input2_value);
 
     rewriter.replaceAllUsesWith(input2_zero_point_subtract_op.getResult(),
                                 input2_uniform_quantize_op.getResult());
@@ -1512,8 +1505,8 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
             output_uniform_quantize_call_op.getLoc(), *rewriter.getContext(),
             output_scale_value, output_zero_point_value);
 
-    auto new_dot_general_op = rewriter.create<stablehlo::DotGeneralOp>(
-        op.getLoc(), /*resultType0=*/
+    auto new_dot_general_op = stablehlo::DotGeneralOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/
         mlir::cast<TensorType>(op.getResult().getType())
             .clone(output_uniform_quantized_type),
         /*lhs=*/op.getLhs(), /*rhs=*/op.getRhs(),
@@ -1523,10 +1516,9 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
 
     rewriter.replaceAllUsesWith(op.getResult(), new_dot_general_op.getResult());
 
-    auto new_output_dequant_op =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            output_uniform_dequantize_call_op.getLoc(),
-            /*operand=*/new_dot_general_op);
+    auto new_output_dequant_op = stablehlo::UniformDequantizeOp::create(
+        rewriter, output_uniform_dequantize_call_op.getLoc(),
+        /*operand=*/new_dot_general_op);
 
     rewriter.replaceAllUsesWith(output_uniform_dequantize_call_op.getResult(0),
                                 new_output_dequant_op.getResult());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 7608ff985f1eb9..0d8688b2c8855a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -328,22 +328,22 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
       size.push_back(input_shape[i] - pre_slice - post_slice);
     }
 
-    auto start_attr = rewriter.create<TF::ConstOp>(
-        value.getLoc(),
+    auto start_attr = TF::ConstOp::create(
+        rewriter, value.getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({static_cast<int64_t>(start.size())},
                                   rewriter.getI64Type()),
             start));
-    auto size_attr = rewriter.create<TF::ConstOp>(
-        value.getLoc(),
+    auto size_attr = TF::ConstOp::create(
+        rewriter, value.getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({static_cast<int64_t>(size.size())},
                                   rewriter.getI64Type()),
             size));
     auto output_type = RankedTensorType::get(size, input_type.getElementType());
 
-    return rewriter.create<TF::SliceOp>(value.getLoc(), output_type, value,
-                                        start_attr, size_attr);
+    return TF::SliceOp::create(rewriter, value.getLoc(), output_type, value,
+                               start_attr, size_attr);
   }
 
   void CreateConvOp(mhlo::ConvolutionOp conv_op, ArrayRef<int64_t> strides,
@@ -381,14 +381,15 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
           mlir::dyn_cast<RankedTensorType>(conv_op.getLhs().getType());
       RankedTensorType padding_attr_type = mlir::RankedTensorType::get(
           {lhs_type.getRank(), 2}, rewriter.getIntegerType(64));
-      auto padding_const = rewriter.create<TF::ConstOp>(
-          conv_op->getLoc(),
+      auto padding_const = TF::ConstOp::create(
+          rewriter, conv_op->getLoc(),
           mlir::DenseElementsAttr::get(padding_attr_type,
                                        ArrayRef<int64_t>(new_padding)));
       // Add Pad op.
       auto pad_output_type = UnrankedTensorType::get(lhs_type.getElementType());
-      sliced_lhs = rewriter.create<TF::PadOp>(
-          conv_op->getLoc(), pad_output_type, sliced_lhs, padding_const);
+      sliced_lhs =
+          TF::PadOp::create(rewriter, conv_op->getLoc(), pad_output_type,
+                            sliced_lhs, padding_const);
       padding = "VALID";
     }
 
@@ -422,28 +423,28 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
                                                     hlo_filter_shape.end());
       tf_filter_shape[2] = input_channels;
       tf_filter_shape[3] = hlo_filter_shape.back() / input_channels;
-      auto reshaped_filter = rewriter.create<mhlo::ReshapeOp>(
-          rhs.getLoc(),
+      auto reshaped_filter = mhlo::ReshapeOp::create(
+          rewriter, rhs.getLoc(),
           RankedTensorType::get(tf_filter_shape, filter_type.getElementType()),
           rhs);
 
-      output = rewriter.create<TF::DepthwiseConv2dNativeOp>(
-          conv_op.getLoc(), conv_output_type, sliced_lhs, reshaped_filter,
-          rewriter.getI64ArrayAttr(strides),
+      output = TF::DepthwiseConv2dNativeOp::create(
+          rewriter, conv_op.getLoc(), conv_output_type, sliced_lhs,
+          reshaped_filter, rewriter.getI64ArrayAttr(strides),
           /*padding=*/rewriter.getStringAttr(padding),
           /*explicit_paddings=*/rewriter.getI64ArrayAttr(new_padding),
           /*data_format=*/rewriter.getStringAttr("NHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
     } else if (num_spatial_dims == 3) {
-      output = rewriter.create<TF::Conv3DOp>(
-          conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
+      output = TF::Conv3DOp::create(
+          rewriter, conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
           rewriter.getI64ArrayAttr(strides),
           /*padding=*/rewriter.getStringAttr(padding),
           /*data_format=*/rewriter.getStringAttr("NDHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
     } else {
-      output = rewriter.create<TF::Conv2DOp>(
-          conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
+      output = TF::Conv2DOp::create(
+          rewriter, conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
           rewriter.getI64ArrayAttr(strides),
           /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
           /*padding=*/rewriter.getStringAttr(padding),
@@ -462,8 +463,8 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
               dnums.getOutputFeatureDimension(),
               *dnums.getOutputSpatialDimensions().begin(), num_spatial_dims,
               conv_output_type, rewriter);
-      output = rewriter.create<mhlo::TransposeOp>(
-          conv_op.getLoc(), conv_op.getType(), output, permutation);
+      output = mhlo::TransposeOp::create(
+          rewriter, conv_op.getLoc(), conv_op.getType(), output, permutation);
     }
     rewriter.replaceOp(conv_op, {output});
   }
@@ -513,8 +514,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     auto image_2d_type =
         RankedTensorType::get(image_2d_shape, image_type.getElementType());
     auto loc = conv_op.getLoc();
-    auto image_2d_op = rewriter.create<mhlo::ReshapeOp>(
-        conv_op.getLoc(), image_2d_type, conv_op.getLhs());
+    auto image_2d_op = mhlo::ReshapeOp::create(rewriter, conv_op.getLoc(),
+                                               image_2d_type, conv_op.getLhs());
 
     // Transpose image to get it into NWHC form (where H is the added dim).
     SmallVector<int64_t, 4> image_permutation = {
@@ -523,9 +524,9 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
         dnums.getInputFeatureDimension()};
     auto image_permutation_and_shape = GetPermutationAndTransposedShape(
         image_permutation, image_2d_type, rewriter);
-    auto transposed_image_2d_op = rewriter.create<mhlo::TransposeOp>(
-        loc, image_permutation_and_shape.shape, image_2d_op->getResult(0),
-        image_permutation_and_shape.permutation);
+    auto transposed_image_2d_op = mhlo::TransposeOp::create(
+        rewriter, loc, image_permutation_and_shape.shape,
+        image_2d_op->getResult(0), image_permutation_and_shape.permutation);
 
     // Reshape kernel to add a new spatial dimension.
     auto kernel_type = mlir::cast<ShapedType>(conv_op.getRhs().getType());
@@ -536,8 +537,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     kernel_2d_shape.push_back(1);
     auto kernel_2d_type =
         RankedTensorType::get(kernel_2d_shape, kernel_type.getElementType());
-    auto kernel_2d_op =
-        rewriter.create<mhlo::ReshapeOp>(loc, kernel_2d_type, conv_op.getRhs());
+    auto kernel_2d_op = mhlo::ReshapeOp::create(rewriter, loc, kernel_2d_type,
+                                                conv_op.getRhs());
 
     // Transpose kernel to get it into WHIO form (where H is the added dim).
     SmallVector<int64_t, 4> kernel_permutation = {
@@ -547,9 +548,9 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
         dnums.getKernelOutputFeatureDimension()};
     auto kernel_permutation_and_shape = GetPermutationAndTransposedShape(
         kernel_permutation, kernel_2d_type, rewriter);
-    auto transposed_kernel_2d_op = rewriter.create<mhlo::TransposeOp>(
-        loc, kernel_permutation_and_shape.shape, kernel_2d_op->getResult(0),
-        kernel_permutation_and_shape.permutation);
+    auto transposed_kernel_2d_op = mhlo::TransposeOp::create(
+        rewriter, loc, kernel_permutation_and_shape.shape,
+        kernel_2d_op->getResult(0), kernel_permutation_and_shape.permutation);
 
     //
     // Create 2d equivalents for 1d convolution attributes.
@@ -638,12 +639,12 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
                                          rewriter)
             .shape;
 
-    auto conv2d_op = rewriter.create<mhlo::ConvolutionOp>(
-        loc, transposed_output_2d_shape, transposed_image_2d_op.getResult(),
-        transposed_kernel_2d_op.getResult(), window_strides_2d, padding_2d,
-        lhs_dilation_2d, rhs_dilation_2d, window_reversal_2d, dnums_2d,
-        conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
-        conv_op.getPrecisionConfigAttr());
+    auto conv2d_op = mhlo::ConvolutionOp::create(
+        rewriter, loc, transposed_output_2d_shape,
+        transposed_image_2d_op.getResult(), transposed_kernel_2d_op.getResult(),
+        window_strides_2d, padding_2d, lhs_dilation_2d, rhs_dilation_2d,
+        window_reversal_2d, dnums_2d, conv_op.getFeatureGroupCount(),
+        conv_op.getBatchGroupCount(), conv_op.getPrecisionConfigAttr());
 
     OpResult conv2d_output = conv2d_op->getResult(0);
     auto conv2d_output_type = mlir::cast<ShapedType>(conv2d_output.getType());
@@ -656,8 +657,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     // affectively applied.
     auto output_permutation_and_shape = GetInversePermutationAndShape(
         output_permutation, conv2d_output_type, rewriter);
-    auto transposed_output_2d_op = rewriter.create<mhlo::TransposeOp>(
-        loc, output_permutation_and_shape.shape, conv2d_output,
+    auto transposed_output_2d_op = mhlo::TransposeOp::create(
+        rewriter, loc, output_permutation_and_shape.shape, conv2d_output,
         output_permutation_and_shape.permutation);
 
     // Drop the trailing spatial dimension from the output.
@@ -804,11 +805,10 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
       } else {
         limit_indices[channel_idx] = depth_idx + 1;
       }
-      return rewriter.create<mhlo::SliceOp>(
-          conv_op.getLoc(), tensor,
-          GetI64ElementsAttr(start_indices, &rewriter),
-          GetI64ElementsAttr(limit_indices, &rewriter),
-          GetI64ElementsAttr(strides, &rewriter));
+      return mhlo::SliceOp::create(rewriter, conv_op.getLoc(), tensor,
+                                   GetI64ElementsAttr(start_indices, &rewriter),
+                                   GetI64ElementsAttr(limit_indices, &rewriter),
+                                   GetI64ElementsAttr(strides, &rewriter));
     };
 
     // Storage for smaller convolution results
@@ -832,18 +832,19 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
           RankedTensorType::get(new_output_shape, output_type.getElementType());
 
       // Create a Smaller Convolution (Ensure compatibility)
-      auto conv_result = rewriter.create<mhlo::ConvolutionOp>(
-          conv_op.getLoc(), new_output_type, sliced_input, sliced_kernel,
-          conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
-          conv_op.getLhsDilationAttr(), conv_op.getRhsDilationAttr(),
-          conv_op.getWindowReversalAttr(), conv_op.getDimensionNumbers(), 1, 1,
+      auto conv_result = mhlo::ConvolutionOp::create(
+          rewriter, conv_op.getLoc(), new_output_type, sliced_input,
+          sliced_kernel, conv_op.getWindowStridesAttr(),
+          conv_op.getPaddingAttr(), conv_op.getLhsDilationAttr(),
+          conv_op.getRhsDilationAttr(), conv_op.getWindowReversalAttr(),
+          conv_op.getDimensionNumbers(), 1, 1,
           conv_op.getPrecisionConfigAttr());
 
       conv_results.push_back(conv_result);
     }
 
-    auto final_output = rewriter.create<mhlo::ConcatenateOp>(
-        conv_op.getLoc(), conv_results,
+    auto final_output = mhlo::ConcatenateOp::create(
+        rewriter, conv_op.getLoc(), conv_results,
         rewriter.getI64IntegerAttr(dnums.getOutputFeatureDimension()));
     rewriter.replaceOp(conv_op, final_output.getResult());
     return mlir::success();
@@ -854,8 +855,8 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
                               llvm::ArrayRef<int32_t> output_sizes,
                               bool align_corners,
                               ConversionPatternRewriter& rewriter) const {
-    Value output_sizes_attr = rewriter.create<TF::ConstOp>(
-        conv_op.getLoc(),
+    Value output_sizes_attr = TF::ConstOp::create(
+        rewriter, conv_op.getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({static_cast<int64_t>(output_sizes.size())},
                                   rewriter.getI32Type()),
@@ -863,8 +864,8 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
     // The value of half_pixel_centers couldn't be inferred from the IR and XLA
     // only support half_pixel_centers=True as in 01/11/2022. Here
     // half_pixel_centers=False is hardcoded.
-    Value output = rewriter.create<TF::ResizeBilinearOp>(
-        conv_op.getLoc(), conv_op.getType(), conv_op.getLhs(),
+    Value output = TF::ResizeBilinearOp::create(
+        rewriter, conv_op.getLoc(), conv_op.getType(), conv_op.getLhs(),
         output_sizes_attr,
         /*align_corners=*/rewriter.getBoolAttr(align_corners),
         /*half_pixel_centers=*/rewriter.getBoolAttr(false));
@@ -1071,8 +1072,8 @@ class ConvertNonTrivialConvOp
       permutation.push_back(dnums.getKernelOutputFeatureDimension());
       permutation.push_back(dnums.getKernelInputFeatureDimension());
 
-      auto filter_transposed = rewriter.create<mhlo::TransposeOp>(
-          conv_op.getLoc(), conv_op.getRhs(),
+      auto filter_transposed = mhlo::TransposeOp::create(
+          rewriter, conv_op.getLoc(), conv_op.getRhs(),
           DenseIntElementsAttr::get(
               RankedTensorType::get({static_cast<int64_t>(permutation.size())},
                                     rewriter.getI64Type()),
@@ -1082,8 +1083,9 @@ class ConvertNonTrivialConvOp
 
     // Lets hard-code the reverse indexes to be {0, 1} as the expectation is
     // that the kernel is always in HWOI format, with the above code.
-    mhlo::ReverseOp filter = rewriter.create<mhlo::ReverseOp>(
-        conv_op.getLoc(), reverse_filter_in, rewriter.getI64TensorAttr({0, 1}));
+    mhlo::ReverseOp filter =
+        mhlo::ReverseOp::create(rewriter, conv_op.getLoc(), reverse_filter_in,
+                                rewriter.getI64TensorAttr({0, 1}));
 
     // if output is not in [b, 0, 1, f] format, insert transpose to go back
     if (dnums.getOutputBatchDimension() != 0 ||
@@ -1112,23 +1114,23 @@ class ConvertNonTrivialConvOp
       auto output_type = RankedTensorType::get(
           transposed_output_shape,
           mlir::cast<ShapedType>(conv_op.getRhs().getType()).getElementType());
-      auto output_sizes = rewriter.create<TF::ConstOp>(
-          conv_op.getLoc(),
+      auto output_sizes = TF::ConstOp::create(
+          rewriter, conv_op.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get(
                   {static_cast<int64_t>(transposed_output_shape_i32.size())},
                   rewriter.getI32Type()),
               transposed_output_shape_i32));
-      auto new_conv = rewriter.create<TF::Conv2DBackpropInputOp>(
-          conv_op.getLoc(), output_type, output_sizes, filter, conv_input,
-          rewriter.getI64ArrayAttr(strides),
+      auto new_conv = TF::Conv2DBackpropInputOp::create(
+          rewriter, conv_op.getLoc(), output_type, output_sizes, filter,
+          conv_input, rewriter.getI64ArrayAttr(strides),
           /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
           /*padding=*/rewriter.getStringAttr(padding),
           /*explicit_paddings=*/rewriter.getI64ArrayAttr({}),
           /*data_format=*/rewriter.getStringAttr("NHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
-      auto output_transpose = rewriter.create<mhlo::TransposeOp>(
-          conv_op.getLoc(), new_conv.getResult(),
+      auto output_transpose = mhlo::TransposeOp::create(
+          rewriter, conv_op.getLoc(), new_conv.getResult(),
           rewriter.getI64TensorAttr(transpose_order));
       conv_op->replaceAllUsesWith(output_transpose);
       rewriter.eraseOp(conv_op);
@@ -1139,8 +1141,8 @@ class ConvertNonTrivialConvOp
                .getShape()) {
         output_shape_i32.push_back(dim);
       }
-      auto output_sizes = rewriter.create<TF::ConstOp>(
-          conv_op.getLoc(),
+      auto output_sizes = TF::ConstOp::create(
+          rewriter, conv_op.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get(
                   {static_cast<int64_t>(output_shape_i32.size())},
@@ -1255,12 +1257,12 @@ class ConvertSliceOp : public OpConversionPattern<mhlo::SliceOp> {
   LogicalResult matchAndRewrite(
       mhlo::SliceOp slice_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto begin = rewriter.create<TF::ConstOp>(slice_op.getLoc(),
-                                              slice_op.getStartIndices());
-    auto end = rewriter.create<TF::ConstOp>(slice_op.getLoc(),
-                                            slice_op.getLimitIndices());
+    auto begin = TF::ConstOp::create(rewriter, slice_op.getLoc(),
+                                     slice_op.getStartIndices());
+    auto end = TF::ConstOp::create(rewriter, slice_op.getLoc(),
+                                   slice_op.getLimitIndices());
     auto strides =
-        rewriter.create<TF::ConstOp>(slice_op.getLoc(), slice_op.getStrides());
+        TF::ConstOp::create(rewriter, slice_op.getLoc(), slice_op.getStrides());
     rewriter.replaceOpWithNewOp<TF::StridedSliceOp>(
         slice_op, slice_op.getType(), slice_op.getOperand(), begin, end,
         strides);
@@ -1294,22 +1296,24 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
     // Clamp indices to [0, input_size - output_size]
     llvm::SmallVector<Value, 4> start_indices_vector;
     start_indices_vector.reserve(op.getStartIndices().size());
-    Value clamp_min = rewriter.create<TF::ConstOp>(
-        op.getLoc(),
+    Value clamp_min = TF::ConstOp::create(
+        rewriter, op.getLoc(),
         rewriter.getIntegerAttr(signed_start_indices_element_type, 0));
     for (uint64_t i = 0, e = op.getStartIndices().size(); i < e; ++i) {
       // Always put a cast there.
       auto start = op.getStartIndices()[i];
       auto cast_type = mlir::cast<ShapedType>(start.getType())
                            .clone(signed_start_indices_element_type);
-      auto cast_op = rewriter.create<TF::CastOp>(op.getLoc(), cast_type, start);
-      Value clamp_max = rewriter.create<TF::ConstOp>(
-          op.getLoc(), rewriter.getIntegerAttr(
-                           signed_start_indices_element_type,
-                           input_type.getShape()[i] -
-                               op.getSliceSizes().getValues<int64_t>()[i]));
-      Value clamped_index = rewriter.create<mhlo::ClampOp>(
-          op.getLoc(), cast_type, clamp_min, cast_op, clamp_max);
+      auto cast_op =
+          TF::CastOp::create(rewriter, op.getLoc(), cast_type, start);
+      Value clamp_max = TF::ConstOp::create(
+          rewriter, op.getLoc(),
+          rewriter.getIntegerAttr(
+              signed_start_indices_element_type,
+              input_type.getShape()[i] -
+                  op.getSliceSizes().getValues<int64_t>()[i]));
+      Value clamped_index = mhlo::ClampOp::create(
+          rewriter, op.getLoc(), cast_type, clamp_min, cast_op, clamp_max);
       start_indices_vector.push_back(clamped_index);
     }
 
@@ -1317,11 +1321,12 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
     Type start_indices_type = RankedTensorType::get(
         {static_cast<int64_t>(start_indices_vector.size())},
         signed_start_indices_element_type);
-    Value start_indices_op = rewriter.create<TF::PackOp>(
-        op.getLoc(), start_indices_type, ValueRange(start_indices_vector));
+    Value start_indices_op =
+        TF::PackOp::create(rewriter, op.getLoc(), start_indices_type,
+                           ValueRange(start_indices_vector));
 
     Value slice_sices_op =
-        rewriter.create<TF::ConstOp>(op.getLoc(), op.getSliceSizes());
+        TF::ConstOp::create(rewriter, op.getLoc(), op.getSliceSizes());
     rewriter.replaceOpWithNewOp<TF::SliceOp>(op, op.getType(), op.getOperand(),
                                              start_indices_op, slice_sices_op);
     return success();
@@ -1378,8 +1383,8 @@ Value BuildReshapeOp(ImplicitLocOpBuilder& builder,
                      ArrayRef<int64_t> shape, Type idx_type,
                      Type element_type) {
   Value shape_cst = BuildIntArrayConstOp(builder, rewriter, shape, idx_type);
-  Value reshaped_input = builder.create<TF::ReshapeOp>(
-      RankedTensorType::get(shape, element_type), input, shape_cst);
+  Value reshaped_input = TF::ReshapeOp::create(
+      builder, RankedTensorType::get(shape, element_type), input, shape_cst);
   return reshaped_input;
 }
 
@@ -1389,8 +1394,9 @@ Value BuildSliceOp(ImplicitLocOpBuilder& builder,
                    Value begin, ArrayRef<int64_t> shape, Type idx_type,
                    Type element_type) {
   Value shape_cst = BuildIntArrayConstOp(builder, rewriter, shape, idx_type);
-  Value slice_result = builder.create<TF::SliceOp>(
-      RankedTensorType::get(shape, element_type), input, begin, shape_cst);
+  Value slice_result =
+      TF::SliceOp::create(builder, RankedTensorType::get(shape, element_type),
+                          input, begin, shape_cst);
   return slice_result;
 }
 
@@ -1416,8 +1422,8 @@ class ConvertDynamicUpdateSliceOp
     llvm::SmallVector<Value> start_indices_vector;
     Append(start_indices_vector, op.getStartIndices());
     auto shape_tensor_type = RankedTensorType::get({shape_dim}, idx_type);
-    Value start_indices_tensor = rewriter.create<TF::PackOp>(
-        op.getLoc(), shape_tensor_type, start_indices_vector);
+    Value start_indices_tensor = TF::PackOp::create(
+        rewriter, op.getLoc(), shape_tensor_type, start_indices_vector);
     rewriter.replaceOpWithNewOp<TF::XlaDynamicUpdateSliceOp>(
         op, op.getType(), op.getOperand(), op.getUpdate(),
         start_indices_tensor);
@@ -1584,7 +1590,7 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
                                       bool is_lhs) {
   auto operand_type = mlir::cast<ShapedType>(operand.getType());
   BoolAttr true_attr = builder.getBoolAttr(true);
-  auto operand_shape = builder.create<TF::ShapeOp>(operand, true_attr);
+  auto operand_shape = TF::ShapeOp::create(builder, operand, true_attr);
   const int64_t operand_rank = operand_type.getRank();
   // Compute flattened out dimension and contracting dimension using
   // TF::UnsortedSegmentProdOp.
@@ -1600,26 +1606,28 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
   }
   auto seg_prod_result_type =
       RankedTensorType::get(static_cast<int32_t>(1), builder.getI32Type());
-  auto out_segids_cst = builder.create<TF::ConstOp>(
-      builder.getI32TensorAttr(flattened_out_segids));
-  auto contracting_segids_cst = builder.create<TF::ConstOp>(
-      builder.getI32TensorAttr(flattened_contracting_segids));
+  auto out_segids_cst = TF::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_out_segids));
+  auto contracting_segids_cst = TF::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_contracting_segids));
   auto num_segids_tensor =
-      builder.create<TF::ConstOp>(builder.getI32IntegerAttr(1));
-  auto flattened_out_dims = builder.create<TF::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, out_segids_cst, num_segids_tensor);
-  auto flattened_contracting_dims = builder.create<TF::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, contracting_segids_cst,
+      TF::ConstOp::create(builder, builder.getI32IntegerAttr(1));
+  auto flattened_out_dims = TF::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, out_segids_cst,
+      num_segids_tensor);
+  auto flattened_contracting_dims = TF::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, contracting_segids_cst,
       num_segids_tensor);
   llvm::SmallVector<Value, 3> flattend_shape_values;
   // Gather the batch dimensions.
   if (!dot_dimensions_info.batch_dimensions().AxesArray().empty()) {
     if (ShapedType::isDynamicShape(
             dot_dimensions_info.batch_dimensions().SizesArray())) {
-      auto batch_axes_tensor =
-          builder.create<TF::ConstOp>(builder.getI64TensorAttr(
-              dot_dimensions_info.batch_dimensions().AxesArray()));
-      auto batch_dims = builder.create<TF::GatherOp>(
+      auto batch_axes_tensor = TF::ConstOp::create(
+          builder, builder.getI64TensorAttr(
+                       dot_dimensions_info.batch_dimensions().AxesArray()));
+      auto batch_dims = TF::GatherOp::create(
+          builder,
           RankedTensorType::get(
               {static_cast<int>(
                   dot_dimensions_info.batch_dimensions().AxesArray().size())},
@@ -1633,7 +1641,7 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
         batch_i32_vec.push_back(static_cast<int32_t>(element));
       }
       auto batch_dims =
-          builder.create<TF::ConstOp>(builder.getI32TensorAttr(batch_i32_vec));
+          TF::ConstOp::create(builder, builder.getI32TensorAttr(batch_i32_vec));
       flattend_shape_values.push_back(batch_dims);
     }
   }
@@ -1649,9 +1657,9 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
       builder.getIntegerType(32));
   // Concatenate the batch dimensions, flattened out dimension and flattened
   // contracting dimension.
-  return builder.create<TF::ConcatOp>(
-      concat_result_type,
-      builder.create<TF::ConstOp>(builder.getI32IntegerAttr(0)),
+  return TF::ConcatOp::create(
+      builder, concat_result_type,
+      TF::ConstOp::create(builder, builder.getI32IntegerAttr(0)),
       flattend_shape_values);
 }
 
@@ -1682,8 +1690,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       lhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       lhs_dot_dimensions_info.out_dimensions().SizesArray(),
       lhs_dot_dimensions_info.contracting_dimensions().SizesArray());
-  auto lhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto lhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(lhs_transposed_shape, lhs_type.getElementType()),
       lhs,
       DenseIntElementsAttr::get(
@@ -1700,8 +1708,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       rhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       rhs_dot_dimensions_info.contracting_dimensions().SizesArray(),
       rhs_dot_dimensions_info.out_dimensions().SizesArray());
-  auto rhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto rhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(rhs_transposed_shape, rhs_type.getElementType()),
       rhs,
       DenseIntElementsAttr::get(
@@ -1717,15 +1725,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.FlattenedContractingDimensionSize()});
   Value lhs_flattend;
   if (lhs_type.hasStaticShape()) {
-    lhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed.getResult());
   } else {
     auto lhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         lhs, lhs_dot_dimensions_info, builder, /*is_lhs=*/true);
-    lhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed, lhs_flattend_shape_op);
   }
@@ -1739,15 +1747,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
   Value rhs_flattend;
   if (rhs_type.hasStaticShape()) {
-    rhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed.getResult());
   } else {
     auto rhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         rhs, rhs_dot_dimensions_info, builder, /*is_lhs=*/false);
-    rhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed, rhs_flattend_shape_op);
   }
@@ -1759,36 +1767,38 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                           lhs_dot_dimensions_info.FlattenedOutDimensionSize()},
                       llvm::ArrayRef<int64_t>{
                           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
-  auto matmul = rewriter.create<TF::BatchMatMulV3Op>(
-      loc, RankedTensorType::get(matmul_shape, result_type.getElementType()),
+  auto matmul = TF::BatchMatMulV3Op::create(
+      rewriter, loc,
+      RankedTensorType::get(matmul_shape, result_type.getElementType()),
       lhs_flattend, rhs_flattend);
 
   if (result_type.hasStaticShape()) {
     auto reshaped =
-        rewriter.create<mhlo::ReshapeOp>(loc, result_type, matmul.getResult());
+        mhlo::ReshapeOp::create(rewriter, loc, result_type, matmul.getResult());
     return reshaped.getResult();
   }
 
   // Reshape for dynamic shaped operands. The result shape is
   // [lhs_batch_dimensions, lhs_out_dimensions, rhs_out_dimensions].
   BoolAttr true_attr = rewriter.getBoolAttr(true);
-  auto lhs_shape = rewriter.create<TF::ShapeOp>(loc, lhs, true_attr);
-  auto rhs_shape = rewriter.create<TF::ShapeOp>(loc, rhs, true_attr);
+  auto lhs_shape = TF::ShapeOp::create(rewriter, loc, lhs, true_attr);
+  auto rhs_shape = TF::ShapeOp::create(rewriter, loc, rhs, true_attr);
   llvm::SmallVector<int64_t, 4> lhs_batch_and_out =
       Concat<int64_t>(lhs_dot_dimensions_info.batch_dimensions().AxesArray(),
                       lhs_dot_dimensions_info.out_dimensions().AxesArray());
-  auto lhs_batch_and_out_cst = rewriter.create<TF::ConstOp>(
-      loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
-  auto lhs_batch_and_out_dims = rewriter.create<TF::GatherOp>(
-      loc,
+  auto lhs_batch_and_out_cst = TF::ConstOp::create(
+      rewriter, loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
+  auto lhs_batch_and_out_dims = TF::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get({static_cast<int>(lhs_batch_and_out.size())},
                             rewriter.getIntegerType(32)),
       lhs_shape, lhs_batch_and_out_cst, true_attr);
-  auto rhs_out_cst = rewriter.create<TF::ConstOp>(
-      loc, rewriter.getI64TensorAttr(
-               rhs_dot_dimensions_info.out_dimensions().AxesArray()));
-  auto rhs_out_dims = rewriter.create<TF::GatherOp>(
-      loc,
+  auto rhs_out_cst = TF::ConstOp::create(
+      rewriter, loc,
+      rewriter.getI64TensorAttr(
+          rhs_dot_dimensions_info.out_dimensions().AxesArray()));
+  auto rhs_out_dims = TF::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get(
           {static_cast<int32_t>(
               rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
@@ -1800,13 +1810,13 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.out_dimensions().AxesArray().size() +
           rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
       rewriter.getIntegerType(32));
-  auto result_shape = rewriter.create<TF::ConcatOp>(
-      loc, result_shape_type,
-      rewriter.create<TF::ConstOp>(loc, rewriter.getI32IntegerAttr(0)),
+  auto result_shape = TF::ConcatOp::create(
+      rewriter, loc, result_shape_type,
+      TF::ConstOp::create(rewriter, loc, rewriter.getI32IntegerAttr(0)),
       ValueRange{lhs_batch_and_out_dims, rhs_out_dims});
 
-  auto reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
-      loc, result_type, matmul.getResult(), result_shape);
+  auto reshaped = mhlo::DynamicReshapeOp::create(
+      rewriter, loc, result_type, matmul.getResult(), result_shape);
   return reshaped.getResult();
 }
 
@@ -1844,9 +1854,10 @@ template <typename TfReduceOp, typename TfBinOp>
 LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
                                        TF::ConstOp reduction_indices,
                                        ConversionPatternRewriter& rewriter) {
-  Value reduce_result = rewriter.create<TfReduceOp>(
-      reduce_op.getLoc(), reduce_op.getType(0), input, reduction_indices,
-      /*keep_dim=*/rewriter.getBoolAttr(false));
+  Value reduce_result =
+      TfReduceOp::create(rewriter, reduce_op.getLoc(), reduce_op.getType(0),
+                         input, reduction_indices,
+                         /*keep_dim=*/rewriter.getBoolAttr(false));
   rewriter.replaceOpWithNewOp<TfBinOp>(reduce_op, reduce_op.getType(0),
                                        reduce_result,
                                        reduce_op.getInitValues()[0]);
@@ -1902,8 +1913,9 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
     }
     auto dim_type = RankedTensorType::get(
         {static_cast<int64_t>(reduce_dims.size())}, rewriter.getI64Type());
-    auto reduction_indices = rewriter.create<TF::ConstOp>(
-        reduce_op.getLoc(), dim_type, rewriter.getI64TensorAttr(reduce_dims));
+    auto reduction_indices =
+        TF::ConstOp::create(rewriter, reduce_op.getLoc(), dim_type,
+                            rewriter.getI64TensorAttr(reduce_dims));
 
     // In `MatchReduceOpOperand` function, we already match that the
     // "mhlo::ReduceOp" only has one operand, one init_value and one result.
@@ -2103,25 +2115,26 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
 
     auto range_type =
         RankedTensorType::get({type.getShape()[dimension]}, element_type);
-    Value start_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), start);
-    Value limit_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), limit);
-    Value delta_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), delta);
-    Value result = rewriter.create<TF::RangeOp>(iota_op.getLoc(), range_type,
-                                                start_op, limit_op, delta_op);
+    Value start_op = TF::ConstOp::create(rewriter, iota_op.getLoc(), start);
+    Value limit_op = TF::ConstOp::create(rewriter, iota_op.getLoc(), limit);
+    Value delta_op = TF::ConstOp::create(rewriter, iota_op.getLoc(), delta);
+    Value result = TF::RangeOp::create(rewriter, iota_op.getLoc(), range_type,
+                                       start_op, limit_op, delta_op);
 
     if (type.getRank() > 1) {
       std::vector<int64_t> reshape_shape(type.getRank(), 1);
       reshape_shape[iota_op.getIotaDimension()] = type.getShape()[dimension];
       auto reshape_type = RankedTensorType::get(reshape_shape, element_type);
-      Value reshape_shape_op = rewriter.create<TF::ConstOp>(
-          iota_op.getLoc(), rewriter.getI64TensorAttr(reshape_shape));
-      result = rewriter.create<TF::ReshapeOp>(iota_op.getLoc(), reshape_type,
-                                              result, reshape_shape_op);
+      Value reshape_shape_op = TF::ConstOp::create(
+          rewriter, iota_op.getLoc(), rewriter.getI64TensorAttr(reshape_shape));
+      result = TF::ReshapeOp::create(rewriter, iota_op.getLoc(), reshape_type,
+                                     result, reshape_shape_op);
 
-      Value broadcast_shape_op = rewriter.create<TF::ConstOp>(
-          iota_op.getLoc(), rewriter.getI64TensorAttr(type.getShape()));
-      result = rewriter.create<TF::BroadcastToOp>(iota_op.getLoc(), type,
-                                                  result, broadcast_shape_op);
+      Value broadcast_shape_op =
+          TF::ConstOp::create(rewriter, iota_op.getLoc(),
+                              rewriter.getI64TensorAttr(type.getShape()));
+      result = TF::BroadcastToOp::create(rewriter, iota_op.getLoc(), type,
+                                         result, broadcast_shape_op);
     }
 
     rewriter.replaceOp(iota_op, result);
@@ -2314,8 +2327,8 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
       if (right_padding != 0) return failure();
     }
 
-    auto axis = rewriter.create<TF::ConstOp>(
-        rw->getLoc(),
+    auto axis = TF::ConstOp::create(
+        rewriter, rw->getLoc(),
         rewriter.getIntegerAttr(rewriter.getIntegerType(64), cumulative_axis));
 
     rewriter.replaceOpWithNewOp<TfCumOp>(rw, rw.getType(0), rw.getInputs()[0],
@@ -2585,7 +2598,7 @@ arith::ConstantOp ShapeToConst(PatternRewriter& rewriter, Value value) {
   auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
                                          rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, shape);
-  return rewriter.create<arith::ConstantOp>(value.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(rewriter, value.getLoc(), attr_type, attr);
 }
 
 bool IsSign(APInt a, APInt sign) {
@@ -2841,8 +2854,8 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
 
     TF::CastOp cast_op = nullptr;
     if (canonical_start_indices_type.getElementType().isUnsignedInteger(32)) {
-      cast_op = rewriter.create<TF::CastOp>(
-          gather_op->getLoc(),
+      cast_op = TF::CastOp::create(
+          rewriter, gather_op->getLoc(),
           RankedTensorType::get(canonical_start_indices_type.getShape(),
                                 rewriter.getI64Type()),
           canonical_start_indices);
@@ -2861,8 +2874,8 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
 
     auto canonical_result_type = RankedTensorType::get(
         canonical_result_shape, result_type.getElementType());
-    auto canonical_result = rewriter.create<TF::GatherNdOp>(
-        gather_op->getLoc(), canonical_result_type, canonical_operand,
+    auto canonical_result = TF::GatherNdOp::create(
+        rewriter, gather_op->getLoc(), canonical_result_type, canonical_operand,
         cast_op ? cast_op.getResult() : canonical_start_indices);
 
     auto offset_dims = gather_op.getDimensionNumbers().getOffsetDims();
@@ -2968,24 +2981,24 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     auto min_start_indices = BuildIntArrayConstOp(
         builder, rewriter, llvm::SmallVector<int64_t>({0, 0}),
         start_indices_type.getElementType());
-    auto start_indices_max_op = rewriter.create<TF::MaximumOp>(
-        gather_op.getLoc(), start_indices, min_start_indices);
-    auto clamped_start_indices_op = rewriter.create<TF::MinimumOp>(
-        gather_op.getLoc(), start_indices_max_op, max_start_indices);
+    auto start_indices_max_op = TF::MaximumOp::create(
+        rewriter, gather_op.getLoc(), start_indices, min_start_indices);
+    auto clamped_start_indices_op = TF::MinimumOp::create(
+        rewriter, gather_op.getLoc(), start_indices_max_op, max_start_indices);
 
     int64_t batch_size = start_indices_type.getDimSize(batch_dim);
     auto slice_size = BuildIntArrayConstOp(
         builder, rewriter, slice_sizes_vector, rewriter.getI32Type());
     if (batch_size == 1) {
-      auto squeeze_op = rewriter.create<TF::SqueezeOp>(
-          gather_op.getLoc(),
+      auto squeeze_op = TF::SqueezeOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({rank_two},
                                 start_indices_type.getElementType()),
           clamped_start_indices_op,
           rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({batch_dim})));
       auto slice_op =
-          rewriter.create<TF::SliceOp>(gather_op.getLoc(), gather_op.getType(),
-                                       operand, squeeze_op, slice_size);
+          TF::SliceOp::create(rewriter, gather_op.getLoc(), gather_op.getType(),
+                              operand, squeeze_op, slice_size);
       rewriter.replaceOp(gather_op, slice_op);
       return mlir::success();
     }
@@ -2999,29 +3012,29 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
       auto two = BuildIntArrayConstOp(builder, rewriter,
                                       llvm::SmallVector<int64_t>({1, 2}),
                                       rewriter.getI32Type());
-      auto begin = rewriter.create<TF::SliceOp>(
-          gather_op.getLoc(),
+      auto begin = TF::SliceOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({1, 2}, start_indices_type.getElementType()),
           clamped_start_indices_op, zero, two);
-      auto squeeze_op = rewriter.create<TF::SqueezeOp>(
-          gather_op.getLoc(),
+      auto squeeze_op = TF::SqueezeOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({rank_two},
                                 start_indices_type.getElementType()),
           begin,
           rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({batch_dim})));
-      auto slice_op = rewriter.create<TF::SliceOp>(
-          gather_op.getLoc(),
+      auto slice_op = TF::SliceOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({1, slice_sizes_vector[1]},
                                 operand_type.getElementType()),
           operand, squeeze_op, slice_size);
       slices.push_back(slice_op);
     }
     auto scalar_type = RankedTensorType::get({}, rewriter.getI32Type());
-    auto zero_scalar = rewriter.create<TF::ConstOp>(
-        gather_op.getLoc(),
+    auto zero_scalar = TF::ConstOp::create(
+        rewriter, gather_op.getLoc(),
         DenseIntElementsAttr::get(scalar_type, static_cast<int32_t>(0)));
-    auto concat_op = rewriter.create<TF::ConcatV2Op>(
-        gather_op.getLoc(), result_type, slices, zero_scalar);
+    auto concat_op = TF::ConcatV2Op::create(rewriter, gather_op.getLoc(),
+                                            result_type, slices, zero_scalar);
     rewriter.replaceOp(gather_op, concat_op);
     return mlir::success();
   }
@@ -3116,12 +3129,13 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     if (canonical_result_type.hasStaticShape()) {
       auto unflattened_result_type = RankedTensorType::get(
           unflattened_shape, original_result_type.getElementType());
-      canonical_result = rewriter.create<mhlo::ReshapeOp>(
-          gather_op.getLoc(), unflattened_result_type, canonical_result);
+      canonical_result =
+          mhlo::ReshapeOp::create(rewriter, gather_op.getLoc(),
+                                  unflattened_result_type, canonical_result);
     }
     // Transpose back to the original result shape.
-    return rewriter.create<mhlo::TransposeOp>(
-        gather_op.getLoc(), original_result_type, canonical_result,
+    return mhlo::TransposeOp::create(
+        rewriter, gather_op.getLoc(), original_result_type, canonical_result,
         rewriter.getI64TensorAttr(
             GetInversePermutationArray(permutation_to_canonical)));
   }
@@ -3168,13 +3182,13 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     // Transpose the dimensions and flatten the batching dimensions.
     RankedTensorType transposed_type =
         RankedTensorType::get(transposed_shape, operand_type.getElementType());
-    auto transposed_operand = rewriter.create<mhlo::TransposeOp>(
-        gather_op.getLoc(), transposed_type, operand,
+    auto transposed_operand = mhlo::TransposeOp::create(
+        rewriter, gather_op.getLoc(), transposed_type, operand,
         rewriter.getI64TensorAttr(permutation));
     auto flattened_type =
         RankedTensorType::get(flattened_shape, operand_type.getElementType());
-    auto flattened_operand = rewriter.create<mhlo::ReshapeOp>(
-        gather_op.getLoc(), flattened_type, transposed_operand);
+    auto flattened_operand = mhlo::ReshapeOp::create(
+        rewriter, gather_op.getLoc(), flattened_type, transposed_operand);
     return flattened_operand;
   }
 
@@ -3233,13 +3247,13 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     reshaped_shape.push_back(index_vector_size);
 
     // Transpose the dimensions and flatten the batching dimensions.
-    auto transposed_start_indices = rewriter.create<mhlo::TransposeOp>(
-        gather_op.getLoc(),
+    auto transposed_start_indices = mhlo::TransposeOp::create(
+        rewriter, gather_op.getLoc(),
         RankedTensorType::get(transposed_shape,
                               start_indices_type.getElementType()),
         start_indices, rewriter.getI64TensorAttr(permutation));
-    start_indices = rewriter.create<mhlo::ReshapeOp>(
-        gather_op.getLoc(),
+    start_indices = mhlo::ReshapeOp::create(
+        rewriter, gather_op.getLoc(),
         RankedTensorType::get(reshaped_shape,
                               start_indices_type.getElementType()),
         transposed_start_indices);
@@ -3275,32 +3289,33 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
       llvm::SmallVector<int64_t> offsets_shape(start_indices_shape.size(), 1);
       offsets_shape[non_trivial_sliced_dim] = slice_sizes[operand_dim];
       start_indices_shape[non_trivial_sliced_dim] = slice_sizes[operand_dim];
-      auto offsets = rewriter.create<mhlo::IotaOp>(
-          gather_op.getLoc(),
+      auto offsets = mhlo::IotaOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get(offsets_shape,
                                 start_indices_type.getElementType()),
           rewriter.getI64IntegerAttr(non_trivial_sliced_dim));
       non_trivial_sliced_dim++;
 
       // Pad with 0s on the other operand dimensions.
-      Value zero = rewriter.create<arith::ConstantOp>(
-          gather_op.getLoc(), rewriter.getZeroAttr(RankedTensorType::get(
-                                  {}, start_indices_type.getElementType())));
+      Value zero = arith::ConstantOp::create(
+          rewriter, gather_op.getLoc(),
+          rewriter.getZeroAttr(
+              RankedTensorType::get({}, start_indices_type.getElementType())));
       int rank = offsets_shape.size();
       llvm::SmallVector<int64_t> padding_low(rank, 0);
       llvm::SmallVector<int64_t> padding_high(rank, 0);
       llvm::SmallVector<int64_t> padding_interior(rank, 0);
       padding_low.back() = i;
       padding_high.back() = start_indices_shape.back() - i - 1;
-      auto padded_offsets = rewriter.create<mhlo::PadOp>(
-          gather_op.getLoc(), offsets, zero,
-          GetI64ElementsAttr(padding_low, &rewriter),
-          GetI64ElementsAttr(padding_high, &rewriter),
-          GetI64ElementsAttr(padding_interior, &rewriter));
+      auto padded_offsets =
+          mhlo::PadOp::create(rewriter, gather_op.getLoc(), offsets, zero,
+                              GetI64ElementsAttr(padding_low, &rewriter),
+                              GetI64ElementsAttr(padding_high, &rewriter),
+                              GetI64ElementsAttr(padding_interior, &rewriter));
 
       // Add the padded offsets to the start indices (with broadcasting).
-      start_indices = rewriter.create<TF::AddOp>(gather_op.getLoc(),
-                                                 start_indices, padded_offsets);
+      start_indices = TF::AddOp::create(rewriter, gather_op.getLoc(),
+                                        start_indices, padded_offsets);
     }
 
     if (!start_indices_batching_dims.empty()) {
@@ -3308,15 +3323,15 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
       // operand.
       llvm::SmallVector<int64_t> offsets_shape = start_indices_shape;
       offsets_shape.back() = 1;
-      auto offsets = rewriter.create<mhlo::IotaOp>(
-          gather_op.getLoc(),
+      auto offsets = mhlo::IotaOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get(offsets_shape,
                                 start_indices_type.getElementType()),
           rewriter.getI64IntegerAttr(0));
 
       start_indices_shape.back()++;
-      start_indices = rewriter.create<mhlo::ConcatenateOp>(
-          gather_op.getLoc(),
+      start_indices = mhlo::ConcatenateOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get(start_indices_shape,
                                 start_indices_type.getElementType()),
           ValueRange{offsets, start_indices},
@@ -3345,8 +3360,9 @@ class ConvertWhileOp : public OpConversionPattern<mhlo::WhileOp> {
     // Creates a TF::WhileRegionOp to replace the mhlo::WhileOp. HLO WhileOp
     // currently doesn't support stateless and shape invariant, so these
     // parameters are set to the default values.
-    auto new_while = rewriter.create<TF::WhileRegionOp>(
-        while_op.getLoc(), while_op->getResultTypes(), while_op->getOperands(),
+    auto new_while = TF::WhileRegionOp::create(
+        rewriter, while_op.getLoc(), while_op->getResultTypes(),
+        while_op->getOperands(),
         /*parallel_iterations=*/10,
         /*is_stateless=*/false, /*shape_invariant=*/false);
     new_while.getCond().takeBody(while_op.getCond());
@@ -3366,8 +3382,8 @@ class ConvertIfOp : public OpConversionPattern<mhlo::IfOp> {
       mhlo::IfOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     // HLO IfOp currently doesn't support stateless
-    auto new_op = rewriter.create<TF::IfRegionOp>(
-        op.getLoc(), op->getResultTypes(), op.getPred(),
+    auto new_op = TF::IfRegionOp::create(
+        rewriter, op.getLoc(), op->getResultTypes(), op.getPred(),
         /*is_stateless=*/false, /*_then_func_name=*/nullptr,
         /*_else_func_name=*/nullptr);
     new_op.getThenBranch().takeBody(op.getTrueBranch());
@@ -3427,10 +3443,10 @@ Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
       {pad_op.getEdgePaddingLow().size(), 2}, rewriter.getI64Type());
   auto padding_attr = DenseIntElementsAttr::get(padding_attr_type, padding);
   auto padding_amount_const_op =
-      rewriter.create<arith::ConstantOp>(loc, padding_attr_type, padding_attr);
-  auto new_pad_op = rewriter.create<TF::PadV2Op>(
-      loc, pad_op.getType().clone(pad_output_shape), pad_op.getOperand(),
-      padding_amount_const_op, pad_op.getPaddingValue());
+      arith::ConstantOp::create(rewriter, loc, padding_attr_type, padding_attr);
+  auto new_pad_op = TF::PadV2Op::create(
+      rewriter, loc, pad_op.getType().clone(pad_output_shape),
+      pad_op.getOperand(), padding_amount_const_op, pad_op.getPaddingValue());
   if (!has_negative_padding_amount) {
     return new_pad_op;
   }
@@ -3438,15 +3454,14 @@ Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
   // Convert negative padding amount into slice.
   auto slice_attr_type = RankedTensorType::get(
       {pad_op.getEdgePaddingLow().size()}, rewriter.getI64Type());
-  auto slice_begins_const_op = rewriter.create<arith::ConstantOp>(
-      loc, slice_attr_type,
+  auto slice_begins_const_op = arith::ConstantOp::create(
+      rewriter, loc, slice_attr_type,
       DenseIntElementsAttr::get(slice_attr_type, slice_begins));
-  auto slice_sizes_const_op = rewriter.create<arith::ConstantOp>(
-      loc, slice_attr_type,
+  auto slice_sizes_const_op = arith::ConstantOp::create(
+      rewriter, loc, slice_attr_type,
       DenseIntElementsAttr::get(slice_attr_type, slice_sizes));
-  return rewriter.create<TF::SliceOp>(loc, pad_op.getType(), new_pad_op,
-                                      slice_begins_const_op,
-                                      slice_sizes_const_op);
+  return TF::SliceOp::create(rewriter, loc, pad_op.getType(), new_pad_op,
+                             slice_begins_const_op, slice_sizes_const_op);
 }
 
 class ConvertPopulationCountOp
@@ -3459,8 +3474,8 @@ class ConvertPopulationCountOp
       ConversionPatternRewriter& rewriter) const final {
     auto output_type = op.getType().clone(
         rewriter.getIntegerType(/*width=*/8, /*isSigned=*/false));
-    auto pop_cnt = rewriter.create<TF::PopulationCountOp>(
-        op.getLoc(), output_type, op.getOperand());
+    auto pop_cnt = TF::PopulationCountOp::create(rewriter, op.getLoc(),
+                                                 output_type, op.getOperand());
     auto cast_or_pop_cnt =
         rewriter.createOrFold<TF::CastOp>(op.getLoc(), op.getType(), pop_cnt);
     rewriter.replaceOp(op, {cast_or_pop_cnt});
@@ -3608,9 +3623,9 @@ class ConvertCustomCallWithApproxTopK
     }
     auto is_max_k = rewriter.getBoolAttr(true);
 
-    auto approx_top_k = rewriter.create<TF::ApproxTopKOp>(
-        op.getLoc(), op->getResultTypes(), op.getInputs()[0], top_k_attr,
-        reduction_dim_attr, recall_target_attr, is_max_k,
+    auto approx_top_k = TF::ApproxTopKOp::create(
+        rewriter, op.getLoc(), op->getResultTypes(), op.getInputs()[0],
+        top_k_attr, reduction_dim_attr, recall_target_attr, is_max_k,
         reduction_input_size_override_attr, aggregate_to_topk_attr);
 
     rewriter.replaceOp(op, approx_top_k.getResults());
@@ -3661,8 +3676,8 @@ class ConvertGetDimensionSizeOp
       mhlo::GetDimensionSizeOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
-    Value shape_op = rewriter.create<TF::ShapeOp>(op.getLoc(), op.getOperand(),
-                                                  rewriter.getBoolAttr(true));
+    Value shape_op = TF::ShapeOp::create(rewriter, op.getLoc(), op.getOperand(),
+                                         rewriter.getBoolAttr(true));
     Value size =
         BuildIntArrayConstOp(builder, rewriter, llvm::SmallVector<int64_t>({1}),
                              rewriter.getI32Type());
@@ -3670,13 +3685,13 @@ class ConvertGetDimensionSizeOp
         builder, rewriter,
         llvm::SmallVector<int64_t>({static_cast<int64_t>(op.getDimension())}),
         rewriter.getI64Type());
-    Value slice_op = rewriter.create<TF::SliceOp>(
-        op.getLoc(),
+    Value slice_op = TF::SliceOp::create(
+        rewriter, op.getLoc(),
         RankedTensorType::get({static_cast<int64_t>(1)},
                               op.getType().getElementType()),
         shape_op, begin, size);
-    Value squeeze_op = rewriter.create<TF::SqueezeOp>(
-        op.getLoc(), op.getType(), slice_op,
+    Value squeeze_op = TF::SqueezeOp::create(
+        rewriter, op.getLoc(), op.getType(), slice_op,
         rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({0})));
     rewriter.replaceOp(op, {squeeze_op});
     return success();
@@ -3749,25 +3764,26 @@ class ConvertDynamicIotaOp : public OpConversionPattern<mhlo::DynamicIotaOp> {
     if (mlir::isa<FloatType>(element_type)) {
       auto cast_type =
           mlir::cast<ShapedType>(output_shape.getType()).clone(element_type);
-      output_shape = rewriter.create<TF::CastOp>(dynamic_iota_op.getLoc(),
-                                                 cast_type, output_shape);
+      output_shape = TF::CastOp::create(rewriter, dynamic_iota_op.getLoc(),
+                                        cast_type, output_shape);
     }
     DenseIntElementsAttr scalar_attr = DenseIntElementsAttr::get(
         RankedTensorType::get({0}, rewriter.getI32Type()),
         llvm::ArrayRef<int32_t>({}));
     auto scalar_shape =
-        rewriter.create<TF::ConstOp>(dynamic_iota_op.getLoc(), scalar_attr);
-    auto limit_scalar = rewriter.create<TF::ReshapeOp>(
-        dynamic_iota_op.getLoc(), RankedTensorType::get({}, element_type),
-        output_shape, scalar_shape);
+        TF::ConstOp::create(rewriter, dynamic_iota_op.getLoc(), scalar_attr);
+    auto limit_scalar = TF::ReshapeOp::create(
+        rewriter, dynamic_iota_op.getLoc(),
+        RankedTensorType::get({}, element_type), output_shape, scalar_shape);
     auto range_type =
         RankedTensorType::get({type.getShape()[dimension]}, element_type);
     Value start_op =
-        rewriter.create<TF::ConstOp>(dynamic_iota_op.getLoc(), start);
+        TF::ConstOp::create(rewriter, dynamic_iota_op.getLoc(), start);
     Value delta_op =
-        rewriter.create<TF::ConstOp>(dynamic_iota_op.getLoc(), delta);
-    Value range_op = rewriter.create<TF::RangeOp>(
-        dynamic_iota_op.getLoc(), range_type, start_op, limit_scalar, delta_op);
+        TF::ConstOp::create(rewriter, dynamic_iota_op.getLoc(), delta);
+    Value range_op =
+        TF::RangeOp::create(rewriter, dynamic_iota_op.getLoc(), range_type,
+                            start_op, limit_scalar, delta_op);
     rewriter.replaceOp(dynamic_iota_op, range_op);
     return success();
   }
@@ -3820,7 +3836,7 @@ arith::ConstantOp ExpandedShape(PatternRewriter& rewriter, Value input,
       RankedTensorType::get({static_cast<int64_t>(expanded_shape.size())},
                             rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, expanded_shape);
-  return rewriter.create<arith::ConstantOp>(output.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(rewriter, output.getLoc(), attr_type, attr);
 }
 
 Value ExpandedDynamicShape(PatternRewriter& rewriter, Value input,
@@ -3843,9 +3859,9 @@ Value ExpandedDynamicShape(PatternRewriter& rewriter, Value input,
   for (int64_t i : expanded_dimensions) {
     auto index_attr = DenseIntElementsAttr::get(
         RankedTensorType::get({}, rewriter.getI64Type()), {i});
-    Value index = rewriter.create<TF::ConstOp>(output.getLoc(), index_attr);
-    expanded_input = rewriter.create<TF::ExpandDimsOp>(output.getLoc(),
-                                                       expanded_input, index);
+    Value index = TF::ConstOp::create(rewriter, output.getLoc(), index_attr);
+    expanded_input = TF::ExpandDimsOp::create(rewriter, output.getLoc(),
+                                              expanded_input, index);
   }
   return expanded_input;
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc
index e5ea3d2ebc5e93..096de88c16055f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc
@@ -111,11 +111,11 @@ Value CreatePadOpFromConvPadding(OpBuilder& b, mhlo::ConvolutionOp op) {
   auto padding_value_type = RankedTensorType::get({}, data.ElementType());
   auto padding_value_attr = b.getZeroAttr(padding_value_type);
   auto padding_value_op =
-      b.create<arith::ConstantOp>(op->getLoc(), padding_value_attr);
+      arith::ConstantOp::create(b, op->getLoc(), padding_value_attr);
 
-  auto pad_op = b.create<mhlo::PadOp>(padding_value_op->getLoc(), op.getLhs(),
-                                      padding_value_op, lo_padding_attr,
-                                      hi_padding_attr, interior_padding_attr);
+  auto pad_op = mhlo::PadOp::create(b, padding_value_op->getLoc(), op.getLhs(),
+                                    padding_value_op, lo_padding_attr,
+                                    hi_padding_attr, interior_padding_attr);
 
   return pad_op;
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
index f89f8acd446315..18d9b10d677259 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
@@ -82,9 +82,9 @@ LogicalResult ConvertCustomCallOp::matchAndRewrite(
   if (!call_target_name.starts_with("custom_call.")) {
     return failure();
   }
-  auto tfl_custom = rewriter.create<TFL::CustomOp>(
-      mhlo_custom_call.getLoc(), mhlo_custom_call.getResultTypes(),
-      mhlo_custom_call.getInputs());
+  auto tfl_custom = TFL::CustomOp::create(rewriter, mhlo_custom_call.getLoc(),
+                                          mhlo_custom_call.getResultTypes(),
+                                          mhlo_custom_call.getInputs());
   tfl_custom.setCustomCodeAttr(rewriter.getStringAttr(call_target_name));
 
   if (auto bc = mhlo_custom_call.getBackendConfig()) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
index 940c75256b9e75..347817d3cc6d59 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
@@ -178,7 +178,8 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
                                       ImplicitLocOpBuilder& builder,
                                       bool is_lhs) {
   auto operand_type = mlir::cast<ShapedType>(operand.getType());
-  auto operand_shape = builder.create<TFL::ShapeOp>(
+  auto operand_shape = TFL::ShapeOp::create(
+      builder,
       RankedTensorType::get(static_cast<int32_t>(operand_type.getRank()),
                             builder.getIntegerType(32)),
       operand);
@@ -197,27 +198,29 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
   }
   auto seg_prod_result_type =
       RankedTensorType::get(static_cast<int32_t>(1), builder.getI32Type());
-  auto out_segids_cst = builder.create<TFL::ConstOp>(
-      builder.getI32TensorAttr(flattened_out_segids));
-  auto contracting_segids_cst = builder.create<TFL::ConstOp>(
-      builder.getI32TensorAttr(flattened_contracting_segids));
-  auto num_segids_tensor =
-      builder.create<TFL::ConstOp>(DenseIntElementsAttr::get(
-          RankedTensorType::get({}, builder.getIntegerType(32)), 1));
-  auto flattened_out_dims = builder.create<TFL::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, out_segids_cst, num_segids_tensor);
-  auto flattened_contracting_dims = builder.create<TFL::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, contracting_segids_cst,
+  auto out_segids_cst = TFL::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_out_segids));
+  auto contracting_segids_cst = TFL::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_contracting_segids));
+  auto num_segids_tensor = TFL::ConstOp::create(
+      builder, DenseIntElementsAttr::get(
+                   RankedTensorType::get({}, builder.getIntegerType(32)), 1));
+  auto flattened_out_dims = TFL::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, out_segids_cst,
+      num_segids_tensor);
+  auto flattened_contracting_dims = TFL::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, contracting_segids_cst,
       num_segids_tensor);
   llvm::SmallVector<Value, 3> flattend_shape_values;
   // Gather the batch dimensions.
   if (!dot_dimensions_info.batch_dimensions().AxesArray().empty()) {
     if (ShapedType::isDynamicShape(
             dot_dimensions_info.batch_dimensions().SizesArray())) {
-      auto batch_axes_tensor =
-          builder.create<TFL::ConstOp>(builder.getI64TensorAttr(
-              dot_dimensions_info.batch_dimensions().AxesArray()));
-      auto batch_dims = builder.create<TFL::GatherOp>(
+      auto batch_axes_tensor = TFL::ConstOp::create(
+          builder, builder.getI64TensorAttr(
+                       dot_dimensions_info.batch_dimensions().AxesArray()));
+      auto batch_dims = TFL::GatherOp::create(
+          builder,
           RankedTensorType::get(
               {static_cast<int>(
                   dot_dimensions_info.batch_dimensions().AxesArray().size())},
@@ -230,8 +233,8 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
            dot_dimensions_info.batch_dimensions().SizesArray()) {
         batch_i32_vec.push_back(static_cast<int32_t>(element));
       }
-      auto batch_dims =
-          builder.create<TFL::ConstOp>(builder.getI32TensorAttr(batch_i32_vec));
+      auto batch_dims = TFL::ConstOp::create(
+          builder, builder.getI32TensorAttr(batch_i32_vec));
       flattend_shape_values.push_back(batch_dims);
     }
   }
@@ -247,9 +250,9 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
       builder.getIntegerType(32));
   // Concatenate the batch dimensions, flattened out dimension and flattened
   // contracting dimension.
-  return builder.create<TFL::ConcatenationOp>(
-      concat_result_type, flattend_shape_values, /*axis*/ 0,
-      /*fused_activation_function*/ "NONE");
+  return TFL::ConcatenationOp::create(builder, concat_result_type,
+                                      flattend_shape_values, /*axis*/ 0,
+                                      /*fused_activation_function*/ "NONE");
 }
 }  // namespace
 
@@ -280,8 +283,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       lhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       lhs_dot_dimensions_info.out_dimensions().SizesArray(),
       lhs_dot_dimensions_info.contracting_dimensions().SizesArray());
-  auto lhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto lhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(lhs_transposed_shape, lhs_type.getElementType()),
       lhs,
       DenseIntElementsAttr::get(
@@ -298,8 +301,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       rhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       rhs_dot_dimensions_info.contracting_dimensions().SizesArray(),
       rhs_dot_dimensions_info.out_dimensions().SizesArray());
-  auto rhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto rhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(rhs_transposed_shape, rhs_type.getElementType()),
       rhs,
       DenseIntElementsAttr::get(
@@ -314,15 +317,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.FlattenedContractingDimensionSize()});
   Value lhs_flattend;
   if (lhs_type.hasStaticShape()) {
-    lhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed.getResult());
   } else {
     auto lhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         lhs, lhs_dot_dimensions_info, builder, /*is_lhs=*/true);
-    lhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed, lhs_flattend_shape_op);
   }
@@ -336,15 +339,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
   Value rhs_flattend;
   if (rhs_type.hasStaticShape()) {
-    rhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed.getResult());
   } else {
     auto rhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         rhs, rhs_dot_dimensions_info, builder, /*is_lhs=*/false);
-    rhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed, rhs_flattend_shape_op);
   }
@@ -357,44 +360,46 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                       llvm::ArrayRef<int64_t>{
                           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
   BoolAttr false_attr = rewriter.getBoolAttr(false);
-  auto matmul = rewriter.create<TFL::BatchMatMulOp>(
-      loc, RankedTensorType::get(matmul_shape, result_type.getElementType()),
+  auto matmul = TFL::BatchMatMulOp::create(
+      rewriter, loc,
+      RankedTensorType::get(matmul_shape, result_type.getElementType()),
       lhs_flattend, rhs_flattend, /*adj_x*/ false_attr, /*adj_y*/ false_attr,
       /*asym_quant_input*/ false_attr);
   if (result_type.hasStaticShape()) {
     auto reshaped =
-        rewriter.create<mhlo::ReshapeOp>(loc, result_type, matmul.getResult());
+        mhlo::ReshapeOp::create(rewriter, loc, result_type, matmul.getResult());
     return reshaped.getResult();
   }
 
   // Reshape for dynamic shaped operands. The result shape is
   // [lhs_batch_dimensions, lhs_out_dimensions, rhs_out_dimensions].
-  auto lhs_shape = rewriter.create<TFL::ShapeOp>(
-      loc,
+  auto lhs_shape = TFL::ShapeOp::create(
+      rewriter, loc,
       RankedTensorType::get(static_cast<int32_t>(lhs_type.getRank()),
                             builder.getIntegerType(32)),
       lhs);
-  auto rhs_shape = rewriter.create<TFL::ShapeOp>(
-      loc,
+  auto rhs_shape = TFL::ShapeOp::create(
+      rewriter, loc,
       RankedTensorType::get(static_cast<int32_t>(rhs_type.getRank()),
                             builder.getIntegerType(32)),
       rhs);
   llvm::SmallVector<int64_t, 4> lhs_batch_and_out =
       Concat<int64_t>(lhs_dot_dimensions_info.batch_dimensions().AxesArray(),
                       lhs_dot_dimensions_info.out_dimensions().AxesArray());
-  auto lhs_batch_and_out_cst = rewriter.create<TFL::ConstOp>(
-      loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
-  auto lhs_batch_and_out_dims = rewriter.create<TFL::GatherOp>(
-      loc,
+  auto lhs_batch_and_out_cst = TFL::ConstOp::create(
+      rewriter, loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
+  auto lhs_batch_and_out_dims = TFL::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get({static_cast<int>(lhs_batch_and_out.size())},
                             rewriter.getIntegerType(32)),
       lhs_shape, lhs_batch_and_out_cst,
       /*axis*/ 0, /*batch_dims*/ 0);
-  auto rhs_out_cst = rewriter.create<TFL::ConstOp>(
-      loc, rewriter.getI64TensorAttr(
-               rhs_dot_dimensions_info.out_dimensions().AxesArray()));
-  auto rhs_out_dims = rewriter.create<TFL::GatherOp>(
-      loc,
+  auto rhs_out_cst = TFL::ConstOp::create(
+      rewriter, loc,
+      rewriter.getI64TensorAttr(
+          rhs_dot_dimensions_info.out_dimensions().AxesArray()));
+  auto rhs_out_dims = TFL::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get(
           {static_cast<int32_t>(
               rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
@@ -407,12 +412,12 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.out_dimensions().AxesArray().size() +
           rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
       rewriter.getIntegerType(32));
-  auto result_shape = rewriter.create<TFL::ConcatenationOp>(
-      loc, result_shape_type, ValueRange{lhs_batch_and_out_dims, rhs_out_dims},
-      0, "NONE");
+  auto result_shape = TFL::ConcatenationOp::create(
+      rewriter, loc, result_shape_type,
+      ValueRange{lhs_batch_and_out_dims, rhs_out_dims}, 0, "NONE");
 
-  auto reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
-      loc, result_type, matmul.getResult(), result_shape);
+  auto reshaped = mhlo::DynamicReshapeOp::create(
+      rewriter, loc, result_type, matmul.getResult(), result_shape);
   return reshaped.getResult();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
index f2d29774c31c89..34b1b60fd1b825 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
@@ -74,9 +74,11 @@ bool IsSupportedRfftOp(mhlo::FftOp fft_op) {
 // concatenate with other dimension sizes.
 Value GetDimensionSizeTensor(OpBuilder& rewriter, Location loc, Value input,
                              int64_t dim) {
-  auto size_scalar = rewriter.create<mhlo::GetDimensionSizeOp>(loc, input, dim);
-  return rewriter.create<mhlo::ReshapeOp>(
-      loc, RankedTensorType::get({1}, rewriter.getI32Type()), size_scalar);
+  auto size_scalar =
+      mhlo::GetDimensionSizeOp::create(rewriter, loc, input, dim);
+  return mhlo::ReshapeOp::create(
+      rewriter, loc, RankedTensorType::get({1}, rewriter.getI32Type()),
+      size_scalar);
 }
 
 // Convert rfft to rfft2d.
@@ -154,13 +156,13 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
           expanded_input_shape_values.push_back(GetDimensionSizeTensor(
               rewriter, fft_op.getLoc(), fft_operand, i));
         }
-        expanded_input_shape_values.push_back(rewriter.create<mhlo::ConstantOp>(
-            fft_op.getLoc(), rewriter.getI32TensorAttr({1})));
+        expanded_input_shape_values.push_back(mhlo::ConstantOp::create(
+            rewriter, fft_op.getLoc(), rewriter.getI32TensorAttr({1})));
         expanded_input_shape_values.push_back(GetDimensionSizeTensor(
             rewriter, fft_op.getLoc(), fft_operand, input_shape.size() - 1));
 
-        auto expanded_input_shape_tensor = rewriter.create<mhlo::ConcatenateOp>(
-            fft_op.getLoc(),
+        auto expanded_input_shape_tensor = mhlo::ConcatenateOp::create(
+            rewriter, fft_op.getLoc(),
             RankedTensorType::get(
                 {static_cast<int64_t>(expanded_input_shape_values.size())},
                 rewriter.getI32Type()),
@@ -168,12 +170,12 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
 
         // Create a new mhlo.dynamic_reshape op with the expanded input and
         // expanded input shape. SHAPE tensor is created in the previous step.
-        fft_operand = rewriter.create<mhlo::DynamicReshapeOp>(
-            fft_op.getLoc(), expanded_input_type, fft_operand,
+        fft_operand = mhlo::DynamicReshapeOp::create(
+            rewriter, fft_op.getLoc(), expanded_input_type, fft_operand,
             expanded_input_shape_tensor);
       } else {
-        fft_operand = rewriter.create<mhlo::ReshapeOp>(
-            fft_op.getLoc(), expanded_input_type, fft_operand);
+        fft_operand = mhlo::ReshapeOp::create(rewriter, fft_op.getLoc(),
+                                              expanded_input_type, fft_operand);
       }
 
       SmallVector<int64_t, 6> new_output_shape = {output_shape.begin(),
@@ -186,8 +188,8 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
     }
 
     auto new_fft =
-        rewriter.create<mhlo::FftOp>(fft_op.getLoc(), output_type, fft_operand,
-                                     fft_op.getFftType(), new_fft_lengths_attr);
+        mhlo::FftOp::create(rewriter, fft_op.getLoc(), output_type, fft_operand,
+                            fft_op.getFftType(), new_fft_lengths_attr);
 
     if (input_shape[input_shape.size() - 2] != 1) {
       // Squeeze the output dimensions back to 2D.
@@ -202,19 +204,20 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
             rewriter, fft_op.getLoc(), new_fft.getResult(),
             new_fft.getResult().getType().getShape().size() - 1));
 
-        auto shape_tensor = rewriter.create<mhlo::ConcatenateOp>(
-            fft_op.getLoc(),
+        auto shape_tensor = mhlo::ConcatenateOp::create(
+            rewriter, fft_op.getLoc(),
             RankedTensorType::get(
                 {static_cast<int64_t>(output_shape_values.size())},
                 rewriter.getI32Type()),
             output_shape_values, 0);
-        auto squeeze_op = rewriter.create<mhlo::DynamicReshapeOp>(
-            fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult(),
-            shape_tensor);
+        auto squeeze_op = mhlo::DynamicReshapeOp::create(
+            rewriter, fft_op.getLoc(), fft_op.getResult().getType(),
+            new_fft.getResult(), shape_tensor);
         rewriter.replaceOp(fft_op, squeeze_op.getResult());
       } else {
-        auto squeeze_op = rewriter.create<mhlo::ReshapeOp>(
-            fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult());
+        auto squeeze_op = mhlo::ReshapeOp::create(rewriter, fft_op.getLoc(),
+                                                  fft_op.getResult().getType(),
+                                                  new_fft.getResult());
         rewriter.replaceOp(fft_op, squeeze_op.getResult());
       }
     } else {
@@ -256,9 +259,10 @@ class LegalizeRfftOp : public OpConversionPattern<mhlo::FftOp> {
 
     auto output_type = mlir::cast<ShapedType>(fft_op.getResult().getType());
     auto fft_len_const =
-        rewriter.create<arith::ConstantOp>(fft_op.getLoc(), fft_len_f32_attr);
-    auto tfl_rfft2d = rewriter.create<TFL::RFFT2dOp>(
-        fft_op.getLoc(), output_type, fft_op.getOperand(), fft_len_const);
+        arith::ConstantOp::create(rewriter, fft_op.getLoc(), fft_len_f32_attr);
+    auto tfl_rfft2d =
+        TFL::RFFT2dOp::create(rewriter, fft_op.getLoc(), output_type,
+                              fft_op.getOperand(), fft_len_const);
 
     rewriter.replaceOp(fft_op, tfl_rfft2d.getResult());
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc
index 539a9934f75e5a..9833b3415f3059 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc
@@ -119,9 +119,9 @@ LogicalResult LowerGELU::matchAndRewrite(Operation* op,
   if (!HasSplatArg(rhs_mul, kOneOverRoot2, 1)) return failure();
 
   auto is_approx_attr = rewriter.getBoolAttr(false);
-  auto gelu = rewriter.create<TFL::GeluOp>(
-      output_mul.getLoc(), output_mul.getResult().getType(),
-      erf_input->getOperand(0), is_approx_attr);
+  auto gelu = TFL::GeluOp::create(rewriter, output_mul.getLoc(),
+                                  output_mul.getResult().getType(),
+                                  erf_input->getOperand(0), is_approx_attr);
   rewriter.replaceAllOpUsesWith(output_mul, gelu);
   // Note these must be erased in reverse topo order to avoid
   // failing in debug mode.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc
index e43f342aec2cdc..6b377c0eee933c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc
@@ -33,8 +33,8 @@ class LegalizeIfOp : public OpConversionPattern<mhlo::IfOp> {
   LogicalResult matchAndRewrite(
       mhlo::IfOp if_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto new_op = rewriter.create<TFL::IfOp>(
-        if_op.getLoc(), if_op.getResultTypes(), if_op.getPred());
+    auto new_op = TFL::IfOp::create(rewriter, if_op.getLoc(),
+                                    if_op.getResultTypes(), if_op.getPred());
 
     new_op.getThenRegion().takeBody(if_op.getTrueBranch());
     new_op.getElseRegion().takeBody(if_op.getFalseBranch());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc
index f237a7168e5660..5b5368ac1f5522 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc
@@ -248,8 +248,8 @@ LogicalResult ConvertReduceOpToArgMinMax<
   int64_t axis = reduce_op.getDimensions().getValues<int64_t>()[0];
 
   auto dim_type = RankedTensorType::get({1}, rewriter.getI32Type());
-  auto reduction_indices = rewriter.create<arith::ConstantOp>(
-      reduce_op.getLoc(), dim_type,
+  auto reduction_indices = arith::ConstantOp::create(
+      rewriter, reduce_op.getLoc(), dim_type,
       rewriter.getI32TensorAttr({static_cast<int32_t>(axis)}));
 
   // Generate a Max and an ArgMax of as the mhlo op returns both while in TF
@@ -260,24 +260,24 @@ LogicalResult ConvertReduceOpToArgMinMax<
   if (operand_type.getElementType().isInteger(1)) {
     // TF does not support min or max on boolean (int1) arguments.
     // Use AnyOp for MaxOp and AllOp for MinOp.
-    auto tf_reduce_op = rewriter.create<BooleanReduce>(
-        reduce_op.getLoc(), reduce_op->getResult(0).getType(), operand,
-        reduction_indices,
+    auto tf_reduce_op = BooleanReduce::create(
+        rewriter, reduce_op.getLoc(), reduce_op->getResult(0).getType(),
+        operand, reduction_indices,
         /*keep_dim=*/rewriter.getBoolAttr(false));
-    auto tf_argreduce_op = rewriter.create<ArgReduce>(
-        reduce_op.getLoc(), reduce_op->getResult(1).getType(), operand,
-        reduction_indices);
+    auto tf_argreduce_op = ArgReduce::create(rewriter, reduce_op.getLoc(),
+                                             reduce_op->getResult(1).getType(),
+                                             operand, reduction_indices);
 
     rewriter.replaceOp(reduce_op, {tf_reduce_op, tf_argreduce_op});
   } else {
-    auto tf_reduce_op = rewriter.create<Reduce>(
-        reduce_op.getLoc(), reduce_op->getResult(0).getType(), operand,
-        reduction_indices,
+    auto tf_reduce_op = Reduce::create(
+        rewriter, reduce_op.getLoc(), reduce_op->getResult(0).getType(),
+        operand, reduction_indices,
         /*keep_dim=*/rewriter.getBoolAttr(false));
 
-    auto tf_argreduce_op = rewriter.create<ArgReduce>(
-        reduce_op.getLoc(), reduce_op->getResult(1).getType(), operand,
-        reduction_indices);
+    auto tf_argreduce_op = ArgReduce::create(rewriter, reduce_op.getLoc(),
+                                             reduce_op->getResult(1).getType(),
+                                             operand, reduction_indices);
 
     rewriter.replaceOp(reduce_op, {tf_reduce_op, tf_argreduce_op});
   }
@@ -366,9 +366,10 @@ template <typename ReduceOp, typename BinaryOp, bool BuilderHasFAF = false>
 LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
                                        arith::ConstantOp reduction_indices,
                                        ConversionPatternRewriter& rewriter) {
-  Value reduce_result = rewriter.create<ReduceOp>(
-      reduce_op.getLoc(), reduce_op.getType(0), input, reduction_indices,
-      /*keep_dim=*/rewriter.getBoolAttr(false));
+  Value reduce_result =
+      ReduceOp::create(rewriter, reduce_op.getLoc(), reduce_op.getType(0),
+                       input, reduction_indices,
+                       /*keep_dim=*/rewriter.getBoolAttr(false));
 
   if constexpr (BuilderHasFAF) {
     rewriter.replaceOpWithNewOp<BinaryOp>(reduce_op, reduce_result,
@@ -455,7 +456,7 @@ class ConvertReduce : public OpConversionPattern<mhlo::ReduceOp> {
 
     auto tfl_dims = GetDimsAsI32Elements(rewriter, reduce_op);
     auto tfl_dims_op =
-        rewriter.create<arith::ConstantOp>(reduce_op.getLoc(), tfl_dims);
+        arith::ConstantOp::create(rewriter, reduce_op.getLoc(), tfl_dims);
 
     //
     // replace with new reduce op, chaining binary op if needed.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc
index 4382a9864cac02..c4a3dc62fd58f0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc
@@ -126,7 +126,7 @@ Value TransposeTensor(OpBuilder& b, Value tensor,
   const int64_t perm_size = perm.size();
   auto perm_attr_type = RankedTensorType::get({perm_size}, b.getI64Type());
   auto perm_attr = DenseIntElementsAttr::get(perm_attr_type, perm);
-  return b.create<mhlo::TransposeOp>(tensor.getLoc(), tensor, perm_attr);
+  return mhlo::TransposeOp::create(b, tensor.getLoc(), tensor, perm_attr);
 }
 
 DenseIntElementsAttr BuildDenseI64(OpBuilder& b, ArrayRef<int64_t> shape,
@@ -289,9 +289,10 @@ LogicalResult RelayoutReduceWindow::matchAndRewrite(
 
   // transpose input and build new reduce_window
   auto new_input = TransposeTensor(rewriter, input, perm_for_inputs);
-  auto new_rw = rewriter.create<mhlo::ReduceWindowOp>(
-      op.getLoc(), new_out_type, new_input, init_val, new_window_dims_attr,
-      new_window_strides_attr, BuildDenseI64(rewriter, view.BaseDilations()),
+  auto new_rw = mhlo::ReduceWindowOp::create(
+      rewriter, op.getLoc(), new_out_type, new_input, init_val,
+      new_window_dims_attr, new_window_strides_attr,
+      BuildDenseI64(rewriter, view.BaseDilations()),
       BuildDenseI64(rewriter, view.WindowDilations()), new_paddings_attr);
   IRMapping ir_map;
   op.getBody().cloneInto(&new_rw.getBody(), ir_map);
@@ -412,7 +413,7 @@ LogicalResult LegalizeCumSum::matchAndRewrite(
       RankedTensorType::get({}, rewriter.getI32Type()),
       static_cast<int32_t>(axis));
   auto axis_cst =
-      rewriter.create<arith::ConstantOp>(op->getLoc(), axis_cst_attr);
+      arith::ConstantOp::create(rewriter, op->getLoc(), axis_cst_attr);
 
   auto tfl_exclusive_attr = rewriter.getBoolAttr(false);
   auto tfl_reverse_attr = rewriter.getBoolAttr(false);
@@ -476,7 +477,7 @@ TFL::PadV2Op LegalizeMaxPool::BuildExplicitPadOp(
       llvm::ArrayRef<int64_t>(padding_values));
 
   auto padding_values_op =
-      rewriter.create<arith::ConstantOp>(op.getLoc(), padding_dense_attr);
+      arith::ConstantOp::create(rewriter, op.getLoc(), padding_dense_attr);
 
   llvm::SmallVector<int64_t, 4> pad_output_shape_vector;
   pad_output_shape_vector.push_back(input_type.getDimSize(0));
@@ -489,8 +490,8 @@ TFL::PadV2Op LegalizeMaxPool::BuildExplicitPadOp(
   pad_output_shape_vector.push_back(input_type.getDimSize(3));
   auto pad_output_type = mlir::RankedTensorType::get(
       pad_output_shape_vector, output_type.getElementType());
-  return rewriter.create<TFL::PadV2Op>(op.getLoc(), pad_output_type, input,
-                                       padding_values_op, init);
+  return TFL::PadV2Op::create(rewriter, op.getLoc(), pad_output_type, input,
+                              padding_values_op, init);
 }
 
 LogicalResult LegalizeMaxPool::matchAndRewrite(
@@ -575,13 +576,12 @@ void ReplaceWithAvgPool(mhlo::DivOp op, Value rw_lhs_input,
 
   auto [fh, fw, sh, sw, p, faf] =
       BuildTFLPoolAttrs(rewriter, lhs_view, padding);
-  Value final_op = rewriter.create<TFL::AveragePool2DOp>(
-      op->getLoc(), out_type, rw_lhs_input, fh, fw, p, sh, sw, faf);
+  Value final_op = TFL::AveragePool2DOp::create(
+      rewriter, op->getLoc(), out_type, rw_lhs_input, fh, fw, p, sh, sw, faf);
 
   if (opt_final_tpose) {
-    final_op = rewriter
-                   .create<mhlo::TransposeOp>(final_op.getLoc(), final_op,
-                                              opt_final_tpose.getPermutation())
+    final_op = mhlo::TransposeOp::create(rewriter, final_op.getLoc(), final_op,
+                                         opt_final_tpose.getPermutation())
                    .getResult();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc
index 87bf7770a20ddf..303c446d536b47 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc
@@ -69,8 +69,8 @@ LogicalResult CanonicalizeScatterUpdates(
   auto permutation_and_shape = GetPermutationAndTransposedShape(
       permutation_array, updates_type, rewriter);
 
-  auto transposed_updates = rewriter.create<mhlo::TransposeOp>(
-      scatter_op->getLoc(), permutation_and_shape.shape, updates,
+  auto transposed_updates = mhlo::TransposeOp::create(
+      rewriter, scatter_op->getLoc(), permutation_and_shape.shape, updates,
       permutation_and_shape.permutation);
 
   updates = transposed_updates;
@@ -163,9 +163,9 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
       permutation_array, operand_type, rewriter);
 
   Location loc = scatter_op.getLoc();
-  auto transposed_operand = rewriter.create<mhlo::TransposeOp>(
-      loc, permutation_and_shape.shape, operands[0],
-      permutation_and_shape.permutation);
+  auto transposed_operand =
+      mhlo::TransposeOp::create(rewriter, loc, permutation_and_shape.shape,
+                                operands[0], permutation_and_shape.permutation);
 
   Value new_indices = indices;
   int64_t index_depth =
@@ -181,8 +181,8 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
         builder, rewriter,
         llvm::SmallVector<int64_t>({num_updates, index_depth}),
         rewriter.getI32Type());
-    new_indices = rewriter.create<TF::ReshapeOp>(
-        loc,
+    new_indices = TF::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get({num_updates, index_depth},
                               indices_type.getElementType()),
         indices, indices_shape);
@@ -190,8 +190,8 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
         builder, rewriter,
         llvm::SmallVector<int64_t>({num_updates, updates_type.getDimSize(0)}),
         rewriter.getI32Type());
-    new_updates = rewriter.create<TF::ReshapeOp>(
-        loc,
+    new_updates = TF::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get({1, updates_type.getDimSize(0)},
                               updates_type.getElementType()),
         new_updates, updates_shape);
@@ -200,8 +200,8 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
   // Apply TF scatter to update the trailing dimensions of the
   // transposed operand.
   auto tf_scatter_op =
-      rewriter.create<TfOp>(loc, permutation_and_shape.shape,
-                            transposed_operand, new_indices, new_updates);
+      TfOp::create(rewriter, loc, permutation_and_shape.shape,
+                   transposed_operand, new_indices, new_updates);
 
   // Reverse the earlier transpose.
   auto inverse_permutation = GetInversePermutation(permutation_array, rewriter);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc
index e43c0c665ff9db..548951c1ae43e0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc
@@ -46,8 +46,8 @@ Value PackScalarIndices(mlir::ValueRange indices, OpBuilder& b) {
   auto values_count_attr = b.getI32IntegerAttr(num_indices);
   auto pack_axis_attr = b.getI32IntegerAttr(0);
 
-  return b.create<TFL::PackOp>(indices.back().getLoc(), packed_indices_type,
-                               indices, values_count_attr, pack_axis_attr);
+  return TFL::PackOp::create(b, indices.back().getLoc(), packed_indices_type,
+                             indices, values_count_attr, pack_axis_attr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -56,8 +56,8 @@ Value PackScalarIndices(mlir::ValueRange indices, OpBuilder& b) {
 
 // Cast the value to i32.
 Value BuildTFLCastOp(OpBuilder& b, Value value) {
-  return b.create<TFL::CastOp>(
-      value.getLoc(),
+  return TFL::CastOp::create(
+      b, value.getLoc(),
       RankedTensorType::get(llvm::cast<ShapedType>(value.getType()).getShape(),
                             b.getI32Type()),
       value);
@@ -70,12 +70,12 @@ class LegalizeSliceOp : public OpConversionPattern<mhlo::SliceOp> {
   LogicalResult matchAndRewrite(
       mhlo::SliceOp slice_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto begin = rewriter.create<arith::ConstantOp>(slice_op.getLoc(),
-                                                    slice_op.getStartIndices());
-    auto end = rewriter.create<arith::ConstantOp>(slice_op.getLoc(),
-                                                  slice_op.getLimitIndices());
-    auto strides = rewriter.create<arith::ConstantOp>(slice_op.getLoc(),
-                                                      slice_op.getStrides());
+    auto begin = arith::ConstantOp::create(rewriter, slice_op.getLoc(),
+                                           slice_op.getStartIndices());
+    auto end = arith::ConstantOp::create(rewriter, slice_op.getLoc(),
+                                         slice_op.getLimitIndices());
+    auto strides = arith::ConstantOp::create(rewriter, slice_op.getLoc(),
+                                             slice_op.getStrides());
     auto zero = rewriter.getIntegerAttr(rewriter.getI32Type(), 0);
     auto no_offset = rewriter.getBoolAttr(false);
 
@@ -116,8 +116,8 @@ LogicalResult CastSliceIndicesToSignless::matchAndRewrite(
 
   llvm::SmallVector<Value> casted_start_inds;
   for (auto start_ind_opr : op.getStartIndices()) {
-    auto casted_start_ind_opr = rewriter.create<mhlo::ConvertOp>(
-        start_ind_opr.getLoc(), start_ind_opr, new_start_e_type);
+    auto casted_start_ind_opr = mhlo::ConvertOp::create(
+        rewriter, start_ind_opr.getLoc(), start_ind_opr, new_start_e_type);
     casted_start_inds.push_back(casted_start_ind_opr.getResult());
   }
 
@@ -161,8 +161,8 @@ LogicalResult LegalizeDynamicSliceOp::matchAndRewrite(
   // clamp start indices between zero and shape(operand) - slice_sizes
   //=-----
 
-  Value clamp_left_cst = rewriter.create<arith::ConstantOp>(
-      op->getLoc(), rewriter.getZeroAttr(start_type));
+  Value clamp_left_cst = arith::ConstantOp::create(
+      rewriter, op->getLoc(), rewriter.getZeroAttr(start_type));
 
   llvm::SmallVector<Value> new_start_indices;
   const auto stride_sizes = UnrollI64Splat(op.getSliceSizes());
@@ -170,15 +170,15 @@ LogicalResult LegalizeDynamicSliceOp::matchAndRewrite(
   for (auto [dim_size, start_ind_opr, stride_size] :
        llvm::zip(input_type.getShape(), op.getStartIndices(), stride_sizes)) {
     const int64_t clamp_right_val = dim_size - stride_size;
-    auto clamp_right_cst = rewriter.create<arith::ConstantOp>(
-        op->getLoc(),
+    auto clamp_right_cst = arith::ConstantOp::create(
+        rewriter, op->getLoc(),
         DenseElementsAttr::get(start_type, rewriter.getIntegerAttr(
                                                start_e_type, clamp_right_val)));
 
-    Value new_start_ind = rewriter.create<TFL::MaximumOp>(
-        op->getLoc(), start_type, clamp_left_cst, start_ind_opr);
-    new_start_ind = rewriter.create<TFL::MinimumOp>(
-        op->getLoc(), start_type, clamp_right_cst, new_start_ind);
+    Value new_start_ind = TFL::MaximumOp::create(
+        rewriter, op->getLoc(), start_type, clamp_left_cst, start_ind_opr);
+    new_start_ind = TFL::MinimumOp::create(rewriter, op->getLoc(), start_type,
+                                           clamp_right_cst, new_start_ind);
 
     new_start_indices.push_back(new_start_ind);
   }
@@ -190,7 +190,7 @@ LogicalResult LegalizeDynamicSliceOp::matchAndRewrite(
   auto packed_indices = PackScalarIndices(new_start_indices, rewriter);
 
   auto slice_sizes_cst =
-      rewriter.create<arith::ConstantOp>(op->getLoc(), op.getSliceSizes());
+      arith::ConstantOp::create(rewriter, op->getLoc(), op.getSliceSizes());
 
   rewriter.replaceOpWithNewOp<TFL::SliceOp>(op, op.getType(), op.getOperand(),
                                             packed_indices, slice_sizes_cst);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
index a64012415729e4..6dcf03b1600244 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
@@ -70,7 +70,7 @@ Value BuildIntConstOp(ImplicitLocOpBuilder& builder,
                       ConversionPatternRewriter& rewriter, int64_t const_value,
                       Type type) {
   Value result_const =
-      builder.create<TF::ConstOp>(rewriter.getIntegerAttr(type, const_value));
+      TF::ConstOp::create(builder, rewriter.getIntegerAttr(type, const_value));
   return result_const;
 }
 
@@ -115,8 +115,8 @@ LogicalResult NormalizeIndexVector(Operation* parent_op, Value& indices,
     new_start_indices_shape.push_back(1);
     indices_type = RankedTensorType::get(new_start_indices_shape,
                                          indices_type.getElementType());
-    indices = rewriter.create<mhlo::ReshapeOp>(parent_op->getLoc(),
-                                               indices_type, indices);
+    indices = mhlo::ReshapeOp::create(rewriter, parent_op->getLoc(),
+                                      indices_type, indices);
   } else if (index_vector_dim != indices_type.getRank() - 1) {
     // If index_vector_dim isn't the last dimension in indices then it isn't
     // supported yet.
@@ -197,8 +197,8 @@ Value InsertTranspose(Value value, int batch_dim, int feature_dim,
                                     default_batch_dim, default_feature_dim,
                                     default_spatial_dim_start, num_spatial_dims,
                                     type, rewriter);
-  return rewriter.create<mhlo::TransposeOp>(value.getLoc(), type, value,
-                                            permutation);
+  return mhlo::TransposeOp::create(rewriter, value.getLoc(), type, value,
+                                   permutation);
 }
 
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
@@ -206,10 +206,10 @@ Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   if (auto shaped_type = mlir::dyn_cast<RankedTensorType>(val.getType())) {
     ShapedType new_type =
         RankedTensorType::get(shaped_type.getShape(), new_ele_type);
-    return rewriter.create<TFL::CastOp>(loc, new_type, val);
+    return TFL::CastOp::create(rewriter, loc, new_type, val);
   }
-  return rewriter.create<TFL::CastOp>(
-      loc, UnrankedTensorType::get(new_ele_type), val);
+  return TFL::CastOp::create(rewriter, loc,
+                             UnrankedTensorType::get(new_ele_type), val);
 }
 
 // Replaces `region`'s terminator to TFL::Yield.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
index c72fce3ffc6a84..1bf33c1d0d993e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
@@ -75,7 +75,7 @@ Value BuildIntArrayConstOp(ImplicitLocOpBuilder& builder,
     }
     const_value_raw = rewriter.getI32TensorAttr(const_i32_vec);
   }
-  Value result_const = builder.create<ConstOpT>(const_value_raw);
+  Value result_const = ConstOpT::create(builder, const_value_raw);
   return result_const;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc
index c2323b63b9370c..0de2ccafedbe16 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc
@@ -51,9 +51,10 @@ class LeagalizeWhileOp : public OpConversionPattern<mhlo::WhileOp> {
     // currently doesn't support stateless, so this
     // parameters are set to the default values.
     auto is_stateless = rewriter.getBoolAttr(false);
-    auto new_while = rewriter.create<TFL::WhileOp>(
-        while_op.getLoc(), while_op->getResultTypes(), while_op->getOperands(),
-        /*is_stateless=*/is_stateless);
+    auto new_while = TFL::WhileOp::create(rewriter, while_op.getLoc(),
+                                          while_op->getResultTypes(),
+                                          while_op->getOperands(),
+                                          /*is_stateless=*/is_stateless);
     new_while.getCond().takeBody(while_op.getCond());
     new_while.getBody().takeBody(while_op.getBody());
     TFLReplaceReturnOp(new_while.getCond(), rewriter);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
index 113293596536c9..c7f88bb2ebeebc 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
@@ -69,9 +69,9 @@ struct ReplaceCustomCallWithComposite final
 
     auto decomposition = mlir::cast<FlatSymbolRefAttr>(calledComputations[0]);
 
-    auto composite = rewriter.create<mlir::stablehlo::CompositeOp>(
-        op.getLoc(), op.getResultTypes(), op.getOperands(), name.str(), attrs,
-        decomposition.getValue());
+    auto composite = mlir::stablehlo::CompositeOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getOperands(),
+        name.str(), attrs, decomposition.getValue());
     rewriter.replaceOp(op, composite.getResults());
     return success();
   }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
index 704dbf37d680dd..836598d19a7516 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
@@ -59,7 +59,7 @@ namespace {
 
 Value MaterializeIllegalCast(OpBuilder &builder, Type type,
                                             ValueRange inputs, Location loc) {
-  return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+  return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
       ->getResult(0);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
index 78da8b153f47fc..614bd070748267 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
@@ -177,8 +177,8 @@ class ConvertTFXlaCallModuleOp : public OpRewritePattern<TF::XlaCallModuleOp> {
 
     SmallVector<Value, 4> call_op_operands(op.getOperands());
     if (ContainsPlatformIndexArg(op)) {
-      Value dummy_const = rewriter.create<TF::ConstOp>(
-          op.getLoc(),
+      Value dummy_const = TF::ConstOp::create(
+          rewriter, op.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get({}, rewriter.getIntegerType(32)), {0}));
       call_op_operands.insert(call_op_operands.begin(), dummy_const);
@@ -196,16 +196,16 @@ class ConvertTFXlaCallModuleOp : public OpRewritePattern<TF::XlaCallModuleOp> {
       Value operand = std::get<0>(operand_and_type);
       Type expected_type = std::get<1>(operand_and_type);
       if (operand.getType() != expected_type) {
-        operand = rewriter.create<TF::CastOp>(
-            op.getLoc(), expected_type, operand,
-            /*Truncate=*/rewriter.getBoolAttr(false));
+        operand =
+            TF::CastOp::create(rewriter, op.getLoc(), expected_type, operand,
+                               /*Truncate=*/rewriter.getBoolAttr(false));
       }
       casted_operands.push_back(operand);
     }
 
-    auto call = rewriter.create<func::CallOp>(
-        op->getLoc(), main_fn.getSymName(), main_fn.getResultTypes(),
-        casted_operands);
+    auto call =
+        func::CallOp::create(rewriter, op->getLoc(), main_fn.getSymName(),
+                             main_fn.getResultTypes(), casted_operands);
     rewriter.replaceOp(op, call->getResults());
 
     return success();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
index 282c44a958c27f..1effffd9aa00e3 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
@@ -269,9 +269,9 @@ LogicalResult LiftDotConcatLHS(mhlo::ConcatenateOp concat,
         mlir::dyn_cast<ShapedType>(v.getType()).getShape()[new_concat_dim];
   }
 
-  auto new_concat = rewriter.create<mhlo::ConcatenateOp>(
-      concat->getLoc(), concat.getType().clone(new_concat_shape), all_dot_lhs,
-      rewriter.getI64IntegerAttr(new_concat_dim));
+  auto new_concat = mhlo::ConcatenateOp::create(
+      rewriter, concat->getLoc(), concat.getType().clone(new_concat_shape),
+      all_dot_lhs, rewriter.getI64IntegerAttr(new_concat_dim));
   rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
       concat, concat.getType(), new_concat, first_dot.getRhs(),
       first_dot.getDotDimensionNumbers(), first_dot.getPrecisionConfigAttr(),
@@ -368,11 +368,11 @@ LogicalResult LiftDotConcatLHSAndRHS(mhlo::ConcatenateOp concat,
         mlir::dyn_cast<ShapedType>(v.getType()).getShape()[rhs_batch_dim];
   }
 
-  auto lhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
-      concat->getLoc(), concat.getType().clone(lhs_new_concat_shape),
+  auto lhs_new_concat = mhlo::ConcatenateOp::create(
+      rewriter, concat->getLoc(), concat.getType().clone(lhs_new_concat_shape),
       all_dot_lhs, rewriter.getI64IntegerAttr(lhs_batch_dim));
-  auto rhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
-      concat->getLoc(), concat.getType().clone(rhs_new_concat_shape),
+  auto rhs_new_concat = mhlo::ConcatenateOp::create(
+      rewriter, concat->getLoc(), concat.getType().clone(rhs_new_concat_shape),
       all_dot_rhs, rewriter.getI64IntegerAttr(rhs_batch_dim));
   rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
       concat, concat.getType(), lhs_new_concat, rhs_new_concat,
@@ -439,7 +439,8 @@ LogicalResult FuseSliceConcat(mhlo::ConcatenateOp concat,
     new_slice_shape.push_back(second_limit - first_start);
   }
 
-  auto new_slice = rewriter.create<mhlo::SliceOp>(
+  auto new_slice = mhlo::SliceOp::create(
+      rewriter,
       FusedLoc::get(first->getContext(), {first.getLoc(), second.getLoc()}),
       first.getType().clone(new_slice_shape), first.getOperand(),
       /*start_indices=*/rewriter.getI64TensorAttr(new_start),
@@ -730,8 +731,8 @@ class SimplifyBroadcastInDimsReshape
 
     auto new_broadcast_input_type = RankedTensorType::get(
         new_broadcast_input_shape, broadcast_type.getElementType());
-    auto new_broadcast_input = rewriter.create<mhlo::ReshapeOp>(
-        op->getLoc(), new_broadcast_input_type, op.getOperand());
+    auto new_broadcast_input = mhlo::ReshapeOp::create(
+        rewriter, op->getLoc(), new_broadcast_input_type, op.getOperand());
     auto new_broadcast_dims_attr =
         rewriter.getI64TensorAttr(new_broadcast_dims);
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
index 249a1018e091f4..13f981c8714f46 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
@@ -44,8 +44,8 @@ LogicalResult SmuggleOp(Operation* op, PatternRewriter& rewriter) {
       rewriter.getNamedAttr("call_target_name", op->getName().getIdentifier());
   SmallVector<NamedAttribute> attrs{op->getAttrs()};
   attrs.push_back(call_target);
-  auto custom_call = rewriter.create<mlir::stablehlo::CustomCallOp>(
-      op->getLoc(), op->getResultTypes(), op->getOperands(), attrs);
+  auto custom_call = mlir::stablehlo::CustomCallOp::create(
+      rewriter, op->getLoc(), op->getResultTypes(), op->getOperands(), attrs);
   rewriter.replaceOp(op, custom_call.getResults());
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
index fcecd557aeab1c..557b721bfaf35f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
@@ -143,13 +143,15 @@ class FuseStablehloMulAndConvolutionPattern
       broadcast_dims =
           DenseI64ArrayAttr::get(rewriter.getContext(), {filter_rank - 1});
     }
-    Value broadcast_multiplier = rewriter.create<stablehlo::BroadcastInDimOp>(
-        mul_op.getLoc(), filter.getType(), multiplier, broadcast_dims);
-    Value new_filter = rewriter.create<stablehlo::MulOp>(
-        mul_op.getLoc(), filter.getType(), filter, broadcast_multiplier);
-    Value new_conv = rewriter.create<stablehlo::ConvolutionOp>(
-        mul_op.getLoc(), conv_op.getType(), conv_op.getLhs(), new_filter,
-        conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
+    Value broadcast_multiplier = stablehlo::BroadcastInDimOp::create(
+        rewriter, mul_op.getLoc(), filter.getType(), multiplier,
+        broadcast_dims);
+    Value new_filter =
+        stablehlo::MulOp::create(rewriter, mul_op.getLoc(), filter.getType(),
+                                 filter, broadcast_multiplier);
+    Value new_conv = stablehlo::ConvolutionOp::create(
+        rewriter, mul_op.getLoc(), conv_op.getType(), conv_op.getLhs(),
+        new_filter, conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
         conv_op.getLhsDilationAttr(), conv_op.getRhsDilationAttr(),
         conv_op.getWindowReversalAttr(), conv_op.getDimensionNumbers(),
         conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
@@ -169,8 +171,8 @@ class FuseStablehloMulAndConvolutionPattern
               conv_op) {
         return failure();
       }
-      Value new_shape_of = rewriter.create<shape::ShapeOfOp>(
-          mul_op.getLoc(), shape_of_op.getType(), new_conv);
+      Value new_shape_of = shape::ShapeOfOp::create(
+          rewriter, mul_op.getLoc(), shape_of_op.getType(), new_conv);
       shape_of_op.replaceAllUsesWith(new_shape_of);
       rewriter.replaceOp(mul_op, {new_conv});
     }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index 0c43a5c4047a64..b283dea3098232 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -69,7 +69,7 @@ arith::ConstantOp ShapeToConst(PatternRewriter& rewriter, Value value) {
   auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
                                          rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, shape);
-  return rewriter.create<arith::ConstantOp>(value.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(rewriter, value.getLoc(), attr_type, attr);
 }
 
 // Returns true if broadcast_dimensions obey Tensorflow convention, as in new
@@ -107,7 +107,7 @@ arith::ConstantOp ExpandedShape(OpBuilder& b, Value input,
   auto attr_type = RankedTensorType::get(
       {static_cast<int64_t>(expanded_shape.size())}, b.getIntegerType(32));
   auto attr = DenseElementsAttr::get(attr_type, expanded_shape);
-  return b.create<arith::ConstantOp>(output.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(b, output.getLoc(), attr_type, attr);
 }
 
 Value ExpandedDynamicShape(OpBuilder& b, Value input,
@@ -132,7 +132,7 @@ Value ExpandedDynamicShape(OpBuilder& b, Value input,
   for (int64_t i : expanded_dimensions) {
     auto index_attr = DenseIntElementsAttr::get(
         RankedTensorType::get({}, b.getI64Type()), {i});
-    Value index = b.create<arith::ConstantOp>(output.getLoc(), index_attr);
+    Value index = arith::ConstantOp::create(b, output.getLoc(), index_attr);
 
     auto cur_type = llvm::cast<ShapedType>(expanded_input.getType());
     auto cur_shape = cur_type.getShape();
@@ -145,8 +145,8 @@ Value ExpandedDynamicShape(OpBuilder& b, Value input,
 
     auto new_type = RankedTensorType::get(new_shape, cur_type.getElementType());
 
-    expanded_input = b.create<TFL::ExpandDimsOp>(output.getLoc(), new_type,
-                                                 expanded_input, index);
+    expanded_input = TFL::ExpandDimsOp::create(b, output.getLoc(), new_type,
+                                               expanded_input, index);
   }
 
   return expanded_input;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
index b0a023494f1ca4..b5aded528cdc25 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
@@ -74,18 +74,17 @@ class UnfoldSplatConstantPass
       return;
     }
     op_builder->setInsertionPoint(const_op);
-    Value scalar = op_builder->create<mhlo::ConstantOp>(
-        const_op->getLoc(),
+    Value scalar = mhlo::ConstantOp::create(
+        *op_builder, const_op->getLoc(),
         DenseElementsAttr::get(
             RankedTensorType::get(/*shape=*/{}, element_type),
             splat_elements_attr.getSplatValue<Attribute>()));
     auto broadcast_dims = DenseIntElementsAttr::get(
         RankedTensorType::get(/*shape=*/{0}, op_builder->getI64Type()),
         llvm::SmallVector<int64_t>{});
-    mhlo::BroadcastInDimOp broadcast_in_dim_op =
-        op_builder->create<mhlo::BroadcastInDimOp>(
-            const_op->getLoc(), splat_elements_attr.getType(), scalar,
-            broadcast_dims);
+    mhlo::BroadcastInDimOp broadcast_in_dim_op = mhlo::BroadcastInDimOp::create(
+        *op_builder, const_op->getLoc(), splat_elements_attr.getType(), scalar,
+        broadcast_dims);
     const_op->replaceAllUsesWith(broadcast_in_dim_op);
     const_op->erase();
   }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir
new file mode 100644
index 00000000000000..987f5a90e374f5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir
@@ -0,0 +1,14 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+module {
+  func.func public @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = "tfl.external_const"() <{external_buffer = #tfl.external_buffer<group_name = "test.bin", offset = 0, length = 13, packing = "unpacked">}> : () -> tensor<2x2xf32>
+    %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+    return %1 : tensor<2x2xf32>
+  }
+}
+
+// CHECK-LABEL: @main
+// CHECK:      %0 = "tfl.external_const"() <{external_buffer = #tfl.external_buffer<group_name = "test.bin", offset = 0, length = 13, packing = "unpacked">}>
+// CHECK-NEXT: %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+// CHECK-NEXT: return %1
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir
new file mode 100644
index 00000000000000..09d7e764b1f7a2
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir
@@ -0,0 +1,34 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+module {
+  func.func public @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = "tfl.external_const"() <{external_buffer = #tfl.external_buffer<group_name = "test.bin", offset = 0, length = 13, packing = "unpacked">}> : () -> tensor<2x2xf32>
+    %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+    return %1 : tensor<2x2xf32>
+  }
+}
+
+// CHECK:  tensors: [ {
+// CHECK:    shape: [ 2, 2 ],
+// CHECK:    buffer: 1,
+// CHECK:    name: "arg0",
+// CHECK:    has_rank: true
+// CHECK:  }, {
+// CHECK:    shape: [ 2, 2 ],
+// CHECK:    name: "tfl.external_const",
+// CHECK:    has_rank: true,
+// CHECK:    external_buffer: 2147483648
+// CHECK:  }, {
+// CHECK:    shape: [ 2, 2 ],
+// CHECK:    buffer: 2,
+// CHECK:    name: "tfl.add",
+// CHECK:    has_rank: true
+// CHECK:  } ],
+// CHECK:  external_buffer_groups: [ {
+// CHECK:    name: "test.bin"
+// CHECK:  } ],
+// CHECK:  external_buffers: [ {
+// CHECK:    id: 2147483648,
+// CHECK:    length: 13,
+// CHECK:    packing: "unpacked"
+// CHECK:  } ]
diff --git a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
index 6b92b5f63ee66f..e04be6148b7b1d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
@@ -93,7 +93,7 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
       if (QuantizedType::getQuantizedElementType(operand.getType())) {
         auto newTy = QuantizedType::castToExpressedType(operand.getType());
         newOperands.push_back(
-            rewriter.create<TFL::DequantizeOp>(loc, newTy, operand));
+            TFL::DequantizeOp::create(rewriter, loc, newTy, operand));
         continue;
       }
 
@@ -109,9 +109,8 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
       newResultTys.push_back(resultTy);
     }
 
-    auto newResults = rewriter
-                          .create<SrcOp>(loc, newResultTys, newOperands,
-                                         op->getAttrDictionary().getValue())
+    auto newResults = SrcOp::create(rewriter, loc, newResultTys, newOperands,
+                                    op->getAttrDictionary().getValue())
                           .getOperation()
                           ->getResults();
 
@@ -120,8 +119,8 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
       Value result = newResults[i];
       Type resultTy = op->getOpResult(i).getType();
       if (QuantizedType::getQuantizedElementType(resultTy)) {
-        replaceResults.push_back(rewriter.create<TFL::QuantizeOp>(
-            loc, resultTy, result, TypeAttr::get(resultTy)));
+        replaceResults.push_back(TFL::QuantizeOp::create(
+            rewriter, loc, resultTy, result, TypeAttr::get(resultTy)));
         continue;
       }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/if_outline.cc b/tensorflow/compiler/mlir/lite/transforms/if_outline.cc
index 0e7c03dd32b35f..c45d5f74b8988d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/if_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/if_outline.cc
@@ -84,7 +84,7 @@ func::FuncOp CreateOutlineFuncAndEraseRegion(
   type = FunctionType::get(context, types, result_types);
 
   // Create outlined function and move region body to it.
-  auto outlined_func = func_builder.create<func::FuncOp>(loc, name, type);
+  auto outlined_func = func::FuncOp::create(func_builder, loc, name, type);
   outlined_func.getBody().takeBody(region);
   Region& func_region = outlined_func.getBody();
 
@@ -97,8 +97,8 @@ func::FuncOp CreateOutlineFuncAndEraseRegion(
   // Replace yield op with return.
   Operation* yield_op = outlined_func.getBody().front().getTerminator();
   OpBuilder return_builder(yield_op);
-  return_builder.create<func::ReturnOp>(yield_op->getLoc(),
-                                        yield_op->getOperands());
+  func::ReturnOp::create(return_builder, yield_op->getLoc(),
+                         yield_op->getOperands());
   yield_op->erase();
 
   SymbolTable(region.getParentOfType<ModuleOp>()).insert(outlined_func);
@@ -121,8 +121,8 @@ void ReplaceRegionWithCall(StringRef name, Region& region,
     new_operands.push_back(block->addArgument(t, loc));
   }
   new_operands.append(extern_values.begin(), extern_values.end());
-  auto call = b.create<func::CallOp>(loc, func, new_operands);
-  b.create<YieldOp>(loc, call.getResults());
+  auto call = func::CallOp::create(b, loc, func, new_operands);
+  YieldOp::create(b, loc, call.getResults());
 }
 
 void IfOutlinePass::OutlineIf(IfOp if_op) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc b/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
index 4e1fe8e012211a..7a85d60b51d6eb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
@@ -51,7 +51,7 @@ void InsertCallOnceOpFromSessionInitializerPass::runOnOperation() {
 
       OpBuilder builder(func.getContext());
       builder.setInsertionPointToStart(&func.getBlocks().front());
-      builder.create<TFL::CallOnceOp>(func.getLoc(), init_func_op.getName());
+      TFL::CallOnceOp::create(builder, func.getLoc(), init_func_op.getName());
     }
   }
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 36091686021e2c..26c5496ff3b08b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -75,14 +75,14 @@ def CreateInt32ConstOrCast : NativeCodeCall<
 
 // Creates an int32 constant op from an integer attribute $0.
 def CreateInt32ConstOpFromIntAttr
-  : NativeCodeCall<"$_builder.create<TF::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>(llvm::cast<IntegerAttr>($0).getInt())}))">;
+  : NativeCodeCall<"TF::ConstOp::create($_builder, $_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>(llvm::cast<IntegerAttr>($0).getInt())}))">;
 
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
 //===----------------------------------------------------------------------===//
 
 def createConstOp
-  : NativeCodeCall<"$_builder.create<ConstOp>($_loc, $0.getType(), $1)">;
+  : NativeCodeCall<"ConstOp::create($_builder, $_loc, $0.getType(), $1)">;
 
 def LegalizeTFConstToTFLConst: Pat<(TF_ConstOp:$res ElementsAttr:$value),
                                    (createConstOp $res, $value)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
index 9894e7df7587f9..ce9b6af564d2a4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
@@ -20,7 +20,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 
 def ConstDenseElementsI32ZeroAttr
-  : NativeCodeCall<"$_builder.create<TFL::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {0}))">;
+  : NativeCodeCall<"TFL::ConstOp::create($_builder, $_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {0}))">;
 
 def Size1InputRange : NativeCodeCall<
   "SmallVector<Value, 1>{$0}">;
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
index bfeea6d6e6373a..668493eca931e7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
@@ -135,8 +135,8 @@ struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
       // mapped to X and Z dimension.
       std::iter_swap(permute.begin() + input_rank - 1,
                      permute.begin() + input_rank - 2);
-      auto permutation_tensor_op = rewriter.create<arith::ConstantOp>(
-          bmm_op->getLoc(), permuation_tensor_type,
+      auto permutation_tensor_op = arith::ConstantOp::create(
+          rewriter, bmm_op->getLoc(), permuation_tensor_type,
           DenseElementsAttr::get(permuation_tensor_type, permute));
 
       auto input_shape = input_type.getShape();
@@ -181,9 +181,8 @@ struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
             RankedTensorType::get(permuted_shape, input_type.getElementType());
       }
 
-      return rewriter.create<TFL::TransposeOp>(
-          bmm_op->getLoc(), output_type, input,
-          permutation_tensor_op.getResult());
+      return TFL::TransposeOp::create(rewriter, bmm_op->getLoc(), output_type,
+                                      input, permutation_tensor_op.getResult());
     };
 
     Value input_lhs = bmm_op.getX();
@@ -198,10 +197,11 @@ struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
         !bmm_op.getAdjY() ? create_z_x_transpose_op(input_rhs) : input_rhs;
 
     Type output_type = bmm_op.getResult().getType();
-    auto no_input = rewriter.create<TFL::NoValueOp>(
-        bmm_op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
-    auto fc_op = rewriter.create<TFL::FullyConnectedOp>(
-        bmm_op->getLoc(), ArrayRef<Type>{output_type},
+    auto no_input =
+        TFL::NoValueOp::create(rewriter, bmm_op->getLoc(),
+                               rewriter.getNoneType(), rewriter.getUnitAttr());
+    auto fc_op = TFL::FullyConnectedOp::create(
+        rewriter, bmm_op->getLoc(), ArrayRef<Type>{output_type},
         /*input=*/output_lhs, /*filter=*/output_rhs, /*bias=*/no_input,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
         /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
@@ -257,13 +257,14 @@ struct ConvertBatchMatMulOpToReduceSum
       cY = rhs_shape.size() - 1;
     }
 
-    auto reduce_dim_op = rewriter.create<TFL::ConstOp>(
-        bmm_op->getLoc(),
+    auto reduce_dim_op = TFL::ConstOp::create(
+        rewriter, bmm_op->getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({1}, rewriter.getI32Type()), {cY}));
-    auto sum_op = rewriter.create<TFL::SumOp>(
-        bmm_op->getLoc(), bmm_op.getType(), bmm_op.getY(), reduce_dim_op,
-        /*keep_dims=*/rewriter.getBoolAttr(true));
+    auto sum_op =
+        TFL::SumOp::create(rewriter, bmm_op->getLoc(), bmm_op.getType(),
+                           bmm_op.getY(), reduce_dim_op,
+                           /*keep_dims=*/rewriter.getBoolAttr(true));
     rewriter.replaceOp(bmm_op, sum_op);
     return success();
   };
@@ -368,19 +369,21 @@ struct FuseRhsTransposeIntoBatchMatMulOp
     new_reshape_input_shape.push_back(
         rhs_contracting_dimensions.SizesArray().front());
 
-    Value new_reshape_shape_value = rewriter.create<arith::ConstantOp>(
-        bmm_op->getLoc(),
+    Value new_reshape_shape_value = arith::ConstantOp::create(
+        rewriter, bmm_op->getLoc(),
         GetI32ElementsAttr(new_reshape_input_shape, &rewriter));
-    auto new_reshape_value = rewriter.create<TFL::ReshapeOp>(
-        bmm_op->getLoc(), transpose_op.getInput(), new_reshape_shape_value);
+    auto new_reshape_value = TFL::ReshapeOp::create(rewriter, bmm_op->getLoc(),
+                                                    transpose_op.getInput(),
+                                                    new_reshape_shape_value);
 
     // Replace the BatchMatMulOp with a FullyConnectedOp, if the RHS of BMM has
     // no broadcasting dimensions. I.e. RHS of BMM is of Rank 2.
     if (rhs_dimensions_info.batch_dimensions().AxesArray().empty()) {
-      auto no_input = rewriter.create<TFL::NoValueOp>(
-          bmm_op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
-      auto fc_op = rewriter.create<TFL::FullyConnectedOp>(
-          bmm_op->getLoc(), ArrayRef<Type>{bmm_op.getType()},
+      auto no_input = TFL::NoValueOp::create(rewriter, bmm_op->getLoc(),
+                                             rewriter.getNoneType(),
+                                             rewriter.getUnitAttr());
+      auto fc_op = TFL::FullyConnectedOp::create(
+          rewriter, bmm_op->getLoc(), ArrayRef<Type>{bmm_op.getType()},
           /*input=*/bmm_op.getX(), /*filter=*/new_reshape_value,
           /*bias=*/no_input,
           /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
@@ -391,9 +394,10 @@ struct FuseRhsTransposeIntoBatchMatMulOp
     } else {
       // Replace the BatchMatMulOp with a BatchMatMulOp with adj_y = true and
       // transpose fused into RHS.
-      auto bmm_op_with_adj_y = rewriter.create<TFL::BatchMatMulOp>(
-          bmm_op->getLoc(), bmm_op.getType(), bmm_op.getX(), new_reshape_value,
-          bmm_op.getAdjX(), /*adj_y=*/true, mlir::BoolAttr());
+      auto bmm_op_with_adj_y = TFL::BatchMatMulOp::create(
+          rewriter, bmm_op->getLoc(), bmm_op.getType(), bmm_op.getX(),
+          new_reshape_value, bmm_op.getAdjX(), /*adj_y=*/true,
+          mlir::BoolAttr());
       rewriter.replaceOp(bmm_op, {bmm_op_with_adj_y.getResult()});
     }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
index aed2946db17ba3..21b1963998d0d5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
@@ -131,8 +131,9 @@ Value PrepareBroadcastLikeOpInput(Operation* op, PatternRewriter& rewriter) {
         RankedTensorType::get({}, elements_attr.getType().getElementType()),
         elements_attr.getSplatValue<mlir::Attribute>());
 
-    return rewriter.create<arith::ConstantOp>(
-        op->getLoc(), scalar_elements_attr.getType(), scalar_elements_attr);
+    return arith::ConstantOp::create(rewriter, op->getLoc(),
+                                     scalar_elements_attr.getType(),
+                                     scalar_elements_attr);
   }
   return nullptr;
 }
@@ -380,10 +381,10 @@ LogicalResult ReorderBroadcastToCast::matchAndRewrite(
           : static_cast<TensorType>(UnrankedTensorType::get(
                 old_cast_op_output_type.getElementType()));
 
-  auto new_cast_op = rewriter.create<TFL::CastOp>(
-      fused_loc, new_cast_op_output_type, input_value);
-  auto new_broadcast_to_op = rewriter.create<TFL::BroadcastToOp>(
-      fused_loc, old_cast_op_output_type, new_cast_op.getOutput(),
+  auto new_cast_op = TFL::CastOp::create(rewriter, fused_loc,
+                                         new_cast_op_output_type, input_value);
+  auto new_broadcast_to_op = TFL::BroadcastToOp::create(
+      rewriter, fused_loc, old_cast_op_output_type, new_cast_op.getOutput(),
       broadcast_to_op.getShape());
 
   rewriter.replaceOp(cast_op, new_broadcast_to_op.getOutput());
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
index ec6e2b5902503f..062d9c1e712de2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
@@ -502,8 +502,8 @@ Value GetBiasMultiplier(OpBuilder& builder, Value binary_op,
       (llvm::isa<mlir::TFL::AddOp>(binary_op.getDefiningOp()) ? 1.0 : -1.0);
   Attribute constant_attr = FloatAttr::get(element_type, multiplier);
 
-  return builder.create<arith::ConstantOp>(
-      binary_op.getLoc(),
+  return arith::ConstantOp::create(
+      builder, binary_op.getLoc(),
       DenseFPElementsAttr::get(RankedTensorType::get({}, element_type),
                                constant_attr));
 }
@@ -677,10 +677,10 @@ Value ReshapeValueDroppingLastDim(OpBuilder& builder, Value value) {
   } else {
     new_shape.push_back(-1);
   }
-  return builder.create<ReshapeOp>(
-      value.getLoc(), value,
-      builder.create<arith::ConstantOp>(
-          value.getLoc(),
+  return ReshapeOp::create(
+      builder, value.getLoc(), value,
+      arith::ConstantOp::create(
+          builder, value.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get(type.getRank() - 1, builder.getI32Type()),
               new_shape)));
@@ -754,9 +754,8 @@ Value Get1DShapeValue(OpBuilder& builder, Value value) {
   }
   auto output_type = RankedTensorType::get({1}, builder.getI32Type());
   const int num_elements = type.getNumElements();
-  return builder.create<ConstOp>(
-      value.getLoc(), output_type,
-      DenseIntElementsAttr::get(output_type, num_elements));
+  return ConstOp::create(builder, value.getLoc(), output_type,
+                         DenseIntElementsAttr::get(output_type, num_elements));
 }
 
 Type GetEmbeddingLookupShape(Value lookup, Value value) {
@@ -780,8 +779,8 @@ mlir::Value GetFcOutput(OpBuilder* builder,
                         StringAttr fused_activation_function,
                         StringAttr weights_format, BoolAttr keep_num_dims,
                         BoolAttr asymmetric_quantize_inputs) {
-  auto fc_op = builder->create<FullyConnectedOp>(
-      result[0].getLoc(), result.getTypes(), input, filter, bias,
+  auto fc_op = FullyConnectedOp::create(
+      *builder, result[0].getLoc(), result.getTypes(), input, filter, bias,
       fused_activation_function, weights_format, keep_num_dims,
       asymmetric_quantize_inputs);
   return fc_op->getResult(0);
@@ -973,13 +972,13 @@ struct SqueezeReshapesAroundBroadcastOp
             .drop_back(num_trailing_broadcast_dims)
             .drop_front(num_leading_broadcast_dims)};
 
-    Value new_reshape_shape_value = rewriter.create<arith::ConstantOp>(
-        inner_reshape_op->getLoc(),
+    Value new_reshape_shape_value = arith::ConstantOp::create(
+        rewriter, inner_reshape_op->getLoc(),
         GetI32ElementsAttr(new_reshape_shape_i32, &rewriter));
 
-    auto new_inner_reshape_op = rewriter.create<TFL::ReshapeOp>(
-        inner_reshape_op->getLoc(), inner_reshape_input,
-        new_reshape_shape_value);
+    auto new_inner_reshape_op =
+        TFL::ReshapeOp::create(rewriter, inner_reshape_op->getLoc(),
+                               inner_reshape_input, new_reshape_shape_value);
 
     // Create a new reshape_op to replace the old inner reshape_op.
     rewriter.replaceOp(inner_reshape_op, new_inner_reshape_op.getResult());
@@ -990,11 +989,12 @@ struct SqueezeReshapesAroundBroadcastOp
             .drop_back(num_trailing_broadcast_dims)
             .drop_front(num_leading_broadcast_dims)};
 
-    Value new_broadcast_shape_value = rewriter.create<arith::ConstantOp>(
-        loc, GetI64ElementsAttr(new_broadcast_shape, &rewriter));
+    Value new_broadcast_shape_value = arith::ConstantOp::create(
+        rewriter, loc, GetI64ElementsAttr(new_broadcast_shape, &rewriter));
 
-    auto new_broadcast_to_op = rewriter.create<TFL::BroadcastToOp>(
-        loc, RankedTensorType::get(new_broadcast_shape, rewriter.getF32Type()),
+    auto new_broadcast_to_op = TFL::BroadcastToOp::create(
+        rewriter, loc,
+        RankedTensorType::get(new_broadcast_shape, rewriter.getF32Type()),
         new_inner_reshape_op.getOutput(), new_broadcast_shape_value);
 
     // Create a new broadcast_op to replace the old broadcast_op.
@@ -1055,18 +1055,19 @@ struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
         added_value.reshape(RankedTensorType::get(
             {num_dims},
             mlir::cast<ShapedType>(added_value.getType()).getElementType()));
-    ::mlir::arith::ConstantOp new_end = rewriter.create<arith::ConstantOp>(
-        strided_slice_op.getEnd().getLoc(), new_added_value);
+    ::mlir::arith::ConstantOp new_end = arith::ConstantOp::create(
+        rewriter, strided_slice_op.getEnd().getLoc(), new_added_value);
 
     if (strided_slice_op.getBeginMask() != 0) return failure();
     if (strided_slice_op.getEndMask() != 0) return failure();
     if (strided_slice_op.getEllipsisMask() != 0) return failure();
     mlir::TFL::StridedSliceOp new_strided_slice_op =
-        rewriter.create<TFL::StridedSliceOp>(
-            strided_slice_op.getLoc(), strided_slice_op.getOutput().getType(),
-            strided_slice_op.getInput(), strided_slice_op.getBegin(), new_end,
-            strided_slice_op.getStrides(), strided_slice_op.getBeginMask(),
-            strided_slice_op.getEndMask(), strided_slice_op.getEllipsisMask(),
+        TFL::StridedSliceOp::create(
+            rewriter, strided_slice_op.getLoc(),
+            strided_slice_op.getOutput().getType(), strided_slice_op.getInput(),
+            strided_slice_op.getBegin(), new_end, strided_slice_op.getStrides(),
+            strided_slice_op.getBeginMask(), strided_slice_op.getEndMask(),
+            strided_slice_op.getEllipsisMask(),
             strided_slice_op.getNewAxisMask(),
             strided_slice_op.getShrinkAxisMask(),
             /*offset=*/true);
@@ -1186,24 +1187,26 @@ struct Convert2DUpscalingToResizeNearestNeighor
     SmallVector<int64_t, 4> reshape_shape_in_int64(
         {1, image_size, image_size, feature_size});
 
-    auto reshape_shape_const_op = rewriter.create<TFL::ConstOp>(
-        gather_nd_first->getLoc(),
-        GetI32ElementsAttr(reshape_shape, &rewriter));
+    auto reshape_shape_const_op =
+        TFL::ConstOp::create(rewriter, gather_nd_first->getLoc(),
+                             GetI32ElementsAttr(reshape_shape, &rewriter));
 
-    auto reshape_op = rewriter.create<TFL::ReshapeOp>(
-        gather_nd_first->getLoc(),
+    auto reshape_op = TFL::ReshapeOp::create(
+        rewriter, gather_nd_first->getLoc(),
         tensorflow::GetTypeFromTFTensorShape(reshape_shape_in_int64,
                                              result_type.getElementType()),
         params_value, reshape_shape_const_op.getResult());
 
     // Add TFL::resize_nearest_neighor op for 2x upscaling.
     SmallVector<int32_t, 2> size_vec = {image_size * 2, image_size * 2};
-    auto size_const_op = rewriter.create<TFL::ConstOp>(
-        gather_nd_first->getLoc(), GetI32ElementsAttr(size_vec, &rewriter));
+    auto size_const_op =
+        TFL::ConstOp::create(rewriter, gather_nd_first->getLoc(),
+                             GetI32ElementsAttr(size_vec, &rewriter));
 
-    auto resize = rewriter.create<TFL::ResizeNearestNeighborOp>(
-        gather_nd_first->getLoc(), transpose_second.getResult().getType(),
-        reshape_op.getResult(), size_const_op.getResult(), false, false);
+    auto resize = TFL::ResizeNearestNeighborOp::create(
+        rewriter, gather_nd_first->getLoc(),
+        transpose_second.getResult().getType(), reshape_op.getResult(),
+        size_const_op.getResult(), false, false);
 
     rewriter.replaceOp(transpose_second, resize.getResult());
     return success();
@@ -1233,13 +1236,13 @@ static std::optional<Value> GetAs1DValue(PatternRewriter& rewriter, Value value,
           RankedTensorType::get({num_channels}, type.getElementType());
       auto splat_attr =
           DenseElementsAttr::get(splat_type, attr.getSplatValue<Attribute>());
-      return rewriter.create<arith::ConstantOp>(value.getLoc(), splat_attr);
+      return arith::ConstantOp::create(rewriter, value.getLoc(), splat_attr);
     }
 
     if (HasOneTailUnitDimension(attr) &&
         attr.getNumElements() == num_channels) {
       auto flattened = FlattenTo1D(attr);
-      return rewriter.create<arith::ConstantOp>(value.getLoc(), flattened);
+      return arith::ConstantOp::create(rewriter, value.getLoc(), flattened);
     }
   }
 
@@ -1259,7 +1262,7 @@ static std::optional<Value> GetBiasIn1D(PatternRewriter& rewriter, Value bias,
     RankedTensorType type =
         RankedTensorType::get({num_channels}, fallback_element_type);
     auto attr = rewriter.getZeroAttr(type);
-    return rewriter.create<arith::ConstantOp>(bias.getLoc(), type, attr);
+    return arith::ConstantOp::create(rewriter, bias.getLoc(), type, attr);
   }
 
   auto bias_type = mlir::dyn_cast<RankedTensorType>(bias.getType());
@@ -1377,34 +1380,34 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
     }
 
     auto new_bias =
-        rewriter
-            .create<AddOp>(add_op.getLoc(), bias_1d.value(), add_rhs_1d.value(),
-                           rewriter.getStringAttr("NONE"))
+        AddOp::create(rewriter, add_op.getLoc(), bias_1d.value(),
+                      add_rhs_1d.value(), rewriter.getStringAttr("NONE"))
             .getOutput();
     mlir::Value out =
-        rewriter
-            .create<TFL::FullyConnectedOp>(
-                mlir::FusedLoc::get(fc_op.getContext(),
-                                    {fc_op.getLoc(), add_op.getLoc()}),
-                fc_output_type,
-                /*input=*/fc_op.getInput(),
-                /*filter=*/filter,
-                /*bias=*/new_bias,
-                /*fused_activation_function=*/
-                rewriter.getStringAttr(add_op.getFusedActivationFunction()),
-                /*weights_format=*/
-                rewriter.getStringAttr(fc_op.getWeightsFormat()),
-                /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.getKeepNumDims()),
-                /*asymmetric_quantize_inputs=*/
-                fc_op.getAsymmetricQuantizeInputsAttr())
+        TFL::FullyConnectedOp::create(
+            rewriter,
+            mlir::FusedLoc::get(fc_op.getContext(),
+                                {fc_op.getLoc(), add_op.getLoc()}),
+            fc_output_type,
+            /*input=*/fc_op.getInput(),
+            /*filter=*/filter,
+            /*bias=*/new_bias,
+            /*fused_activation_function=*/
+            rewriter.getStringAttr(add_op.getFusedActivationFunction()),
+            /*weights_format=*/
+            rewriter.getStringAttr(fc_op.getWeightsFormat()),
+            /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.getKeepNumDims()),
+            /*asymmetric_quantize_inputs=*/
+            fc_op.getAsymmetricQuantizeInputsAttr())
             .getOutput()[0];
 
     if (fc_output_type.getShape() != add_output_type.getShape()) {
-      auto target_shape = rewriter.create<arith::ConstantOp>(
-          add_op.getLoc(), rewriter.getI32TensorAttr(llvm::SmallVector<int32_t>(
-                               add_output_type.getShape())));
-      out = rewriter.create<ReshapeOp>(add_op.getLoc(), add_output_type, out,
-                                       target_shape);
+      auto target_shape = arith::ConstantOp::create(
+          rewriter, add_op.getLoc(),
+          rewriter.getI32TensorAttr(
+              llvm::SmallVector<int32_t>(add_output_type.getShape())));
+      out = ReshapeOp::create(rewriter, add_op.getLoc(), add_output_type, out,
+                              target_shape);
     }
     rewriter.replaceOp(add_op, out);
 
@@ -1471,8 +1474,8 @@ struct FuseAddAndFullyConnected
       return failure();
     }
 
-    auto new_bias = rewriter.create<TFL::FullyConnectedOp>(
-        fc_op.getLoc(), old_bias.getType(),
+    auto new_bias = TFL::FullyConnectedOp::create(
+        rewriter, fc_op.getLoc(), old_bias.getType(),
         /*input=*/add_op.getRhs(),
         /*filter=*/fc_op.getFilter(),
         /*bias=*/old_bias,
@@ -1482,7 +1485,8 @@ struct FuseAddAndFullyConnected
         /*asymmetric_quantize_inputs=*/fc_op.getAsymmetricQuantizeInputsAttr());
 
     // Create the updated FC.
-    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
+    auto new_fc = TFL::FullyConnectedOp::create(
+        rewriter,
         FusedLoc::get(add_op.getContext(), {add_op.getLoc(), fc_op.getLoc()}),
         fc_op.getOutput().getTypes(),
         /*input=*/add_op.getLhs(),
@@ -1557,14 +1561,14 @@ struct FuseMulAndFullyConnected
     auto location =
         FusedLoc::get(mul_op.getContext(), {mul_op.getLoc(), fc_op.getLoc()});
 
-    auto new_filter = rewriter.create<TFL::MulOp>(
-        location,
+    auto new_filter = TFL::MulOp::create(
+        rewriter, location,
         /*lhs=*/fc_op.getFilter(),
         /*rhs=*/mul_op.getRhs(),
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
     // Create the updated FC.
-    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
-        location, fc_op.getOutput().getTypes(),
+    auto new_fc = TFL::FullyConnectedOp::create(
+        rewriter, location, fc_op.getOutput().getTypes(),
         /*input=*/mul_op.getLhs(),
         /*filter=*/new_filter,
         /*bias=*/fc_op.getBias(),
@@ -1597,7 +1601,8 @@ struct FuseFullyConnectedAndReluX : public OpRewritePattern<ReluXOp> {
         rewriter.getStringAttr(fully_connected_op.getWeightsFormat());
     auto new_keep_num_dims =
         rewriter.getBoolAttr(fully_connected_op.getKeepNumDims());
-    auto fc = rewriter.create<FullyConnectedOp>(
+    auto fc = FullyConnectedOp::create(
+        rewriter,
         FusedLoc::get(relu_op.getContext(),
                       {fully_connected_op.getLoc(), relu_op.getLoc()}),
         relu_op.getType(), /*input=*/fully_connected_op.getInput(),
@@ -1674,7 +1679,7 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
     }
 
     auto new_op =
-        rewriter.create<arith::ConstantOp>(mul_op.getLoc(), new_type, new_cst);
+        arith::ConstantOp::create(rewriter, mul_op.getLoc(), new_type, new_cst);
     Value new_const_val = new_op.getResult();
 
     // Rewrite. Since the folder of TFL::MulOp couldn't broadcast the operands,
@@ -1689,15 +1694,16 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
       if (size > (1 << 30)) return failure();
     }
     auto new_filter =
-        rewriter.create<TF::MulOp>(mul_op.getLoc(), filter, new_const_val)
+        TF::MulOp::create(rewriter, mul_op.getLoc(), filter, new_const_val)
             .getZ();
     // If bias isn't None, it needs to be multiplied as well.
     if (!mlir::isa<NoneType>(bias.getType())) {
-      bias = rewriter.create<TF::MulOp>(mul_op.getLoc(), bias, constant_val)
+      bias = TF::MulOp::create(rewriter, mul_op.getLoc(), bias, constant_val)
                  .getZ();
     }
 
-    auto fc = rewriter.create<TFL::FullyConnectedOp>(
+    auto fc = TFL::FullyConnectedOp::create(
+        rewriter,
         FusedLoc::get(fc_op.getContext(), {fc_op.getLoc(), mul_op.getLoc()}),
         mul_op.getType(),
         /*input=*/fc_op.getInput(),
@@ -1848,13 +1854,13 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
     DenseElementsAttr broadcasted_gamma_attr =
         ExpandTo4DForConv(gamma_cst, filter_output_dim);
     auto broadcasted_gamma =
-        rewriter.create<ConstOp>(loc, broadcasted_gamma_attr);
+        ConstOp::create(rewriter, loc, broadcasted_gamma_attr);
 
     // Inject a mul between the filter constant and the quantize op.
-    auto new_filter = rewriter
-                          .create<TFL::MulOp>(loc, filter, broadcasted_gamma,
-                                              rewriter.getStringAttr("NONE"))
-                          .getResult();
+    auto new_filter =
+        TFL::MulOp::create(rewriter, loc, filter, broadcasted_gamma,
+                           rewriter.getStringAttr("NONE"))
+            .getResult();
     // Update the scale in the quantize op.
     auto new_qtype = RescaleQtype(q_op.getQtype(), gamma_cst);
     if (!new_qtype) {
@@ -1869,11 +1875,11 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
 
       auto squeezed_gamma = FlattenTo1D(gamma_cst);
       auto squeezed_gamma_type = squeezed_gamma.getType();
-      auto squeezed_gamma_op = rewriter.create<arith::ConstantOp>(
-          affine_op.getLoc(), squeezed_gamma_type, squeezed_gamma);
+      auto squeezed_gamma_op = arith::ConstantOp::create(
+          rewriter, affine_op.getLoc(), squeezed_gamma_type, squeezed_gamma);
 
-      auto new_bias = rewriter.create<TFL::MulOp>(
-          loc, bias, squeezed_gamma_op, rewriter.getStringAttr("NONE"));
+      auto new_bias = TFL::MulOp::create(rewriter, loc, bias, squeezed_gamma_op,
+                                         rewriter.getStringAttr("NONE"));
       affine_op.getOperation()->replaceUsesOfWith(bias, new_bias);
     }
 
@@ -1977,7 +1983,7 @@ struct FuseBinaryOpToFollowingAffineOp : public OpRewritePattern<AffineOpType> {
       }
       auto new_bias = DenseFPElementsAttr::get(new_bias_type, new_bias_values);
       auto new_bias_op =
-          rewriter.create<ConstOp>(fc_op.getLoc(), new_bias_type, new_bias);
+          ConstOp::create(rewriter, fc_op.getLoc(), new_bias_type, new_bias);
       fc_op.setOperand(0, binary_op->getOperand(0));
       fc_op.setOperand(2, new_bias_op);
     } else if (llvm::isa<MulOp, DivOp>(binary_op)) {
@@ -1992,8 +1998,8 @@ struct FuseBinaryOpToFollowingAffineOp : public OpRewritePattern<AffineOpType> {
           });
       // We recreate the constant op in case it is shared by the other ops. This
       // might increase the model size.
-      auto new_filter_op = rewriter.create<ConstOp>(
-          fc_op.getLoc(), filter.getType(), new_filter);
+      auto new_filter_op = ConstOp::create(rewriter, fc_op.getLoc(),
+                                           filter.getType(), new_filter);
       fc_op.setOperand(0, binary_op->getOperand(0));
       if (fc_op.getFilter() != filter) {
         // This filter goes through quantize and dequantize ops. Then we just
@@ -2186,8 +2192,9 @@ struct FuseUnpackAndConcatToReshape
       new_shape_array_i32.push_back(
           ShapedType::isDynamic(size) ? -1 : static_cast<int32_t>(size));
     }
-    auto new_shape = rewriter.create<TFL::ConstOp>(
-        concat_op.getLoc(), GetI32ElementsAttr(new_shape_array_i32, &rewriter));
+    auto new_shape = TFL::ConstOp::create(
+        rewriter, concat_op.getLoc(),
+        GetI32ElementsAttr(new_shape_array_i32, &rewriter));
 
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
         concat_op, output_type, unpack_op.getInput(), new_shape);
@@ -2273,8 +2280,8 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     auto k = !values.use_empty() ? k_values : k_indices;
     // Build scalar tensor k.
     auto k_ty = mlir::RankedTensorType::get({}, rewriter.getIntegerType(32));
-    Value k_cst = rewriter.create<TFL::ConstOp>(
-        op.getLoc(), DenseElementsAttr::get(k_ty, k));
+    Value k_cst = TFL::ConstOp::create(rewriter, op.getLoc(),
+                                       DenseElementsAttr::get(k_ty, k));
     // Compute new result types.
     auto values_ty = mlir::dyn_cast<ShapedType>(values.getType());
     auto indices_ty = mlir::dyn_cast<ShapedType>(indices.getType());
@@ -2287,8 +2294,9 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
         mlir::RankedTensorType::get(shape, values_ty.getElementType());
     auto new_indices_ty =
         mlir::RankedTensorType::get(shape, indices_ty.getElementType());
-    TFL::TopKV2Op top_k_op = rewriter.create<TFL::TopKV2Op>(
-        op.getLoc(), new_values_ty, new_indices_ty, op->getOperand(0), k_cst);
+    TFL::TopKV2Op top_k_op =
+        TFL::TopKV2Op::create(rewriter, op.getLoc(), new_values_ty,
+                              new_indices_ty, op->getOperand(0), k_cst);
 
     // Remove original ops (topk, Slice, Slice).
     if (!values.use_empty()) {
@@ -2376,10 +2384,12 @@ struct FuseReshapeAndTransposeAroundBatchMatmul
         static_cast<int>(std::accumulate(
             transpose_input.getType().getShape().begin() + 2,
             transpose_input.getType().getShape().end(), 1, std::multiplies()))};
-    auto shape_constant = rewriter.create<ConstOp>(
-        batch_matmul.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
-    auto reshaped_input = rewriter.create<ReshapeOp>(
-        batch_matmul.getLoc(), transpose_op.getInput(), shape_constant);
+    auto shape_constant =
+        ConstOp::create(rewriter, batch_matmul.getLoc(),
+                        GetI32ElementsAttr(new_shape, &rewriter));
+    auto reshaped_input =
+        ReshapeOp::create(rewriter, batch_matmul.getLoc(),
+                          transpose_op.getInput(), shape_constant);
     rewriter.replaceOpWithNewOp<BatchMatMulOp>(
         op, op.getType(), reshaped_input, batch_matmul.getX(),
         /*adj_x=*/false, /*adj_y=*/!batch_matmul.getAdjX(),
@@ -2438,10 +2448,10 @@ struct FuseTransposeReshapeIntoBatchMatmul
         reshape_op.getType().getShape().drop_front().begin(),
         reshape_op.getType().getShape().drop_front().end());
     new_shape.push_back(reshape_op.getType().getDimSize(0));
-    auto shape_constant = rewriter.create<ConstOp>(
-        op.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
-    auto new_reshape = rewriter.create<ReshapeOp>(
-        op.getLoc(), transpose_op.getInput(), shape_constant);
+    auto shape_constant = ConstOp::create(
+        rewriter, op.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
+    auto new_reshape = ReshapeOp::create(
+        rewriter, op.getLoc(), transpose_op.getInput(), shape_constant);
     rewriter.replaceOpWithNewOp<BatchMatMulOp>(
         op, op.getType(), op.getX(), new_reshape, op.getAdjX(), !op.getAdjY(),
         op.getAsymmetricQuantizeInputsAttr());
@@ -2647,8 +2657,8 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
 
     auto new_bias = FlattenTo1D(bias_op.getValueAttr());
     auto new_bias_type = new_bias.getType();
-    auto new_bias_op = rewriter.create<arith::ConstantOp>(
-        bias_op.getLoc(), new_bias_type, new_bias);
+    auto new_bias_op = arith::ConstantOp::create(rewriter, bias_op.getLoc(),
+                                                 new_bias_type, new_bias);
 
     // Update QuantizeOp with the new bias and its output shape
     q_op.setOperand(new_bias_op);
@@ -2717,10 +2727,11 @@ struct MoveReshapeAfterFullyConnected
     new_input_shape.pop_back();
     new_input_shape.push_back(input_ty.getShape().back());
 
-    auto reshape_before = rewriter.create<TFL::ReshapeOp>(
-        fc.getLoc(), fc.getInput(),
-        rewriter.create<arith::ConstantOp>(
-            fc->getLoc(), GetI32ElementsAttr(new_input_shape, &rewriter)));
+    auto reshape_before = TFL::ReshapeOp::create(
+        rewriter, fc.getLoc(), fc.getInput(),
+        arith::ConstantOp::create(
+            rewriter, fc->getLoc(),
+            GetI32ElementsAttr(new_input_shape, &rewriter)));
 
     rewriter.replaceOpWithNewOp<TFL::FullyConnectedOp>(
         reshape,
@@ -2864,16 +2875,16 @@ struct PushTransposeThroughSqueeze : public RewritePattern {
             transpose.getInput().getType().getDimSize(i));
       }
     }
-    auto new_squeeze = rewriter.create<TFL::SqueezeOp>(
-        squeeze->getLoc(),
+    auto new_squeeze = TFL::SqueezeOp::create(
+        rewriter, squeeze->getLoc(),
         mlir::RankedTensorType::get(new_squeeze_shape,
                                     squeeze.getType().getElementType()),
         transpose.getInput(), rewriter.getI32ArrayAttr(new_squeeze_dims));
 
-    auto new_transpose = rewriter.create<TFL::TransposeOp>(
-        squeeze->getLoc(), squeeze.getType(), new_squeeze,
-        rewriter.create<arith::ConstantOp>(
-            squeeze->getLoc(), GetI32ElementsAttr(new_perm, &rewriter)));
+    auto new_transpose = TFL::TransposeOp::create(
+        rewriter, squeeze->getLoc(), squeeze.getType(), new_squeeze,
+        arith::ConstantOp::create(rewriter, squeeze->getLoc(),
+                                  GetI32ElementsAttr(new_perm, &rewriter)));
 
     rewriter.replaceOp(squeeze, new_transpose);
     return success();
@@ -3000,17 +3011,18 @@ struct ReorderTransposeReshapeTranspose
         mlir::dyn_cast_or_null<RankedTensorType>(reshape.getType());
     if (!reshape_type) return failure();
 
-    auto new_reshape_shape_const = rewriter.create<arith::ConstantOp>(
-        reshape.getLoc(), GetI32ElementsAttr(new_reshape_shape, &rewriter));
+    auto new_reshape_shape_const = arith::ConstantOp::create(
+        rewriter, reshape.getLoc(),
+        GetI32ElementsAttr(new_reshape_shape, &rewriter));
 
-    auto new_inner_reshape = rewriter.create<TFL::ReshapeOp>(
-        reshape.getLoc(),
+    auto new_inner_reshape = TFL::ReshapeOp::create(
+        rewriter, reshape.getLoc(),
         RankedTensorType::get(new_reshape_shape, reshape_type.getElementType()),
         input, new_reshape_shape_const.getResult());
-    auto new_inner_tpose = rewriter.create<TFL::TransposeOp>(
-        inner_tpose.getLoc(), reshape_type, new_inner_reshape,
-        rewriter.create<arith::ConstantOp>(
-            inner_tpose.getLoc(),
+    auto new_inner_tpose = TFL::TransposeOp::create(
+        rewriter, inner_tpose.getLoc(), reshape_type, new_inner_reshape,
+        arith::ConstantOp::create(
+            rewriter, inner_tpose.getLoc(),
             GetI32ElementsAttr(new_inner_perm, &rewriter)));
 
     rewriter.replaceOp(reshape, new_inner_tpose);
@@ -3079,8 +3091,8 @@ struct FullyConnectedSwapOperandsWhenLHSIsConst
     RankedTensorType intermediate_type =
         RankedTensorType::get({O, B}, element_type);
 
-    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
-        loc,
+    auto new_fc = TFL::FullyConnectedOp::create(
+        rewriter, loc,
         /*resultTypes=*/intermediate_type,
         /*input=*/filter,  // Original Filter V[O, I]
         /*filter=*/input,  // Original Input C[B, I]
@@ -3096,10 +3108,11 @@ struct FullyConnectedSwapOperandsWhenLHSIsConst
     RankedTensorType final_shape_type =
         RankedTensorType::get({B, O}, element_type);
 
-    Value transposed_result = rewriter.create<TFL::TransposeOp>(
-        loc, final_shape_type, new_fc.getResult(0),
-        rewriter.create<arith::ConstantOp>(
-            loc, GetI32ElementsAttr(ArrayRef<int32_t>({1, 0}), &rewriter)));
+    Value transposed_result = TFL::TransposeOp::create(
+        rewriter, loc, final_shape_type, new_fc.getResult(0),
+        arith::ConstantOp::create(
+            rewriter, loc,
+            GetI32ElementsAttr(ArrayRef<int32_t>({1, 0}), &rewriter)));
 
     rewriter.replaceOp(fc, transposed_result);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc b/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
index 7baa0136f1c33c..b93422d3812f6c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
@@ -140,15 +140,15 @@ void PinOpsWithSideEffectsPass::runOnOperation() {
     // Wrap all side-effect producing/dependent operations in a ControlNodeOp.
     builder.setInsertionPoint(op);
     Location loc = op->getLoc();
-    auto outer_op = builder.create<ControlNodeOp>(
-        loc, op->getResultTypes(), ControlType::get(op->getContext()),
-        control_tokens);
+    auto outer_op = ControlNodeOp::create(builder, loc, op->getResultTypes(),
+                                          ControlType::get(op->getContext()),
+                                          control_tokens);
     Region region;
     Block *new_block = new Block;
     region.push_back(new_block);
     builder.setInsertionPointToEnd(&region.front());
     Operation *inner_op = builder.clone(*op);
-    builder.create<YieldOp>(loc, inner_op->getResults());
+    YieldOp::create(builder, loc, inner_op->getResults());
     outer_op.getBody().takeBody(region);
     // Careful: We can't use outer_op.getResults(), because that also includes
     // the control token.
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 2538cc423cdf1e..0cf34df94faf6c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -183,7 +183,7 @@ std::optional<mlir::Value> GetConstTensor(PatternRewriter& rewriter,
   auto const_attr = DenseElementsAttr::get(const_type, vec);
 
   auto const_op =
-      rewriter.create<arith::ConstantOp>(loc, const_type, const_attr);
+      arith::ConstantOp::create(rewriter, loc, const_type, const_attr);
   return const_op.getResult();
 }
 
@@ -207,8 +207,8 @@ std::optional<mlir::Value> ConvertDequantizeOp(
     auto const_attr =
         DenseElementsAttr::get(const_type, static_cast<float>(zeropoint[0]));
 
-    auto const_op = rewriter.create<arith::ConstantOp>(op->getLoc(), const_type,
-                                                       const_attr);
+    auto const_op = arith::ConstantOp::create(rewriter, op->getLoc(),
+                                              const_type, const_attr);
     zp_val = const_op.getResult();
   } else {
     SmallVector<int64_t> shape;
@@ -224,8 +224,8 @@ std::optional<mlir::Value> ConvertDequantizeOp(
     auto const_attr =
         DenseElementsAttr::get(const_type, static_cast<float>(scale[0]));
 
-    auto const_op = rewriter.create<arith::ConstantOp>(op->getLoc(), const_type,
-                                                       const_attr);
+    auto const_op = arith::ConstantOp::create(rewriter, op->getLoc(),
+                                              const_type, const_attr);
     scale_val = const_op.getResult();
   } else {
     SmallVector<int64_t> shape;
@@ -237,16 +237,17 @@ std::optional<mlir::Value> ConvertDequantizeOp(
   if (!zp_val || !scale_val) return std::nullopt;
 
   auto op1_cast_in =
-      rewriter.create<TFL::CastOp>(op->getLoc(), output_type, input_value);
+      TFL::CastOp::create(rewriter, op->getLoc(), output_type, input_value);
 
-  auto op2_sub_op1 = rewriter.create<TFL::SubOp>(
-      op->getLoc(), output_type, op1_cast_in.getResult(), zp_val.value(),
+  auto op2_sub_op1 = TFL::SubOp::create(
+      rewriter, op->getLoc(), output_type, op1_cast_in.getResult(),
+      zp_val.value(),
       /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
 
-  return rewriter
-      .create<TFL::MulOp>(
-          op->getLoc(), output_type, op2_sub_op1.getResult(), scale_val.value(),
-          /*fused_activation_function=*/rewriter.getStringAttr("NONE"))
+  return TFL::MulOp::create(
+             rewriter, op->getLoc(), output_type, op2_sub_op1.getResult(),
+             scale_val.value(),
+             /*fused_activation_function=*/rewriter.getStringAttr("NONE"))
       .getResult();
 }
 
@@ -313,8 +314,8 @@ struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
 
       auto const_type = tensorflow::GetTypeFromTFTensorShape(
           output_type.getShape(), qtype.getStorageType());
-      auto const_op = rewriter.create<arith::ConstantOp>(
-          op->getLoc(), const_type, qconst_op.getValue());
+      auto const_op = arith::ConstantOp::create(
+          rewriter, op->getLoc(), const_type, qconst_op.getValue());
 
       auto new_value =
           ConvertDequantizeOp(rewriter, op, output_type, const_op.getResult(),
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 235ec7d38615fc..d14ee12b7e55a3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -24,7 +24,7 @@ def DenseElementsAttr : ElementsAttrBase<
   "non-opaque constant tensor">;
 
 def CreateGatherNdOp : NativeCodeCall<
-    "$_builder.create<TF::GatherNdOp>($0.getLoc(), $0.getType(), $1, $2, $3)">;
+    "TF::GatherNdOp::create($_builder, $0.getLoc(), $0.getType(), $1, $2, $3)">;
 
 def CreateTFCastOpI32 : NativeCodeCall<
     "CreateTFCastOpI32(&$_builder, $_loc, $0, $1)">;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 957d243e72774d..899e4e9e088312 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -102,7 +102,7 @@ static Value CreateTFCastOpI32(OpBuilder *builder, Location loc, Value x,
   auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getI32Type());
-  return builder->create<TF::CastOp>(loc, type, x, truncate);
+  return TF::CastOp::create(*builder, loc, type, x, truncate);
 }
 }  // namespace
 
@@ -253,7 +253,7 @@ class ConvertTFConvOp : public RewritePattern {
         tensorflow::GetTypeFromTFTensorShape({bias_dim}, elem_type);
     auto bias_attr = rewriter.getZeroAttr(bias_type);
     auto bias =
-        rewriter.create<TF::ConstOp>(op->getLoc(), bias_type, bias_attr);
+        TF::ConstOp::create(rewriter, op->getLoc(), bias_type, bias_attr);
 
     if (op->getAttrOfType<StringAttr>("padding").getValue() == "EXPLICIT") {
       // Add Const op for padding value.
@@ -276,12 +276,12 @@ class ConvertTFConvOp : public RewritePattern {
           mlir::DenseIntElementsAttr::get(padding_attr_type, padding_values);
 
       auto padding_const =
-          rewriter.create<TF::ConstOp>(op->getLoc(), padding_attr);
+          TF::ConstOp::create(rewriter, op->getLoc(), padding_attr);
 
       // Add Pad op.
       auto pad_output_type = UnrankedTensorType::get(elem_type);
-      input = rewriter.create<TF::PadOp>(op->getLoc(), pad_output_type, input,
-                                         padding_const);
+      input = TF::PadOp::create(rewriter, op->getLoc(), pad_output_type, input,
+                                padding_const);
 
       // Set Conv padding to `VALID` since padding has been handled by Pad op.
       state.padding = rewriter.getStringAttr("VALID");
@@ -315,8 +315,8 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
                             Type result_type, Value input, Value filter,
                             Value bias) const {
     filter = legalizeFilter(rewriter, loc, filter);
-    return rewriter.create<TFL::Conv2DOp>(
-        loc, result_type, input, filter, bias,
+    return TFL::Conv2DOp::create(
+        rewriter, loc, result_type, input, filter, bias,
         /*dilation_h_factor=*/state->dilation_height_factor,
         /*dilation_w_factor=*/state->dilation_width_factor,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
@@ -338,7 +338,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
         {static_cast<int>(perm.size())}, rewriter.getIntegerType(32));
     auto perm_attr =
         DenseElementsAttr::get(perm_type, llvm::ArrayRef<int>(perm));
-    auto perm_op = rewriter.create<TF::ConstOp>(loc, perm_type, perm_attr);
+    auto perm_op = TF::ConstOp::create(rewriter, loc, perm_type, perm_attr);
 
     // Create tensor type for the transpose result.
     auto filter_type = mlir::cast<RankedTensorType>(filter.getType());
@@ -350,7 +350,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
     auto result_type =
         tensorflow::GetTypeFromTFTensorShape(result_shape, elem_type);
 
-    return rewriter.create<TF::TransposeOp>(loc, result_type, filter, perm_op);
+    return TF::TransposeOp::create(rewriter, loc, result_type, filter, perm_op);
   }
 };
 
@@ -382,8 +382,8 @@ class ConvertTFDepthwiseConv2dNative
         mlir::cast<RankedTensorType>(filter.getType()).getDimSize(3);
 
     filter = legalizeFilter(rewriter, loc, filter);
-    return rewriter.create<TFL::DepthwiseConv2DOp>(
-        loc, result_type, input, filter, bias,
+    return TFL::DepthwiseConv2DOp::create(
+        rewriter, loc, result_type, input, filter, bias,
         /*dilation_h_factor=*/state->dilation_height_factor,
         /*dilation_w_factor=*/state->dilation_width_factor,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
@@ -420,9 +420,9 @@ class ConvertTFDepthwiseConv2dNative
           rewriter.getI32IntegerAttr(ConvertToTfliteSize(size));
     }
     auto shape_attr = DenseElementsAttr::get(shape_type, result_shape_data);
-    auto shape = rewriter.create<TF::ConstOp>(loc, shape_type, shape_attr);
+    auto shape = TF::ConstOp::create(rewriter, loc, shape_type, shape_attr);
 
-    return rewriter.create<TF::ReshapeOp>(loc, result_type, filter, shape);
+    return TF::ReshapeOp::create(rewriter, loc, result_type, filter, shape);
   }
 };
 
@@ -495,11 +495,11 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto shape_attr = DenseElementsAttr::get(shape_type, result_shape_data);
     auto shape =
-        rewriter.create<arith::ConstantOp>(loc, shape_type, shape_attr);
+        arith::ConstantOp::create(rewriter, loc, shape_type, shape_attr);
     auto revised_output_type = tensorflow::GetTypeFromTFTensorShape(
         revised_shape, original_input_type.getElementType());
-    TF::ReshapeOp reshape = rewriter.create<TF::ReshapeOp>(
-        loc, revised_output_type, original_input, shape);
+    TF::ReshapeOp reshape = TF::ReshapeOp::create(
+        rewriter, loc, revised_output_type, original_input, shape);
 
     // Replace the original strided_slice.
     uint64_t revised_begin_mask = strided_slice_op.getBeginMask();
@@ -656,13 +656,13 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto begin_attr = DenseElementsAttr::get<int32_t>(type, padded_begin);
     auto begin_op =
-        rewriter.create<arith::ConstantOp>(op->getLoc(), type, begin_attr);
+        arith::ConstantOp::create(rewriter, op->getLoc(), type, begin_attr);
     auto end_attr = DenseElementsAttr::get<int32_t>(type, padded_end);
     auto end_op =
-        rewriter.create<arith::ConstantOp>(op->getLoc(), type, end_attr);
+        arith::ConstantOp::create(rewriter, op->getLoc(), type, end_attr);
     auto stride_attr = DenseElementsAttr::get<int32_t>(type, padded_stride);
     auto stride_op =
-        rewriter.create<arith::ConstantOp>(op->getLoc(), type, stride_attr);
+        arith::ConstantOp::create(rewriter, op->getLoc(), type, stride_attr);
 
     rewriter.replaceOpWithNewOp<TF::StridedSliceOp>(
         op, strided_slice_op.getType(), input, begin_op.getResult(),
@@ -767,17 +767,17 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto begin_end_type = tensorflow::GetTypeFromTFTensorShape(
         {num_input_dims}, rewriter.getIntegerType(32));
-    auto new_begin_attr = rewriter.create<arith::ConstantOp>(
-        op->getLoc(), begin_end_type,
+    auto new_begin_attr = arith::ConstantOp::create(
+        rewriter, op->getLoc(), begin_end_type,
         DenseElementsAttr::get<int32_t>(begin_end_type, padded_begin));
-    auto new_end_attr = rewriter.create<arith::ConstantOp>(
-        op->getLoc(), begin_end_type,
+    auto new_end_attr = arith::ConstantOp::create(
+        rewriter, op->getLoc(), begin_end_type,
         DenseElementsAttr::get<int32_t>(begin_end_type, padded_end));
     auto strides_type = tensorflow::GetTypeFromTFTensorShape(
         {static_cast<int64_t>(padded_strides.size())},
         rewriter.getIntegerType(32));
-    auto new_strides_attr = rewriter.create<arith::ConstantOp>(
-        op->getLoc(), strides_type,
+    auto new_strides_attr = arith::ConstantOp::create(
+        rewriter, op->getLoc(), strides_type,
         DenseElementsAttr::get<int32_t>(strides_type, padded_strides));
 
     auto attribute_type = rewriter.getIntegerType(64);
@@ -1043,9 +1043,10 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
         auto reduce_dim_type = tensorflow::GetTypeFromTFTensorShape(
             {3}, rewriter.getIntegerType(32));
         ::mlir::SmallVector<int32_t, 3> reduce_dim_values = {0, 1, 2};
-        reduce_dim_op = rewriter.create<TF::ConstOp>(
-            odsLoc, ::mlir::DenseIntElementsAttr::get(reduce_dim_type,
-                                                      reduce_dim_values));
+        reduce_dim_op =
+            TF::ConstOp::create(rewriter, odsLoc,
+                                ::mlir::DenseIntElementsAttr::get(
+                                    reduce_dim_type, reduce_dim_values));
       }
 
       auto new_mean_type = tensorflow::GetTypeFromTFTensorShape(
@@ -1053,8 +1054,8 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
       ::mlir::TF::MeanOp mean_op_1;
       {
         ::mlir::Value x_value = (*x.begin());
-        mean_op_1 = rewriter.create<TF::MeanOp>(
-            odsLoc, new_mean_type, x_value, reduce_dim_op,
+        mean_op_1 = TF::MeanOp::create(
+            rewriter, odsLoc, new_mean_type, x_value, reduce_dim_op,
             /*keep_dims=*/rewriter.getBoolAttr(false));
       }
 
@@ -1064,15 +1065,15 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
         ::mlir::Value tblgen_value_1 = (*mean_op_1.getODSResults(0).begin());
         // If x has shape of [b, h, w, c], the result of mean_op_1 will have
         // shape of [c]. Therefore, their shapes are always compatible.
-        square_diff_op = rewriter.create<::mlir::TF::SquaredDifferenceOp>(
-            odsLoc, tblgen_value_0, tblgen_value_1);
+        square_diff_op = ::mlir::TF::SquaredDifferenceOp::create(
+            rewriter, odsLoc, tblgen_value_0, tblgen_value_1);
       }
 
       ::mlir::TF::MeanOp mean_op_2;
       {
         ::mlir::Value input_value = (*square_diff_op.getODSResults(0).begin());
-        mean_op_2 = rewriter.create<TF::MeanOp>(
-            odsLoc, new_mean_type, input_value, reduce_dim_op,
+        mean_op_2 = TF::MeanOp::create(
+            rewriter, odsLoc, new_mean_type, input_value, reduce_dim_op,
             /*keep_dims=*/rewriter.getBoolAttr(false));
       }
 
@@ -1083,57 +1084,56 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
     ::llvm::SmallVector<::mlir::Value, 4> replace_values;
     ::mlir::TF::ConstOp epsilon_const_op;
     {
-      epsilon_const_op =
-          rewriter.create<::mlir::TF::ConstOp>(odsLoc,
-                                               /*value=*/epsilon);
+      epsilon_const_op = ::mlir::TF::ConstOp::create(rewriter, odsLoc,
+                                                     /*value=*/epsilon);
     }
     ::mlir::TF::AddOp add_op_1;
     {
       ::mlir::Value epsilon_value =
           (*epsilon_const_op.getODSResults(0).begin());
       // Multiplying with a constant, no need to check broadcastibility.
-      add_op_1 = rewriter.create<::mlir::TF::AddOp>(odsLoc,
-                                                    /*x=*/variance_value,
-                                                    /*y=*/epsilon_value);
+      add_op_1 = ::mlir::TF::AddOp::create(rewriter, odsLoc,
+                                           /*x=*/variance_value,
+                                           /*y=*/epsilon_value);
     }
     ::mlir::TF::RsqrtOp rsqrt_op;
     {
       ::mlir::SmallVector<::mlir::Value, 4> tblgen_values;
       ::mlir::SmallVector<::mlir::NamedAttribute, 4> tblgen_attrs;
       tblgen_values.push_back((*add_op_1.getODSResults(0).begin()));
-      rsqrt_op = rewriter.create<::mlir::TF::RsqrtOp>(odsLoc, tblgen_values,
-                                                      tblgen_attrs);
+      rsqrt_op = ::mlir::TF::RsqrtOp::create(rewriter, odsLoc, tblgen_values,
+                                             tblgen_attrs);
     }
     ::mlir::TF::MulOp multiplier;
     {
       ::mlir::Value tblgen_value_0 = (*scale.begin());
       ::mlir::Value tblgen_value_1 = (*rsqrt_op.getODSResults(0).begin());
-      multiplier = rewriter.create<::mlir::TF::MulOp>(odsLoc,
-                                                      /*x=*/tblgen_value_0,
-                                                      /*y=*/tblgen_value_1);
+      multiplier = ::mlir::TF::MulOp::create(rewriter, odsLoc,
+                                             /*x=*/tblgen_value_0,
+                                             /*y=*/tblgen_value_1);
     }
     ::mlir::TF::MulOp mul_op_1;
     {
       ::mlir::Value tblgen_value_0 = (*x.begin());
       ::mlir::Value tblgen_value_1 = (*multiplier.getODSResults(0).begin());
-      mul_op_1 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
-                                                    /*x=*/tblgen_value_0,
-                                                    /*y=*/tblgen_value_1);
+      mul_op_1 = ::mlir::TF::MulOp::create(rewriter, odsLoc,
+                                           /*x=*/tblgen_value_0,
+                                           /*y=*/tblgen_value_1);
     }
     ::mlir::TF::MulOp mul_op_2;
     {
       ::mlir::Value multiplier_value = (*multiplier.getODSResults(0).begin());
-      mul_op_2 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
-                                                    /*x=*/mean_value,
-                                                    /*y=*/multiplier_value);
+      mul_op_2 = ::mlir::TF::MulOp::create(rewriter, odsLoc,
+                                           /*x=*/mean_value,
+                                           /*y=*/multiplier_value);
     }
     ::mlir::TF::SubOp sub_op;
     {
       ::mlir::Value tblgen_value_0 = (*offset.begin());
       ::mlir::Value tblgen_value_1 = (*mul_op_2.getODSResults(0).begin());
-      sub_op = rewriter.create<::mlir::TF::SubOp>(odsLoc,
-                                                  /*x=*/tblgen_value_0,
-                                                  /*y=*/tblgen_value_1);
+      sub_op = ::mlir::TF::SubOp::create(rewriter, odsLoc,
+                                         /*x=*/tblgen_value_0,
+                                         /*y=*/tblgen_value_1);
     }
     ::mlir::TF::AddOp add_op_2;
     {
@@ -1145,8 +1145,8 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
       for (auto v : fused_batch_norm_op.getODSResults(0)) {
         tblgen_types.push_back(v.getType());
       }
-      add_op_2 = rewriter.create<::mlir::TF::AddOp>(
-          odsLoc, tblgen_types, tblgen_values, tblgen_attrs);
+      add_op_2 = ::mlir::TF::AddOp::create(rewriter, odsLoc, tblgen_types,
+                                           tblgen_values, tblgen_attrs);
     }
     for (auto v :
          ::llvm::SmallVector<::mlir::Value, 4>{add_op_2.getODSResults(0)}) {
@@ -1261,9 +1261,9 @@ struct ReorderFakeQuantPattern : public RewritePattern {
                                 ReorderOp &new_reorder_op) const {
     Value tensor_value = (*input.begin());
     Value shape_value = (*shape.begin());
-    new_reorder_op = rewriter.create<ReorderOp>(ods_loc,
-                                                /*tensor=*/tensor_value,
-                                                /*shape=*/shape_value);
+    new_reorder_op = ReorderOp::create(rewriter, ods_loc,
+                                       /*tensor=*/tensor_value,
+                                       /*shape=*/shape_value);
     return success();
   }
 
@@ -1289,8 +1289,8 @@ struct ReorderFakeQuantPattern : public RewritePattern {
     for (auto v : casted_op.getODSResults(0)) {
       target_types.push_back(v.getType());
     }
-    fakequant_op = rewriter.create<TF::FakeQuantWithMinMaxVarsOp>(
-        ods_loc, target_types, target_values, target_attrs);
+    fakequant_op = TF::FakeQuantWithMinMaxVarsOp::create(
+        rewriter, ods_loc, target_types, target_values, target_attrs);
     return success();
   }
 
@@ -1442,13 +1442,14 @@ struct ConvertRfftToRfft2d : public RewritePattern {
 
     auto expaned_input_type = tensorflow::GetTypeFromTFTensorShape(
         expanded_input_shape, input_type.getElementType());
-    TF::ExpandDimsOp expanded_input = rewriter.create<TF::ExpandDimsOp>(
-        rfft_op.getLoc(), expaned_input_type, input, minus_two->getResult());
+    TF::ExpandDimsOp expanded_input =
+        TF::ExpandDimsOp::create(rewriter, rfft_op.getLoc(), expaned_input_type,
+                                 input, minus_two->getResult());
 
     // Expanded fft_len.
     auto one_attr = mlir::DenseIntElementsAttr::get(one_ele_type, {1});
 
-    auto one = rewriter.create<TF::ConstOp>(rfft_op.getLoc(), one_attr);
+    auto one = TF::ConstOp::create(rewriter, rfft_op.getLoc(), one_attr);
 
     auto zero = CreateConstOpWithSingleValue(&rewriter, rfft_op.getLoc(),
                                              one_ele_type, 0);
@@ -1456,21 +1457,22 @@ struct ConvertRfftToRfft2d : public RewritePattern {
     auto expanded_fft_len_type = tensorflow::GetTypeFromTFTensorShape(
         {2}, fft_len_type.getElementType());
 
-    TF::ConcatV2Op expanded_fft_len = rewriter.create<TF::ConcatV2Op>(
-        rfft_op.getLoc(), expanded_fft_len_type,
+    TF::ConcatV2Op expanded_fft_len = TF::ConcatV2Op::create(
+        rewriter, rfft_op.getLoc(), expanded_fft_len_type,
         SmallVector<Value, 2>({one.getResult(), fft_len}), zero->getResult());
 
     // Insert the rfft_2d.
     auto rfft2d_out_type = tensorflow::GetTypeFromTFTensorShape(
         expanded_output_shape, output_type.getElementType());
-    TF::RFFT2DOp rfft2d = rewriter.create<TF::RFFT2DOp>(
-        rfft_op.getLoc(), rfft2d_out_type, expanded_input.getResult(),
+    TF::RFFT2DOp rfft2d = TF::RFFT2DOp::create(
+        rewriter, rfft_op.getLoc(), rfft2d_out_type, expanded_input.getResult(),
         expanded_fft_len.getResult());
 
     // Insert the squeeze op.
     auto squeeze_dim = rewriter.getI64ArrayAttr({-2});
-    TF::SqueezeOp squeeze = rewriter.create<TF::SqueezeOp>(
-        rfft_op.getLoc(), output_type, rfft2d.getResult(), squeeze_dim);
+    TF::SqueezeOp squeeze =
+        TF::SqueezeOp::create(rewriter, rfft_op.getLoc(), output_type,
+                              rfft2d.getResult(), squeeze_dim);
 
     rewriter.replaceOp(op, squeeze.getResult());
 
@@ -1614,8 +1616,8 @@ class QuantizeConcatResult : public OpRewritePattern<TF::ConcatV2Op> {
     llvm::SmallVector<Value, 4> inputs{concat_result, min_v, max_v};
 
     rewriter.setInsertionPointAfter(concat.getOperation());
-    auto new_fake_quant_op = rewriter.create<TF::FakeQuantWithMinMaxVarsOp>(
-        concat.getLoc(), concat->getResultTypes(), inputs,
+    auto new_fake_quant_op = TF::FakeQuantWithMinMaxVarsOp::create(
+        rewriter, concat.getLoc(), concat->getResultTypes(), inputs,
         (*fake_quant_ops.begin())->getAttrs());
 
     for (OpOperand *use : uses) {
@@ -1673,8 +1675,9 @@ class QuantizeMeanResult : public OpRewritePattern<TF::MeanOp> {
     llvm::SmallVector<Value, 4> inputs{mean_result, fq.getMin(), fq.getMax()};
 
     rewriter.setInsertionPointAfter(mean.getOperation());
-    auto new_fake_quant_op = rewriter.create<TF::FakeQuantWithMinMaxVarsOp>(
-        mean.getLoc(), mean->getResultTypes(), inputs, fq->getAttrs());
+    auto new_fake_quant_op = TF::FakeQuantWithMinMaxVarsOp::create(
+        rewriter, mean.getLoc(), mean->getResultTypes(), inputs,
+        fq->getAttrs());
 
     for (OpOperand *use : uses) {
       use->assign(new_fake_quant_op);
diff --git a/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc b/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc
index 4dc0b4bf668043..82803f6de927cb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc
@@ -173,8 +173,8 @@ class CommuteBothInputsTransposedWithEwiseOps : public RewritePattern {
                         new_out_type, op->getAttrs());
 
     // Apply original tranpose to output of ewise op.
-    auto out_tpose_op = rewriter.create<TFL::TransposeOp>(
-        new_ewise_op->getLoc(), op->getResult(0).getType(),
+    auto out_tpose_op = TFL::TransposeOp::create(
+        rewriter, new_ewise_op->getLoc(), op->getResult(0).getType(),
         new_ewise_op->getResults()[0], perm1);
     rewriter.replaceOp(op, out_tpose_op.getOperation());
     return success();
@@ -273,7 +273,7 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
           RankedTensorType::get(inverse_perm.size(), rewriter.getI32Type()),
           inverse_perm);
       auto inverse_perm_op =
-          rewriter.create<arith::ConstantOp>(perm.getLoc(), inverse_perm_attr);
+          arith::ConstantOp::create(rewriter, perm.getLoc(), inverse_perm_attr);
 
       // Transpose the input constant.
       auto in_rtt =
@@ -283,9 +283,9 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
           RankedTensorType::get(PermuteShape(in_rtt.getShape(), inverse_perm),
                                 in_rtt.getElementType());
 
-      tposed_const = rewriter.create<TFL::TransposeOp>(
-          cst_arg->getLoc(), inverse_type, cst_arg->getResult(0),
-          inverse_perm_op);
+      tposed_const =
+          TFL::TransposeOp::create(rewriter, cst_arg->getLoc(), inverse_type,
+                                   cst_arg->getResult(0), inverse_perm_op);
     }
 
     auto current_out_type =
@@ -301,8 +301,8 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
                         new_out_type, op->getAttrs());
 
     // Apply original tranpose to output of ewise op.
-    auto out_tpose_op = rewriter.create<TFL::TransposeOp>(
-        new_ewise_op->getLoc(), op->getResult(0).getType(),
+    auto out_tpose_op = TFL::TransposeOp::create(
+        rewriter, new_ewise_op->getLoc(), op->getResult(0).getType(),
         new_ewise_op->getResults()[0], perm);
     rewriter.replaceOp(op, out_tpose_op.getOperation());
     return success();
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index d0c143d73914c9..c50e0a26e71c48 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -80,13 +80,13 @@ static LogicalResult IsDrqTensor(Value value, Value& fq_input) {
   // fake quant op.
   // This is to support the case such as:
   // %2077 = "vhlo.composite_v1"(%73, %69, %2070) : (tensor<i32>, tensor<i32>,
-  //  tensor<1x?x512xf32>) -> tensor<1x?x512xf32>
+  //   tensor<1x?x512xf32>) -> tensor<1x?x512xf32>
   // %2078 = "tfl.reshape"(%2077, %99) : (tensor<1x?x512xf32>, tensor<2xi32>) ->
-  //  tensor<?x512xf32>
+  //   tensor<?x512xf32>
   // %2079 = "tfl.pseudo_qconst"() <{qtype = tensor<64x512x!quant.uniform<i8....
-  // %2080 = "tfl.dequantize"(%2079) %2081 = "tfl.fully_connected"
-  //  (%2078, %2080, %0) : (tensor<?x512xf32>, tensor<64x512xf32>, none) ->
-  //  tensor<?x64xf32>
+  // %2080 = "tfl.dequantize"(%2079)
+  // %2081 = "tfl.fully_connected"(%2078, %2080, %0) : (tensor<?x512xf32>,
+  //   tensor<64x512xf32>, none) -> tensor<?x64xf32>
   // TODO - b/422588785: Have proper support for dynamic shaped models.
   auto v = value;
   if (auto reshape_op = llvm::dyn_cast_or_null<ReshapeOp>(v.getDefiningOp())) {
@@ -207,8 +207,9 @@ class PushForwardDrqFQ : public OpRewritePattern<stablehlo::CompositeOp> {
     Value float_input = drq_fq_op.getOperand(drq_fq_op.getNumOperands() - 1);
 
     // Create a new pad op.
-    auto new_pad_op = rewriter.create<TFL::PadOp>(
-        pad_op.getLoc(), pad_op.getType(), float_input, pad_op.getPadding());
+    auto new_pad_op =
+        TFL::PadOp::create(rewriter, pad_op.getLoc(), pad_op.getType(),
+                           float_input, pad_op.getPadding());
 
     // Create a new drq fake quant op.
     // Operands are the same, except for the last one.
@@ -218,8 +219,8 @@ class PushForwardDrqFQ : public OpRewritePattern<stablehlo::CompositeOp> {
     }
     new_drq_operands.push_back(new_pad_op.getResult());
 
-    auto new_drq_fq_op = rewriter.create<stablehlo::CompositeOp>(
-        drq_fq_op.getLoc(), pad_op.getType(), new_drq_operands,
+    auto new_drq_fq_op = stablehlo::CompositeOp::create(
+        rewriter, drq_fq_op.getLoc(), pad_op.getType(), new_drq_operands,
         drq_fq_op->getAttrs());
 
     rewriter.replaceOp(pad_op, new_drq_fq_op.getResult(0));
@@ -227,6 +228,40 @@ class PushForwardDrqFQ : public OpRewritePattern<stablehlo::CompositeOp> {
   }
 };
 
+// Fixes keep_num_dims option of FC if output dims is different from input dims
+// though keep_num_dims is true. It happens when FC's input has changed after
+// quantization, e.g. by IsDrqTensor().
+// Sets keep_num_dims to false if that's the case. Otherwise, it's not
+// compatible with GPU. See CheckGpuDelegateCompatibility() in
+// third_party/tensorflow/lite/tools/versioning/gpu_compatibility.cc.
+// Note that if FC is followed by Reshape, the keep_num_dims will be set to true
+// with a correct shape later by EnableFullyConnectedKeepNumDimsBeforeReshape()
+// in optimize pass.
+struct FixFullyConnectedKeepNumDims
+    : public OpRewritePattern<FullyConnectedOp> {
+  explicit FixFullyConnectedKeepNumDims(MLIRContext* context)
+      : OpRewritePattern<TFL::FullyConnectedOp>(context, /*benefit=*/0) {}
+
+  LogicalResult matchAndRewrite(FullyConnectedOp fc,
+                                PatternRewriter& rewriter) const override {
+    if (!fc.getKeepNumDims()) return failure();
+
+    auto input_ty =
+        mlir::dyn_cast_or_null<RankedTensorType>(fc.getInput().getType());
+    auto fc_ty = mlir::dyn_cast_or_null<RankedTensorType>(fc.getType(0));
+    if (!input_ty || !fc_ty) return failure();
+
+    auto input_shape = input_ty.getShape();
+    auto fc_shape = fc_ty.getShape();
+    if (input_shape.size() == fc_shape.size()) {
+      return failure();
+    }
+
+    fc.setKeepNumDims(false);
+    return success();
+  }
+};
+
 class StrictQuantizationPattern : public RewritePattern {
  public:
   using BaseType = StrictQuantizationPattern;
@@ -763,7 +798,7 @@ void QuantizePass::runOnOperation() {
     patterns.add<TFLFullQuantization, TFLFullQuantizationReverse>(ctx,
                                                                   quant_params);
   }
-
+  patterns.add<FixFullyConnectedKeepNumDims>(ctx);
   (void)applyPatternsGreedily(func, std::move(patterns));
 
   // Constant quantization is a lossy transformation, so they are applied only
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 7453ed54975a5a..d6e18dc4158508 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -114,11 +114,12 @@ void QuantizeVariablesPass::QuantizeVariable(
       // Add dequantize.
       builder.setInsertionPointAfter(read_variable_op);
       auto new_read_variable_op =
-          builder.create<ReadVariableOp>(read_variable_op.getLoc(), ref_qtype,
-                                         read_variable_op.getResourceId());
-      auto new_dq_op = builder.create<DequantizeOp>(
-          read_variable_op.getLoc(), read_variable_op.getResult().getType(),
-          new_read_variable_op.getResult());
+          ReadVariableOp::create(builder, read_variable_op.getLoc(), ref_qtype,
+                                 read_variable_op.getResourceId());
+      auto new_dq_op =
+          DequantizeOp::create(builder, read_variable_op.getLoc(),
+                               read_variable_op.getResult().getType(),
+                               new_read_variable_op.getResult());
       read_variable_op->replaceAllUsesWith(new_dq_op);
       read_variable_op.erase();
     }
@@ -135,19 +136,19 @@ void QuantizeVariablesPass::QuantizeVariable(
         if (qtype == quant::QuantizedType::getQuantizedElementType(ref_qtype)) {
           // Same quantization parameters, remove it.
           builder.setInsertionPoint(assign_variable_op);
-          auto new_assign_variable_op = builder.create<AssignVariableOp>(
-              assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
-              dq_op.getInput());
+          auto new_assign_variable_op = AssignVariableOp::create(
+              builder, assign_variable_op.getLoc(),
+              assign_variable_op.getResourceId(), dq_op.getInput());
           assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
         } else {
           // Otherwise, apply re-quantization.
           builder.setInsertionPoint(assign_variable_op);
-          auto new_q_op = builder.create<QuantizeOp>(
-              assign_variable_op.getLoc(), ref_qtype, dq_op.getInput(),
+          auto new_q_op = QuantizeOp::create(
+              builder, assign_variable_op.getLoc(), ref_qtype, dq_op.getInput(),
               TypeAttr::get(ref_qtype));
-          auto new_assign_variable_op = builder.create<AssignVariableOp>(
-              assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
-              new_q_op.getResult());
+          auto new_assign_variable_op = AssignVariableOp::create(
+              builder, assign_variable_op.getLoc(),
+              assign_variable_op.getResourceId(), new_q_op.getResult());
           assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
         }
         assign_variable_op.erase();
@@ -155,12 +156,12 @@ void QuantizeVariablesPass::QuantizeVariable(
       } else {
         // Add quantize op.
         builder.setInsertionPoint(assign_variable_op);
-        auto new_q_op = builder.create<QuantizeOp>(
-            assign_variable_op.getLoc(), ref_qtype,
+        auto new_q_op = QuantizeOp::create(
+            builder, assign_variable_op.getLoc(), ref_qtype,
             assign_variable_op.getValue(), TypeAttr::get(ref_qtype));
-        auto new_assign_variable_op = builder.create<AssignVariableOp>(
-            assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
-            new_q_op.getResult());
+        auto new_assign_variable_op = AssignVariableOp::create(
+            builder, assign_variable_op.getLoc(),
+            assign_variable_op.getResourceId(), new_q_op.getResult());
         assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
         assign_variable_op.erase();
       }
@@ -171,9 +172,9 @@ void QuantizeVariablesPass::QuantizeVariable(
     builder.setInsertionPoint(var_handle_op);
     auto output_type = UnrankedTensorType::get(TF::ResourceType::get(
         {mlir::cast<TensorType>(ref_qtype)}, builder.getContext()));
-    auto new_var_handle_op = builder.create<VarHandleOp>(
-        var_handle_op.getLoc(), output_type, var_handle_op.getContainer(),
-        var_handle_op.getSharedName());
+    auto new_var_handle_op = VarHandleOp::create(
+        builder, var_handle_op.getLoc(), output_type,
+        var_handle_op.getContainer(), var_handle_op.getSharedName());
     var_handle_op->replaceAllUsesWith(new_var_handle_op);
     var_handle_op.erase();
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
index 80e0986209e8d0..58fff203b9fb3e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
@@ -80,8 +80,8 @@ void RaiseCustomOpsPass::runOnOperation() {
   for (auto *op : custom_ops) {
     builder.setInsertionPoint(op);
     Location loc = op->getLoc();
-    auto custom_op = builder.create<CustomTfOp>(loc, op->getResultTypes(),
-                                                op->getOperands());
+    auto custom_op = CustomTfOp::create(builder, loc, op->getResultTypes(),
+                                        op->getOperands());
     Region region;
     Block *new_block = new Block;
     region.push_back(new_block);
@@ -95,7 +95,7 @@ void RaiseCustomOpsPass::runOnOperation() {
       inner_op->setOperand(idx_args.index(), idx_args.value());
     }
     custom_op->setAttrs(inner_op->getAttrs());
-    builder.create<YieldOp>(loc, inner_op->getResults());
+    YieldOp::create(builder, loc, inner_op->getResults());
     custom_op.getBody().takeBody(region);
 
     op->replaceAllUsesWith(custom_op);
diff --git a/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc b/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
index e964f76b30efbe..a0a6df9cf4feef 100644
--- a/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
@@ -106,12 +106,12 @@ class SanitizeGatherOpOutputToI4 : public OpRewritePattern<TFL::GatherOp> {
     }
 
     Builder builder(op.getContext());
-    auto new_gather_op = rewriter.create<TFL::GatherOp>(
-        op.getLoc(),
-        /*result=*/
-        mlir::cast<TensorType>(op.getResult().getType())
-            .clone(builder.getI4Type()),
-        /*operand=*/op.getOperands(), op->getAttrs());
+    auto new_gather_op =
+        TFL::GatherOp::create(rewriter, op.getLoc(),
+                              /*result=*/
+                              mlir::cast<TensorType>(op.getResult().getType())
+                                  .clone(builder.getI4Type()),
+                              /*operand=*/op.getOperands(), op->getAttrs());
     rewriter.replaceAllUsesWith(op.getResult(), new_gather_op.getResult());
 
     return success();
diff --git a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
index 2b03557121652f..6f476ded0a1a62 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
@@ -60,23 +60,21 @@ void MaybeUnfoldLargeSplatConstant(mlir::OpBuilder* op_builder,
   }
 
   op_builder->setInsertionPoint(const_op);
-  mlir::arith::ConstantOp fill_shape =
-      op_builder->create<mlir::arith::ConstantOp>(
-          const_op->getLoc(), DenseIntElementsAttr::get(
-                                  tensorflow::GetTypeFromTFTensorShape(
-                                      {splat_elements_attr.getType().getRank()},
-                                      op_builder->getI64Type()),
-                                  splat_elements_attr.getType().getShape()));
-  mlir::arith::ConstantOp fill_value =
-      op_builder->create<mlir::arith::ConstantOp>(
-          const_op->getLoc(),
-          DenseElementsAttr::get(
-              tensorflow::GetTypeFromTFTensorShape(
-                  {}, splat_elements_attr.getType().getElementType()),
-              splat_elements_attr.getSplatValue<Attribute>()));
-  TFL::FillOp fill = op_builder->create<TFL::FillOp>(
-      const_op->getLoc(), splat_elements_attr.getType(), fill_shape,
-      fill_value);
+  mlir::arith::ConstantOp fill_shape = mlir::arith::ConstantOp::create(
+      *op_builder, const_op->getLoc(),
+      DenseIntElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
+                                    {splat_elements_attr.getType().getRank()},
+                                    op_builder->getI64Type()),
+                                splat_elements_attr.getType().getShape()));
+  mlir::arith::ConstantOp fill_value = mlir::arith::ConstantOp::create(
+      *op_builder, const_op->getLoc(),
+      DenseElementsAttr::get(
+          tensorflow::GetTypeFromTFTensorShape(
+              {}, splat_elements_attr.getType().getElementType()),
+          splat_elements_attr.getSplatValue<Attribute>()));
+  TFL::FillOp fill = TFL::FillOp::create(*op_builder, const_op->getLoc(),
+                                         splat_elements_attr.getType(),
+                                         fill_shape, fill_value);
   const_op->replaceAllUsesWith(fill);
   const_op->erase();
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
index 4e0fb068c8b9e8..d9cab52085ef5b 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -150,10 +150,10 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.
     Value value = tf_op.getOutputs();
-    auto quantize = rewriter.create<TFL::QuantizeOp>(
-        tf_op.getLoc(), qtype.getValue(), value, qtype);
-    auto dequantize = rewriter.create<TFL::DequantizeOp>(
-        tf_op.getLoc(), res_type, quantize.getOutput());
+    auto quantize = TFL::QuantizeOp::create(rewriter, tf_op.getLoc(),
+                                            qtype.getValue(), value, qtype);
+    auto dequantize = TFL::DequantizeOp::create(rewriter, tf_op.getLoc(),
+                                                res_type, quantize.getOutput());
     value.replaceAllUsesWith(dequantize);
     quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index a2023742140fce..f94cad6b5eabe7 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -52,14 +52,14 @@ Value CreateI32SplatConst(OpBuilder* builder, ArrayRef<int64_t> shape,
                           int32_t val, mlir::Location location) {
   auto type = RankedTensorType::get(shape, builder->getIntegerType(32));
   auto attr = DenseElementsAttr::get(type, val);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreateF32SplatConst(OpBuilder* builder, ArrayRef<int64_t> shape,
                           float val, mlir::Location location) {
   auto type = RankedTensorType::get(shape, builder->getF32Type());
   auto attr = DenseElementsAttr::get(type, val);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreatTfF32ConstOp(OpBuilder* builder, ArrayRef<int64_t> shape, float val,
@@ -67,7 +67,7 @@ Value CreatTfF32ConstOp(OpBuilder* builder, ArrayRef<int64_t> shape, float val,
   auto type = RankedTensorType::get(shape, builder->getF32Type());
   auto ele_type = RankedTensorType::get({1}, builder->getF32Type());
   auto attr = DenseElementsAttr::get(ele_type, val);
-  return builder->create<TF::ConstOp>(location, type, attr);
+  return TF::ConstOp::create(*builder, location, type, attr);
 }
 
 Value CreateI64DenseConst(OpBuilder* builder, ArrayRef<int64_t> shape,
@@ -75,7 +75,7 @@ Value CreateI64DenseConst(OpBuilder* builder, ArrayRef<int64_t> shape,
   auto type = RankedTensorType::get(static_cast<int>(shape.size()),
                                     builder->getIntegerType(64));
   auto attr = DenseElementsAttr::get(type, values);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreateI32DenseConst(OpBuilder* builder, ArrayRef<int32_t> values,
@@ -83,12 +83,12 @@ Value CreateI32DenseConst(OpBuilder* builder, ArrayRef<int32_t> values,
   auto type = RankedTensorType::get(static_cast<int>(values.size()),
                                     builder->getIntegerType(32));
   auto attr = DenseElementsAttr::get(type, values);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreateNoneValue(OpBuilder* builder, mlir::Location location) {
-  return builder->create<TFL::NoValueOp>(location, builder->getNoneType(),
-                                         builder->getUnitAttr());
+  return TFL::NoValueOp::create(*builder, location, builder->getNoneType(),
+                                builder->getUnitAttr());
 }
 
 Value Transpose(OpBuilder* builder, Value value_to_transpose,
@@ -106,8 +106,8 @@ Value Transpose(OpBuilder* builder, Value value_to_transpose,
   auto elem_type = transpose_type.getElementType();
   auto result_type = RankedTensorType::get(transpose_shape, elem_type);
 
-  return builder->create<TF::TransposeOp>(location, result_type,
-                                          value_to_transpose, perm_op);
+  return TF::TransposeOp::create(*builder, location, result_type,
+                                 value_to_transpose, perm_op);
 }
 
 Value Transpose2D(OpBuilder* builder, Value value_to_transpose,
@@ -121,8 +121,8 @@ Value Reverse(OpBuilder* builder, Value value_to_reverse, int axis,
               RankedTensorType type, mlir::Location location) {
   auto axis_op = CreateI32SplatConst(builder, {1}, axis, location);
   // The result type will be the same as the input.
-  return builder->create<TF::ReverseV2Op>(location, type, value_to_reverse,
-                                          axis_op);
+  return TF::ReverseV2Op::create(*builder, location, type, value_to_reverse,
+                                 axis_op);
 }
 
 ArrayRef<int64_t> GetRankedTensorShape(Value value) {
@@ -154,8 +154,8 @@ Value SliceRankedTensor(OpBuilder* builder, Value input,
   auto slice_i2c_size =
       CreateI64DenseConst(builder, size_shape, size_values, location);
 
-  return builder->create<TF::SliceOp>(
-      location,
+  return TF::SliceOp::create(
+      *builder, location,
       RankedTensorType::get(
           size_values,
           mlir::cast<RankedTensorType>(input.getType()).getElementType()),
@@ -175,9 +175,9 @@ Value CreateStridedSliceOp(mlir::Location loc, ArrayRef<int64_t> output_shape,
   auto end_tensor = CreateI32DenseConst(builder, end, loc);
   auto strides_tensor = CreateI32DenseConst(builder, strides, loc);
 
-  return builder->create<TF::StridedSliceOp>(
-      loc, output_type, input, begin_tensor, end_tensor, strides_tensor,
-      builder->getI64IntegerAttr(begin_mask),
+  return TF::StridedSliceOp::create(
+      *builder, loc, output_type, input, begin_tensor, end_tensor,
+      strides_tensor, builder->getI64IntegerAttr(begin_mask),
       builder->getI64IntegerAttr(end_mask),
       builder->getI64IntegerAttr(ellipsis_mask),
       builder->getI64IntegerAttr(new_axis_mask),
@@ -590,21 +590,20 @@ TF::ConstOp Create1DConstantOp(const std::vector<int>& value, Location loc,
   auto type =
       mlir::RankedTensorType::get(value.size(), builder->getIntegerType(32));
   auto dense_values = mlir::DenseIntElementsAttr::get(type, value);
-  return builder->create<TF::ConstOp>(loc, dense_values);
+  return TF::ConstOp::create(*builder, loc, dense_values);
 }
 
 TF::ConstOp CreateScalarConstantOp(int value, Location loc,
                                    OpBuilder* builder) {
-  return builder->create<TF::ConstOp>(loc, builder->getI32IntegerAttr(value));
+  return TF::ConstOp::create(*builder, loc, builder->getI32IntegerAttr(value));
 }
 
 TF::ReshapeOp CreateFlattenOP(const Value& input, Location loc,
                               OpBuilder* builder) {
   auto output_shape = Create1DConstantOp({-1}, loc, builder);
-  return builder->create<mlir::TF::ReshapeOp>(
-      loc,
-      /*tensor=*/input,
-      /*shape=*/output_shape.getResult());
+  return mlir::TF::ReshapeOp::create(*builder, loc,
+                                     /*tensor=*/input,
+                                     /*shape=*/output_shape.getResult());
 }
 
 LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
@@ -637,9 +636,9 @@ LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
       builder);
 
   auto axis_op = CreateScalarConstantOp(axis, loc, builder);
-  *result = builder->create<TF::SplitVOp>(loc, output_types, input,
-                                          size_of_splits_op.getResult(),
-                                          axis_op.getResult());
+  *result =
+      TF::SplitVOp::create(*builder, loc, output_types, input,
+                           size_of_splits_op.getResult(), axis_op.getResult());
   return success();
 }
 
@@ -771,8 +770,8 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
       mlir::cast<RankedTensorType>(final_inputs.getType()).getElementType());
 
   Value none = CreateNoneValue(builder, func_op.getLoc());
-  auto lstm = builder->create<mlir::TFL::UnidirectionalSequenceLSTMOp>(
-      func_op.getLoc(), result_type, /*input=*/final_inputs,
+  auto lstm = mlir::TFL::UnidirectionalSequenceLSTMOp::create(
+      *builder, func_op.getLoc(), result_type, /*input=*/final_inputs,
       /*input_to_input_weights=*/weights_array->getResult(0),
       /*input_to_forget_weights=*/weights_array->getResult(1),
       /*input_to_cell_weights=*/weights_array->getResult(2),
@@ -881,7 +880,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
                                           func_op.getFunctionType().getInputs(),
                                           output_types));
 
-  builder->create<mlir::func::ReturnOp>(func_op.getLoc(), outputs);
+  mlir::func::ReturnOp::create(*builder, func_op.getLoc(), outputs);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
index 211336de124075..59c3f883411221 100644
--- a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
@@ -57,11 +57,11 @@ void ConvertNMSPaddedFunc::RewriteFunc() {
   auto output_type1 = func_.getFunctionType().getResult(1);
 
   OpBuilder builder(func_.getBody());
-  auto op = builder.create<mlir::TFL::NonMaxSuppressionV4Op>(
-      func_.getLoc(), output_type0, output_type1, boxes, scores,
+  auto op = mlir::TFL::NonMaxSuppressionV4Op::create(
+      builder, func_.getLoc(), output_type0, output_type1, boxes, scores,
       max_output_size, iou_threshold, score_threshold);
 
-  builder.create<mlir::func::ReturnOp>(func_.getLoc(), op.getResults());
+  mlir::func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 }
 
 LogicalResult ConvertNMSPaddedFunc::VerifySignature() {
@@ -102,11 +102,11 @@ LogicalResult ConvertSSDPostProcessFunc::RewriteFunc() {
                                     custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func_.getLoc(), func_.getFunctionType().getResults(),
-      func_.getArguments(), kCustomSSDPostprocessing,
-      CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func_.getLoc(), op.getResults());
+  auto op = CustomOp::create(builder, func_.getLoc(),
+                             func_.getFunctionType().getResults(),
+                             func_.getArguments(), kCustomSSDPostprocessing,
+                             CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
index 5f680c7db9be58..4bcf4b86e0ea17 100644
--- a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
@@ -104,11 +104,11 @@ LogicalResult ConvertMaxUnpoolingFunc::RewriteFunc() {
   if (failed(CreateCustomOptions(custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func_.getLoc(), func_.getFunctionType().getResults(),
-      func_.getArguments(), kMaxUnpooling,
-      CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func_.getLoc(), op.getResults());
+  auto op = CustomOp::create(builder, func_.getLoc(),
+                             func_.getFunctionType().getResults(),
+                             func_.getArguments(), kMaxUnpooling,
+                             CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 
   return success();
 }
@@ -205,11 +205,11 @@ LogicalResult ConvertDenseImageWarpFunc::RewriteFunc() {
                  StringAttr::get(func_.getContext(), kImageWarping));
 
   OpBuilder builder(func_.getBody());
-  auto op = builder.create<CustomOp>(func_.getLoc(),
-                                     func_.getFunctionType().getResults(),
-                                     func_.getArguments(), kImageWarping,
-                                     CustomOption(&builder, /*content=*/""));
-  builder.create<func::ReturnOp>(func_.getLoc(), op.getResults());
+  auto op = CustomOp::create(builder, func_.getLoc(),
+                             func_.getFunctionType().getResults(),
+                             func_.getArguments(), kImageWarping,
+                             CustomOption(&builder, /*content=*/""));
+  func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc b/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc
index 3c136be98ef071..f3917e32d91126 100644
--- a/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc
@@ -85,7 +85,7 @@ TEST(RegionIsolationTest, CaseOp) {
 
   OpBuilder b(&ctx);
 
-  OwningOpRef<ModuleOp> root(b.create<ModuleOp>(b.getUnknownLoc()));
+  OwningOpRef<ModuleOp> root(ModuleOp::create(b, b.getUnknownLoc()));
 
   {
     auto& block = root->getBodyRegion().front();
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index fa191c6c69d984..a402deb4bc230e 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -136,10 +136,10 @@ LogicalResult ConvertWhitespaceTokenizer(func::FuncOp func, llvm::StringRef api,
   func->setAttr(kTFImplements, attr);
   OpBuilder builder(func.getBody());
   std::string empty_option_buffer;
-  auto op = builder.create<CustomOp>(
-      func.getLoc(), func.getFunctionType().getResults(), func.getArguments(),
-      api, CustomOption(&builder, empty_option_buffer));
-  builder.create<func::ReturnOp>(func.getLoc(), op.getResults());
+  auto op = CustomOp::create(
+      builder, func.getLoc(), func.getFunctionType().getResults(),
+      func.getArguments(), api, CustomOption(&builder, empty_option_buffer));
+  func::ReturnOp::create(builder, func.getLoc(), op.getResults());
   return success();
 }
 
@@ -267,10 +267,10 @@ LogicalResult ConvertNgrams(func::FuncOp func, llvm::StringRef api,
                                       custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func.getLoc(), func.getFunctionType().getResults(), func.getArguments(),
-      api, CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func.getLoc(), op.getResults());
+  auto op = CustomOp::create(
+      builder, func.getLoc(), func.getFunctionType().getResults(),
+      func.getArguments(), api, CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func.getLoc(), op.getResults());
   return success();
 }
 
@@ -350,10 +350,10 @@ LogicalResult ConvertSgnnProjection(func::FuncOp func, llvm::StringRef api,
                                               custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func.getLoc(), func.getFunctionType().getResults(), func.getArguments(),
-      api, CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func.getLoc(), op.getResults());
+  auto op = CustomOp::create(
+      builder, func.getLoc(), func.getFunctionType().getResults(),
+      func.getArguments(), api, CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func.getLoc(), op.getResults());
   return success();
 }
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index 7583d48618f4fc..d38cf411ea9f2c 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -136,7 +136,7 @@ def HasSameStaticShapes : Constraint<
     "have the same static shape">;
 
 def CreateNoneValue : NativeCodeCall<
-  "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
+  "TFL::NoValueOp::create($_builder, $0.getLoc(), $_builder.getUnitAttr())">;
 
 // Returns shape of a ranked tensor.
 // if called without a ranked tensor it will fail.
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index e94f9359d6fad2..89896d69079c28 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -89,12 +89,12 @@ Value CreateConstValue(OpBuilder& builder, const Location loc,
         RankedTensorType::get(shape, builder.getIntegerType(sizeof(T) * 8));
 
     const auto attr = DenseIntElementsAttr::get(shape_type, values);
-    return builder.create<TF::ConstOp>(loc, attr);
+    return TF::ConstOp::create(builder, loc, attr);
   }
 
   const auto type = RankedTensorType::get(shape, builder.getF32Type());
   const auto value_attr = DenseFPElementsAttr::get(type, values);
-  return builder.create<TF::ConstOp>(loc, value_attr);
+  return TF::ConstOp::create(builder, loc, value_attr);
 }
 
 // Creates a 1D array with integer/float type.
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index 324b70c8fbe573..5f43083540831f 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -288,10 +288,10 @@ struct ConvertStatsToQDQs
     rewriter.setInsertionPointAfter(op.getOperation());
     Type result_type = quant_type.castFromExpressedType(op.getType());
     auto q =
-        rewriter.create<QuantizeOpT>(op.getLoc(), result_type, op.getArg());
+        QuantizeOpT::create(rewriter, op.getLoc(), result_type, op.getArg());
     q->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
 
-    auto dq = rewriter.create<DequantizeOpT>(op.getLoc(), op.getType(), q);
+    auto dq = DequantizeOpT::create(rewriter, op.getLoc(), op.getType(), q);
     op.getResult().replaceAllUsesWith(dq);
     q.getOperation()->replaceUsesOfWith(dq, op.getArg());
     op.erase();
@@ -644,8 +644,8 @@ class QuantizationPattern : public RewritePattern {
             if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
               continue;
             }
-            auto cst = rewriter.create<arith::ConstantOp>(
-                quantized_op->getLoc(), attr);
+            auto cst = arith::ConstantOp::create(rewriter,
+                                                 quantized_op->getLoc(), attr);
             quantizing_op->setOperand(i, cst.getResult());
           }
         }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index 0818c8013e534e..4203d7824844f9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -125,9 +125,9 @@ class ConvertTfQuantToMhloIntTest : public Test {
       // can't lower tf.Const.
       Value cst;
       if (use_mhlo_const) {
-        cst = builder.create<mhlo::ConstantOp>(func_op->getLoc(), attrs);
+        cst = mhlo::ConstantOp::create(builder, func_op->getLoc(), attrs);
       } else {
-        cst = builder.create<TF::ConstOp>(func_op->getLoc(), attrs);
+        cst = TF::ConstOp::create(builder, func_op->getLoc(), attrs);
       }
       func_op.getArgument(i).replaceAllUsesWith(cst);
     }
@@ -180,7 +180,7 @@ class ConvertTfQuantToMhloIntTest : public Test {
             /*byte_strides=*/std::nullopt, host_buffer_semantics,
             /*on_done_with_host_buffer=*/nullptr,
             *device_->default_memory_space(), /*device_layout=*/nullptr));
-    return buffer->ToLiteralSync();
+    return buffer->ToLiteral().Await();
   }
 
   absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileProgram(
@@ -220,7 +220,7 @@ class ConvertTfQuantToMhloIntTest : public Test {
     TF_ASSIGN_OR_RETURN(auto result,
                         executable->Execute({buffer_ptrs}, /*options=*/{}));
     CHECK(result.size() == 1 && result[0].size() == 1);
-    return result[0][0]->ToLiteralSync();
+    return result[0][0]->ToLiteral().Await();
   }
 
   void ExecuteAndCompareResultsWithTfKernel(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
index 46d408b06d05e4..cc63c246434934 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
@@ -241,8 +241,8 @@ class TFUniformQuantizedOpsPattern : public ConversionPattern {
       Type orig_op_type = op->getOperandTypes()[i];
       if (IsIllegalType(orig_op_type) &&
           !IsQintValueDefinedByIntToQintCast(op->getOperand(i))) {
-        new_operands.push_back(rewriter.create<TF::CastOp>(
-            op->getLoc(), orig_op_type, operands[i]));
+        new_operands.push_back(TF::CastOp::create(rewriter, op->getLoc(),
+                                                  orig_op_type, operands[i]));
       } else {
         new_operands.push_back(operands[i]);
       }
@@ -261,8 +261,8 @@ class TFUniformQuantizedOpsPattern : public ConversionPattern {
       Value &result = new_results[i];
       if (IsIllegalType(result.getType()) &&
           !IsQintValueQintToIntCast(op->getResult(i))) {
-        result = rewriter.create<TF::CastOp>(
-            op->getLoc(), ToLegalType(result.getType()), result);
+        result = TF::CastOp::create(rewriter, op->getLoc(),
+                                    ToLegalType(result.getType()), result);
       }
       // If the result is already consumed by qint->int CastOp, manually replace
       // its use by the new UQ op. This is because such CastOp is already legal,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
index b7903b433757b5..1dd93a9b2c165e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
@@ -177,15 +177,17 @@ class BitcastConvertOpPattern
       return failure();
     } else if (is_input_legal) {
       // output is f32, we bitcast_convert to f32 and then convert to bf16.
-      const Value output = rewriter.create<mlir::stablehlo::BitcastConvertOp>(
-          op->getLoc(), op.getResult().getType(), adaptor.getOperand());
+      const Value output = mlir::stablehlo::BitcastConvertOp::create(
+          rewriter, op->getLoc(), op.getResult().getType(),
+          adaptor.getOperand());
       rewriter.replaceOpWithNewOp<mlir::stablehlo::ConvertOp>(
           op, getTypeConverter()->convertType(op.getResult().getType()),
           output);
     } else if (is_output_legal) {
       // input is f32, we convert from bf16 and then bitcast_convert.
-      const Value output = rewriter.create<mlir::stablehlo::ConvertOp>(
-          op->getLoc(), op.getOperand().getType(), adaptor.getOperand());
+      const Value output = mlir::stablehlo::ConvertOp::create(
+          rewriter, op->getLoc(), op.getOperand().getType(),
+          adaptor.getOperand());
       rewriter.replaceOpWithNewOp<mlir::stablehlo::BitcastConvertOp>(
           op, op.getResult().getType(), output);
     } else {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
index 7ee6bbd98f61e6..a63ffb1504bd85 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
@@ -123,8 +123,8 @@ void ConvertXlaCallModuleOpToBfloat16Pass::runOnOperation() {
     builder.setInsertionPoint(op);
     for (auto& op_operand : op->getOpOperands()) {
       if (quant::stablehlo::IsLargeFloatType(op_operand.get().getType())) {
-        op_operand.set(builder.create<TF::CastOp>(
-            op->getLoc(),
+        op_operand.set(TF::CastOp::create(
+            builder, op->getLoc(),
             quant::stablehlo::ToBfloat16Type(op_operand.get().getType()),
             op_operand.get()));
       }
@@ -135,7 +135,7 @@ void ConvertXlaCallModuleOpToBfloat16Pass::runOnOperation() {
         const Type original_type = op_result.getType();
         op_result.setType(quant::stablehlo::ToBfloat16Type(original_type));
         const Value cast =
-            builder.create<TF::CastOp>(op->getLoc(), original_type, op_result);
+            TF::CastOp::create(builder, op->getLoc(), original_type, op_result);
         op_result.replaceAllUsesExcept(cast, cast.getDefiningOp());
       }
     }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index 0fdefd7342624c..08befa7708297c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -58,8 +58,8 @@ LogicalResult IsTransposeOpWithPermuation(Operation* absl_nullable op,
 // The Location is set as `input`'s loc.
 TransposeOp CreateTransposeOp(Value input, const ArrayRef<int64_t> permutation,
                               PatternRewriter& rewriter) {
-  return rewriter.create<TransposeOp>(
-      input.getLoc(), input, rewriter.getDenseI64ArrayAttr(permutation));
+  return TransposeOp::create(rewriter, input.getLoc(), input,
+                             rewriter.getDenseI64ArrayAttr(permutation));
 }
 
 // Defers the transpose of the left-hand side (LHS) to the right-hand side and
@@ -77,7 +77,7 @@ void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
       /*input=*/rhs, kNchwToNhwcPermutation, rewriter);
 
   auto new_binary_op =
-      rewriter.create<OpT>(op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
+      OpT::create(rewriter, op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
 
   // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
   TransposeOp output_transpose_op = CreateTransposeOp(
@@ -166,23 +166,22 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
 
     // Create a new `stablehlo.reduce_window` with all relevant attributes
     // permutated to match the new operand & result type.
-    auto new_reduce_window_op =
-        rewriter.create<mlir::stablehlo::ReduceWindowOp>(
-            op.getLoc(), new_result_type, transpose_op.getOperand(),
-            /*init_value=*/op.getOperand(1),
-            /*window_dimensions=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowDimensions(),
-                                kNchwToNhwcPermutation),
-            /*window_strides=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowStrides(),
-                                kNchwToNhwcPermutation),
-            /*base_dilations=*/
-            PermuteI64ArrayAttr(rewriter, op.getBaseDilations(),
-                                kNchwToNhwcPermutation),
-            /*window_dilations=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowDilations(),
-                                kNchwToNhwcPermutation),
-            /*padding=*/DenseIntElementsAttr(nullptr));
+    auto new_reduce_window_op = mlir::stablehlo::ReduceWindowOp::create(
+        rewriter, op.getLoc(), new_result_type, transpose_op.getOperand(),
+        /*init_value=*/op.getOperand(1),
+        /*window_dimensions=*/
+        PermuteI64ArrayAttr(rewriter, op.getWindowDimensions(),
+                            kNchwToNhwcPermutation),
+        /*window_strides=*/
+        PermuteI64ArrayAttr(rewriter, op.getWindowStrides(),
+                            kNchwToNhwcPermutation),
+        /*base_dilations=*/
+        PermuteI64ArrayAttr(rewriter, op.getBaseDilations(),
+                            kNchwToNhwcPermutation),
+        /*window_dilations=*/
+        PermuteI64ArrayAttr(rewriter, op.getWindowDilations(),
+                            kNchwToNhwcPermutation),
+        /*padding=*/DenseIntElementsAttr(nullptr));
 
     // Clone the reduce body. It is not affected by the permutation.
     IRMapping mapping;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
index 699f157e3d1b39..f4648f9a0a0362 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
@@ -160,8 +160,8 @@ class FoldTransposedConstantOp
                                      /*elementType=*/rewriter.getF32Type());
     auto new_value_attr =
         DenseFPElementsAttr::get(new_value_type, std::move(transposed_values));
-    auto new_const_op = rewriter.create<mlir::stablehlo::ConstantOp>(
-        combined_loc, new_value_attr);
+    auto new_const_op = mlir::stablehlo::ConstantOp::create(
+        rewriter, combined_loc, new_value_attr);
 
     rewriter.replaceAllUsesWith(op, new_const_op);
     return success();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
index e855c51749e6d5..05a826b14b010a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
@@ -86,9 +86,9 @@ LogicalResult InsertCalibrationStatisticsSaverOp(
   ArrayAttr ids_attr = builder.getStrArrayAttr(ids);
   ArrayAttr calibration_methods_attr =
       builder.getI32ArrayAttr(calibration_methods);
-  builder.create<TF::CalibrationStatisticsSaverOp>(
-      region.getLoc(), statistics_outputs, output_file_path_attr, ids_attr,
-      calibration_methods_attr);
+  TF::CalibrationStatisticsSaverOp::create(
+      builder, region.getLoc(), statistics_outputs, output_file_path_attr,
+      ids_attr, calibration_methods_attr);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
index c72879c2e04a4d..71a5b35e351495 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
@@ -114,20 +114,21 @@ class MergeFusionWithUniformDequantizePattern
 
     // Modify the quantized fused function to do dequantize+relu(6).
     rewriter.setInsertionPoint(req_op);
-    Value new_result = rewriter.create<mlir::stablehlo::UniformDequantizeOp>(
-        req_op.getLoc(), func_op.getResultTypes()[0], req_op.getOperand());
+    Value new_result = mlir::stablehlo::UniformDequantizeOp::create(
+        rewriter, req_op.getLoc(), func_op.getResultTypes()[0],
+        req_op.getOperand());
     if (func_name.contains("_relu6_")) {
-      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
-          req_op.getLoc(), rewriter.getF32FloatAttr(0));
-      auto max = rewriter.create<mlir::stablehlo::ConstantOp>(
-          req_op.getLoc(), rewriter.getF32FloatAttr(6));
-      new_result = rewriter.create<mlir::stablehlo::ClampOp>(
-          req_op.getLoc(), min, new_result, max);
+      auto min = mlir::stablehlo::ConstantOp::create(
+          rewriter, req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      auto max = mlir::stablehlo::ConstantOp::create(
+          rewriter, req_op.getLoc(), rewriter.getF32FloatAttr(6));
+      new_result = mlir::stablehlo::ClampOp::create(rewriter, req_op.getLoc(),
+                                                    min, new_result, max);
     } else if (func_name.contains("_relu_")) {
-      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
-          req_op.getLoc(), rewriter.getF32FloatAttr(0));
-      new_result = rewriter.create<mlir::chlo::BroadcastMaxOp>(
-          req_op.getLoc(), min, new_result, nullptr);
+      auto min = mlir::stablehlo::ConstantOp::create(
+          rewriter, req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      new_result = mlir::chlo::BroadcastMaxOp::create(rewriter, req_op.getLoc(),
+                                                      min, new_result, nullptr);
     }
     return_op->setOperand(0, new_result);
     rewriter.eraseOp(req_op);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
index 51950c5513c5df..1c425487799962 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
@@ -76,8 +76,9 @@ class RewriteNchwConvolutionToNhwc
     const TensorType new_input_tensor_type = GetTransposedTensorType(
         mlir::cast<TensorType>(input.getType()), kNchwToNhwcPermutation);
 
-    auto input_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
-        op.getLoc(), /*resultType0=*/new_input_tensor_type, /*operand=*/input,
+    auto input_transpose_op = mlir::stablehlo::TransposeOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/new_input_tensor_type,
+        /*operand=*/input,
         rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
 
     // Transpose the filter tensor: [o, i, 0, 1] => [0, 1, i, o]
@@ -85,8 +86,9 @@ class RewriteNchwConvolutionToNhwc
     const TensorType new_filter_tensor_type = GetTransposedTensorType(
         mlir::cast<TensorType>(filter.getType()), kOihwToHwioPermutation);
 
-    auto filter_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
-        op.getLoc(), /*resultType0=*/new_filter_tensor_type, /*operand=*/filter,
+    auto filter_transpose_op = mlir::stablehlo::TransposeOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/new_filter_tensor_type,
+        /*operand=*/filter,
         rewriter.getDenseI64ArrayAttr(kOihwToHwioPermutation));
 
     // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
@@ -108,8 +110,8 @@ class RewriteNchwConvolutionToNhwc
     // reused without modification because the ordering of spatial dimensions
     // is not modified (i.e. before: [b, f, 0, 1], after: [b, 0, 1, f] => the
     // spatial dimension is still ordered as {0, 1}).
-    auto new_convolution_op = rewriter.create<mlir::stablehlo::ConvolutionOp>(
-        op.getLoc(), /*resultType0=*/new_conv_output_tensor_type,
+    auto new_convolution_op = mlir::stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/new_conv_output_tensor_type,
         /*lhs=*/input_transpose_op,
         /*rhs=*/filter_transpose_op,
         /*window_strides=*/op.getWindowStridesAttr(),
@@ -125,8 +127,9 @@ class RewriteNchwConvolutionToNhwc
     // Transpose the output of the `ConvolutionOp` back to the original op's
     // output shape so that users' shapes match.
     // [b, 0, 1, f] => [b, f, 0, 1]
-    auto output_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
-        new_convolution_op.getLoc(), /*resultType0=*/output_tensor_type,
+    auto output_transpose_op = mlir::stablehlo::TransposeOp::create(
+        rewriter, new_convolution_op.getLoc(),
+        /*resultType0=*/output_tensor_type,
         /*operand=*/new_convolution_op,
         rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
index 2102e64f223d55..4dff113b6427c9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
@@ -95,8 +95,8 @@ class MergeConsecutiveQuantizeCast
         q_op.getArg().getDefiningOp<mlir::quant::ir::QuantizeCastOp>();
     if (!preceding_qcast) return failure();
 
-    auto new_qcast = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
-        q_op.getLoc(), q_op.getType(), preceding_qcast.getArg());
+    auto new_qcast = mlir::quant::ir::QuantizeCastOp::create(
+        rewriter, q_op.getLoc(), q_op.getType(), preceding_qcast.getArg());
     new_qcast->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
     q_op->replaceAllUsesWith(new_qcast);
     return success();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
index 9d0a978bdb8efc..e65d5423458f50 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
@@ -163,8 +163,8 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
       }
     }
     rewriter.setInsertionPointAfter(op);
-    ConvertOp new_convert_op = rewriter.create<ConvertOp>(
-        op->getLoc(), new_result_type, op.getResult());
+    ConvertOp new_convert_op = ConvertOp::create(
+        rewriter, op->getLoc(), new_result_type, op.getResult());
     quantizable_op->setOperand(quantize_operand_num,
                                new_convert_op.getResult());
   }
@@ -203,10 +203,10 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
       // of its number of users.
       rewriter.setInsertionPointAfter(op);
       // create new F16 constant op in that location
-      ConstantOp new_const = rewriter.create<ConstantOp>(
-          op->getLoc(), new_result_type, new_value_attr);
+      ConstantOp new_const = ConstantOp::create(
+          rewriter, op->getLoc(), new_result_type, new_value_attr);
       ConvertOp dcast =
-          rewriter.create<ConvertOp>(op->getLoc(), old_result_type, new_const);
+          ConvertOp::create(rewriter, op->getLoc(), old_result_type, new_const);
       // replace all convert ops with dq op.
       convert_op->replaceAllUsesWith(dcast);
       // Return without scanning for the next ConvertOp as only one ConvertOp is
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
index ac1f5e8d705d49..46da2a3f25b82c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
@@ -82,8 +82,8 @@ void UnwrapXlaCallModuleOp(TF::XlaCallModuleOp call_op,
   // TODO: b/310291615 - find a better way for multi-platform support.
   if (call_op_has_platform_index_arg) {
     arg_mapper.map(func_op.getArgument(0),
-                   builder.create<mhlo::ConstantOp>(
-                       func_op.getLoc(), builder.getI16IntegerAttr(0)));
+                   mhlo::ConstantOp::create(builder, func_op.getLoc(),
+                                            builder.getI16IntegerAttr(0)));
   }
   for (auto [func_arg, operand] : llvm::zip_equal(
            func_op.getArguments().take_back(call_op.getNumOperands()),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index 4ec998d18bcfa3..b06568589dadf2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -221,9 +221,9 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@local_xla//xla/tsl/platform:errors",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
index fd3acb188656a1..7f8f2623b7bfa0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status_matchers.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -55,11 +55,11 @@ TEST_F(CalibrationStatisticsSaverTest, MissingOutputPath) {
   inputs.emplace_back("min", 0, DT_FLOAT);
   inputs.emplace_back("max", 0, DT_FLOAT);
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Finalize(node_def()));
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Finalize(node_def()));
   ASSERT_THAT(InitOp(),
               absl_testing::StatusIs(
                   tsl::error::INVALID_ARGUMENT,
@@ -75,12 +75,12 @@ TEST_F(CalibrationStatisticsSaverTest, WrongNumInputs) {
   inputs.emplace_back("min", 0, DT_FLOAT);
   inputs.emplace_back("max", 0, DT_FLOAT);
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", "/tmp/statistics.pbtxt")
-                  .Finalize(node_def()));
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", "/tmp/statistics.pbtxt")
+               .Finalize(node_def()));
   ASSERT_THAT(InitOp(),
               absl_testing::StatusIs(
                   tsl::error::ABORTED,
@@ -98,12 +98,12 @@ TEST_F(CalibrationStatisticsSaverTest, WrongInputTypes) {
   inputs.emplace_back("max", 0, DT_FLOAT);
   inputs.emplace_back("histogram", 0, DT_FLOAT);
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", "/tmp/statistics.pbtxt")
-                  .Finalize(node_def()));
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", "/tmp/statistics.pbtxt")
+               .Finalize(node_def()));
   ASSERT_THAT(InitOp(),
               absl_testing::StatusIs(
                   tsl::error::ABORTED,
@@ -123,24 +123,23 @@ TEST_F(CalibrationStatisticsSaverTest, SimpleMinMax) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({0}), {});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
   ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
 
@@ -163,24 +162,23 @@ TEST_F(CalibrationStatisticsSaverTest, SimpleAverageMinMax) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({0}), {});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
   ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
 
@@ -204,24 +202,23 @@ TEST_F(CalibrationStatisticsSaverTest, SimpleHistogram) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({8}), {1, 4, 6, 7, 3, 2, 1, 0});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
   ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
 
@@ -250,13 +247,13 @@ TEST_F(CalibrationStatisticsSaverTest, MultipleStats) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
@@ -265,12 +262,11 @@ TEST_F(CalibrationStatisticsSaverTest, MultipleStats) {
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({8}), {1, 4, 6, 7, 3, 2, 1, 0});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(2));
   ASSERT_THAT(statistics_map.statistics(), Contains(Key("1")));
   ASSERT_THAT(statistics_map.statistics(), Contains(Key("2")));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 6fe5bd285f8f50..99f2d2a52e1a92 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -54,6 +54,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/util/tensor_bundle",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -62,7 +63,6 @@ tf_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
index 5cef40a8e77c0e..42bf32a27e7bee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
@@ -67,7 +67,7 @@ LogicalResult FoldOperation(OpBuilder& builder, Operation* op,
   results.clear();
   builder.setInsertionPointAfter(op);
   for (const auto& result_value : result_values) {
-    results.push_back(builder.create<TF::ConstOp>(op->getLoc(), result_value));
+    results.push_back(TF::ConstOp::create(builder, op->getLoc(), result_value));
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
index 2fca9426c9d59f..3c5d085d7655aa 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/cleanup/cleanup.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -63,7 +63,7 @@ class SaveVariablesToCheckpointTest : public ::testing::Test {
       return absl::InternalError("Failed to create temp file.");
     }
 
-    TF_CHECK_OK(env_->CreateDir(tmp_dir));
+    CHECK_OK(env_->CreateDir(tmp_dir));
     return tmp_dir;
   }
 
@@ -103,8 +103,8 @@ TEST_F(SaveVariablesToCheckpointTest, VariableSavedToCheckpoint) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -149,8 +149,8 @@ TEST_F(SaveVariablesToCheckpointTest, MultipleVariablesSavedToCheckpoint) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -186,8 +186,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -218,8 +218,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   EXPECT_TRUE(
@@ -253,8 +253,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -293,8 +293,8 @@ TEST_F(SaveVariablesToCheckpointTest, MutableVariablesNotSaved) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -330,8 +330,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -371,8 +371,8 @@ TEST_F(SaveVariablesToCheckpointTest, FailsWhenDuplicateSharedName) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   EXPECT_FALSE(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
index 430056668af8ae..c2339fe044edd7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
@@ -91,8 +91,8 @@ class CastBf16OpsToF32 : public RewritePattern {
     for (int i = 0; i < op->getNumOperands(); i++) {
       Value input = op->getOperand(i);
       if (getElementTypeOrSelf(input).isBF16()) {
-        Value f32_cast = rewriter.create<TF::CastOp>(
-            op->getLoc(),
+        Value f32_cast = TF::CastOp::create(
+            rewriter, op->getLoc(),
             CloneTypeWithNewElementType(input.getType(), rewriter.getF32Type()),
             input);
         op->setOperand(i, f32_cast);
@@ -108,8 +108,8 @@ class CastBf16OpsToF32 : public RewritePattern {
         for (Operation* user : op->getUsers()) {
           for (int i = 0; i < user->getNumOperands(); i++) {
             if (user->getOperand(i) == value) {
-              Value bf16_cast = rewriter.create<TF::CastOp>(
-                  user->getLoc(),
+              Value bf16_cast = TF::CastOp::create(
+                  rewriter, user->getLoc(),
                   CloneTypeWithNewElementType(value.getType(),
                                               rewriter.getBF16Type()),
                   value);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index bfbb8b45c2d80c..2ae814880fc2ff 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -241,10 +241,10 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
       }
     }
     rewriter.setInsertionPointAfter(op);
-    auto q = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
-        op->getLoc(), cast_type, op.getResult());
-    auto dq = rewriter.create<mlir::quant::ir::DequantizeCastOp>(
-        op->getLoc(), expressed_type, q);
+    auto q = mlir::quant::ir::QuantizeCastOp::create(rewriter, op->getLoc(),
+                                                     cast_type, op.getResult());
+    auto dq = mlir::quant::ir::DequantizeCastOp::create(rewriter, op->getLoc(),
+                                                        expressed_type, q);
     quantize_op->setOperand(quantize_operand_num, dq.getResult());
     return true;
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
index 2ef9eeecc7bc2d..0c42b760557c51 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
@@ -32,8 +32,8 @@ namespace {
 Value GetDimValue(OpBuilder &builder, Location loc, Value shape_value,
                   int32_t dim) {
   Type attribute_type = builder.getI64Type();
-  return builder.create<TF::StridedSliceOp>(
-      loc,
+  return TF::StridedSliceOp::create(
+      builder, loc,
       RankedTensorType::get(
           {}, mlir::cast<ShapedType>(shape_value.getType()).getElementType()),
       /*input=*/shape_value,
@@ -60,16 +60,16 @@ void GetSamePaddingValues(OpBuilder &builder, Location loc, Value input_size,
   Type int32_scalar_type = zero.getType();
 
   auto scalar_add = [&](Value lhs, Value rhs) {
-    return builder.create<TF::AddOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::AddOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
   auto scalar_mul = [&](Value lhs, Value rhs) {
-    return builder.create<TF::MulOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::MulOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
   auto scalar_sub = [&](Value lhs, Value rhs) {
-    return builder.create<TF::SubOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::SubOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
   auto scalar_div = [&](Value lhs, Value rhs) {
-    return builder.create<TF::DivOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::DivOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
 
   // effective_filter_size = (filter_size - 1) * dilation_rate + 1
@@ -90,7 +90,7 @@ void GetSamePaddingValues(OpBuilder &builder, Location loc, Value input_size,
       scalar_add(effective_filter_size_op,
                  scalar_mul(stride_value, scalar_sub(output_size, one))),
       input_size);
-  padding_needed = builder.create<TF::MaximumOp>(loc, padding_needed, zero);
+  padding_needed = TF::MaximumOp::create(builder, loc, padding_needed, zero);
   padding_low = scalar_div(padding_needed, two);
   padding_high = scalar_sub(padding_needed, padding_low);
 }
@@ -104,14 +104,15 @@ Value PadForDynamicShapedInputSamePadding(
 
   auto reshape_op = [&](Value value, const SmallVector<int64_t> &shape) {
     const int64_t rank = shape.size();
-    return builder.create<TF::ReshapeOp>(
-        loc, RankedTensorType::get(shape, builder.getI32Type()), value,
+    return TF::ReshapeOp::create(
+        builder, loc, RankedTensorType::get(shape, builder.getI32Type()), value,
         CreateConstValue<int64_t>(builder, loc, {rank}, shape));
   };
 
   ShapedType filter_shape = mlir::cast<ShapedType>(filter.getType());
-  Value input_shape_value = builder.create<TF::ShapeOp>(
-      loc, RankedTensorType::get({num_dims}, builder.getI32Type()), input);
+  Value input_shape_value = TF::ShapeOp::create(
+      builder, loc, RankedTensorType::get({num_dims}, builder.getI32Type()),
+      input);
   auto scalar_to_rank1 = [&](Value value) { return reshape_op(value, {1}); };
   for (int i : llvm::seq<int>(1, num_dims - 1)) {
     Value input_size_i = GetDimValue(builder, loc, input_shape_value, i);
@@ -131,12 +132,12 @@ Value PadForDynamicShapedInputSamePadding(
       builder, loc, /*shape=*/{num_dims - 2, 2},
       /*values=*/SmallVector<int32_t>(2 * (num_dims - 2), 0));
   Value zero = CreateScalarConstValue(builder, loc, 0);
-  Value temp_padding_rank1 = builder.create<TF::ConcatOp>(
-      loc, RankedTensorType::get({2 * num_dims}, builder.getI32Type()), zero,
-      temp_padding_values);
+  Value temp_padding_rank1 = TF::ConcatOp::create(
+      builder, loc, RankedTensorType::get({2 * num_dims}, builder.getI32Type()),
+      zero, temp_padding_values);
   Value temp_padding = reshape_op(temp_padding_rank1, {num_dims, 2});
-  return builder.create<TF::PadV2Op>(
-      loc, input.getType(), input, temp_padding,
+  return TF::PadV2Op::create(
+      builder, loc, input.getType(), input, temp_padding,
       CreateScalarConstValue<int8_t>(builder, loc, input_zp_value));
 }
 
@@ -224,9 +225,9 @@ Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
     output_shape[i] += padding_values[2 * i] + padding_values[2 * i + 1];
   }
 
-  return builder.create<TF::PadV2Op>(
-      loc, RankedTensorType::get(output_shape, builder.getI8Type()), input,
-      temp_padding,
+  return TF::PadV2Op::create(
+      builder, loc, RankedTensorType::get(output_shape, builder.getI8Type()),
+      input, temp_padding,
       CreateScalarConstValue<int8_t>(builder, loc, input_zp_value));
 }
 
@@ -254,7 +255,7 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
                                     value_type.getShape().end());
   RankedTensorType shape_type =
       RankedTensorType::get({rank}, builder.getI64Type());
-  Value shape_value = builder.create<TF::ShapeOp>(loc, shape_type, value);
+  Value shape_value = TF::ShapeOp::create(builder, loc, shape_type, value);
 
   // It is guaranteed that packed_shape[pack_dim] is known.
   if (packed_shape[pack_dim] % 2 != 0) {
@@ -263,14 +264,14 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
     padding[pack_dim * 2 + 1] = 1;
     Value padding_value =
         CreateConstValue<int32_t>(builder, loc, {rank, 2}, padding);
-    value = builder.create<TF::PadV2Op>(
-        loc, RankedTensorType::get(packed_shape, builder.getI8Type()), value,
-        padding_value, CreateScalarConstValue<int8_t>(builder, loc, 0));
+    value = TF::PadV2Op::create(
+        builder, loc, RankedTensorType::get(packed_shape, builder.getI8Type()),
+        value, padding_value, CreateScalarConstValue<int8_t>(builder, loc, 0));
 
     SmallVector<int64_t> shape_add(rank, 0);
     shape_add[pack_dim] = 1;
-    shape_value = builder.create<TF::AddOp>(
-        loc, shape_type, shape_value,
+    shape_value = TF::AddOp::create(
+        builder, loc, shape_type, shape_value,
         CreateConstValue<int64_t>(builder, loc, {rank}, shape_add));
   }
   packed_shape[pack_dim] /= 2;
@@ -279,17 +280,17 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
 
   RankedTensorType packed_output_type =
       RankedTensorType::get(packed_shape, builder.getI8Type());
-  Value packed_shape_value = builder.create<TF::DivOp>(
-      loc, shape_type, shape_value,
+  Value packed_shape_value = TF::DivOp::create(
+      builder, loc, shape_type, shape_value,
       CreateConstValue<int64_t>(builder, loc, {rank}, divisor));
 
   Value packed_low_begin_value = CreateConstValue<int64_t>(
       builder, loc, {rank}, SmallVector<int64_t>(rank, 0));
   Value packed_low_value =
-      builder.create<TF::SliceOp>(loc, packed_output_type, value,
-                                  packed_low_begin_value, packed_shape_value);
-  packed_low_value = builder.create<TF::BitwiseAndOp>(
-      loc, packed_output_type, packed_low_value,
+      TF::SliceOp::create(builder, loc, packed_output_type, value,
+                          packed_low_begin_value, packed_shape_value);
+  packed_low_value = TF::BitwiseAndOp::create(
+      builder, loc, packed_output_type, packed_low_value,
       CreateScalarConstValue<int8_t>(builder, loc, 0x0F));
 
   SmallVector<int64_t> packed_high_begin(rank, 0);
@@ -297,14 +298,14 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
   Value packed_high_begin_value =
       CreateConstValue<int64_t>(builder, loc, {rank}, packed_high_begin);
   Value packed_high_value =
-      builder.create<TF::SliceOp>(loc, packed_output_type, value,
-                                  packed_high_begin_value, packed_shape_value);
-  packed_high_value = builder.create<TF::LeftShiftOp>(
-      loc, packed_output_type, packed_high_value,
+      TF::SliceOp::create(builder, loc, packed_output_type, value,
+                          packed_high_begin_value, packed_shape_value);
+  packed_high_value = TF::LeftShiftOp::create(
+      builder, loc, packed_output_type, packed_high_value,
       CreateScalarConstValue<int8_t>(builder, loc, 4));
 
-  Operation *packed = builder.create<TF::BitwiseOrOp>(
-      loc, packed_output_type, packed_low_value, packed_high_value);
+  Operation* packed = TF::BitwiseOrOp::create(
+      builder, loc, packed_output_type, packed_low_value, packed_high_value);
   return ConstantFoldOpIfPossible(packed).front();
 }
 
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
index 24b1d05bce9735..ce91055db9c666 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
@@ -40,7 +40,7 @@ def CastValueToI64: NativeCodeCall<
   "CastValueToI64($0.getLoc(), $1, &$_builder)">;
 
 def CastValueToElementType: NativeCodeCall<
-  "$_builder.create<ConvertOp>($0.getLoc(), $1, "
+  "ConvertOp::create($_builder, $0.getLoc(), $1, "
   "getElementTypeOrSelf($2.getType()))">;
 
 // Here, $0 is an ElementsAttr with exactly one element of type integer. $1 is
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc b/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
index d440f20e6d9779..f963d7a9c8dcb1 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
@@ -27,14 +27,14 @@ namespace odml {
 
 mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
                                       OpBuilder* builder) {
-  return builder->create<mhlo::ConstantOp>(loc,
-                                           hlo::getScalarOfType(ty, raw_value));
+  return mhlo::ConstantOp::create(*builder, loc,
+                                  hlo::getScalarOfType(ty, raw_value));
 }
 
 mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
                                         OpBuilder* builder) {
-  return builder->create<mhlo::ConstantOp>(loc,
-                                           hlo::getScalarNegZeroOfType(ty));
+  return mhlo::ConstantOp::create(*builder, loc,
+                                  hlo::getScalarNegZeroOfType(ty));
 }
 
 DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr) {
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index cbd6bc3b283504..494c23f1344e9b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -298,9 +298,11 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
@@ -350,6 +352,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
@@ -399,6 +403,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:inline_function_utils",
         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
@@ -502,6 +507,9 @@ cc_library(
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
@@ -535,6 +543,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
@@ -565,10 +574,11 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
-        "//tensorflow/core/common_runtime:threadpool_device",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:threadpool_options",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
@@ -905,7 +915,6 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/util:managed_stack_trace",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@local_xla//xla/mlir/utils:error_util",
@@ -949,10 +958,8 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_xla//xla/hlo/testlib:test",
-        "@local_xla//xla/mlir/utils:error_util",
     ],
 )
 
@@ -1305,8 +1312,10 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
index 9a78a1a83ae214..a41e81b0bda21a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h"
 
+#include <cstdint>
+
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 19a988827bdf42..f6ce8d327a8874 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index e8d0ea525943fd..db85471f6ed6aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index ecd4f7560c359a..931fb51426257b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -11497,6 +11497,26 @@ representation of that entry.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_PostProcessPredictionOp : TF_Op<"PostProcessPrediction", []> {
+  let summary = [{
+Performs post-processing on prediction inputs. This op has no tensor outputs.
+  }];
+
+  let description = [{
+Send an rpc to the external service that builds rpc payload based on prediction result.
+  }];
+
+  let arguments = (ins
+    Arg<TF_Int64Tensor, [{Scalar int64 tensor representing steps.}]>:$steps,
+    Arg<TF_Int64Tensor, [{Scalar int64 tensor representing the GAIA ID.}]>:$gaia_id,
+    Arg<TF_Uint64Tensor, [{1-D uint64 tensor representing a list of video IDs.}]>:$video_id,
+
+    StrAttr:$op_config
+  );
+
+  let results = (outs);
+}
+
 def TF_PowOp : TF_Op<"Pow", [Pure, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Computes the power of one value to another.";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
index 2cc385794122a2..60a3ea3abdc10c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 
+#include <cstdint>
+
 namespace mlir {
 namespace TF {
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index f432b6b1f612f8..160413009efb3a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -15,16 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
-#include <algorithm>
-#include <cstdint>
 #include <functional>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <tuple>
-#include <type_traits>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 6382f325a47505..a0fefadca96559 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
index b3ce501c1c08d1..02105ad8cfc210 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h"
 
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 23683673fe189a..1d9a4fecfab4cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -90,6 +91,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
index ca8f27a1489c06..0b13f1791c7717 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h"
 
+#include <cstdint>
+
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index 7419149074fb8a..1764aa1124059c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -15,16 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
 
-#include <algorithm>
 #include <cstdint>
-#include <functional>
-#include <limits>
-#include <numeric>
 #include <optional>
 #include <string>
-#include <tuple>
-#include <type_traits>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 45717471e373a2..74af6e58fb2294 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 #include <algorithm>
+#include <cassert>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
index 48cfb26d6802b9..1c53c8296a4a17 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
+#include <gmock/gmock.h>
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
index d6d22098666ffe..adf055365a9c56 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
+#include <cstdint>
+
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
index 5921efa20969b2..93c33e9799a298 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
 
-#include <vector>
-
 namespace mlir {
 namespace TF {
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
index e052268b6ede98..162a597ef7c40e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
@@ -294,7 +294,6 @@ glob_lit_tests(
     default_tags = [
         "no_mac",  # TODO(b/191167848)
         "no_oss",  # TODO(b/190855110)
-        "cuda-only",
     ],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
index 93d31b884732c1..2beec1bcd87944 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -107,8 +107,8 @@ void BuildLaunchForCluster(const TF::Cluster& c, OpBuilder* builder) {
   // as operand.
   OpBuilder return_builder(builder->getContext());
   return_builder.setInsertionPointToEnd(block);
-  return_builder.create<tf_device::ReturnOp>(return_builder.getUnknownLoc(),
-                                             live_outs);
+  tf_device::ReturnOp::create(return_builder, return_builder.getUnknownLoc(),
+                              live_outs);
 
   llvm::SmallVector<Type, 4> live_out_types;
   live_out_types.reserve(live_outs.size());
@@ -116,8 +116,8 @@ void BuildLaunchForCluster(const TF::Cluster& c, OpBuilder* builder) {
     live_out_types.emplace_back(v.getType());
   }
 
-  tf_device::LaunchOp launch_op = builder->create<tf_device::LaunchOp>(
-      builder->getUnknownLoc(), builder->getStringAttr(c.target),
+  tf_device::LaunchOp launch_op = tf_device::LaunchOp::create(
+      *builder, builder->getUnknownLoc(), builder->getStringAttr(c.target),
       live_out_types);
 
   // Attach the region to launch_op.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
index beee1afb1a129e..9158ecc6f7fcd7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
@@ -575,7 +575,7 @@ tf_device::ClusterOp CreateClusterOp(Cluster &cluster, StringAttr policy) {
   OpBuilder builder(back);
 
   auto cluster_op =
-      builder.create<tf_device::ClusterOp>(loc, return_types, policy);
+      tf_device::ClusterOp::create(builder, loc, return_types, policy);
 
   // Create block in cluster_op's region and move 'cluster.operations' into
   // it.
@@ -585,7 +585,7 @@ tf_device::ClusterOp CreateClusterOp(Cluster &cluster, StringAttr policy) {
 
   // Add 'tf_device::ReturnOp' at the end of the block.
   builder.setInsertionPointToEnd(block);
-  builder.create<tf_device::ReturnOp>(loc, return_values.getArrayRef());
+  tf_device::ReturnOp::create(builder, loc, return_values.getArrayRef());
 
   // Set device attribute
   if (auto device = back->getAttr(kDeviceAttr))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index d63ace094451a6..ea7dce395d84d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -370,7 +370,7 @@ void AppendFunctionResults(func::FuncOp func, int num_resources,
   // function.
   OpBuilder builder(graph_op);
   auto new_graph_op =
-      builder.create<GraphOp>(graph_op.getLoc(), new_result_types);
+      GraphOp::create(builder, graph_op.getLoc(), new_result_types);
   new_graph_op.getRegion().takeBody(graph_op.getRegion());
   graph_op->replaceAllUsesWith(
       new_graph_op->getResults().drop_back(num_resources));
@@ -388,14 +388,15 @@ IslandOp CreateIsland(Operation* sub_op, ValueRange control_inputs,
                       OpBuilder builder) {
   assert(sub_op);
   auto control_type = ControlType::get(builder.getContext());
-  auto island = builder.create<IslandOp>(
-      sub_op->getLoc(), sub_op->getResultTypes(), control_type, control_inputs);
+  auto island =
+      IslandOp::create(builder, sub_op->getLoc(), sub_op->getResultTypes(),
+                       control_type, control_inputs);
   island.getBody().push_back(new Block);
   Block* block = &island.getBody().back();
   builder.setInsertionPointToEnd(block);
   sub_op->replaceAllUsesWith(island.getOutputs());
   sub_op->moveBefore(block, block->begin());
-  builder.create<YieldOp>(sub_op->getLoc(), sub_op->getResults());
+  YieldOp::create(builder, sub_op->getLoc(), sub_op->getResults());
   return island;
 }
 
@@ -429,12 +430,12 @@ void ChainResourceOps(
     // Create chain source and sink identity islands for current equivalence
     // class.
     auto chain_arg = func.getArgument(chain_index++);
-    auto src_identity = builder_chain_src.create<TF::IdentityOp>(
-        chain_arg.getLoc(), chain_arg.getType(), chain_arg);
+    auto src_identity = TF::IdentityOp::create(
+        builder_chain_src, chain_arg.getLoc(), chain_arg.getType(), chain_arg);
     auto chain_src_island = CreateIsland(src_identity, {}, builder_chain_src);
 
-    auto sink_identity = builder_chain_sink.create<TF::IdentityOp>(
-        chain_arg.getLoc(), chain_arg.getType(), chain_arg);
+    auto sink_identity = TF::IdentityOp::create(
+        builder_chain_sink, chain_arg.getLoc(), chain_arg.getType(), chain_arg);
     auto chain_sink_island =
         CreateIsland(sink_identity, {}, builder_chain_sink);
 
@@ -477,7 +478,7 @@ void ChainResourceOps(
 IslandOp GetDummyConstant(OpBuilder builder, ShapedType const_type,
                           Location loc) {
   DenseIntElementsAttr val = DenseIntElementsAttr::get(const_type, 1);
-  auto const_op = builder.create<TF::ConstOp>(loc, val);
+  auto const_op = TF::ConstOp::create(builder, loc, val);
   auto const_island = CreateIsland(const_op, {}, builder);
   return const_island;
 }
@@ -506,8 +507,9 @@ TF::WhileOp RewriteWhileOp(TF::WhileOp while_op, int num_resource_inputs,
   }
 
   // Replace old while op with new while op.
-  auto new_while_op = builder.create<TF::WhileOp>(
-      while_op.getLoc(), new_result_types, new_operands, while_op->getAttrs());
+  auto new_while_op =
+      TF::WhileOp::create(builder, while_op.getLoc(), new_result_types,
+                          new_operands, while_op->getAttrs());
   auto new_while_wrapper =
       CreateIsland(new_while_op, while_wrapper.getControlInputs(), builder);
   for (auto result : while_wrapper.getOutputs()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
index 144bdb44018649..cda422d0d9938e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
@@ -137,12 +137,12 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
 
     // Read the state value from the resource.
     Value state =
-        rewriter.create<ReadVariableOp>(loc, res_type, rng_op.getResource());
+        ReadVariableOp::create(rewriter, loc, res_type, rng_op.getResource());
 
     // Extract the key and counter from the state.
     RankedTensorType word_type = RankedTensorType::get({}, state_element_type);
-    auto unpacked = rewriter.create<UnpackOp>(
-        loc, SmallVector<Type, 4>(state_size, word_type), state, 0);
+    auto unpacked = UnpackOp::create(
+        rewriter, loc, SmallVector<Type, 4>(state_size, word_type), state, 0);
     Value key = unpacked.getResult(counter_size);
 
     SmallVector<Value, 4> counter;
@@ -153,39 +153,40 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
     // Set the increment to 256 * delta.
     Type u64 = rewriter.getIntegerType(64, /*isSigned=*/false);
     RankedTensorType u64_scalar = RankedTensorType::get({}, u64);
-    Value step_size = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 256));
+    Value step_size = ConstOp::create(rewriter, loc, GetScalarOfType(u64, 256));
     Value increment =
-        rewriter.create<MulOp>(loc, u64_scalar, step_size, rng_op.getDelta());
+        MulOp::create(rewriter, loc, u64_scalar, step_size, rng_op.getDelta());
 
     // Increment the counter.
     SmallVector<Value, 4> pack_args;
     RankedTensorType word_u64_type = RankedTensorType::get({}, u64);
-    Value zero_u64 = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 0));
-    Value one_u64 = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 1));
+    Value zero_u64 = ConstOp::create(rewriter, loc, GetScalarOfType(u64, 0));
+    Value one_u64 = ConstOp::create(rewriter, loc, GetScalarOfType(u64, 1));
     for (int i = 0; i < counter_size; ++i) {
       Value word = counter[i];
-      Value word_u64 = rewriter.create<CastOp>(loc, word_u64_type, word);
-      Value new_word_u64 = rewriter.create<AddV2Op>(loc, word_u64, increment);
-      Value new_word = rewriter.create<CastOp>(loc, word_type, new_word_u64);
+      Value word_u64 = CastOp::create(rewriter, loc, word_u64_type, word);
+      Value new_word_u64 = AddV2Op::create(rewriter, loc, word_u64, increment);
+      Value new_word = CastOp::create(rewriter, loc, word_type, new_word_u64);
       pack_args.push_back(new_word);
 
-      Value overflow = rewriter.create<LessOp>(loc, new_word_u64, word_u64);
-      increment = rewriter.create<SelectV2Op>(loc, overflow, one_u64, zero_u64);
+      Value overflow = LessOp::create(rewriter, loc, new_word_u64, word_u64);
+      increment =
+          SelectV2Op::create(rewriter, loc, overflow, one_u64, zero_u64);
     }
 
     // Save the new state value to the resource.
     pack_args.push_back(key);
-    Value new_state = rewriter.create<PackOp>(loc, res_type, pack_args);
-    rewriter.create<AssignVariableOp>(loc, rng_op.getResource(), new_state);
+    Value new_state = PackOp::create(rewriter, loc, res_type, pack_args);
+    AssignVariableOp::create(rewriter, loc, rng_op.getResource(), new_state);
 
     // Pad the original state as necessary to fill the output shape.
     int pad = tensorflow::RNG_MAX_COUNTER_SIZE - counter_size;
     Type i64 = rewriter.getI64Type();
     RankedTensorType paddings_ty = RankedTensorType::get({1, 2}, i64);
     std::vector<int64_t> paddings_values = {0, pad};
-    Value paddings = rewriter.create<ConstOp>(
-        loc, DenseIntElementsAttr::get(paddings_ty, paddings_values));
-    Value output = rewriter.create<PadOp>(loc, op_type, state, paddings);
+    Value paddings = ConstOp::create(
+        rewriter, loc, DenseIntElementsAttr::get(paddings_ty, paddings_values));
+    Value output = PadOp::create(rewriter, loc, op_type, state, paddings);
 
     rewriter.replaceOp(op, output);
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index 1fc666da4a8d95..9130ae844bc6b9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -27,7 +27,7 @@ def EmptyList: NativeCodeCall<"llvm::SmallVector<mlir::Value>{}">;
 // Creates a tf.ReadVariable op that reads a resource `$2` that has the same
 // element type as `$1`. The op created will use location of `$0`.
 def CreateTFReadVariableOp : NativeCodeCall<
-    "$_builder.create<TF::ReadVariableOp>("
+    "TF::ReadVariableOp::create($_builder, "
     "  $0.getLoc(),"
     "  GetResourceSubtypeOrDefault("
     "    $2, llvm::cast<TensorType>($1.getType()).getElementType()),"
@@ -39,19 +39,19 @@ def CheckHasResourceSubtype : Constraint<CPred<"HasResourceSubtype($0)">>;
 def CreateConstBoolAttrFalse : NativeCodeCall<"$_builder.getBoolAttr(false)">;
 
 def CreateTensorScatterAddOp : NativeCodeCall<
-    "$_builder.create<TF::TensorScatterAddOp>("
+    "TF::TensorScatterAddOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $0, $1, $2, $_builder.getStringAttr(\"\"))">;
 
 def CreateTensorScatterUpdateOp : NativeCodeCall<
-    "$_builder.create<TF::TensorScatterUpdateOp>("
+    "TF::TensorScatterUpdateOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $0, $1, $2, $_builder.getStringAttr(\"\"))">;
 
 def CreateTFReadVariableOpFromResourceHandle : NativeCodeCall<
-    "$_builder.create<TF::ReadVariableOp>("
+    "TF::ReadVariableOp::create($_builder, "
     "$0.getLoc(), GetResourceSubtype($1), $1)">;
 
 def CreateTFSelectOp: NativeCodeCall<
-    "$_builder.create<TF::SelectOp>("
+    "TF::SelectOp::create($_builder, "
     "$0.getLoc(), $3.getType(), $1, $2, $3)">;
 
 def ConstAttrIfThenElse: NativeCodeCall<
@@ -59,7 +59,7 @@ def ConstAttrIfThenElse: NativeCodeCall<
 
 // Convert clamp(lo, x, hi) to clipbyvalue(x, lo, hi).
 def Clamp: NativeCodeCall<
-    "$_builder.create<TF::ClipByValueOp>("
+    "TF::ClipByValueOp::create($_builder, "
     "  $0.getLoc(),"
     "  $2.getType(), $2, $1, $3)">;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 954c318b416150..73dc7802c7d56d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -61,7 +61,7 @@ namespace {
 ConstOp createI32ConstOp(int32_t value, Location loc,
                          PatternRewriter* rewriter) {
   auto int_attr = IntegerAttr::get(rewriter->getIntegerType(32), value);
-  return rewriter->create<ConstOp>(loc, int_attr);
+  return ConstOp::create(*rewriter, loc, int_attr);
 }
 
 // Creates ConstantOp for array of int32_t.
@@ -70,7 +70,7 @@ arith::ConstantOp createI32ConstantOp(llvm::ArrayRef<int32_t> values,
   auto values_type = RankedTensorType::get(
       {static_cast<int32_t>(values.size())}, rewriter->getIntegerType(32));
   auto constant_attr = rewriter->getI32TensorAttr(values);
-  return rewriter->create<arith::ConstantOp>(loc, values_type, constant_attr);
+  return arith::ConstantOp::create(*rewriter, loc, values_type, constant_attr);
 }
 
 // Creates ConstantOp for array of int64_t.
@@ -79,7 +79,7 @@ arith::ConstantOp createI64ConstantOp(llvm::ArrayRef<int64_t> values,
   auto values_type = RankedTensorType::get(
       {static_cast<int64_t>(values.size())}, rewriter->getIntegerType(64));
   auto constant_attr = rewriter->getI64TensorAttr(values);
-  return rewriter->create<arith::ConstantOp>(loc, values_type, constant_attr);
+  return arith::ConstantOp::create(*rewriter, loc, values_type, constant_attr);
 }
 
 // Function to create a tf.SumOp to sum the element in 'value' reduced along the
@@ -98,8 +98,9 @@ TF::SumOp createSumOp(Value value, Location loc,
       sum_shape.push_back(shape[i]);
     }
   }
-  return rewriter->create<TF::SumOp>(
-      loc, RankedTensorType::get(sum_shape, value_type.getElementType()), value,
+  return TF::SumOp::create(
+      *rewriter, loc,
+      RankedTensorType::get(sum_shape, value_type.getElementType()), value,
       redux_op);
 }
 
@@ -115,8 +116,8 @@ TF::TransposeOp createTransposeOp(Value value, Location loc,
   }
   auto transposed_type =
       RankedTensorType::get(transposed_shape, value_type.getElementType());
-  return rewriter->create<TF::TransposeOp>(loc, transposed_type, value,
-                                           perm_op);
+  return TF::TransposeOp::create(*rewriter, loc, transposed_type, value,
+                                 perm_op);
 }
 
 TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
@@ -125,8 +126,8 @@ TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
   auto shape_tensor = createI64ConstantOp(
       tensorflow::ConvertMlirShapeToTF(shape), loc, rewriter);
   Type resultType = RankedTensorType::get(shape, element_type);
-  return rewriter->create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
-                                         /*shape=*/shape_tensor);
+  return TF::ReshapeOp::create(*rewriter, loc, resultType, /*tensor=*/value,
+                               /*shape=*/shape_tensor);
 }
 
 // Creates ReshapeOp with runtime calcuation of required shape to support
@@ -140,7 +141,7 @@ TF::ReshapeOp createReshapeOpForDynamic(Value value, ArrayRef<int64_t> shape,
                                         PatternRewriter* rewriter) {
   // Build ShapeOp
   auto input_shape =
-      rewriter->create<TF::ShapeOp>(loc, value, rewriter->getBoolAttr(true));
+      TF::ShapeOp::create(*rewriter, loc, value, rewriter->getBoolAttr(true));
 
   // Build UnsortedSegmentProdOp
   Type segProdresultType =
@@ -148,16 +149,16 @@ TF::ReshapeOp createReshapeOpForDynamic(Value value, ArrayRef<int64_t> shape,
   auto segids_tensor = createI32ConstantOp(reshape_segids, loc, rewriter);
   auto num_reshape_segids_tensor =
       createI32ConstOp(num_reshape_segids, loc, rewriter);
-  auto segprod = rewriter->create<TF::UnsortedSegmentProdOp>(
-      loc, segProdresultType, input_shape->getResults()[0], segids_tensor,
-      num_reshape_segids_tensor);
+  auto segprod = TF::UnsortedSegmentProdOp::create(
+      *rewriter, loc, segProdresultType, input_shape->getResults()[0],
+      segids_tensor, num_reshape_segids_tensor);
 
   // Build ReshapeOp with the result of UnsortedSegmentProdOp.
   Type out_tensor_type =
       RankedTensorType::get(shape, getElementTypeOrSelf(value.getType()));
-  return rewriter->create<TF::ReshapeOp>(loc, out_tensor_type,
-                                         /*tensor=*/value,
-                                         /*shape=*/segprod->getResults()[0]);
+  return TF::ReshapeOp::create(*rewriter, loc, out_tensor_type,
+                               /*tensor=*/value,
+                               /*shape=*/segprod->getResults()[0]);
 }
 
 struct EinsumDimensionNumbers {
@@ -178,8 +179,8 @@ TF::ReshapeOp createOutputReshapeOpForDynamic(
     EinsumDimensionNumbers& dnums, Location loc, PatternRewriter* rewriter) {
   BoolAttr true_attr = rewriter->getBoolAttr(true);
   // Build ShapeOp
-  auto shape_lhs = rewriter->create<TF::ShapeOp>(loc, org_lhs, true_attr);
-  auto shape_rhs = rewriter->create<TF::ShapeOp>(loc, org_rhs, true_attr);
+  auto shape_lhs = TF::ShapeOp::create(*rewriter, loc, org_lhs, true_attr);
+  auto shape_rhs = TF::ShapeOp::create(*rewriter, loc, org_rhs, true_attr);
 
   std::vector<int32_t> bl_index;  // Indexes of B0,...,Bn and L0,...,Ln
   bl_index.reserve(dnums.lhs_rhs_out.size() + dnums.lhs_out.size());
@@ -196,20 +197,20 @@ TF::ReshapeOp createOutputReshapeOpForDynamic(
   }
 
   auto lhs_index_tensor = createI32ConstantOp(bl_index, loc, rewriter);
-  auto gather_lhs = rewriter->create<TF::GatherOp>(
-      loc,
+  auto gather_lhs = TF::GatherOp::create(
+      *rewriter, loc,
       RankedTensorType::get({static_cast<int>(bl_index.size())},
                             rewriter->getIntegerType(32)),
       shape_lhs->getResults()[0], lhs_index_tensor->getResults()[0], true_attr);
   auto rhs_index_tensor = createI32ConstantOp(r_index, loc, rewriter);
-  auto gather_rhs = rewriter->create<TF::GatherOp>(
-      loc,
+  auto gather_rhs = TF::GatherOp::create(
+      *rewriter, loc,
       RankedTensorType::get({static_cast<int>(r_index.size())},
                             rewriter->getIntegerType(32)),
       shape_rhs->getResults()[0], rhs_index_tensor->getResults()[0], true_attr);
   Value zero_value = createI32ConstOp(0, loc, rewriter);
-  auto concat_out_shape = rewriter->create<TF::ConcatOp>(
-      loc,
+  auto concat_out_shape = TF::ConcatOp::create(
+      *rewriter, loc,
       RankedTensorType::get({static_cast<int>(bl_index.size()) +
                              static_cast<int>(r_index.size())},
                             rewriter->getIntegerType(32)),
@@ -220,10 +221,9 @@ TF::ReshapeOp createOutputReshapeOpForDynamic(
   // Build ReshapeOp with the calculated output shape.
   Type out_type =
       RankedTensorType::get(shape, getElementTypeOrSelf(value.getType()));
-  return rewriter->create<TF::ReshapeOp>(
-      loc, out_type,
-      /*tensor=*/value,
-      /*shape=*/concat_out_shape->getResults()[0]);
+  return TF::ReshapeOp::create(*rewriter, loc, out_type,
+                               /*tensor=*/value,
+                               /*shape=*/concat_out_shape->getResults()[0]);
 }
 
 std::optional<llvm::SmallDenseMap<char, int64_t>> EquationToMap(
@@ -793,9 +793,9 @@ LogicalResult rewriteToBatchMatmul(TF::EinsumOp op,
 
   auto matmul_type =
       RankedTensorType::get(matmul_shape, original_type.getElementType());
-  Value out = rewriter.create<TF::BatchMatMulV2Op>(
-      op.getLoc(), matmul_type, lhs, rhs, rewriter.getBoolAttr(false),
-      rewriter.getBoolAttr(false));
+  Value out = TF::BatchMatMulV2Op::create(rewriter, op.getLoc(), matmul_type,
+                                          lhs, rhs, rewriter.getBoolAttr(false),
+                                          rewriter.getBoolAttr(false));
 
   bool out_reshape_need = (reshape_shape.size() != matmul_shape.size() ||
                            original_type.getRank() != matmul_shape.size());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
index 18480fbd772fa9..883da73f2fb378 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
@@ -127,8 +127,8 @@ tf_device::LaunchOp CreateNewHostLaunchOpWithNewResult(
   for (Value result : new_launch_op_results)
     new_launch_op_results_types.push_back(result.getType());
 
-  auto new_launch_op = builder.create<tf_device::LaunchOp>(
-      old_launch_op->getLoc(), old_launch_op->getDeviceAttr(),
+  auto new_launch_op = tf_device::LaunchOp::create(
+      builder, old_launch_op->getLoc(), old_launch_op->getDeviceAttr(),
       /*result_types=*/new_launch_op_results_types);
 
   new_launch_op.getBody().takeBody(old_launch_op->getBody());
@@ -154,17 +154,16 @@ LogicalResult CreateNewDeviceLaunchOp(
     return failure();
   }
 
-  new_device_launch_op = builder.create<tf_device::LaunchOp>(
-      tpu_copy_with_dynamic_shape_op->getLoc(),
+  new_device_launch_op = tf_device::LaunchOp::create(
+      builder, tpu_copy_with_dynamic_shape_op->getLoc(),
       builder.getStringAttr(device_str),
       /*result_types=*/tpu_copy_with_dynamic_shape_op->getResultTypes());
 
   new_device_launch_op.getBody().push_back(new Block);
   builder.setInsertionPointToEnd(&new_device_launch_op.GetBody());
-  auto* return_op = builder
-                        .create<tf_device::ReturnOp>(
-                            tpu_copy_with_dynamic_shape_op->getLoc(),
-                            tpu_copy_with_dynamic_shape_op->getResults())
+  auto* return_op = tf_device::ReturnOp::create(
+                        builder, tpu_copy_with_dynamic_shape_op->getLoc(),
+                        tpu_copy_with_dynamic_shape_op->getResults())
                         .getOperation();
   tpu_copy_with_dynamic_shape_op->moveBefore(return_op);
   return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index e73d76fbc5907d..b2ab71fa5129cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -230,8 +230,8 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     auto *bias_add_op = bias_add.getOperation();
     if (bias_add_op) rewriter.setInsertionPoint(bias_add_op);
 
-    Value fused_op = rewriter.create<FusedOpT>(fused_loc, result_type,
-                                               ValueRange(operands), attrs);
+    Value fused_op = FusedOpT::create(rewriter, fused_loc, result_type,
+                                      ValueRange(operands), attrs);
     auto op_to_replace = fuse_activation ? activation : bias_add;
     rewriter.replaceOp(op_to_replace, ValueRange({fused_op}));
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
index 2c70a078fbb13a..18fc8fc1cb58cc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
@@ -89,9 +89,9 @@ void MoveTailWritesAfterReplicate(
 
   OpBuilder builder(replicate_op);
   // Clone this old replicate op but with new result types.
-  auto new_replicate_op = builder.create<tf_device::ReplicateOp>(
-      replicate_op->getLoc(), new_result_types, replicate_op->getOperands(),
-      replicate_op->getAttrs());
+  auto new_replicate_op = tf_device::ReplicateOp::create(
+      builder, replicate_op->getLoc(), new_result_types,
+      replicate_op->getOperands(), replicate_op->getAttrs());
 
   // Move region to the new op.
   new_replicate_op.getRegion().takeBody(replicate_op.getRegion());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
index 9492c007b07ca5..7806967d7dcfe9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
@@ -413,8 +413,8 @@ void ReplaceParallelExecute(
       &output_types, parallel_execute, region_index + 1, num_regions);
 
   builder->setInsertionPoint(parallel_execute);
-  auto new_parallel_execute = builder->create<tf_device::ParallelExecuteOp>(
-      parallel_execute.getLoc(), num_regions, output_types);
+  auto new_parallel_execute = tf_device::ParallelExecuteOp::create(
+      *builder, parallel_execute.getLoc(), num_regions, output_types);
 
   // Replace the uses of the original parallel_execute before region containing
   // merged execute.
@@ -449,8 +449,8 @@ void ReplaceParallelExecute(
   // execute results.
   Operation* old_terminator = execute_region->front().getTerminator();
   builder->setInsertionPointToEnd(&execute_region->front());
-  builder->create<tf_device::ReturnOp>(old_terminator->getLoc(),
-                                       merged_execute_launch.getResults());
+  tf_device::ReturnOp::create(*builder, old_terminator->getLoc(),
+                              merged_execute_launch.getResults());
   old_terminator->erase();
 
   // Remove the original TPUExecute op.
@@ -532,8 +532,8 @@ LogicalResult MergeForOneTPUExecute(
   }
 
   // Create the merged execute and update variables op.
-  auto merged_execute = builder->create<TF::TPUExecuteAndUpdateVariablesOp>(
-      execute_launch.getLoc(), new_output_types,
+  auto merged_execute = TF::TPUExecuteAndUpdateVariablesOp::create(
+      *builder, execute_launch.getLoc(), new_output_types,
       var_access_info.new_operand_values,
       llvm::ArrayRef<NamedAttribute>{
           builder->getNamedAttr(
@@ -544,14 +544,14 @@ LogicalResult MergeForOneTPUExecute(
               builder->getI64ArrayAttr(device_var_updates_indices))});
 
   // Wrap in launch for device assignment.
-  auto merged_execute_launch = builder->create<tf_device::LaunchOp>(
-      merged_execute.getLoc(), execute_launch.getDeviceAttr(),
+  auto merged_execute_launch = tf_device::LaunchOp::create(
+      *builder, merged_execute.getLoc(), execute_launch.getDeviceAttr(),
       merged_execute.getResultTypes());
   merged_execute_launch.getBody().push_back(new Block);
 
   builder->setInsertionPointToEnd(&merged_execute_launch.GetBody());
-  builder->create<tf_device::ReturnOp>(merged_execute.getLoc(),
-                                       merged_execute.getResults());
+  tf_device::ReturnOp::create(*builder, merged_execute.getLoc(),
+                              merged_execute.getResults());
 
   merged_execute.getOperation()->moveBefore(
       merged_execute_launch.GetBody().getTerminator());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
index ac9c18602804d7..a5bd582b7c2b5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
@@ -224,8 +224,8 @@ LogicalResult SetMetadataProtoFromClusterFuncOp(
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   if (auto options_attr =
           op->getAttrOfType<StringAttr>("tpu_compile_options_proto")) {
-    if (!metadata->mutable_compile_options()->ParseFromArray(
-            options_attr.data(), options_attr.size())) {
+    if (!metadata->mutable_compile_options()->ParseFromString(
+            absl::string_view(options_attr.data(), options_attr.size()))) {
       return failure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index 85b61d16355077..0b5976b619ea26 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -90,7 +90,7 @@ static Value CreateTFCastOpF32(OpBuilder *builder, Location loc, Value x,
   auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getF32Type());
-  return builder->create<CastOp>(loc, type, x, truncate);
+  return CastOp::create(*builder, loc, type, x, truncate);
 }
 
 // Returns a TF_CastOp to I32. This function is used for CastOps that are
@@ -103,7 +103,7 @@ static Value CreateTFCastOpI32(OpBuilder *builder, Location loc, Value x,
   auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getI32Type());
-  return builder->create<CastOp>(loc, type, x, truncate);
+  return CastOp::create(*builder, loc, type, x, truncate);
 }
 
 static APFloat ConvertToAPFloat(double val, Type type) {
@@ -125,22 +125,24 @@ static Value GetDimensionSize(OpBuilder *builder, Location loc, Value input,
     }
     // Return a ConstOp if it's static dimension.
     if (!ranked_ty.isDynamicDim(idx)) {
-      return builder->create<TF::ConstOp>(
-          loc, GetScalarOfType(
-                   builder->getIntegerType(use_32bit.getValue() ? 32 : 64),
-                   ranked_ty.getDimSize(idx)));
+      return TF::ConstOp::create(
+          *builder, loc,
+          GetScalarOfType(
+              builder->getIntegerType(use_32bit.getValue() ? 32 : 64),
+              ranked_ty.getDimSize(idx)));
     }
   }
 
-  auto shape = builder->create<TF::ShapeOp>(loc, input, use_32bit);
-  return builder->create<TF::StridedSliceOp>(
-      loc, mlir::RankedTensorType::get({}, getElementTypeOrSelf(shape)), shape,
+  auto shape = TF::ShapeOp::create(*builder, loc, input, use_32bit);
+  return TF::StridedSliceOp::create(
+      *builder, loc,
+      mlir::RankedTensorType::get({}, getElementTypeOrSelf(shape)), shape,
       /*begin=*/
-      builder->create<TF::ConstOp>(loc, builder->getI32TensorAttr({idx})),
+      TF::ConstOp::create(*builder, loc, builder->getI32TensorAttr({idx})),
       /*end=*/
-      builder->create<TF::ConstOp>(loc, builder->getI32TensorAttr({idx + 1})),
+      TF::ConstOp::create(*builder, loc, builder->getI32TensorAttr({idx + 1})),
       /*strides=*/
-      builder->create<TF::ConstOp>(loc, builder->getI32TensorAttr({1})),
+      TF::ConstOp::create(*builder, loc, builder->getI32TensorAttr({1})),
       /*begin_mask=*/0, /*end_mask=*/0, /*ellipsis_mask=*/0,
       /*new_axis_mask=*/0, /*shrink_axis_mask=*/1);
 }
@@ -211,9 +213,9 @@ Value ValuesToRank1(PatternRewriter &rewriter, Location loc, Type dtype,
                     ArrayRef<Value> vals) {
   int64_t length = vals.size();
   auto type = tensorflow::GetTypeFromTFTensorShape({length}, dtype);
-  auto axis = rewriter.create<ConstOp>(
-      loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
-  return rewriter.create<ConcatV2Op>(loc, type, ValueRange(vals), axis);
+  auto axis = ConstOp::create(rewriter, loc,
+                              GetScalarOfType(rewriter.getIntegerType(64), 0));
+  return ConcatV2Op::create(rewriter, loc, type, ValueRange(vals), axis);
 }
 
 // Lowers AddN op to a sequence of AddV2 ops to accumulate operands.
@@ -277,10 +279,10 @@ class LowerAddNOp : public RewritePattern {
     while (n > 1) {
       for (int64_t i = 0; i < n; i += 2) {
         // Add two adjacent operands if applicable.
-        operands[i / 2] =
-            (i + 1 < n) ? rewriter.create<AddV2Op>(addn_op.getLoc(),
-                                                   operands[i], operands[i + 1])
-                        : operands[i];
+        operands[i / 2] = (i + 1 < n)
+                              ? AddV2Op::create(rewriter, addn_op.getLoc(),
+                                                operands[i], operands[i + 1])
+                              : operands[i];
       }
       n = (n + 1) / 2;
     }
@@ -363,8 +365,8 @@ class LowerDynamicStitchOp : public RewritePattern {
     packed_shape.push_back(-1);
     packed_shape.append(item_shape.begin(), item_shape.end());
     Location loc = op.getLoc();
-    auto packed_shape_val = rewriter.create<ConstOp>(
-        loc, GetI64ElementsAttr(packed_shape, &rewriter));
+    auto packed_shape_val = ConstOp::create(
+        rewriter, loc, GetI64ElementsAttr(packed_shape, &rewriter));
 
     // Prepare each of the output item by unpacking data and then putting it to
     // the specified index.
@@ -374,12 +376,13 @@ class LowerDynamicStitchOp : public RewritePattern {
       Value data = std::get<1>(it);
 
       auto reshaped_data =
-          rewriter.create<ReshapeOp>(loc, data, packed_shape_val);
+          ReshapeOp::create(rewriter, loc, data, packed_shape_val);
       auto num_items =
           mlir::cast<RankedTensorType>(reshaped_data.getType()).getShape()[0];
-      auto items = rewriter.create<UnpackOp>(
-          loc, SmallVector<Type, 4>(num_items, item_ty), reshaped_data,
-          /*axis=*/0);
+      auto items = UnpackOp::create(rewriter, loc,
+                                    SmallVector<Type, 4>(num_items, item_ty),
+                                    reshaped_data,
+                                    /*axis=*/0);
       for (auto index_item : llvm::zip(index_attr, items.getResults())) {
         int64_t output_index = std::get<0>(index_item).getSExtValue();
         Value item = std::get<1>(index_item);
@@ -426,80 +429,84 @@ class ConvertFakeQuantWithMinMaxVarsOp : public RewritePattern {
     auto float_min = op.getMin();
     auto float_max = op.getMax();
 
-    auto float_diff = rewriter.create<SubOp>(op.getLoc(), float_max, float_min);
+    auto float_diff =
+        SubOp::create(rewriter, op.getLoc(), float_max, float_min);
 
     // Compute the range when quantized.
-    auto quant_min = rewriter.create<ConstOp>(
-        op.getLoc(), DenseElementsAttr::get(
-                         scalar_ty, ConvertToAPFloat(bits_min, element_ty)));
-
-    auto quant_max = rewriter.create<ConstOp>(
-        op.getLoc(), DenseElementsAttr::get(
-                         scalar_ty, ConvertToAPFloat(bits_max, element_ty)));
-
-    auto quant_diff = rewriter.create<ConstOp>(
-        op.getLoc(),
+    auto quant_min =
+        ConstOp::create(rewriter, op.getLoc(),
+                        DenseElementsAttr::get(
+                            scalar_ty, ConvertToAPFloat(bits_min, element_ty)));
+
+    auto quant_max =
+        ConstOp::create(rewriter, op.getLoc(),
+                        DenseElementsAttr::get(
+                            scalar_ty, ConvertToAPFloat(bits_max, element_ty)));
+
+    auto quant_diff = ConstOp::create(
+        rewriter, op.getLoc(),
         DenseElementsAttr::get(
             scalar_ty, ConvertToAPFloat(bits_max - bits_min, element_ty)));
 
     auto quant_to_float =
-        rewriter.create<DivOp>(op.getLoc(), float_diff, quant_diff);
+        DivOp::create(rewriter, op.getLoc(), float_diff, quant_diff);
 
     auto float_to_quant =
-        rewriter.create<DivOp>(op.getLoc(), quant_diff, float_diff);
+        DivOp::create(rewriter, op.getLoc(), quant_diff, float_diff);
 
     // During quantization, the quantized min/max values may not line up
     // perfectly with the specified min/max. Nudge them into the right range.
     auto min_scaled =
-        rewriter.create<DivOp>(op.getLoc(), float_min, quant_to_float);
+        DivOp::create(rewriter, op.getLoc(), float_min, quant_to_float);
     auto min_scaled_sub =
-        rewriter.create<SubOp>(op.getLoc(), quant_min, min_scaled);
+        SubOp::create(rewriter, op.getLoc(), quant_min, min_scaled);
 
     auto mid_rounded =
-        rewriter.create<RoundOp>(op.getLoc(), scalar_ty, min_scaled_sub);
+        RoundOp::create(rewriter, op.getLoc(), scalar_ty, min_scaled_sub);
 
-    auto nudged_zero_point_val = rewriter.create<ClipByValueOp>(
-        op.getLoc(), scalar_ty, mid_rounded, quant_min, quant_max);
+    auto nudged_zero_point_val = ClipByValueOp::create(
+        rewriter, op.getLoc(), scalar_ty, mid_rounded, quant_min, quant_max);
 
     auto quant_min_sub =
-        rewriter.create<SubOp>(op.getLoc(), quant_min, nudged_zero_point_val);
+        SubOp::create(rewriter, op.getLoc(), quant_min, nudged_zero_point_val);
     auto quant_max_sub =
-        rewriter.create<SubOp>(op.getLoc(), quant_max, nudged_zero_point_val);
+        SubOp::create(rewriter, op.getLoc(), quant_max, nudged_zero_point_val);
 
     auto nudged_float_min =
-        rewriter.create<MulOp>(op.getLoc(), quant_min_sub, quant_to_float);
+        MulOp::create(rewriter, op.getLoc(), quant_min_sub, quant_to_float);
 
     auto nudged_float_max =
-        rewriter.create<MulOp>(op.getLoc(), quant_max_sub, quant_to_float);
+        MulOp::create(rewriter, op.getLoc(), quant_max_sub, quant_to_float);
 
     // Now quantize the input value with the approximated min/max values.
 
     // Move the input value into quantized space
-    Value quantized_input = rewriter.create<ClipByValueOp>(
-        op.getLoc(), input_ty, input, nudged_float_min, nudged_float_max);
+    Value quantized_input =
+        ClipByValueOp::create(rewriter, op.getLoc(), input_ty, input,
+                              nudged_float_min, nudged_float_max);
 
-    quantized_input = rewriter.create<SubOp>(op.getLoc(), input_ty,
-                                             quantized_input, nudged_float_min);
+    quantized_input = SubOp::create(rewriter, op.getLoc(), input_ty,
+                                    quantized_input, nudged_float_min);
 
-    quantized_input = rewriter.create<MulOp>(op.getLoc(), input_ty,
-                                             quantized_input, float_to_quant);
+    quantized_input = MulOp::create(rewriter, op.getLoc(), input_ty,
+                                    quantized_input, float_to_quant);
 
     // Round the quantized input always to the positive direction.
-    auto half_val = rewriter.create<ConstOp>(
-        op.getLoc(),
+    auto half_val = ConstOp::create(
+        rewriter, op.getLoc(),
         DenseElementsAttr::get(scalar_ty, ConvertToAPFloat(0.5, element_ty)));
 
-    quantized_input = rewriter.create<AddV2Op>(op.getLoc(), input_ty,
-                                               quantized_input, half_val);
+    quantized_input = AddV2Op::create(rewriter, op.getLoc(), input_ty,
+                                      quantized_input, half_val);
 
-    quantized_input = rewriter.create<FloorOp>(op.getLoc(), quantized_input);
+    quantized_input = FloorOp::create(rewriter, op.getLoc(), quantized_input);
 
     // Convert back into floating point spae.
-    Value output = rewriter.create<MulOp>(op.getLoc(), input_ty,
-                                          quantized_input, quant_to_float);
+    Value output = MulOp::create(rewriter, op.getLoc(), input_ty,
+                                 quantized_input, quant_to_float);
 
-    output = rewriter.create<AddV2Op>(op.getLoc(), input_ty, output,
-                                      nudged_float_min);
+    output = AddV2Op::create(rewriter, op.getLoc(), input_ty, output,
+                             nudged_float_min);
 
     rewriter.replaceOp(op, {output});
     return success();
@@ -549,20 +556,21 @@ class LowerInvertPermutationOp : public RewritePattern {
     Type int_type = x_type.getElementType();  // Could be i32 or i64.
 
     auto result_type = x_type;
-    auto start = rewriter.create<ConstOp>(loc, GetScalarOfType(int_type, 0));
-    Value limit = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(int_type, x_type.getShape()[0]));
-    auto delta = rewriter.create<ConstOp>(loc, GetScalarOfType(int_type, 1));
+    auto start = ConstOp::create(rewriter, loc, GetScalarOfType(int_type, 0));
+    Value limit = ConstOp::create(
+        rewriter, loc, GetScalarOfType(int_type, x_type.getShape()[0]));
+    auto delta = ConstOp::create(rewriter, loc, GetScalarOfType(int_type, 1));
     // Construct a sequence of numbers [0, 1, ... len(x)-1].
     auto updates =
-        rewriter.create<RangeOp>(loc, result_type, start, limit, delta);
+        RangeOp::create(rewriter, loc, result_type, start, limit, delta);
 
     auto shape_type =
         tensorflow::GetTypeFromTFTensorShape({2}, rewriter.getIntegerType(32));
-    auto shape = rewriter.create<ConstOp>(
-        loc, DenseElementsAttr::get(
-                 shape_type, {static_cast<int>(x_type.getDimSize(0)), 1}));
-    auto indices = rewriter.create<ReshapeOp>(loc, op.getX(), shape);
+    auto shape = ConstOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(shape_type,
+                               {static_cast<int>(x_type.getDimSize(0)), 1}));
+    auto indices = ReshapeOp::create(rewriter, loc, op.getX(), shape);
 
     rewriter.replaceOpWithNewOp<TensorScatterUpdateOp>(
         op, result_type, op.getX(), indices, updates);
@@ -641,16 +649,17 @@ class LowerLgammaOp : public RewritePattern {
       } else {
         tensor_type = UnrankedTensorType::get(float_type);
       }
-      input = rewriter.create<CastOp>(loc, tensor_type, input);
+      input = CastOp::create(rewriter, loc, tensor_type, input);
     }
 
     // Helper lambda function for creating a ConstOp for a tensor filled with
     // the given constant float value.
     auto create_const_op = [&rewriter, loc, tensor_type,
                             float_type](double value) {
-      return rewriter.create<ConstOp>(
-          loc, DenseElementsAttr::get(tensor_type,
-                                      FloatAttr::get(float_type, value)));
+      return ConstOp::create(
+          rewriter, loc,
+          DenseElementsAttr::get(tensor_type,
+                                 FloatAttr::get(float_type, value)));
     };
 
     Value one_half = create_const_op(0.5);
@@ -664,26 +673,26 @@ class LowerLgammaOp : public RewritePattern {
         create_const_op(std::log(kLanczosGamma + 0.5));
     Value base_lanczos_coeff = create_const_op(kBaseLanczosCoeff);
 
-    Value minus_input = rewriter.create<NegOp>(loc, input);
-    Value input_minus_one = rewriter.create<SubOp>(loc, input, one);
+    Value minus_input = NegOp::create(rewriter, loc, input);
+    Value input_minus_one = SubOp::create(rewriter, loc, input, one);
 
     // If the input is less than 0.5 use Euler's reflection formula:
     // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
-    Value need_to_reflect = rewriter.create<LessOp>(loc, input, one_half);
+    Value need_to_reflect = LessOp::create(rewriter, loc, input, one_half);
     Type tensor_bool_type = need_to_reflect.getType();
-    Value z = rewriter.create<SelectV2Op>(loc, need_to_reflect, minus_input,
-                                          input_minus_one);
+    Value z = SelectV2Op::create(rewriter, loc, need_to_reflect, minus_input,
+                                 input_minus_one);
 
     Value x = base_lanczos_coeff;
     for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
       Value lanczos_coefficient = create_const_op(kLanczosCoefficients[i]);
       Value index = create_const_op(static_cast<double>(i));
-      Value z_plus_index = rewriter.create<AddV2Op>(loc, z, index);
+      Value z_plus_index = AddV2Op::create(rewriter, loc, z, index);
       Value z_plus_index_plus_one =
-          rewriter.create<AddV2Op>(loc, z_plus_index, one);
-      Value incr = rewriter.create<DivOp>(loc, lanczos_coefficient,
-                                          z_plus_index_plus_one);
-      x = rewriter.create<AddV2Op>(loc, x, incr);
+          AddV2Op::create(rewriter, loc, z_plus_index, one);
+      Value incr = DivOp::create(rewriter, loc, lanczos_coefficient,
+                                 z_plus_index_plus_one);
+      x = AddV2Op::create(rewriter, loc, x, incr);
     }
 
     // To improve accuracy on platforms with less-precise log implementations,
@@ -691,14 +700,14 @@ class LowerLgammaOp : public RewritePattern {
     // the device.
     // log(t) = log(kLanczosGamma + 0.5 + z)
     //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-    Value t = rewriter.create<AddV2Op>(loc, lanczos_gamma_plus_one_half, z);
+    Value t = AddV2Op::create(rewriter, loc, lanczos_gamma_plus_one_half, z);
     Value z_div_lanczos_gamma_plus_one_half =
-        rewriter.create<DivOp>(loc, z, lanczos_gamma_plus_one_half);
+        DivOp::create(rewriter, loc, z, lanczos_gamma_plus_one_half);
     Value log1p_z_div_lanczos_gamma_plus_one_half =
-        rewriter.create<Log1pOp>(loc, z_div_lanczos_gamma_plus_one_half);
+        Log1pOp::create(rewriter, loc, z_div_lanczos_gamma_plus_one_half);
     Value log_t =
-        rewriter.create<AddV2Op>(loc, log_lanczos_gamma_plus_one_half,
-                                 log1p_z_div_lanczos_gamma_plus_one_half);
+        AddV2Op::create(rewriter, loc, log_lanczos_gamma_plus_one_half,
+                        log1p_z_div_lanczos_gamma_plus_one_half);
 
     // Compute the final result (modulo reflection).  t(z) may be large, and we
     // need to be careful not to overflow to infinity in the first term of
@@ -710,17 +719,17 @@ class LowerLgammaOp : public RewritePattern {
     //   (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
     //
     // log_y = log_sqrt_two_pi + (z + one_half - t / log_t) * log_t + Log(x);
-    Value t_div_log_t = rewriter.create<DivOp>(loc, t, log_t);
+    Value t_div_log_t = DivOp::create(rewriter, loc, t, log_t);
     Value one_half_minus_t_div_log_t =
-        rewriter.create<SubOp>(loc, one_half, t_div_log_t);
+        SubOp::create(rewriter, loc, one_half, t_div_log_t);
     Value z_plus_one_half_minus_t_div_log_t =
-        rewriter.create<AddV2Op>(loc, z, one_half_minus_t_div_log_t);
+        AddV2Op::create(rewriter, loc, z, one_half_minus_t_div_log_t);
     Value z_plus_one_half_minus_t_div_log_t_mul_log_t =
-        rewriter.create<MulOp>(loc, z_plus_one_half_minus_t_div_log_t, log_t);
-    Value log_x = rewriter.create<LogOp>(loc, x);
-    Value log_y_rhs = rewriter.create<AddV2Op>(
-        loc, z_plus_one_half_minus_t_div_log_t_mul_log_t, log_x);
-    Value log_y = rewriter.create<AddV2Op>(loc, log_sqrt_two_pi, log_y_rhs);
+        MulOp::create(rewriter, loc, z_plus_one_half_minus_t_div_log_t, log_t);
+    Value log_x = LogOp::create(rewriter, loc, x);
+    Value log_y_rhs = AddV2Op::create(
+        rewriter, loc, z_plus_one_half_minus_t_div_log_t_mul_log_t, log_x);
+    Value log_y = AddV2Op::create(rewriter, loc, log_sqrt_two_pi, log_y_rhs);
 
     // Compute the reflected value, used when x < 0.5:
     //
@@ -747,48 +756,48 @@ class LowerLgammaOp : public RewritePattern {
     // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
     // to 1.  To remedy this, we can use the fact that sin(pi * x) in the domain
     // [0, 1] is symmetric across the line Y=0.5.
-    Value abs_input = rewriter.create<AbsOp>(loc, input);
-    Value abs_input_floor = rewriter.create<FloorOp>(loc, abs_input);
+    Value abs_input = AbsOp::create(rewriter, loc, input);
+    Value abs_input_floor = FloorOp::create(rewriter, loc, abs_input);
     Value abs_frac_input =
-        rewriter.create<SubOp>(loc, abs_input, abs_input_floor);
+        SubOp::create(rewriter, loc, abs_input, abs_input_floor);
 
     // Convert values of abs_frac_input > 0.5 to (1 - frac_input) to improve
     // precision of pi * abs_frac_input for values of abs_frac_input close to 1.
     Value one_minus_abs_frac_input =
-        rewriter.create<SubOp>(loc, one, abs_frac_input);
+        SubOp::create(rewriter, loc, one, abs_frac_input);
     Value abs_frac_input_gt_one_half =
-        rewriter.create<GreaterOp>(loc, abs_frac_input, one_half);
+        GreaterOp::create(rewriter, loc, abs_frac_input, one_half);
     Value reduced_frac_input =
-        rewriter.create<SelectV2Op>(loc, abs_frac_input_gt_one_half,
-                                    one_minus_abs_frac_input, abs_frac_input);
+        SelectV2Op::create(rewriter, loc, abs_frac_input_gt_one_half,
+                           one_minus_abs_frac_input, abs_frac_input);
     Value pi_mul_reduced_frac_input =
-        rewriter.create<MulOp>(loc, pi, reduced_frac_input);
+        MulOp::create(rewriter, loc, pi, reduced_frac_input);
     Value sin_pi_mul_reduced_frac_input =
-        rewriter.create<SinOp>(loc, pi_mul_reduced_frac_input);
+        SinOp::create(rewriter, loc, pi_mul_reduced_frac_input);
     Value reflection_denom =
-        rewriter.create<LogOp>(loc, sin_pi_mul_reduced_frac_input);
+        LogOp::create(rewriter, loc, sin_pi_mul_reduced_frac_input);
 
     // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
     // then it "wins" and the result is +/-inf.
     Value is_finite =
-        rewriter.create<IsFiniteOp>(loc, tensor_bool_type, reflection_denom);
-    Value neg_reflection_denom = rewriter.create<NegOp>(loc, reflection_denom);
+        IsFiniteOp::create(rewriter, loc, tensor_bool_type, reflection_denom);
+    Value neg_reflection_denom = NegOp::create(rewriter, loc, reflection_denom);
     Value log_pi_minus_reflection_denom =
-        rewriter.create<SubOp>(loc, log_pi, reflection_denom);
+        SubOp::create(rewriter, loc, log_pi, reflection_denom);
     Value reflection_if_finite =
-        rewriter.create<SubOp>(loc, log_pi_minus_reflection_denom, log_y);
-    Value reflection = rewriter.create<SelectV2Op>(
-        loc, is_finite, reflection_if_finite, neg_reflection_denom);
+        SubOp::create(rewriter, loc, log_pi_minus_reflection_denom, log_y);
+    Value reflection = SelectV2Op::create(
+        rewriter, loc, is_finite, reflection_if_finite, neg_reflection_denom);
 
     Value result =
-        rewriter.create<SelectV2Op>(loc, need_to_reflect, reflection, log_y);
+        SelectV2Op::create(rewriter, loc, need_to_reflect, reflection, log_y);
 
     // lgamma(+/-inf) = +inf.
-    Value is_inf = rewriter.create<IsInfOp>(loc, tensor_bool_type, input);
-    result = rewriter.create<SelectV2Op>(loc, is_inf, infinity, result);
+    Value is_inf = IsInfOp::create(rewriter, loc, tensor_bool_type, input);
+    result = SelectV2Op::create(rewriter, loc, is_inf, infinity, result);
 
     if (needs_cast) {
-      result = rewriter.create<CastOp>(loc, original_tensor_type, result);
+      result = CastOp::create(rewriter, loc, original_tensor_type, result);
     }
 
     rewriter.replaceOp(op, result);
@@ -819,10 +828,11 @@ class LowerPackOp : public RewritePattern {
     auto op = cast<PackOp>(src_op);
 
     Location loc = op.getLoc();
-    auto axis_value = rewriter.create<ConstOp>(
-        loc, DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
-                                        {}, rewriter.getIntegerType(64)),
-                                    op.getAxis()));
+    auto axis_value = ConstOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
+                                   {}, rewriter.getIntegerType(64)),
+                               op.getAxis()));
     int64_t axis = op.getAxis();
 
     Type prev_input_ty, inferred_ty;
@@ -838,7 +848,7 @@ class LowerPackOp : public RewritePattern {
         prev_input_ty = input_ty;
       }
       expanded_inputs.push_back(
-          rewriter.create<ExpandDimsOp>(loc, inferred_ty, input, axis_value));
+          ExpandDimsOp::create(rewriter, loc, inferred_ty, input, axis_value));
     }
 
     rewriter.replaceOpWithNewOp<ConcatV2Op>(op, op.getType(), expanded_inputs,
@@ -922,28 +932,28 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto block_shape_i64_type = tensorflow::GetTypeFromTFTensorShape(
         block_shape_type.getShape(), rewriter.getIntegerType(64));
     auto block_shape_i64 =
-        rewriter.create<CastOp>(loc, block_shape_i64_type, op.getBlockShape());
+        CastOp::create(rewriter, loc, block_shape_i64_type, op.getBlockShape());
 
     auto paddings_i64_type = tensorflow::GetTypeFromTFTensorShape(
         paddings_type.getShape(), rewriter.getIntegerType(64));
     auto paddings_i64 =
-        rewriter.create<CastOp>(loc, paddings_i64_type, op.getPaddings());
+        CastOp::create(rewriter, loc, paddings_i64_type, op.getPaddings());
 
-    auto pad00 = rewriter.create<ConstOp>(
-        loc, DenseElementsAttr::get<int64_t>(
-                 tensorflow::GetTypeFromTFTensorShape(
-                     {1, 2}, rewriter.getIntegerType(64)),
-                 {0, 0}));
+    auto pad00 = ConstOp::create(rewriter, loc,
+                                 DenseElementsAttr::get<int64_t>(
+                                     tensorflow::GetTypeFromTFTensorShape(
+                                         {1, 2}, rewriter.getIntegerType(64)),
+                                     {0, 0}));
     SmallVector<Value, 4> full_paddings_list{pad00, paddings_i64};
     full_paddings_list.append(remaining_rank, pad00);
     auto full_paddings_type = tensorflow::GetTypeFromTFTensorShape(
         {input_rank, 2}, rewriter.getIntegerType(64));
-    auto zero_i64 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
+    auto zero_i64 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
     // Extends paddings to all dimensions of input by adding 0s to non-block
     // dimensions.
-    auto full_paddings = rewriter.create<ConcatV2Op>(
-        loc, full_paddings_type, full_paddings_list, zero_i64);
+    auto full_paddings = ConcatV2Op::create(rewriter, loc, full_paddings_type,
+                                            full_paddings_list, zero_i64);
 
     // Compute the result type here instead of using shape inference because the
     // full_paddings won't be available as a constant for shape inference.
@@ -973,45 +983,44 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
         tensorflow::GetTypeFromTFTensorShape(padded_shape, element_type);
     // padded = pad(input, full_paddings)
     auto padded =
-        rewriter.create<PadOp>(loc, padded_type, op.getInput(), full_paddings);
+        PadOp::create(rewriter, loc, padded_type, op.getInput(), full_paddings);
 
     auto paddings_sum_type = tensorflow::GetTypeFromTFTensorShape(
         {input_rank}, rewriter.getIntegerType(64));
     // paddings_sum = paddings[*,0] + paddings[*,1]
-    auto paddings_split = rewriter.create<UnpackOp>(
-        loc, TypeRange({paddings_sum_type, paddings_sum_type}), full_paddings,
-        rewriter.getI64IntegerAttr(1));
-    auto paddings_sum = rewriter.create<AddV2Op>(
-        loc, paddings_split.getResult(0), paddings_split.getResult(1));
-
-    auto input_shape_tensor = rewriter.create<ConstOp>(
-        loc,
+    auto paddings_split = UnpackOp::create(
+        rewriter, loc, TypeRange({paddings_sum_type, paddings_sum_type}),
+        full_paddings, rewriter.getI64IntegerAttr(1));
+    auto paddings_sum =
+        AddV2Op::create(rewriter, loc, paddings_split.getResult(0),
+                        paddings_split.getResult(1));
+
+    auto input_shape_tensor = ConstOp::create(
+        rewriter, loc,
         DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
                                    {input_rank}, rewriter.getIntegerType(64)),
                                input_shape));
 
     // padded_shape_tensor is the shape of padded.
     auto padded_shape_tensor =
-        rewriter.create<AddV2Op>(loc, paddings_sum, input_shape_tensor);
+        AddV2Op::create(rewriter, loc, paddings_sum, input_shape_tensor);
 
-    auto zero_i32 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
+    auto zero_i32 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
     SmallVector<Type, 4> padded_shape_splits_types(
         input_rank,
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getIntegerType(64)));
     SmallVector<Value, 4> padded_shape_splits(
-        rewriter
-            .create<SplitOp>(loc, padded_shape_splits_types, zero_i32,
-                             padded_shape_tensor)
+        SplitOp::create(rewriter, loc, padded_shape_splits_types, zero_i32,
+                        padded_shape_tensor)
             .getOutput());
 
     SmallVector<Type, 4> block_shape_splits_types(
         block_rank,
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getIntegerType(64)));
     SmallVector<Value, 4> block_shape_splits(
-        rewriter
-            .create<SplitOp>(loc, block_shape_splits_types, zero_i32,
-                             block_shape_i64)
+        SplitOp::create(rewriter, loc, block_shape_splits_types, zero_i32,
+                        block_shape_i64)
             .getOutput());
 
     SmallVector<int64_t, 4> outer_shape_ints;
@@ -1019,8 +1028,8 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     for (int64_t i = 0; i < block_rank; ++i) {
       // TODO(b/157475606): Insert tf.Assert that the following division has
       // remainder 0.
-      outer_shape_vals.push_back(rewriter.create<DivOp>(
-          loc, padded_shape_splits[1 + i], block_shape_splits[i]));
+      outer_shape_vals.push_back(DivOp::create(
+          rewriter, loc, padded_shape_splits[1 + i], block_shape_splits[i]));
 
       auto padded_shape_i = padded_shape[1 + i];
       auto block_shape_ints_i = block_shape_ints[i];
@@ -1049,8 +1058,8 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto reshaped_shape = ValuesToRank1(
         rewriter, loc, rewriter.getIntegerType(64), reshaped_shape_vals);
 
-    auto reshaped = rewriter.create<ReshapeOp>(
-        loc,
+    auto reshaped = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(reshaped_shape_ints, element_type),
         padded, reshaped_shape);
 
@@ -1065,14 +1074,14 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
       permutation_vals.push_back(block_rank + i);
     }
-    auto permutation = rewriter.create<ConstOp>(
-        loc, GetI64ElementsAttr(permutation_vals, &rewriter));
+    auto permutation = ConstOp::create(
+        rewriter, loc, GetI64ElementsAttr(permutation_vals, &rewriter));
 
-    auto permuted = rewriter.create<TransposeOp>(loc, reshaped, permutation);
+    auto permuted = TransposeOp::create(rewriter, loc, reshaped, permutation);
     auto output_batch = padded_shape_splits[0];
     for (int64_t i = 0; i < block_rank; ++i) {
       output_batch =
-          rewriter.create<MulOp>(loc, output_batch, block_shape_splits[i]);
+          MulOp::create(rewriter, loc, output_batch, block_shape_splits[i]);
     }
     SmallVector<Value, 4> output_shape_vals{output_batch};
     for (int64_t i = 0; i < block_rank; ++i) {
@@ -1163,11 +1172,11 @@ class LowerBatchToSpaceND : public RewritePattern {
     std::copy(input_shape.begin() + 1, input_shape.end(),
               reshaped_shape.begin() + block_rank + 1);
 
-    auto reshaped = rewriter.create<TF::ReshapeOp>(
-        op.getLoc(),
+    auto reshaped = TF::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(reshaped_shape, element_ty), input,
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(reshaped_shape)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(reshaped_shape)));
 
     // 2. Permute dimensions of `reshaped` to produce `permuted` of shape
     //      [batch / prod(block_shape),
@@ -1191,12 +1200,12 @@ class LowerBatchToSpaceND : public RewritePattern {
       transpose_shape[it.index()] = reshaped_shape[it.value()];
     }
 
-    auto permuted = rewriter.create<TF::TransposeOp>(
-        op.getLoc(),
+    auto permuted = TF::TransposeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(transpose_shape, element_ty),
         reshaped,
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(permutation)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(permutation)));
 
     // 3. Reshape `permuted` to produce `reshaped_permuted` of shape
     //      [batch / prod(block_shape),
@@ -1219,13 +1228,13 @@ class LowerBatchToSpaceND : public RewritePattern {
     std::copy(remainder_shape.begin(), remainder_shape.end(),
               reshaped_permuted_shape.begin() + 1 + block_rank);
 
-    auto reshaped_permuted = rewriter.create<TF::ReshapeOp>(
-        op.getLoc(),
+    auto reshaped_permuted = TF::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(reshaped_permuted_shape,
                                              element_ty),
         permuted,
-        rewriter.create<ConstOp>(
-            op.getLoc(), rewriter.getI64TensorAttr(reshaped_permuted_shape)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(reshaped_permuted_shape)));
 
     // 4. Crop the start and end of dimensions `[1, ..., M]` of
     //    `reshaped_permuted` according to `crops` to produce the output of
@@ -1263,10 +1272,10 @@ class LowerBatchToSpaceND : public RewritePattern {
     rewriter.replaceOpWithNewOp<TF::SliceOp>(
         op, tensorflow::GetTypeFromTFTensorShape(slice_sizes, element_ty),
         reshaped_permuted,
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(start_indices)),
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(slice_sizes)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(start_indices)),
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(slice_sizes)));
     return success();
   }
 };
@@ -1310,11 +1319,11 @@ class LowerSparseMatMulOp : public RewritePattern {
         tensor_type_f32 = UnrankedTensorType::get(Float32Type::get(context));
       }
       // Add cast to f32 to conform with element type of result.
-      operand = rewriter.create<CastOp>(op.getLoc(), tensor_type_f32, operand);
+      operand = CastOp::create(rewriter, op.getLoc(), tensor_type_f32, operand);
     }
-    Value result = rewriter.create<MatMulOp>(
-        op.getLoc(), op.getProduct().getType(), operands[0], operands[1],
-        op.getTransposeA(), op.getTransposeB());
+    Value result = MatMulOp::create(
+        rewriter, op.getLoc(), op.getProduct().getType(), operands[0],
+        operands[1], op.getTransposeA(), op.getTransposeB());
 
     rewriter.replaceOp(op, {result});
     return success();
@@ -1441,20 +1450,22 @@ class LowerResizeNearestNeighbor : public RewritePattern {
     }
 
     auto one =
-        rewriter.create<ConstOp>(loc, GetScalarOfType(out_size_element_ty, 1));
+        ConstOp::create(rewriter, loc, GetScalarOfType(out_size_element_ty, 1));
 
     // Extract the image shape.
-    Value input_shape = rewriter.create<ShapeOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({4}, rewriter.getI64Type()),
+    Value input_shape = ShapeOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({4}, rewriter.getI64Type()),
         input);
-    input_shape = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
+    input_shape = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
         input_shape);
 
     auto scalar_dim_ty =
         tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty);
-    auto split_image_shape = rewriter.create<UnpackOp>(
-        loc,
+    auto split_image_shape = UnpackOp::create(
+        rewriter, loc,
         TypeRange({scalar_dim_ty, scalar_dim_ty, scalar_dim_ty, scalar_dim_ty}),
         input_shape);
 
@@ -1464,151 +1475,156 @@ class LowerResizeNearestNeighbor : public RewritePattern {
     auto in_x = split_image_shape.getResult(2);
     auto channels = split_image_shape.getResult(3);
 
-    auto in_count = rewriter.create<MulOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty),
-        in_y, in_x);
+    auto in_count = MulOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty), in_y,
+        in_x);
 
     // Unpack and separate the out width/height.
-    auto split_out_size = rewriter.create<UnpackOp>(
-        loc, TypeRange({scalar_dim_ty, scalar_dim_ty}), out_size);
+    auto split_out_size = UnpackOp::create(
+        rewriter, loc, TypeRange({scalar_dim_ty, scalar_dim_ty}), out_size);
 
     auto out_y = split_out_size.getResult(0);
     auto out_x = split_out_size.getResult(1);
 
-    auto out_count = rewriter.create<MulOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty),
-        out_y, out_x);
+    auto out_count = MulOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty), out_y,
+        out_x);
 
     // Generate what the final output shape will look like.
-    auto out_shape = rewriter.create<PackOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
+    auto out_shape = PackOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
         ValueRange({batch, out_y, out_x, channels}));
 
     // Compute the indices along the vertical dimension.
-    auto in_y_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        in_y);
-    auto out_w_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        out_y);
-
-    Value y_scale = rewriter.create<DivOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
+    auto in_y_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), in_y);
+    auto out_w_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), out_y);
+
+    Value y_scale = DivOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
         in_y_f32, out_w_f32);
 
-    Value zero_f32 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getF32Type(), 0.0));
-    Value one_f32 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getF32Type(), 1.0));
-
-    Value y_range = rewriter.create<RangeOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_height_constant},
-                                             rewriter.getF32Type()),
-        zero_f32, out_w_f32, one_f32);
-
-    y_range = rewriter.create<MulOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_height_constant},
-                                             rewriter.getF32Type()),
-        y_range, y_scale);
-
-    y_range =
-        rewriter.create<CastOp>(loc,
-                                tensorflow::GetTypeFromTFTensorShape(
-                                    {out_height_constant}, out_size_element_ty),
-                                y_range);
-
-    y_range = rewriter.create<ReshapeOp>(
-        loc,
+    Value zero_f32 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getF32Type(), 0.0));
+    Value one_f32 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getF32Type(), 1.0));
+
+    Value y_range =
+        RangeOp::create(rewriter, loc,
+                        tensorflow::GetTypeFromTFTensorShape(
+                            {out_height_constant}, rewriter.getF32Type()),
+                        zero_f32, out_w_f32, one_f32);
+
+    y_range = MulOp::create(rewriter, loc,
+                            tensorflow::GetTypeFromTFTensorShape(
+                                {out_height_constant}, rewriter.getF32Type()),
+                            y_range, y_scale);
+
+    y_range = CastOp::create(rewriter, loc,
+                             tensorflow::GetTypeFromTFTensorShape(
+                                 {out_height_constant}, out_size_element_ty),
+                             y_range);
+
+    y_range = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape({out_height_constant, 1},
                                              out_size_element_ty),
         y_range,
-        rewriter.create<PackOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
+        PackOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
             ValueRange({out_y, one})));
 
-    Value y_indices = rewriter.create<MulOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_height_constant, 1},
-                                             out_size_element_ty),
-        y_range, in_x);
+    Value y_indices =
+        MulOp::create(rewriter, loc,
+                      tensorflow::GetTypeFromTFTensorShape(
+                          {out_height_constant, 1}, out_size_element_ty),
+                      y_range, in_x);
 
     // Compute the indices for the nearest neighbour lookup across the width
     // dim.
-    auto in_x_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        in_x);
-    auto out_h_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        out_x);
-
-    Value x_scale = rewriter.create<DivOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
+    auto in_x_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), in_x);
+    auto out_h_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), out_x);
+
+    Value x_scale = DivOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
         in_x_f32, out_h_f32);
 
-    Value x_range = rewriter.create<RangeOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_width_constant},
-                                             rewriter.getF32Type()),
-        zero_f32, out_h_f32, one_f32);
-
-    x_range =
-        rewriter.create<MulOp>(loc,
-                               tensorflow::GetTypeFromTFTensorShape(
-                                   {out_width_constant}, rewriter.getF32Type()),
-                               x_range, x_scale);
-
-    x_range =
-        rewriter.create<CastOp>(loc,
-                                tensorflow::GetTypeFromTFTensorShape(
-                                    {out_width_constant}, out_size_element_ty),
-                                x_range);
-
-    Value x_indices = rewriter.create<ReshapeOp>(
-        loc,
+    Value x_range =
+        RangeOp::create(rewriter, loc,
+                        tensorflow::GetTypeFromTFTensorShape(
+                            {out_width_constant}, rewriter.getF32Type()),
+                        zero_f32, out_h_f32, one_f32);
+
+    x_range = MulOp::create(rewriter, loc,
+                            tensorflow::GetTypeFromTFTensorShape(
+                                {out_width_constant}, rewriter.getF32Type()),
+                            x_range, x_scale);
+
+    x_range = CastOp::create(rewriter, loc,
+                             tensorflow::GetTypeFromTFTensorShape(
+                                 {out_width_constant}, out_size_element_ty),
+                             x_range);
+
+    Value x_indices = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape({1, out_width_constant},
                                              out_size_element_ty),
         x_range,
-        rewriter.create<PackOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
+        PackOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
             ValueRange({one, out_x})));
 
     // Generate the combined index array, reshape to be 1-D.
-    Value indices = rewriter.create<AddV2Op>(
-        loc,
+    Value indices = AddV2Op::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {out_height_constant, out_width_constant}, out_size_element_ty),
         y_indices, x_indices);
 
-    indices = rewriter.create<ReshapeOp>(
-        loc,
+    indices = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape({out_spatial_cst},
                                              out_size_element_ty),
         indices,
-        rewriter.create<ReshapeOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({1}, out_size_element_ty),
+        ReshapeOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({1}, out_size_element_ty),
             out_count,
-            rewriter.create<ConstOp>(loc, rewriter.getI64TensorAttr({1}))));
+            ConstOp::create(rewriter, loc, rewriter.getI64TensorAttr({1}))));
 
     // Group the spatial indices and gather along that combined index.
-    Value input_collapsed_spatial = rewriter.create<ReshapeOp>(
-        loc,
+    Value input_collapsed_spatial = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {batch_cst, in_spatial_cst, channels_cst}, input_element_ty),
         input,
-        rewriter.create<PackOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({3}, out_size_element_ty),
+        PackOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({3}, out_size_element_ty),
             ValueRange({batch, in_count, channels})));
 
-    Value gathered_values = rewriter.create<GatherV2Op>(
-        loc,
+    Value gathered_values = GatherV2Op::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {batch_cst, out_spatial_cst, channels_cst}, input_element_ty),
         input_collapsed_spatial, indices, /*axis=*/one);
 
     gathered_values =
-        rewriter.create<ReshapeOp>(loc, result_ty, gathered_values, out_shape);
+        ReshapeOp::create(rewriter, loc, result_ty, gathered_values, out_shape);
 
     rewriter.replaceOp(op, gathered_values);
     return success();
@@ -1681,18 +1697,18 @@ struct LowerRollOp : public RewritePattern {
       begin_values[axis_i] = begin_i;
       auto begin_attr = DenseIntElementsAttr::get(axis_type, begin_values);
       auto begin =
-          rewriter.create<ConstOp>(op->getLoc(), axis_type, begin_attr);
+          ConstOp::create(rewriter, op->getLoc(), axis_type, begin_attr);
 
       SmallVector<int64_t, 4> output_shape;
       output_shape.append(input_shape.begin(), input_shape.end());
       output_shape[axis_i] = size_i;
       auto size_attr = DenseIntElementsAttr::get(axis_type, output_shape);
-      auto size = rewriter.create<ConstOp>(op->getLoc(), axis_type, size_attr);
+      auto size = ConstOp::create(rewriter, op->getLoc(), axis_type, size_attr);
 
       auto slice_op_ty = tensorflow::GetTypeFromTFTensorShape(
           output_shape, input_ty.getElementType());
-      return rewriter.create<SliceOp>(op->getLoc(), slice_op_ty, input, begin,
-                                      size);
+      return SliceOp::create(rewriter, op->getLoc(), slice_op_ty, input, begin,
+                             size);
     };
 
     auto result = tf_roll_op.getInput();
@@ -1708,9 +1724,9 @@ struct LowerRollOp : public RewritePattern {
 
       auto dim_attr = DenseIntElementsAttr::get(scalar_type, {axis_i});
       auto concat_dim =
-          rewriter.create<ConstOp>(op->getLoc(), scalar_type, dim_attr);
-      auto concat_op = rewriter.create<ConcatV2Op>(
-          op->getLoc(), input_ty,
+          ConstOp::create(rewriter, op->getLoc(), scalar_type, dim_attr);
+      auto concat_op = ConcatV2Op::create(
+          rewriter, op->getLoc(), input_ty,
           ArrayRef<Value>({slice_op_1.getOutput(), slice_op_2.getOutput()}),
           concat_dim);
       result = concat_op.getResult();
@@ -1741,7 +1757,7 @@ class LowerSoftmaxOp : public OpRewritePattern<OpTy> {
     // Note that the TensorFlow Softmax op verifies that the input rank is
     // greater than or equal to one so the following sequence is valid.
     auto reduce_dim =
-        rewriter.create<TF::ConstOp>(loc, GetI64ElementsAttr({-1}, &rewriter));
+        TF::ConstOp::create(rewriter, loc, GetI64ElementsAttr({-1}, &rewriter));
 
     // Exponential of input values and then their sum can be very large here.
     // Division with large denominator is numerically unstable. To improve
@@ -1750,20 +1766,19 @@ class LowerSoftmaxOp : public OpRewritePattern<OpTy> {
     // after adding or subtracting all inputs in a batch using a common value
     // gives mathematically equivalent result.
     auto max_logits =
-        rewriter.create<TF::MaxOp>(loc, logits, reduce_dim,
-                                   /*keep_dims=*/rewriter.getBoolAttr(true));
-    auto shifted_logits = rewriter.create<TF::SubOp>(loc, logits, max_logits);
+        TF::MaxOp::create(rewriter, loc, logits, reduce_dim,
+                          /*keep_dims=*/rewriter.getBoolAttr(true));
+    auto shifted_logits = TF::SubOp::create(rewriter, loc, logits, max_logits);
 
     // Exponentiate the inputs.
-    Value exp = rewriter.create<TF::ExpOp>(loc, shifted_logits);
+    Value exp = TF::ExpOp::create(rewriter, loc, shifted_logits);
 
     // Compute summation of the exponentials.
-    Value sum =
-        rewriter.create<TF::SumOp>(loc, exp, reduce_dim,
-                                   /*keep_dims=*/rewriter.getBoolAttr(true));
+    Value sum = TF::SumOp::create(rewriter, loc, exp, reduce_dim,
+                                  /*keep_dims=*/rewriter.getBoolAttr(true));
 
     if (use_log) {
-      Value log = rewriter.create<TF::LogOp>(loc, sum);
+      Value log = TF::LogOp::create(rewriter, loc, sum);
       rewriter.replaceOpWithNewOp<TF::SubOp>(op, shifted_logits, log);
     } else {
       rewriter.replaceOpWithNewOp<TF::DivOp>(op, exp, sum);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index a9ff5a8f76268a..1061d564f51afc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -37,7 +37,7 @@ class GetF32Scalar<int value> :
 def TrueBoolAttr : AttrConstraint<CPred<"llvm::cast<::mlir::BoolAttr>($_self).getValue()">>;
 
 def CreateTFShapeOp : NativeCodeCall<
-    "$_builder.create<TF::ShapeOp>($0.getLoc(), $1, $2)">;
+    "TF::ShapeOp::create($_builder, $0.getLoc(), $1, $2)">;
 
 def IsI32 : NativeCodeCall<
     "$_builder.getBoolAttr(getElementTypeOrSelf($0.getType()).isInteger(32))">;
@@ -49,11 +49,11 @@ def CreateTFCastOpI32 : NativeCodeCall<
     "CreateTFCastOpI32(&$_builder, $0.getLoc(), $1, $2)">;
 
 def CreateTensorScatterNdOp : NativeCodeCall<
-    "$_builder.create<TF::ScatterNdOp>("
+    "TF::ScatterNdOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $1, $2, $3, $4)">;
 
 def CreateTensorScatterUpdateOp : NativeCodeCall<
-    "$_builder.create<TF::TensorScatterUpdateOp>("
+    "TF::TensorScatterUpdateOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $0, $1, $2, $3)">;
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index 4ddd3577957163..bd8ae6260ce259 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -105,13 +105,13 @@ class RewriteXlaHostComputeMlir
       rewriter.setInsertionPointToStart(&cloned_func.getBody().front());
       auto result_type =
           RankedTensorType::get({3}, rewriter.getType<TF::StringType>());
-      auto dynamic_key =
-          rewriter.create<TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
-              func.getLoc(), /*program=*/result_type, llvm::ArrayRef<Value>{});
+      auto dynamic_key = TF::_XlaCompileMlirPlaceholderProgramKeyOp::create(
+          rewriter, func.getLoc(), /*program=*/result_type,
+          llvm::ArrayRef<Value>{});
 
-      auto recv_at_host = rewriter.create<TF::_XlaRecvAtHostOp>(
-          func.getLoc(), op.getOperandTypes(), /*dynamic_key=*/dynamic_key,
-          op.getSendKeyAttr(),
+      auto recv_at_host = TF::_XlaRecvAtHostOp::create(
+          rewriter, func.getLoc(), op.getOperandTypes(),
+          /*dynamic_key=*/dynamic_key, op.getSendKeyAttr(),
           /*device_ordinal=*/rewriter.getI64IntegerAttr(0),
           rewriter.getStringAttr("TPU"));
       for (auto result :
@@ -120,8 +120,8 @@ class RewriteXlaHostComputeMlir
       }
 
       rewriter.setInsertionPoint(cloned_func.getBody().front().getTerminator());
-      rewriter.create<TF::_XlaSendFromHostOp>(
-          func.getLoc(),
+      TF::_XlaSendFromHostOp::create(
+          rewriter, func.getLoc(),
           cloned_func.getBody().front().getTerminator()->getOperands(),
           /*dynamic_key=*/dynamic_key, op.getRecvKeyAttr(),
           /*device_ordinal=*/rewriter.getI64IntegerAttr(0),
@@ -157,8 +157,8 @@ void UpdateArgAttributes(mlir::func::FuncOp func) {
         // 'sharding' attribute.
         // TODO(b/414807890): Not sure whether we need to pass a V2 sharding to
         // the _XlaShardingV2, do this when we actually have a use case.
-        auto updated_arg = builder.create<TF::XlaShardingOp>(
-            func.getLoc(), arg.getType(), arg, /*sharding=*/sharding,
+        auto updated_arg = TF::XlaShardingOp::create(
+            builder, func.getLoc(), arg.getType(), arg, /*sharding=*/sharding,
             /*_XlaSharding=*/sharding, /*_XlaShardingV2=*/mlir::StringAttr());
         func.getArgument(i).replaceAllUsesExcept(
             updated_arg, llvm::SmallPtrSet<Operation*, 1>({updated_arg}));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 803f135af624d7..656f87deb0b79f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -118,8 +118,8 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
     if (block_arg.getOwner() != replicate_block) return;
 
     OpBuilder builder(shape_op);
-    auto new_shape_op = builder.create<TF::VariableShapeOp>(
-        shape_op.getLoc(), shape_op.getType(),
+    auto new_shape_op = TF::VariableShapeOp::create(
+        builder, shape_op.getLoc(), shape_op.getType(),
         replicate_op.GetReplicaOperandForBlockArgument(block_arg,
                                                        /*replica=*/0));
     shape_op.replaceAllUsesWith(new_shape_op.getOperation());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
index deef690b4d9636..1945aa6d811c19 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
@@ -45,8 +45,8 @@ OpT AddOperandAndRewriteAs(Operation* op, Value operand, NamedAttrList attr,
   builder->setInsertionPoint(op);
   auto operands = llvm::to_vector<4>(op->getOperands());
   operands.push_back(operand);
-  auto new_op = builder->create<OpT>(op->getLoc(), op->getResultTypes(),
-                                     operands, attr.getAttrs());
+  auto new_op = OpT::create(*builder, op->getLoc(), op->getResultTypes(),
+                            operands, attr.getAttrs());
   op->replaceAllUsesWith(new_op.getOperation()->getResults());
   op->erase();
   return new_op;
@@ -82,8 +82,8 @@ LogicalResult RunOnRegion(Region* region) {
   OpBuilder builder(region);
   auto output_ty =
       RankedTensorType::get({}, VariantType::get(region->getContext()));
-  auto dedup_op = builder.create<XlaRecvTPUEmbeddingDeduplicationDataOp>(
-      loc, output_ty, config);
+  auto dedup_op = XlaRecvTPUEmbeddingDeduplicationDataOp::create(
+      builder, loc, output_ty, config);
 
   // Rewrite RecvTPUEmbeddingActivations op to the corresponding internal op.
   if (recv_op)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
index 1e7958660fd8c4..ce3b6bb5dd5070 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
@@ -213,13 +213,13 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder,
   }
 
   builder->setInsertionPointAfter(before_op);
-  auto launch = builder->create<tf_device::LaunchOp>(
-      before_op->getLoc(), builder->getStringAttr(host_device),
-      launch_result_types);
+  auto launch = tf_device::LaunchOp::create(*builder, before_op->getLoc(),
+                                            builder->getStringAttr(host_device),
+                                            launch_result_types);
   launch.getBody().push_back(launch_block);
 
   builder->setInsertionPointToEnd(&launch.GetBody());
-  builder->create<tf_device::ReturnOp>(before_op->getLoc(), launch_results);
+  tf_device::ReturnOp::create(*builder, before_op->getLoc(), launch_results);
 
   return launch;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
index 72302903b37fa5..d57390cbc919ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
@@ -37,16 +37,17 @@ struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
 
     // The type of the `num_parallel_calls` argument in ParallelMapDataset
     // and MapAndBatchDataset is different (int32 and int64 respectively)
-    auto num_parallel_calls_op = rewriter.create<CastOp>(
-        op.getLoc(), UnrankedTensorType::get(rewriter.getIntegerType(64)),
+    auto num_parallel_calls_op = CastOp::create(
+        rewriter, op.getLoc(),
+        UnrankedTensorType::get(rewriter.getIntegerType(64)),
         batchInputOp.getNumParallelCalls(), rewriter.getBoolAttr(false));
 
     if (op.getMetadata() != batchInputOp.getMetadata()) {
       return failure();
     }
 
-    auto fused_op = rewriter.create<MapAndBatchDatasetOp>(
-        op.getLoc(), op.getType(), batchInputOp.getInputDataset(),
+    auto fused_op = MapAndBatchDatasetOp::create(
+        rewriter, op.getLoc(), op.getType(), batchInputOp.getInputDataset(),
         batchInputOp.getOtherArguments(), op.getBatchSize(),
         num_parallel_calls_op.getY(), op.getDropRemainder(),
         batchInputOp.getF(), op.getOutputTypes(), op.getOutputShapes(),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
index bb4c951065f771..2ee19787c7552f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
@@ -131,8 +131,8 @@ void SinkResourceWritesIntoParallelExecute(
     new_result_types.push_back(old_result.getType());
 
   OpBuilder builder(parallel_execute);
-  auto new_parallel_execute = builder.create<tf_device::ParallelExecuteOp>(
-      parallel_execute.getLoc(), num_regions, new_result_types);
+  auto new_parallel_execute = tf_device::ParallelExecuteOp::create(
+      builder, parallel_execute.getLoc(), num_regions, new_result_types);
 
   for (auto region : llvm::zip(new_parallel_execute.getRegions(),
                                parallel_execute.getRegions()))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
index 5f708ce0ee1a74..8cd90d0a96e9e9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
@@ -106,9 +106,9 @@ void TPUResourceReadForWritePass::runOnOperation() {
       if (!resource_and_type.resource) continue;
       if (ClusterFuncHasResourceRead(cluster_func, resource_and_type.resource))
         continue;
-      auto new_read = builder.create<TF::ReadVariableOp>(
-          resource_and_type.resource.getLoc(), resource_and_type.subtype,
-          resource_and_type.resource);
+      auto new_read = TF::ReadVariableOp::create(
+          builder, resource_and_type.resource.getLoc(),
+          resource_and_type.subtype, resource_and_type.resource);
       read_operands.push_back(new_read.getValue());
     }
 
@@ -119,8 +119,9 @@ void TPUResourceReadForWritePass::runOnOperation() {
     operands.append(read_operands.begin(), read_operands.end());
 
     auto loc = cluster_func.getLoc();
-    auto new_cluster_func = builder.create<tf_device::ClusterFuncOp>(
-        loc, cluster_func.getResultTypes(), operands, cluster_func->getAttrs());
+    auto new_cluster_func = tf_device::ClusterFuncOp::create(
+        builder, loc, cluster_func.getResultTypes(), operands,
+        cluster_func->getAttrs());
     cluster_func.replaceAllUsesWith(new_cluster_func);
     func::FuncOp func = cluster_func.getFuncOp();
     Block& block = func.front();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index 03618d23464b0a..85db75ea51a543 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -89,9 +89,9 @@ TF::ReshapeOp ConvertTFBatchMatMulOp<BatchMatMulOpType>::createReshapeOp(
   Type resultType = RankedTensorType::get(shape, element_type);
   auto constant_attr = DenseElementsAttr::get(shape_spec_type, shape);
   auto shape_tensor =
-      rewriter.create<TF::ConstOp>(loc, shape_spec_type, constant_attr);
-  return rewriter.create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
-                                        /*shape=*/shape_tensor);
+      TF::ConstOp::create(rewriter, loc, shape_spec_type, constant_attr);
+  return TF::ReshapeOp::create(rewriter, loc, resultType, /*tensor=*/value,
+                               /*shape=*/shape_tensor);
 }
 
 template <typename BatchMatMulOpType>
@@ -122,16 +122,16 @@ std::vector<Value> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
     auto split_dimension_type =
         RankedTensorType::get({}, rewriter.getIntegerType(32));
     auto split_dimension_attr = DenseElementsAttr::get(split_dimension_type, 0);
-    auto split_dimension_op = rewriter.create<TF::ConstOp>(
-        loc, split_dimension_type, split_dimension_attr);
+    auto split_dimension_op = TF::ConstOp::create(
+        rewriter, loc, split_dimension_type, split_dimension_attr);
 
     // Split along each batch.
     SmallVector<int64_t, 3> slice_size = {1, num_rows, num_cols};
     Type slice_result_type = RankedTensorType::get(slice_size, element_type);
     llvm::SmallVector<Type, 4> output_types(batch_size, slice_result_type);
-    auto split_op = rewriter.create<TF::SplitOp>(loc, output_types,
-                                                 split_dimension_op.getOutput(),
-                                                 reshape_op.getOutput());
+    auto split_op = TF::SplitOp::create(rewriter, loc, output_types,
+                                        split_dimension_op.getOutput(),
+                                        reshape_op.getOutput());
 
     // Squeeze each batch, i.e. reshape
     // [1, num_rows, num_cols] -> [num_rows, num_cols]
@@ -259,11 +259,11 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
       lhs_batch_idx = batch_idx;
       rhs_batch_idx = batch_idx;
     }
-    auto matmul = rewriter.create<TF::MatMulOp>(loc, matmul_type,
-                                                /*a=*/sliced_lhs[lhs_batch_idx],
-                                                /*b=*/sliced_rhs[rhs_batch_idx],
-                                                /*transpose_a=*/op.getAdjX(),
-                                                /*transpose_b=*/op.getAdjY());
+    auto matmul = TF::MatMulOp::create(rewriter, loc, matmul_type,
+                                       /*a=*/sliced_lhs[lhs_batch_idx],
+                                       /*b=*/sliced_rhs[rhs_batch_idx],
+                                       /*transpose_a=*/op.getAdjX(),
+                                       /*transpose_b=*/op.getAdjY());
     matmuls.emplace_back(matmul.getProduct());
   }
 
@@ -272,7 +272,7 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
       {bcast.output_batch_size(), rows, cols}, element_type);
   const auto axis = rewriter.getI64IntegerAttr(0);
   auto pack_op =
-      rewriter.create<TF::PackOp>(loc, packed_type, /*values=*/matmuls, axis);
+      TF::PackOp::create(rewriter, loc, packed_type, /*values=*/matmuls, axis);
 
   // Reshape the rank-3 tensor into the correct output shape.
   const auto& result_batch_shape = bcast.output_batch_shape().dim_sizes();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
index 9d9780d231523f..3fea8e64e85ca3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
@@ -15,11 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h"
 
+#include <cstdint>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/dump_graph.h"
 
@@ -68,7 +79,7 @@ class StringWritableFile : public WritableFile {
 TEST(Dump, TextualIrToFileSuccess) {
   Graph graph(OpRegistry::Global());
   Node* node;
-  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+  CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
   UseMlirForGraphDump(MlirDumpConfig());
@@ -98,7 +109,7 @@ TEST(Dump, TextualIrWithOptions) {
 TEST(Dump, DumpToTFG) {
   Graph graph(OpRegistry::Global());
   Node* node;
-  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+  CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
   std::string actual;
   StringWritableFile file(&actual);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index dcd71dedc9790f..8634afe5fc1498 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -15,57 +15,71 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 
-#include "llvm/ADT/Twine.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/testlib/test.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace mlir {
 namespace {
 
-using testing::HasSubstr;
+using ::testing::HasSubstr;
 
-TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
-  MLIRContext context;
-  auto id = StringAttr::get(&context, "//tensorflow/python/test.py");
-  auto loc = FileLineColLoc::get(&context, id, 0, 0);
+class ErrorUtilTest : public ::testing::Test {
+ protected:
+  ErrorUtilTest()
+      : id_(StringAttr::get(&context_, "//tensorflow/python/test.py")),
+        loc_(FileLineColLoc::get(&context_, id_, 0, 0)) {}
+
+  MLIRContext context_;
+  StringAttr id_;
+  FileLineColLoc loc_;
+};
+
+using StatusScopedDiagnosticHandlerTest = ErrorUtilTest;
+
+TEST_F(StatusScopedDiagnosticHandlerTest,
+       OkWithoutDiagnosticGetsPassedThrough) {
+  TF_ASSERT_OK(
+      StatusScopedDiagnosticHandler(&context_).Combine(tensorflow::OkStatus()));
+}
+
+TEST_F(StatusScopedDiagnosticHandlerTest,
+       VerifyDiagnosticsAreCapturedAsUnknownStatus) {
+  StatusScopedDiagnosticHandler handler(&context_);
+  emitError(loc_) << "Diagnostic message";
+  ASSERT_TRUE(absl::IsUnknown(handler.ConsumeStatus()));
+}
+
+TEST_F(StatusScopedDiagnosticHandlerTest, VerifyPassedInErrorsArePropagated) {
+  const Status err = tensorflow::errors::Internal("Passed in error");
+  ASSERT_TRUE(
+      absl::IsInternal(StatusScopedDiagnosticHandler(&context_).Combine(err)));
+}
+
+TEST_F(StatusScopedDiagnosticHandlerTest,
+       VerifyThatReportedDiagnosticsAreAppendedToPassedInError) {
+  StatusScopedDiagnosticHandler ssdh(&context_);
+  emitError(loc_) << "Diagnostic message reported";
+  emitError(loc_) << "Second diagnostic message reported";
+  const Status s =
+      ssdh.Combine(tensorflow::errors::Internal("Passed in error"));
+  ASSERT_TRUE(absl::IsInternal(s));
+  EXPECT_THAT(s.message(), HasSubstr("Passed in error"));
+  EXPECT_THAT(s.message(), HasSubstr("Diagnostic message reported"));
+  EXPECT_THAT(s.message(), HasSubstr("Second diagnostic message reported"));
+}
 
-  // Test OK without diagnostic gets passed through.
-  {
-    TF_ASSERT_OK(
-        StatusScopedDiagnosticHandler(&context).Combine(absl::OkStatus()));
-  }
-
-  // Verify diagnostics are captured as Unknown status.
-  {
-    StatusScopedDiagnosticHandler handler(&context);
-    emitError(loc) << "Diagnostic message";
-    ASSERT_TRUE(absl::IsUnknown(handler.ConsumeStatus()));
-  }
-
-  // Verify passed in errors are propagated.
-  {
-    Status err = tensorflow::errors::Internal("Passed in error");
-    ASSERT_TRUE(
-        absl::IsInternal(StatusScopedDiagnosticHandler(&context).Combine(err)));
-  }
-
-  // Verify diagnostic reported are append to passed in error.
-  {
-    auto function = [&]() {
-      emitError(loc) << "Diagnostic message reported";
-      emitError(loc) << "Second diagnostic message reported";
-      return tensorflow::errors::Internal("Passed in error");
-    };
-    StatusScopedDiagnosticHandler ssdh(&context);
-    Status s = ssdh.Combine(function());
-    ASSERT_TRUE(absl::IsInternal(s));
-    EXPECT_THAT(s.message(), HasSubstr("Passed in error"));
-    EXPECT_THAT(s.message(), HasSubstr("Diagnostic message reported"));
-    EXPECT_THAT(s.message(), HasSubstr("Second diagnostic message reported"));
-  }
+TEST_F(StatusScopedDiagnosticHandlerTest, VerifyThatWarningsAreIgnored) {
+  // Note: this logic is actually implemented in BaseScopedDiagnosticHandler's
+  // handler() function, but only StatusScopedDiagnosticHandler uses it.
+  StatusScopedDiagnosticHandler handler(&context_);
+  emitWarning(loc_) << "Warning message";
+  TF_EXPECT_OK(handler.ConsumeStatus());
 }
 
 TEST(ErrorUtilTest, StatusScopedDiagnosticHandlerWithFilter) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
index 2ee95c1337aa52..cb406a2d0e3fc9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
@@ -19,17 +19,21 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "llvm/Support/CommandLine.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/threadpool_options.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/session_options.h"
@@ -81,9 +85,9 @@ void FakeSession::InitVariables() {
   auto container = device->resource_manager()->default_container();
 
   // Create 2 resources and initialize them with dummy values.
-  TF_CHECK_OK(device->resource_manager()->Create(
+  CHECK_OK(device->resource_manager()->Create(
       container, "var1", new tensorflow::Var(tensorflow::DataType::DT_FLOAT)));
-  TF_CHECK_OK(device->resource_manager()->Create(
+  CHECK_OK(device->resource_manager()->Create(
       container, "var2", new tensorflow::Var(tensorflow::DataType::DT_FLOAT)));
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
index 4bca511ca252b5..52d1bfc8ffde3a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
@@ -25,15 +25,15 @@ tf_device::ParallelExecuteOp BuildParallelExecuteOp(
     tf_device::ClusterFuncOp cluster_func, OpBuilder* builder) {
   const auto output_types = cluster_func.getResultTypes();
   builder->setInsertionPoint(cluster_func);
-  auto parallel_execute = builder->create<tf_device::ParallelExecuteOp>(
-      cluster_func.getLoc(), 1, output_types);
+  auto parallel_execute = tf_device::ParallelExecuteOp::create(
+      *builder, cluster_func.getLoc(), 1, output_types);
   cluster_func->remove();
   auto& block = parallel_execute.GetRegionBlockWithIndex(0);
   builder->setInsertionPointToEnd(&block);
   builder->insert(cluster_func);
   cluster_func.replaceAllUsesWith(parallel_execute);
-  builder->create<tf_device::ReturnOp>(block.getParent()->getLoc(),
-                                       cluster_func.getResults());
+  tf_device::ReturnOp::create(*builder, block.getParent()->getLoc(),
+                              cluster_func.getResults());
   return parallel_execute;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 932c941c6f5f7a..a7b676d8541909 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -793,8 +793,8 @@ TEST(TPURewriteDeviceUtilTest, TestHasModelParallelismFalse) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -811,8 +811,8 @@ TEST(TPURewriteDeviceUtilTest, TestHasModelParallelismTrue) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 5));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -830,8 +830,8 @@ TEST(TPURewriteDeviceUtilTest,
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -848,8 +848,8 @@ TEST(TPURewriteDeviceUtilTest,
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
 
   mlir::TF::RuntimeDevices devices;
@@ -865,8 +865,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
 
@@ -884,8 +884,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingTopology) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
@@ -904,8 +904,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingDeviceAssignment) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -924,8 +924,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceAssignment) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -951,8 +951,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceName) {
                     llvm::ArrayRef<llvm::StringRef>({"bad_device_name"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -974,16 +974,16 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
 
   llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<llvm::StringRef, 4>>
       devices;
-  auto replicate = builder.create<mlir::tf_device::ReplicateOp>(
-      mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
+  auto replicate = mlir::tf_device::ReplicateOp::create(
+      builder, mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
       llvm::ArrayRef<std::pair<mlir::ValueRange, mlir::Type>>{},
       mlir::ValueRange{}, mlir::TypeRange{});
   builder.setInsertionPoint(&replicate.getBody().front(),
                             replicate.getBody().front().begin());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
 
   mlir::TF::RuntimeDevices runtime_devices;
   std::string host_device;
@@ -1007,8 +1007,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
                      "/job:worker/replica:0/task:0/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -1034,8 +1034,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceInGenericPipeline) {
                     {"/job:localhost/replica:0/task:0/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
 
   mlir::TF::RuntimeDevices runtime_devices;
   (void)GetDevicesFromOp(*module_ref, &runtime_devices);
@@ -1060,8 +1060,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceInGenericPipelineMultiCPUs) {
                      "/job:worker/replica:0/task:2/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
 
   mlir::TF::RuntimeDevices runtime_devices;
   (void)GetDevicesFromOp(*module_ref, &runtime_devices);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc
index ba4d1b71a857cd..82b7202d6d78e9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc
@@ -83,8 +83,8 @@ int MovePreservedParallelExecuteChildren(
   // `num_moved_children` is the number of children that will be preserved.
   const size_t num_moved_children =
       old_parallel_execute.getRegions().size() - 1;
-  *new_parallel_execute = builder->create<mlir::tf_device::ParallelExecuteOp>(
-      old_parallel_execute->getLoc(),
+  *new_parallel_execute = mlir::tf_device::ParallelExecuteOp::create(
+      *builder, old_parallel_execute->getLoc(),
       num_moved_children + num_cores_per_replica, concatenated_output_types);
 
   // `cluster_idx` is the index of the child with the `ClusterFuncOp`, which
@@ -118,12 +118,12 @@ mlir::tf_device::LaunchOp WrapOpInLaunch(mlir::OpBuilder* builder,
                                          llvm::StringRef device) {
   mlir::OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();
 
-  auto launch = builder->create<mlir::tf_device::LaunchOp>(
-      loc, builder->getStringAttr(device), op->getResultTypes());
+  auto launch = mlir::tf_device::LaunchOp::create(
+      *builder, loc, builder->getStringAttr(device), op->getResultTypes());
   launch.getBody().push_back(new mlir::Block);
 
   builder->setInsertionPointToEnd(&launch.GetBody());
-  builder->create<mlir::tf_device::ReturnOp>(loc, op->getResults());
+  mlir::tf_device::ReturnOp::create(*builder, loc, op->getResults());
 
   // Move op inside cluster.
   op->moveBefore(launch.GetBody().getTerminator());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 89e00e9b4d628c..3bca701131151f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -94,22 +94,23 @@ mlir::TF::SliceOp CreateSliceOp(mlir::OpBuilder* builder,
   auto start_position_type =
       mlir::RankedTensorType::get(shape.dims(), builder->getIntegerType(64));
 
-  auto start_position_op = builder->create<mlir::TF::ConstOp>(
-      input.getLoc(), mlir::DenseIntElementsAttr::get(start_position_type,
-                                                      slice_start_position));
-
-  auto slice_size_op = builder->create<mlir::TF::ConstOp>(
-      input.getLoc(), mlir::DenseIntElementsAttr::get(
-                          mlir::RankedTensorType::get(
-                              shape.dims(), builder->getIntegerType(64)),
-                          slice_size));
+  auto start_position_op =
+      mlir::TF::ConstOp::create(*builder, input.getLoc(),
+                                mlir::DenseIntElementsAttr::get(
+                                    start_position_type, slice_start_position));
+
+  auto slice_size_op = mlir::TF::ConstOp::create(
+      *builder, input.getLoc(),
+      mlir::DenseIntElementsAttr::get(
+          mlir::RankedTensorType::get(shape.dims(),
+                                      builder->getIntegerType(64)),
+          slice_size));
 
   auto slice_result_type =
       mlir::RankedTensorType::get(slice_size, getElementTypeOrSelf(input));
 
-  return builder->create<mlir::TF::SliceOp>(input.getLoc(), slice_result_type,
-                                            input, start_position_op,
-                                            slice_size_op);
+  return mlir::TF::SliceOp::create(*builder, input.getLoc(), slice_result_type,
+                                   input, start_position_op, slice_size_op);
 }
 
 mlir::TF::PadOp CreatePadOp(mlir::OpBuilder* builder,
@@ -135,15 +136,15 @@ mlir::TF::PadOp CreatePadOp(mlir::OpBuilder* builder,
   auto padding_type =
       mlir::RankedTensorType::get({num_dims, 2}, builder->getIntegerType(64));
   auto paddings = mlir::DenseIntElementsAttr::get(padding_type, padding_values);
-  auto paddings_value = builder->create<mlir::TF::ConstOp>(location, paddings);
+  auto paddings_value = mlir::TF::ConstOp::create(*builder, location, paddings);
   mlir::SmallVector<int64_t, 4> expand_shape(padded_shape.begin(),
                                              padded_shape.end());
 
   auto expand_result_type =
       mlir::RankedTensorType::get(expand_shape, input_type.getElementType());
 
-  return builder->create<mlir::TF::PadOp>(location, expand_result_type,
-                                          src_input, paddings_value);
+  return mlir::TF::PadOp::create(*builder, location, expand_result_type,
+                                 src_input, paddings_value);
 }
 
 // Creates a tf::SplitOp that splits 'src_input' into 'num_splits' ways
@@ -198,8 +199,8 @@ mlir::LogicalResult CreateSplitOp(
     output_type = input_type;
   }
 
-  auto split_dimension_op = builder->create<mlir::TF::ConstOp>(
-      location, split_dim_type, split_dimension_attr);
+  auto split_dimension_op = mlir::TF::ConstOp::create(
+      *builder, location, split_dim_type, split_dimension_attr);
   if (is_ici_weight_dist_spmd) {
     split_dimension_op->setAttr(kICIWeightDistributionMlirBridgeMarker,
                                 builder->getBoolAttr(true));
@@ -207,8 +208,9 @@ mlir::LogicalResult CreateSplitOp(
 
   // Creates a split op that splits |src_input| along |split_dimension|.
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
-  *split_op = builder->create<mlir::TF::SplitOp>(
-      location, output_types, split_dimension_op.getOutput(), src_input);
+  *split_op =
+      mlir::TF::SplitOp::create(*builder, location, output_types,
+                                split_dimension_op.getOutput(), src_input);
   (*split_op)->setAttr(
       kNumSplitAttr,
       builder->getIntegerAttr(builder->getIntegerType(32), num_split));
@@ -230,8 +232,8 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
       mlir::RankedTensorType::get({}, builder->getIntegerType(32));
   auto concat_dimension_attr =
       mlir::DenseElementsAttr::get(concat_dim_type, concat_dimension);
-  auto concat_dimension_op = builder->create<mlir::TF::ConstOp>(
-      location, concat_dim_type, concat_dimension_attr);
+  auto concat_dimension_op = mlir::TF::ConstOp::create(
+      *builder, location, concat_dim_type, concat_dimension_attr);
 
   // Correctly set output shapes of concat op output if output shape is
   // statically known. Since the shape of TPUExecute op must be the same
@@ -253,8 +255,8 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
     output_type = input_type;
   }
 
-  return builder->create<mlir::TF::ConcatOp>(
-      location, output_type, concat_dimension_op.getOutput(), inputs);
+  return mlir::TF::ConcatOp::create(*builder, location, output_type,
+                                    concat_dimension_op.getOutput(), inputs);
 }
 
 mlir::TF::XlaConcatNDOp CreateXlaConcatNDOp(
@@ -292,9 +294,9 @@ mlir::TF::XlaConcatNDOp CreateXlaConcatNDOp(
     output_type = input_slice_type;
   }
 
-  auto op = builder.create<mlir::TF::XlaConcatNDOp>(
-      location, output_type, inputs, builder.getI64ArrayAttr(num_concats),
-      builder.getI64ArrayAttr(paddings));
+  auto op = mlir::TF::XlaConcatNDOp::create(
+      builder, location, output_type, inputs,
+      builder.getI64ArrayAttr(num_concats), builder.getI64ArrayAttr(paddings));
   return op;
 }
 
@@ -338,9 +340,9 @@ mlir::LogicalResult CreateXlaSplitNDOp(const mlir::Location& location,
           << absl::StrJoin(input_shape, ",")
           << ", Padding: " << absl::StrJoin(paddings, ",");
 
-  *xla_split_op = builder->create<mlir::TF::XlaSplitNDOp>(
-      location, output_types, src_input, builder->getI64ArrayAttr(num_splits),
-      builder->getI64ArrayAttr(paddings));
+  *xla_split_op = mlir::TF::XlaSplitNDOp::create(
+      *builder, location, output_types, src_input,
+      builder->getI64ArrayAttr(num_splits), builder->getI64ArrayAttr(paddings));
   if (is_ici_weight_dist_spmd) {
     (*xla_split_op)
         ->setAttr(kICIWeightDistributionMlirBridgeMarker,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index b13e099fde3557..475bd79849e80e 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -85,7 +85,7 @@ TEST(LegalizeMlirTest, LegalizesModule) {
       /*shape_determination_fns=*/{}, &compilation_result);
 
   EXPECT_TRUE(status.ok());
-  EXPECT_THAT(status.value(), HasSubstr("mhlo.const"));
+  EXPECT_THAT(status.value(), HasSubstr("stablehlo.constant"));
 }
 
 TEST(LegalizeMlirTest, FailsLegalizesModule) {
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 746bca0cdb79b7..da75a97030412d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -323,6 +323,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":mlir_roundtrip_flags",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -349,13 +350,13 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:function_body",
-        "//tensorflow/core/platform:crash_analysis",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -365,7 +366,6 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@local_xla//xla:status_macros",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc
index edf726134f66bd..cb48ff03def75d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
@@ -74,7 +75,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -83,14 +83,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/node_order.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/crash_analysis.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/function_body.h"
 #include "tensorflow/core/common_runtime/function_def_utils.h"
@@ -120,11 +121,9 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/crash_analysis.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stack_frame.h"
-#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
@@ -1889,7 +1888,7 @@ mlir::Operation* ImporterBase::CreateOperation(
     NameRangeMap input_ranges, output_ranges;
     // This will fail only if the OpDef is syntactically invalid.
     // TODO(jpienaar): Convert this CHECK into a properly propagated error.
-    TF_CHECK_OK(
+    CHECK_OK(
         NameRangesForNode(node, node.op_def(), &input_ranges, &output_ranges));
     if (inner_op->hasTrait<mlir::OpTrait::AttrSizedOperandSegments>()) {
       // Add derived "operand_segment_sizes" attr to the created operation.
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
index 4e7d058c3c6c6c..f292b270f855e8 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -419,10 +419,10 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/platform:enable_tf2_utils",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_xla//xla/tsl/lib/core:status_test_util",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc
index 8ffe558029ad8b..cb332fe4fb997b 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/cc/ops/tpu_functional_ops.h"
 #include "tensorflow/cc/ops/tpu_replication_ops.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -124,11 +124,11 @@ Node* FromNodeDef(absl::string_view name, absl::string_view node_type,
   }
 
   NodeDef node_def;
-  TF_CHECK_OK(builder.Finalize(&node_def));
+  CHECK_OK(builder.Finalize(&node_def));
 
   absl::Status s;
   Node* node = graph.AddNode(node_def, &s);
-  TF_CHECK_OK(s);
+  CHECK_OK(s);
   return node;
 }
 
@@ -547,12 +547,12 @@ TEST(UnsupportedOpTest,
   builder.Attr("dtypes", DT_FLOAT);
   builder.Attr("shapes", 1);
   NodeDef node_def;
-  TF_CHECK_OK(builder.Finalize(&node_def));
+  CHECK_OK(builder.Finalize(&node_def));
   absl::Status s;
   Node* node_InfeedDequeueTuple = (*root.graph()).AddNode(node_def, &s);
   node_InfeedDequeueTuple->set_requested_device(
       "/device:TPU_REPLICATED_CORE:0");
-  TF_CHECK_OK(s);
+  CHECK_OK(s);
   ASSERT_NE(node_InfeedDequeueTuple, nullptr);
 
   Graph graph(OpRegistry::Global());
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 7d00bc41716979..f40ada575d2f4a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -84,7 +84,7 @@ TEST(LegalizationOpConfigTest, CountLoweringsSet) {
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
   EXPECT_EQ(tf2xla_fallback_count, 333);
-  EXPECT_EQ(non_categorized_count, 434);
+  EXPECT_EQ(non_categorized_count, 435);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 2ab0c3c619b292..e4fe30755c2eb7 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -229,8 +229,8 @@ static std::optional<int64_t> GetIntegerHLOAxisFromTFAxis(Value value,
 /// the shape of the input value.
 static stablehlo::ConvertOp CastValueToI64(Location loc, Value value,
                                            PatternRewriter *rewriter) {
-  return rewriter->create<stablehlo::ConvertOp>(loc, value,
-                                                rewriter->getIntegerType(64));
+  return stablehlo::ConvertOp::create(*rewriter, loc, value,
+                                      rewriter->getIntegerType(64));
 }
 
 // Creates an unpack op along the 0th dimension of the tensor. The `value` input
@@ -242,9 +242,9 @@ static TF::UnpackOp UnpackTensorAlongZeroDim(Location loc, Value value,
   SmallVector<Type, 2> unpacked_indices_type(
       num_outputs,
       tensorflow::GetTypeFromTFTensorShape({}, indices_type.getElementType()));
-  auto unpacked_indices = rewriter->create<TF::UnpackOp>(
-      loc, unpacked_indices_type, value,
-      IntegerAttr::get(rewriter->getIntegerType(64), 0));
+  auto unpacked_indices =
+      TF::UnpackOp::create(*rewriter, loc, unpacked_indices_type, value,
+                           IntegerAttr::get(rewriter->getIntegerType(64), 0));
   return unpacked_indices;
 }
 
@@ -277,8 +277,8 @@ tensorflow::TensorShape ToTensorShape(
 static stablehlo::ConstantOp GetScalarLimitConstOfType(Type ty, Location loc,
                                                        hlo::ScalarLimit limit,
                                                        OpBuilder *builder) {
-  return builder->create<stablehlo::ConstantOp>(
-      loc, hlo::getScalarLimitOfType(ty, limit));
+  return stablehlo::ConstantOp::create(*builder, loc,
+                                       hlo::getScalarLimitOfType(ty, limit));
 }
 
 // Deprecated: This is maintained to aid in porting old code that is not yet
@@ -396,12 +396,12 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
                                      OpBuilder &builder) {
   auto broadcast_dims = GetI64ArrayAttr({feature_dim}, &builder);
   auto to_type = mlir::cast<RankedTensorType>(broadcast_to.getType());
-  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_shape = shape::ShapeOfOp::create(builder, loc, broadcast_to);
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
-  auto result_extents = builder.create<shape::ToExtentTensorOp>(
-      loc, result_extents_type, result_shape);
-  return builder.create<stablehlo::DynamicBroadcastInDimOp>(
-      loc, to_type, broadcast_from, result_extents, broadcast_dims);
+  auto result_extents = shape::ToExtentTensorOp::create(
+      builder, loc, result_extents_type, result_shape);
+  return stablehlo::DynamicBroadcastInDimOp::create(
+      builder, loc, to_type, broadcast_from, result_extents, broadcast_dims);
 }
 
 // Broadcasts `input` to the shape of `broadcast_to` value following
@@ -413,15 +413,15 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
 // supports unranked inputs in the lowering.
 static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
                                 OpBuilder &builder) {
-  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_shape = shape::ShapeOfOp::create(builder, loc, broadcast_to);
   auto to_type = mlir::cast<TensorType>(broadcast_to.getType());
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
-  auto result_extents = builder.create<shape::ToExtentTensorOp>(
-      loc, result_extents_type, result_shape);
+  auto result_extents = shape::ToExtentTensorOp::create(
+      builder, loc, result_extents_type, result_shape);
   int64_t rank = mlir::cast<RankedTensorType>(input.getType()).getRank();
   auto broadcast_dims = GetI64ArrayAttrForSeq(0, rank, &builder);
-  return builder.create<stablehlo::DynamicBroadcastInDimOp>(
-      loc, to_type, input, result_extents, broadcast_dims);
+  return stablehlo::DynamicBroadcastInDimOp::create(
+      builder, loc, to_type, input, result_extents, broadcast_dims);
 }
 
 // Builds a set of operations for applying reduction on the input value. A
@@ -430,9 +430,9 @@ static Value ApplyReduction(Location loc, Value input,
                             DenseIntElementsAttr reduce_dims,
                             OpBuilder *builder) {
   auto reduce_dims_op =
-      builder->create<stablehlo::ConstantOp>(loc, reduce_dims);
-  return builder->create<TF::SumOp>(loc, input, reduce_dims_op,
-                                    builder->getBoolAttr(false));
+      stablehlo::ConstantOp::create(*builder, loc, reduce_dims);
+  return TF::SumOp::create(*builder, loc, input, reduce_dims_op,
+                           builder->getBoolAttr(false));
 }
 
 // Creates a stablehlo.rng_uniform op with `builder` to generate `num_elements`
@@ -440,17 +440,16 @@ static Value ApplyReduction(Location loc, Value input,
 static stablehlo::RngOp CreateRngUniform32(Location loc, int num_elements,
                                            int lower_limit, int upper_limit,
                                            OpBuilder *builder) {
-  auto shape_tensor = builder->create<stablehlo::ConstantOp>(
-      loc, GetI64ElementsAttr({num_elements}, builder));
+  auto shape_tensor = stablehlo::ConstantOp::create(
+      *builder, loc, GetI64ElementsAttr({num_elements}, builder));
 
-  auto lower = builder->create<stablehlo::ConstantOp>(
-      loc, builder->getI32IntegerAttr(lower_limit));
-  auto upper = builder->create<stablehlo::ConstantOp>(
-      loc, builder->getI32IntegerAttr(upper_limit));
+  auto lower = stablehlo::ConstantOp::create(
+      *builder, loc, builder->getI32IntegerAttr(lower_limit));
+  auto upper = stablehlo::ConstantOp::create(
+      *builder, loc, builder->getI32IntegerAttr(upper_limit));
 
-  return builder->create<stablehlo::RngOp>(
-      loc, lower, upper, shape_tensor,
-      ::mlir::stablehlo::RngDistribution::UNIFORM);
+  return stablehlo::RngOp::create(*builder, loc, lower, upper, shape_tensor,
+                                  ::mlir::stablehlo::RngDistribution::UNIFORM);
 }
 
 using WhileBodyFnType = llvm::function_ref<void(
@@ -489,8 +488,8 @@ static void CreateWhile32(Location loc, int num_iterations,
   init_types_with_loop_iv.reserve(value_count);
 
   // The initial value for the loop induction variable is 0.
-  init_values_with_loop_iv.push_back(builder->create<stablehlo::ConstantOp>(
-      loc, builder->getI32IntegerAttr(0)));
+  init_values_with_loop_iv.push_back(stablehlo::ConstantOp::create(
+      *builder, loc, builder->getI32IntegerAttr(0)));
   init_values_with_loop_iv.append(init_values.begin(), init_values.end());
 
   // Accumulate types of all the init values.
@@ -498,8 +497,8 @@ static void CreateWhile32(Location loc, int num_iterations,
     init_types_with_loop_iv.push_back(init_value_with_loop_iv.getType());
 
   // Create the while op.
-  auto while_op = builder->create<stablehlo::WhileOp>(
-      loc, init_types_with_loop_iv, init_values_with_loop_iv);
+  auto while_op = stablehlo::WhileOp::create(
+      *builder, loc, init_types_with_loop_iv, init_values_with_loop_iv);
   auto ivs_count = init_types_with_loop_iv.size();
 
   {
@@ -513,12 +512,13 @@ static void CreateWhile32(Location loc, int num_iterations,
 
     // Get the loop induction variable and compare it against the upper limit.
     auto loop_iv = block->getArgument(0);
-    auto upper_limit = builder->create<stablehlo::ConstantOp>(
-        loc, builder->getI32IntegerAttr(num_iterations));
-    Value compare = builder->create<stablehlo::CompareOp>(
-        loc, loop_iv, upper_limit, stablehlo::ComparisonDirection::LT);
+    auto upper_limit = stablehlo::ConstantOp::create(
+        *builder, loc, builder->getI32IntegerAttr(num_iterations));
+    Value compare =
+        stablehlo::CompareOp::create(*builder, loc, loop_iv, upper_limit,
+                                     stablehlo::ComparisonDirection::LT);
 
-    builder->create<stablehlo::ReturnOp>(loc, compare);
+    stablehlo::ReturnOp::create(*builder, loc, compare);
   }
 
   {
@@ -540,15 +540,15 @@ static void CreateWhile32(Location loc, int num_iterations,
             &new_values, builder);
 
     // Increment the loop induction variable by one.
-    auto one = builder->create<stablehlo::ConstantOp>(
-        loc, builder->getI32IntegerAttr(1));
+    auto one = stablehlo::ConstantOp::create(*builder, loc,
+                                             builder->getI32IntegerAttr(1));
     auto scalar_broadcast_dims = builder->getDenseI64ArrayAttr({});
-    auto plus_one = builder->create<chlo::BroadcastAddOp>(
-        loc, block->getArgument(0), one, scalar_broadcast_dims);
+    auto plus_one = chlo::BroadcastAddOp::create(
+        *builder, loc, block->getArgument(0), one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
     new_values.insert(new_values.begin(), plus_one);
 
-    builder->create<stablehlo::ReturnOp>(loc, new_values);
+    stablehlo::ReturnOp::create(*builder, loc, new_values);
   }
 
   // TODO(jpienaar): Support multi-operand while op.
@@ -748,20 +748,20 @@ static void BuildArgMinMaxReductionBody(
 
   ImplicitLocOpBuilder b(loc, *builder);
   Value compare_dt =
-      b.create<stablehlo::CompareOp>(lhs_val, rhs_val, direction);
+      stablehlo::CompareOp::create(b, lhs_val, rhs_val, direction);
   Value selected_input =
-      b.create<stablehlo::SelectOp>(input_type, compare_dt, lhs_val, rhs_val);
+      stablehlo::SelectOp::create(b, input_type, compare_dt, lhs_val, rhs_val);
 
-  Value compare_eq = b.create<stablehlo::CompareOp>(
-      lhs_val, rhs_val, stablehlo::ComparisonDirection::EQ);
-  Value min_index = b.create<stablehlo::MinOp>(lhs_index, rhs_index);
-  Value min_val_index = b.create<stablehlo::SelectOp>(index_type, compare_dt,
-                                                      lhs_index, rhs_index);
-  Value selected_index = b.create<stablehlo::SelectOp>(
-      index_type, compare_eq, min_index, min_val_index);
+  Value compare_eq = stablehlo::CompareOp::create(
+      b, lhs_val, rhs_val, stablehlo::ComparisonDirection::EQ);
+  Value min_index = stablehlo::MinOp::create(b, lhs_index, rhs_index);
+  Value min_val_index = stablehlo::SelectOp::create(b, index_type, compare_dt,
+                                                    lhs_index, rhs_index);
+  Value selected_index = stablehlo::SelectOp::create(b, index_type, compare_eq,
+                                                     min_index, min_val_index);
 
   Value return_values[] = {selected_input, selected_index};
-  b.create<stablehlo::ReturnOp>(return_values);
+  stablehlo::ReturnOp::create(b, return_values);
 }
 
 //===----------------------------------------------------------------------===//
@@ -898,9 +898,9 @@ static void BuildBodyWithCall(PatternRewriter &rewriter, const Location &loc,
   Block *block = rewriter.createBlock(body);
   auto inputs = func_ty.getInputs();
   block->addArguments(inputs, SmallVector<Location>(inputs.size(), loc));
-  mlir::func::CallOp call_op = rewriter.create<mlir::func::CallOp>(
-      loc, func, func_ty.getResults(), block->getArguments());
-  rewriter.create<stablehlo::ReturnOp>(loc, call_op.getResults());
+  mlir::func::CallOp call_op = mlir::func::CallOp::create(
+      rewriter, loc, func, func_ty.getResults(), block->getArguments());
+  stablehlo::ReturnOp::create(rewriter, loc, call_op.getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -955,9 +955,9 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
     auto bias_broadcast = Broadcast1DToFeatureDim(
         loc, op.getValue(), op.getBias(), feature_dim, rewriter);
     Value add =
-        rewriter.create<stablehlo::AddOp>(loc, op.getValue(), bias_broadcast);
+        stablehlo::AddOp::create(rewriter, loc, op.getValue(), bias_broadcast);
     if (add.getType() != op.getType()) {
-      add = rewriter.create<tensor::CastOp>(loc, op.getType(), add);
+      add = tensor::CastOp::create(rewriter, loc, op.getType(), add);
     }
     rewriter.replaceOp(op, {add});
     return success();
@@ -986,7 +986,7 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     switch (padding_type) {
       case tensorflow::Padding::VALID: {
         auto zero =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 0);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 0);
         *padding_low = *padding_high = zero;
         break;
       }
@@ -994,48 +994,49 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
         break;
       case tensorflow::Padding::SAME: {
         auto zero =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 0);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 0);
         auto one =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 1);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 1);
         auto two =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 2);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 2);
         // See also the parallel implementation in
         // GetWindowedOutputSizeFromDimsV2. effective_filter_size = (filter_size
         // - 1) * dilation_rate + 1
-        Value stride_value = rewriter.create<arith::ConstantIntOp>(
-            loc, shape_scalar_type, stride);
-        Value dilation_rate_value = rewriter.create<arith::ConstantIntOp>(
-            loc, shape_scalar_type, dilation_rate);
-        Value effective_filter_size_op = rewriter.create<arith::AddIOp>(
-            loc, one,
-            rewriter.create<arith::MulIOp>(
-                loc, dilation_rate_value,
-                rewriter.create<arith::SubIOp>(loc, filter_size, one)));
+        Value stride_value = arith::ConstantIntOp::create(
+            rewriter, loc, shape_scalar_type, stride);
+        Value dilation_rate_value = arith::ConstantIntOp::create(
+            rewriter, loc, shape_scalar_type, dilation_rate);
+        Value effective_filter_size_op = arith::AddIOp::create(
+            rewriter, loc, one,
+            arith::MulIOp::create(
+                rewriter, loc, dilation_rate_value,
+                arith::SubIOp::create(rewriter, loc, filter_size, one)));
         // output_size = (input_size + stride - 1) / stride;
-        Value output_size = rewriter.create<arith::DivUIOp>(
-            loc,
-            rewriter.create<arith::AddIOp>(
-                loc, input_size,
-                rewriter.create<arith::SubIOp>(loc, stride_value, one)),
+        Value output_size = arith::DivUIOp::create(
+            rewriter, loc,
+            arith::AddIOp::create(
+                rewriter, loc, input_size,
+                arith::SubIOp::create(rewriter, loc, stride_value, one)),
             stride_value);
         // std::max(int64{0}, (output_size - 1) * stride +
         //     effective_filter_size - input_size);
-        Value padding_needed = rewriter.create<arith::SubIOp>(
-            loc,
-            rewriter.create<arith::AddIOp>(
-                loc, effective_filter_size_op,
-                rewriter.create<arith::MulIOp>(
-                    loc, stride_value,
-                    rewriter.create<arith::SubIOp>(loc, output_size, one))),
+        Value padding_needed = arith::SubIOp::create(
+            rewriter, loc,
+            arith::AddIOp::create(
+                rewriter, loc, effective_filter_size_op,
+                arith::MulIOp::create(
+                    rewriter, loc, stride_value,
+                    arith::SubIOp::create(rewriter, loc, output_size, one))),
             input_size);
-        Value cond = rewriter.create<mlir::arith::CmpIOp>(
-            loc, arith::CmpIPredicate::sge, padding_needed, zero);
-        padding_needed = rewriter.create<mlir::arith::SelectOp>(
-            loc, padding_needed.getType(), cond, padding_needed, zero);
+        Value cond = mlir::arith::CmpIOp::create(
+            rewriter, loc, arith::CmpIPredicate::sge, padding_needed, zero);
+        padding_needed = mlir::arith::SelectOp::create(
+            rewriter, loc, padding_needed.getType(), cond, padding_needed,
+            zero);
         *padding_low =
-            rewriter.create<arith::DivUIOp>(loc, padding_needed, two);
+            arith::DivUIOp::create(rewriter, loc, padding_needed, two);
         *padding_high =
-            rewriter.create<arith::SubIOp>(loc, padding_needed, *padding_low);
+            arith::SubIOp::create(rewriter, loc, padding_needed, *padding_low);
         break;
       }
     }
@@ -1086,13 +1087,13 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     auto shape_scalar_type = rewriter.getIntegerType(32);
 
     auto get_const = [&](int64_t val) {
-      return rewriter.create<mlir::arith::ConstantIntOp>(loc, shape_scalar_type,
-                                                         val);
+      return mlir::arith::ConstantIntOp::create(rewriter, loc,
+                                                shape_scalar_type, val);
     };
     auto get_dim_value = [&](Value val, int64_t dim) {
-      Value dim_value = rewriter.create<tensor::DimOp>(loc, val, dim);
-      return rewriter.create<arith::IndexCastOp>(loc, shape_scalar_type,
-                                                 dim_value);
+      Value dim_value = tensor::DimOp::create(rewriter, loc, val, dim);
+      return arith::IndexCastOp::create(rewriter, loc, shape_scalar_type,
+                                        dim_value);
     };
 
     for (auto i : llvm::seq<int>(0, num_spatial_dims)) {
@@ -1149,8 +1150,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     auto precision_config_attr = rewriter.getNamedAttr(
         "precision_config", GetPrecisionConfig(&rewriter));
 
-    Value paddings_op = rewriter.create<tensor::FromElementsOp>(
-        op.getLoc(),
+    Value paddings_op = tensor::FromElementsOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(2 * num_spatial_dims,
                                              rewriter.getI32Type()),
         paddings);
@@ -1166,8 +1167,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
       new_shape.push_back(1);
       new_shape.push_back(filter_shape[num_spatial_dims] *
                           filter_shape[num_spatial_dims + 1]);
-      operands[1] = rewriter.create<stablehlo::ReshapeOp>(
-          op.getLoc(),
+      operands[1] = stablehlo::ReshapeOp::create(
+          rewriter, op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(new_shape,
                                                filter_ty.getElementType()),
           operands[1]);
@@ -1324,8 +1325,8 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
       new_shape.push_back(1);
       new_shape.push_back(filter_shape[num_spatial_dims] *
                           filter_shape[num_spatial_dims + 1]);
-      operands[1] = rewriter.create<stablehlo::ReshapeOp>(
-          op.getLoc(),
+      operands[1] = stablehlo::ReshapeOp::create(
+          rewriter, op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(new_shape,
                                                filter_ty.getElementType()),
           operands[1]);
@@ -1373,35 +1374,35 @@ class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
     auto interior_attr = GetI64ElementsAttr(interior_values, &rewriter);
 
     Value interior_padding_tensor =
-        rewriter.create<stablehlo::ConstantOp>(loc, interior_attr);
+        stablehlo::ConstantOp::create(rewriter, loc, interior_attr);
     Type paddings_elem_ty = paddings_type.getElementType();
     if (!paddings_elem_ty.isInteger(64)) {
-      interior_padding_tensor = rewriter.create<stablehlo::ConvertOp>(
-          loc, interior_padding_tensor, paddings_elem_ty);
+      interior_padding_tensor = stablehlo::ConvertOp::create(
+          rewriter, loc, interior_padding_tensor, paddings_elem_ty);
     }
     llvm::SmallVector<int64_t, 2> transposed_shape = {2, input_rank};
     auto transpose_attr = GetI64ArrayAttr({1, 0}, &rewriter);
     Value transposed_paddings =
-        rewriter.create<stablehlo::TransposeOp>(loc, paddings, transpose_attr);
-    Value reshaped_paddings = rewriter.create<stablehlo::ReshapeOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({input_rank * 2},
-                                             paddings_elem_ty),
-        transposed_paddings);
+        stablehlo::TransposeOp::create(rewriter, loc, paddings, transpose_attr);
+    Value reshaped_paddings =
+        stablehlo::ReshapeOp::create(rewriter, loc,
+                                     tensorflow::GetTypeFromTFTensorShape(
+                                         {input_rank * 2}, paddings_elem_ty),
+                                     transposed_paddings);
 
     auto left_padding_start_attr = GetI64ArrayAttr({0}, &rewriter);
     auto left_padding_limit_attr = GetI64ArrayAttr({input_rank}, &rewriter);
     auto left_padding_stride_attr = GetI64ArrayAttr({1}, &rewriter);
-    Value left_padding_tensor = rewriter.create<stablehlo::SliceOp>(
-        loc, reshaped_paddings, left_padding_start_attr,
+    Value left_padding_tensor = stablehlo::SliceOp::create(
+        rewriter, loc, reshaped_paddings, left_padding_start_attr,
         left_padding_limit_attr, left_padding_stride_attr);
 
     auto right_padding_start_attr = GetI64ArrayAttr({input_rank}, &rewriter);
     auto right_padding_limit_attr =
         GetI64ArrayAttr({2 * input_rank}, &rewriter);
     auto right_padding_stride_attr = GetI64ArrayAttr({1}, &rewriter);
-    Value right_padding_tensor = rewriter.create<stablehlo::SliceOp>(
-        loc, reshaped_paddings, right_padding_start_attr,
+    Value right_padding_tensor = stablehlo::SliceOp::create(
+        rewriter, loc, reshaped_paddings, right_padding_start_attr,
         right_padding_limit_attr, right_padding_stride_attr);
 
     rewriter.replaceOpWithNewOp<stablehlo::DynamicPadOp>(
@@ -1450,23 +1451,24 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
     Value slice_sizes_value = nullptr;
     for (int64_t i = 0; i < params_rank; ++i) {
       if (i < num_index_dims) {
-        slice_sizes_vals.push_back(rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(indices_ty.getElementType(), 1)));
+        slice_sizes_vals.push_back(arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getIntegerAttr(indices_ty.getElementType(), 1)));
       } else {
         int64_t dim_size = params_ty.getDimSize(i);
         if (dim_size != ShapedType::kDynamic) {
-          slice_sizes_vals.push_back(rewriter.create<arith::ConstantOp>(
-              loc,
+          slice_sizes_vals.push_back(arith::ConstantOp::create(
+              rewriter, loc,
               rewriter.getIntegerAttr(indices_ty.getElementType(), dim_size)));
         } else {
-          slice_sizes_vals.push_back(rewriter.create<arith::IndexCastOp>(
-              loc, indices_ty.getElementType(),
-              rewriter.create<tensor::DimOp>(loc, params, i)));
+          slice_sizes_vals.push_back(arith::IndexCastOp::create(
+              rewriter, loc, indices_ty.getElementType(),
+              tensor::DimOp::create(rewriter, loc, params, i)));
         }
       }
     }
     slice_sizes_value =
-        rewriter.create<tensor::FromElementsOp>(loc, slice_sizes_vals);
+        tensor::FromElementsOp::create(rewriter, loc, slice_sizes_vals);
 
     // collapsed_slice_dims
     SmallVector<int64_t, 4> collapsed_slice_dims;
@@ -1535,18 +1537,18 @@ class ConvertBF16FloorDivOp : public OpRewritePattern<TF::FloorDivOp> {
 
     auto out_type = op.getZ().getType();
 
-    l = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), l,
-                                              rewriter.getF32Type());
-    r = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), r,
-                                              rewriter.getF32Type());
+    l = stablehlo::ConvertOp::create(rewriter, op.getLoc(), l,
+                                     rewriter.getF32Type());
+    r = stablehlo::ConvertOp::create(rewriter, op.getLoc(), r,
+                                     rewriter.getF32Type());
 
-    auto intermediate = rewriter.create<TF::FloorDivOp>(
-        op.getLoc(),
+    auto intermediate = TF::FloorDivOp::create(
+        rewriter, op.getLoc(),
         ChangeTensorElementType(&rewriter, out_type, rewriter.getF32Type()), l,
         r);
 
-    auto floor_op = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), out_type,
-                                                          intermediate);
+    auto floor_op = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                 out_type, intermediate);
     rewriter.replaceOp(op, floor_op.getResult());
     return success();
   }
@@ -1615,24 +1617,26 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
     // offset = ((offset % axis_size) + axis_size) % axis_size
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     Value offset = op.getShift();
-    auto axis_size = b.create<stablehlo::ConstantOp>(b.getIntegerAttr(
-        getElementTypeOrSelf(offset.getType()), input_shape[axis]));
-    offset = b.create<stablehlo::RemOp>(
-        b.create<stablehlo::AddOp>(
-            b.create<stablehlo::RemOp>(offset, axis_size), axis_size),
+    auto axis_size = stablehlo::ConstantOp::create(
+        b, b.getIntegerAttr(getElementTypeOrSelf(offset.getType()),
+                            input_shape[axis]));
+    offset = stablehlo::RemOp::create(
+        b,
+        stablehlo::AddOp::create(
+            b, stablehlo::RemOp::create(b, offset, axis_size), axis_size),
         axis_size);
 
     // Stack two copies of the dimension, then slice from the calculated
     // offset. This also works if shift is not constant.
     // DynamicSliceOp requires the sizes being integer, and we can get the
     // information from input shape.
-    auto concat = b.create<stablehlo::ConcatenateOp>(
-        ValueRange{op.getInput(), op.getInput()}, b.getI64IntegerAttr(axis));
-    Value zero = b.create<stablehlo::ConstantOp>(
-        b.getIntegerAttr(getElementTypeOrSelf(offset.getType()), 0));
+    auto concat = stablehlo::ConcatenateOp::create(
+        b, ValueRange{op.getInput(), op.getInput()}, b.getI64IntegerAttr(axis));
+    Value zero = stablehlo::ConstantOp::create(
+        b, b.getIntegerAttr(getElementTypeOrSelf(offset.getType()), 0));
     SmallVector<Value> slice_begin_indices(input_rank, zero);
     slice_begin_indices[axis] =
-        b.create<stablehlo::SubtractOp>(axis_size, offset);
+        stablehlo::SubtractOp::create(b, axis_size, offset);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicSliceOp>(
         op, input_ty, concat, slice_begin_indices,
         GetI64ArrayAttr(input_shape, &rewriter));
@@ -1656,10 +1660,10 @@ class ConvertLeakyReluOp : public OpRewritePattern<TF::LeakyReluOp> {
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyActivationVal =
-        rewriter.create<stablehlo::MulOp>(loc, features, alphaVal);
+        stablehlo::MulOp::create(rewriter, loc, features, alphaVal);
 
-    Value compareGtZero = rewriter.create<stablehlo::CompareOp>(
-        loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
+    Value compareGtZero = stablehlo::CompareOp::create(
+        rewriter, loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
 
     rewriter.replaceOpWithNewOp<stablehlo::SelectOp>(
         op, compareGtZero, features, leakyActivationVal);
@@ -1686,10 +1690,10 @@ class ConvertLeakyReluGradOp : public OpRewritePattern<TF::LeakyReluGradOp> {
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyGradientVal =
-        rewriter.create<stablehlo::MulOp>(loc, gradients, alphaVal);
+        stablehlo::MulOp::create(rewriter, loc, gradients, alphaVal);
 
-    Value compareGtZero = rewriter.create<stablehlo::CompareOp>(
-        loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
+    Value compareGtZero = stablehlo::CompareOp::create(
+        rewriter, loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
 
     rewriter.replaceOpWithNewOp<stablehlo::SelectOp>(
         op, featureType, compareGtZero, gradients, leakyGradientVal);
@@ -1733,29 +1737,30 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
       new_size *= input_type.getDimSize(i);
       new_dims.push_back(input_type.getDimSize(i));
     }
-    Value reshaped_input = rewriter.create<stablehlo::ReshapeOp>(
-        op.getLoc(),
+    Value reshaped_input = stablehlo::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({new_size, new_size},
                                              input_type.getElementType()),
         op.getInput());
     auto iota_type = tensorflow::GetTypeFromTFTensorShape(
         {new_size, new_size}, rewriter.getIntegerType(32));
-    auto iota0 = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), iota_type, rewriter.getI64IntegerAttr(0));
-    auto iota1 = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), iota_type, rewriter.getI64IntegerAttr(1));
-    Value compare = rewriter.create<stablehlo::CompareOp>(
-        op.getLoc(), iota0, iota1, stablehlo::ComparisonDirection::EQ);
+    auto iota0 = stablehlo::IotaOp::create(rewriter, op.getLoc(), iota_type,
+                                           rewriter.getI64IntegerAttr(0));
+    auto iota1 = stablehlo::IotaOp::create(rewriter, op.getLoc(), iota_type,
+                                           rewriter.getI64IntegerAttr(1));
+    Value compare =
+        stablehlo::CompareOp::create(rewriter, op.getLoc(), iota0, iota1,
+                                     stablehlo::ComparisonDirection::EQ);
     Value zero = GetScalarConstOfType(input_type.getElementType(), op.getLoc(),
                                       0, &rewriter);
-    Value zero_matrix = rewriter.create<stablehlo::BroadcastOp>(
-        op.getLoc(), reshaped_input.getType(), zero,
+    Value zero_matrix = stablehlo::BroadcastOp::create(
+        rewriter, op.getLoc(), reshaped_input.getType(), zero,
         GetI64ArrayAttr({new_size, new_size}, &rewriter));
-    Value masked = rewriter.create<stablehlo::SelectOp>(
-        op.getLoc(), reshaped_input.getType(), compare, reshaped_input,
-        zero_matrix);
-    auto reduce = rewriter.create<stablehlo::ReduceOp>(
-        op.getLoc(), masked, zero, GetI64ArrayAttr({0}, &rewriter),
+    Value masked = stablehlo::SelectOp::create(
+        rewriter, op.getLoc(), reshaped_input.getType(), compare,
+        reshaped_input, zero_matrix);
+    auto reduce = stablehlo::ReduceOp::create(
+        rewriter, op.getLoc(), masked, zero, GetI64ArrayAttr({0}, &rewriter),
         input_type.getElementType());
     assert(!input_type.getElementType().isInteger(1) &&
            "data type should not be i1");
@@ -1802,8 +1807,8 @@ class ConvertMatrixDiagPartV3Op
   stablehlo::BroadcastOp BroadcastConstant(Location loc, Shape shape,
                                            int32_t constant, int int_size,
                                            PatternRewriter &rewriter) const {
-    return rewriter.create<stablehlo::BroadcastOp>(
-        loc,
+    return stablehlo::BroadcastOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(shape,
                                              rewriter.getIntegerType(int_size)),
         GetScalarConstOfType(rewriter.getIntegerType(int_size), loc, constant,
@@ -1878,10 +1883,10 @@ class ConvertMatrixDiagPartV3Op
 
     RankedTensorType iota_type = tensorflow::GetTypeFromTFTensorShape(
         indices_shape, rewriter.getIntegerType(32));
-    Value iotaM = rewriter.create<stablehlo::IotaOp>(
-        loc, iota_type, rewriter.getI64IntegerAttr(1));
-    Value iotaN = rewriter.create<stablehlo::IotaOp>(
-        loc, iota_type, rewriter.getI64IntegerAttr(2));
+    Value iotaM = stablehlo::IotaOp::create(rewriter, loc, iota_type,
+                                            rewriter.getI64IntegerAttr(1));
+    Value iotaN = stablehlo::IotaOp::create(rewriter, loc, iota_type,
+                                            rewriter.getI64IntegerAttr(2));
 
     // Boradcasted constants, of the same shape as iotaM and iotaN.
     Value b_zero = BroadcastConstant(loc, indices_shape, 0, 32, rewriter);
@@ -1898,17 +1903,19 @@ class ConvertMatrixDiagPartV3Op
     //  subtract m here. This means we start with the superdiagonals and
     //  move downwards towards the subdiagonals. So the start indices will
     //  be decreasing.)
-    Value d = rewriter.create<stablehlo::SubtractOp>(loc, b_k1, iotaM);
-    Value neg_d = rewriter.create<stablehlo::NegOp>(loc, d);
+    Value d = stablehlo::SubtractOp::create(rewriter, loc, b_k1, iotaM);
+    Value neg_d = stablehlo::NegOp::create(rewriter, loc, d);
 
     // diag_len_d = min(rows + min(d, 0), cols - max(d, 0))
     // (Length of a diagonal for a given d. Same as max_diag_len for m = 0.)
-    Value diag_len_d = rewriter.create<stablehlo::MinOp>(
-        loc,
-        rewriter.create<stablehlo::AddOp>(
-            loc, b_rows, rewriter.create<stablehlo::MinOp>(loc, d, b_zero)),
-        rewriter.create<stablehlo::SubtractOp>(
-            loc, b_cols, rewriter.create<stablehlo::MaxOp>(loc, d, b_zero)));
+    Value diag_len_d = stablehlo::MinOp::create(
+        rewriter, loc,
+        stablehlo::AddOp::create(
+            rewriter, loc, b_rows,
+            stablehlo::MinOp::create(rewriter, loc, d, b_zero)),
+        stablehlo::SubtractOp::create(
+            rewriter, loc, b_cols,
+            stablehlo::MaxOp::create(rewriter, loc, d, b_zero)));
 
     // offset is max_diag_len - diag_len_d if we're padding, 0 otherwise.
     Value cmp;
@@ -1916,10 +1923,10 @@ class ConvertMatrixDiagPartV3Op
       cmp = b_true;
     } else if (superdiagonal_align == kRight) {
       // offset = d>=0 ? max_diag_len - diag_len_d : 0
-      cmp = rewriter.create<TF::GreaterEqualOp>(loc, d, b_zero);
+      cmp = TF::GreaterEqualOp::create(rewriter, loc, d, b_zero);
     } else if (subdiagonal_align == kRight) {
       // offset = d<=0 ? max_diag_len - diag_len_d : 0
-      cmp = rewriter.create<TF::LessEqualOp>(loc, d, b_zero);
+      cmp = TF::LessEqualOp::create(rewriter, loc, d, b_zero);
     } else {
       // offset = 0
       cmp = b_false;
@@ -1927,45 +1934,48 @@ class ConvertMatrixDiagPartV3Op
 
     // This offset shifts the diagonals to the "left" or "right", depending
     // on alignment.
-    Value offset = rewriter.create<stablehlo::SelectOp>(
-        loc, b_zero.getType(), cmp,
-        rewriter.create<stablehlo::SubtractOp>(loc, b_max_diag_len, diag_len_d),
+    Value offset = stablehlo::SelectOp::create(
+        rewriter, loc, b_zero.getType(), cmp,
+        stablehlo::SubtractOp::create(rewriter, loc, b_max_diag_len,
+                                      diag_len_d),
         b_zero);
 
     // x = max(d, 0) - offset
     // y = max(-d, 0) - offset
-    Value x = rewriter.create<stablehlo::SubtractOp>(
-        loc, rewriter.create<stablehlo::MaxOp>(loc, d, b_zero), offset);
-    Value y = rewriter.create<stablehlo::SubtractOp>(
-        loc, rewriter.create<stablehlo::MaxOp>(loc, neg_d, b_zero), offset);
+    Value x = stablehlo::SubtractOp::create(
+        rewriter, loc, stablehlo::MaxOp::create(rewriter, loc, d, b_zero),
+        offset);
+    Value y = stablehlo::SubtractOp::create(
+        rewriter, loc, stablehlo::MaxOp::create(rewriter, loc, neg_d, b_zero),
+        offset);
 
-    Value n_plus_x = rewriter.create<stablehlo::AddOp>(loc, iotaN, x);
-    Value n_plus_y = rewriter.create<stablehlo::AddOp>(loc, iotaN, y);
+    Value n_plus_x = stablehlo::AddOp::create(rewriter, loc, iotaN, x);
+    Value n_plus_y = stablehlo::AddOp::create(rewriter, loc, iotaN, y);
 
     // GatherOp is happy about letting us index out of bounds values, but those
     // values will be undefined. So we mask them later. Set up the boolean
     // expression that tells us which entries, in the output shape, are out of
     // bounds and thus become the padding_value.
-    Value x_in_bounds = rewriter.create<stablehlo::AndOp>(
-        loc,
-        rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_x,
-                                            b_zero),
-        rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_x, b_cols));
-    Value y_in_bounds = rewriter.create<stablehlo::AndOp>(
-        loc,
-        rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_y,
-                                            b_zero),
-        rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_y, b_rows));
-    Value in_bounds = rewriter.create<stablehlo::ReshapeOp>(
-        loc,
+    Value x_in_bounds = stablehlo::AndOp::create(
+        rewriter, loc,
+        TF::GreaterEqualOp::create(rewriter, loc, b_false.getType(), n_plus_x,
+                                   b_zero),
+        TF::LessOp::create(rewriter, loc, b_false.getType(), n_plus_x, b_cols));
+    Value y_in_bounds = stablehlo::AndOp::create(
+        rewriter, loc,
+        TF::GreaterEqualOp::create(rewriter, loc, b_false.getType(), n_plus_y,
+                                   b_zero),
+        TF::LessOp::create(rewriter, loc, b_false.getType(), n_plus_y, b_rows));
+    Value in_bounds = stablehlo::ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(Shape({num_diags, max_diag_len}),
                                              rewriter.getIntegerType(1)),
-        rewriter.create<stablehlo::AndOp>(loc, x_in_bounds, y_in_bounds));
+        stablehlo::AndOp::create(rewriter, loc, x_in_bounds, y_in_bounds));
 
     // Now combine x and y into the index data structure needed for gather.
     Shape concat_shape({2, num_diags, max_diag_len});
-    Value start_indices = rewriter.create<stablehlo::ConcatenateOp>(
-        loc,
+    Value start_indices = stablehlo::ConcatenateOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(concat_shape,
                                              rewriter.getIntegerType(32)),
         mlir::ValueRange({n_plus_y, n_plus_x}),
@@ -2009,8 +2019,8 @@ class ConvertMatrixDiagPartV3Op
         /*operandBatchingDims=*/{},
         /*startIndicesBatchingDims=*/{}, start_index_map,
         /*indexVectorDim=*/0);
-    Value gather = rewriter.create<stablehlo::GatherOp>(
-        loc, op.getInput(), start_indices, dims_attr,
+    Value gather = stablehlo::GatherOp::create(
+        rewriter, loc, op.getInput(), start_indices, dims_attr,
         GetI64ArrayAttr(slice_sizes, &rewriter));
 
     // We now need to broadcast the "in_bounds" boolean expression, as well as
@@ -2019,22 +2029,24 @@ class ConvertMatrixDiagPartV3Op
     for (int i = 0; i < output_shape.size() - 2; i++) {
       broadcast_bounds.push_back(output_shape[i]);
     }
-    Value b_in_bounds = rewriter.create<stablehlo::BroadcastOp>(
-        loc,
+    Value b_in_bounds = stablehlo::BroadcastOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(output_shape,
                                              rewriter.getIntegerType(1)),
         in_bounds, GetI64ArrayAttr(broadcast_bounds, &rewriter));
-    Value b_padding = rewriter.create<stablehlo::BroadcastOp>(
-        loc, op.getPaddingValue(), GetI64ArrayAttr(output_shape, &rewriter));
+    Value b_padding = stablehlo::BroadcastOp::create(
+        rewriter, loc, op.getPaddingValue(),
+        GetI64ArrayAttr(output_shape, &rewriter));
 
     // Replace all out-of-bounds values in the result with padding_value.
-    Value result = rewriter.create<stablehlo::SelectOp>(loc, b_in_bounds,
-                                                        gather, b_padding);
+    Value result = stablehlo::SelectOp::create(rewriter, loc, b_in_bounds,
+                                               gather, b_padding);
 
     if (num_diags == 1) {
       // matrix_diag_part folds away the 1-sized band dimension if we only
       // extract a single diagonal.
-      result = rewriter.create<stablehlo::ReshapeOp>(loc, op.getType(), result);
+      result =
+          stablehlo::ReshapeOp::create(rewriter, loc, op.getType(), result);
     }
 
     rewriter.replaceOp(op, result);
@@ -2057,9 +2069,10 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
     // creates a scalar constant 1.0 for first operand.
     if (op.getN() == 1) {
       equation_str = "," + equation_str;
-      inputs.push_back(rewriter.create<stablehlo::ConstantOp>(
-          op.getLoc(), hlo::getScalarOfType(
-                           mlir::getElementTypeOrSelf(op.getOperand(0)), 1)));
+      inputs.push_back(stablehlo::ConstantOp::create(
+          rewriter, op.getLoc(),
+          hlo::getScalarOfType(mlir::getElementTypeOrSelf(op.getOperand(0)),
+                               1)));
     }
     // Insert remaining operands into inputs, TF op verifier requires there be
     // 0 or 1 operands.
@@ -2129,8 +2142,8 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
 
     // Last dim larger than expected_dim, slice the input
     if (input_shape.back() > expected_dim) {
-      reshaped = rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(),
+      reshaped = stablehlo::SliceOp::create(
+          rewriter, op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
           op.getInput(), GetI64ArrayAttr(begin_indices, &rewriter),
@@ -2144,8 +2157,8 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
       padding.push_back(expected_dim - input_shape.back());
       Value zero =
           GetScalarConstOfType(input_ty.getElementType(), loc, 0, &rewriter);
-      reshaped = rewriter.create<stablehlo::PadOp>(
-          loc,
+      reshaped = stablehlo::PadOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
           op.getInput(), zero, GetI64ArrayAttr(no_padding, &rewriter),
@@ -2193,8 +2206,8 @@ class ConvertFusedBatchNormGradBase
     // To support mixed precision, the statistics type, which maybe more
     // precise than the input types, are used for this op.
     Type kernel_type = mlir::cast<TensorType>(scale.getType()).getElementType();
-    grad = rewriter.create<stablehlo::ConvertOp>(loc, grad, kernel_type);
-    act = rewriter.create<stablehlo::ConvertOp>(loc, act, kernel_type);
+    grad = stablehlo::ConvertOp::create(rewriter, loc, grad, kernel_type);
+    act = stablehlo::ConvertOp::create(rewriter, loc, act, kernel_type);
 
     tensorflow::TensorFormat data_format;
     if (!FormatFromString(op.getDataFormat().str(), &data_format))
@@ -2213,9 +2226,9 @@ class ConvertFusedBatchNormGradBase
 
       SmallVector<Type, 3> operand_types = {act.getType(), feature_type,
                                             feature_type};
-      auto training_op = rewriter.create<stablehlo::BatchNormGradOp>(
-          loc, operand_types, act, scale, mean, var, grad, op.getEpsilon(),
-          feature_dim);
+      auto training_op = stablehlo::BatchNormGradOp::create(
+          rewriter, loc, operand_types, act, scale, mean, var, grad,
+          op.getEpsilon(), feature_dim);
 
       x_backprop = training_op.getResult(0);
 
@@ -2234,52 +2247,55 @@ class ConvertFusedBatchNormGradBase
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float =
           tensorflow::GetTypeFromTFTensorShape({}, kernel_type);
-      auto epsilon = rewriter.create<stablehlo::ConstantOp>(
-          loc, DenseFPElementsAttr::get(scalar_float, {op.getEpsilon()}));
-      auto add_op = rewriter.create<chlo::BroadcastAddOp>(
-          loc, var, epsilon.getResult(), scalar_broadcast_dims);
+      auto epsilon = stablehlo::ConstantOp::create(
+          rewriter, loc,
+          DenseFPElementsAttr::get(scalar_float, {op.getEpsilon()}));
+      auto add_op = chlo::BroadcastAddOp::create(
+          rewriter, loc, var, epsilon.getResult(), scalar_broadcast_dims);
 
-      Value scratch1 = rewriter.create<stablehlo::RsqrtOp>(loc, add_op);
+      Value scratch1 = stablehlo::RsqrtOp::create(rewriter, loc, add_op);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto sub_op = rewriter.create<stablehlo::SubtractOp>(
-          loc, act,
+      auto sub_op = stablehlo::SubtractOp::create(
+          rewriter, loc, act,
           Broadcast1DToFeatureDim(loc, act, mean, feature_dim, rewriter));
-      auto weighted_grad = rewriter.create<stablehlo::MulOp>(loc, grad, sub_op);
+      auto weighted_grad =
+          stablehlo::MulOp::create(rewriter, loc, grad, sub_op);
       Value scratch2 =
           ApplyReduction(loc, weighted_grad, reduce_dims, &rewriter);
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<stablehlo::MulOp>(loc, op.getScale(), scratch1);
-      x_backprop = rewriter.create<stablehlo::MulOp>(
-          loc, grad,
+          stablehlo::MulOp::create(rewriter, loc, op.getScale(), scratch1);
+      x_backprop = stablehlo::MulOp::create(
+          rewriter, loc, grad,
           Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
                                   rewriter));
 
       // scale_backprop = scratch2 * scratch1
       scale_backprop =
-          rewriter.create<stablehlo::MulOp>(loc, scratch1, scratch2);
+          stablehlo::MulOp::create(rewriter, loc, scratch1, scratch2);
 
       // offset_backprop = sum(y_backprop)
       offset_backprop = ApplyReduction(loc, grad, reduce_dims, &rewriter);
     }
 
     x_backprop =
-        rewriter.create<stablehlo::ConvertOp>(loc, x_backprop, act_ele_type);
+        stablehlo::ConvertOp::create(rewriter, loc, x_backprop, act_ele_type);
     Value last_val[2];
     if (op.getResult(3).use_empty() && op.getResult(4).use_empty()) {
       // It doesn't matter what values we provide for the last 2 results.
       last_val[0] = last_val[1] = op.getX();
     } else {
-      auto const_val = rewriter.create<stablehlo::ConstantOp>(
-          op.getLoc(), DenseElementsAttr::get<float>(
-                           tensorflow::GetTypeFromTFTensorShape(
-                               {0}, getElementTypeOrSelf(op.getResult(3))),
-                           0.0));
+      auto const_val = stablehlo::ConstantOp::create(
+          rewriter, op.getLoc(),
+          DenseElementsAttr::get<float>(
+              tensorflow::GetTypeFromTFTensorShape(
+                  {0}, getElementTypeOrSelf(op.getResult(3))),
+              0.0));
       auto maybe_cast = [&](Value val, Type t) -> Value {
         if (val.getType() == t) return val;
-        return rewriter.create<tensor::CastOp>(op.getLoc(), t, val);
+        return tensor::CastOp::create(rewriter, op.getLoc(), t, val);
       };
       last_val[0] = maybe_cast(const_val, op.getResult(3).getType());
       last_val[1] = maybe_cast(const_val, op.getResult(4).getType());
@@ -2333,8 +2349,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
     // TODO(b/69928690): Support mixed precision in the XLA batch
     // normalization operators. As a workaround, create a new x with the same
     // element type as scale (which may be more precise than the input type).
-    Value bn_train_input = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), op.getX(), scale_element_type);
+    Value bn_train_input = stablehlo::ConvertOp::create(
+        rewriter, op.getLoc(), op.getX(), scale_element_type);
     TensorType bn_train_input_type_tensor =
         mlir::cast<TensorType>(bn_train_input.getType());
 
@@ -2351,8 +2367,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       // batch_mean, and batch_var.
       SmallVector<Type, 3> operand_types = {bn_train_input_type_tensor,
                                             mean_var_type, mean_var_type};
-      auto bn_train_op = rewriter.create<stablehlo::BatchNormTrainingOp>(
-          op.getLoc(), operand_types, bn_train_input, op.getScale(),
+      auto bn_train_op = stablehlo::BatchNormTrainingOp::create(
+          rewriter, op.getLoc(), operand_types, bn_train_input, op.getScale(),
           op.getOffset(), op.getEpsilon(), feature_dim.getInt());
       // HLO op outputs a tuple of tensors. Extract those results.
       Value y_out = bn_train_op.getResult(0);
@@ -2368,48 +2384,53 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       int sample_size_minus_one = std::max(1, sample_size - 1);
       double factor = static_cast<double>(sample_size) /
                       static_cast<double>(sample_size_minus_one);
-      auto factor_const_op = rewriter.create<stablehlo::ConstantOp>(
-          op.getLoc(), rewriter.getFloatAttr(scale_element_type, factor));
+      auto factor_const_op = stablehlo::ConstantOp::create(
+          rewriter, op.getLoc(),
+          rewriter.getFloatAttr(scale_element_type, factor));
 
-      Value corrected_variance = rewriter.create<chlo::BroadcastMulOp>(
-          op.getLoc(), batch_variance.getType(), batch_variance,
+      Value corrected_variance = chlo::BroadcastMulOp::create(
+          rewriter, op.getLoc(), batch_variance.getType(), batch_variance,
           factor_const_op, /*broadcast_dimensions=*/DenseI64ArrayAttr());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
-      y_out = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), y_out,
-                                                    input_element_type);
+      y_out = stablehlo::ConvertOp::create(rewriter, op.getLoc(), y_out,
+                                           input_element_type);
 
       float exponential_avg_factor =
           op.getExponentialAvgFactor().convertToFloat();
       if (exponential_avg_factor != 1.0f) {
-        auto alpha = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), rewriter.getFloatAttr(mean_element_type,
-                                               1.0f - exponential_avg_factor));
-        auto beta = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(),
+        auto alpha = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            rewriter.getFloatAttr(mean_element_type,
+                                  1.0f - exponential_avg_factor));
+        auto beta = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
             rewriter.getFloatAttr(mean_element_type, exponential_avg_factor));
 
         // new_running_mean = alpha * old_mean + beta * batch_mean.
-        auto alpha_mul_old_mean = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), op.getMean().getType(), alpha, op.getMean(),
+        auto alpha_mul_old_mean = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), op.getMean().getType(), alpha, op.getMean(),
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        auto beta_mul_batch_mean = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), batch_mean.getType(), beta, batch_mean,
+        auto beta_mul_batch_mean = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), batch_mean.getType(), beta, batch_mean,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        batch_mean = rewriter.create<chlo::BroadcastAddOp>(
-            op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
+        batch_mean = chlo::BroadcastAddOp::create(
+            rewriter, op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
 
         // new_running_variance = alpha * old_variance + beta * batch_variance.
-        auto alpha_mul_old_variance = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), op.getVariance().getType(), alpha, op.getVariance(),
+        auto alpha_mul_old_variance = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), op.getVariance().getType(), alpha,
+            op.getVariance(),
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        auto beta_mul_batch_variance = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), corrected_variance.getType(), beta, corrected_variance,
+        auto beta_mul_batch_variance = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), corrected_variance.getType(), beta,
+            corrected_variance,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        corrected_variance = rewriter.create<chlo::BroadcastAddOp>(
-            op.getLoc(), alpha_mul_old_variance, beta_mul_batch_variance,
+        corrected_variance = chlo::BroadcastAddOp::create(
+            rewriter, op.getLoc(), alpha_mul_old_variance,
+            beta_mul_batch_variance,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
       }
 
@@ -2433,11 +2454,12 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                : 0;
         auto const_attr_type = tensorflow::GetTypeFromTFTensorShape(
             {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-        Value dummy_const = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+        Value dummy_const = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            DenseElementsAttr::get<float>(const_attr_type, 0.0));
         if (const_attr_type != reserve_space_3_type)
-          dummy_const = rewriter.create<tensor::CastOp>(
-              op.getLoc(), reserve_space_3_type, dummy_const);
+          dummy_const = tensor::CastOp::create(
+              rewriter, op.getLoc(), reserve_space_3_type, dummy_const);
         rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
                                 /*batch_variance=*/corrected_variance,
                                 /*reserve_space_1=*/reserve_space_1,
@@ -2445,16 +2467,16 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                 /*reserve_space_3=*/dummy_const});
       }
     } else {  // Inference case.
-      auto bn_train_op = rewriter.create<stablehlo::BatchNormInferenceOp>(
-          op.getLoc(),
+      auto bn_train_op = stablehlo::BatchNormInferenceOp::create(
+          rewriter, op.getLoc(),
           /*result_type=*/bn_train_input_type_tensor, bn_train_input,
           op.getScale(), op.getOffset(), op.getMean(), op.getVariance(),
           op.getEpsilon(), feature_dim.getInt());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
-      auto y_out = rewriter.create<stablehlo::ConvertOp>(
-          op.getLoc(), bn_train_op, input_element_type);
+      auto y_out = stablehlo::ConvertOp::create(
+          rewriter, op.getLoc(), bn_train_op, input_element_type);
 
       // The mean, variance, and reserved space outputs of the batch norm op are
       // not used for inference. It doesn't matter what values we provide for
@@ -2477,11 +2499,12 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                : 0;
         auto const_attr_type = tensorflow::GetTypeFromTFTensorShape(
             {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-        Value dummy_const = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+        Value dummy_const = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            DenseElementsAttr::get<float>(const_attr_type, 0.0));
         if (const_attr_type != reserve_space_3_type)
-          dummy_const = rewriter.create<tensor::CastOp>(
-              op.getLoc(), reserve_space_3_type, dummy_const);
+          dummy_const = tensor::CastOp::create(
+              rewriter, op.getLoc(), reserve_space_3_type, dummy_const);
         rewriter.replaceOp(op, {/*y=*/y_out,
                                 /*batch_mean=*/op.getMean(),
                                 /*batch_variance=*/op.getVariance(),
@@ -2580,8 +2603,8 @@ Operation *AvgPoolDivideByCount(
     Value divisor =
         GetScalarConstOfType(element_type, loc, window_count, &rewriter);
     auto scalar_broadcast_dims = rewriter.getDenseI64ArrayAttr({});
-    result = rewriter.create<chlo::BroadcastDivOp>(
-        loc, pooled_type, pooled, divisor, scalar_broadcast_dims);
+    result = chlo::BroadcastDivOp::create(rewriter, loc, pooled_type, pooled,
+                                          divisor, scalar_broadcast_dims);
   } else {
     assert(op.getPadding() == "SAME");
     // For SAME padding, only original entries that contributed to a window
@@ -2589,7 +2612,7 @@ Operation *AvgPoolDivideByCount(
 
     // Build all-ones tensor of same shape as the original input.
     ElementsAttr splat = hlo::getSplat(&rewriter, orig_input_type, 1);
-    auto all_ones_tensor = rewriter.create<stablehlo::ConstantOp>(loc, splat);
+    auto all_ones_tensor = stablehlo::ConstantOp::create(rewriter, loc, splat);
 
     // Get padding for the input.
     DenseIntElementsAttr input_padding_attr =
@@ -2599,8 +2622,8 @@ Operation *AvgPoolDivideByCount(
 
     // Count the 1's in each window, using the same padding as for the input,
     // which gives us the window counts by which `pooled` needs to be divided.
-    auto divisor = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc, pooled_type,
+    auto divisor = stablehlo::ReduceWindowOp::create(
+        rewriter, loc, pooled_type,
         /*operand=*/all_ones_tensor,
         /*init_value=*/zero,
         /*window_dimensions=*/
@@ -2614,8 +2637,8 @@ Operation *AvgPoolDivideByCount(
                                       &rewriter);
 
     // Divide `pooled` by window counts.
-    result = rewriter.create<stablehlo::DivOp>(loc, pooled_type, pooled,
-                                               divisor.getResult(0));
+    result = stablehlo::DivOp::create(rewriter, loc, pooled_type, pooled,
+                                      divisor.getResult(0));
   }
   return result;
 }
@@ -2651,8 +2674,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
 
     // Convert if we need enlarge the element type's bitwidth.
     if (input_element_type != sum_element_type)
-      input_value = rewriter.create<stablehlo::ConvertOp>(
-          op.getLoc(), input_value, sum_element_type);
+      input_value = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                 input_value, sum_element_type);
 
     // Create the ReduceWindow op.
     Value init =
@@ -2660,8 +2683,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_type.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
-    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
-        op.getLoc(), result_type, input_value, init,
+    auto reduce = stablehlo::ReduceWindowOp::create(
+        rewriter, op.getLoc(), result_type, input_value, init,
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
         /*base_dilations=*/DenseI64ArrayAttr(),
@@ -2683,8 +2706,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     // Convert back if we enlarged the element type's bitwidth.
     Value result = result_op->getOpResult(0);
     if (input_element_type != sum_element_type)
-      result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
-                                                     input_element_type);
+      result = stablehlo::ConvertOp::create(rewriter, op.getLoc(), result,
+                                            input_element_type);
 
     rewriter.replaceOp(op, result);
     return success();
@@ -2825,8 +2848,9 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
       out_grad_shape[dim] = low_padding[dim] + high_padding[dim] +
                             (out_grad_shape[dim] - 1) * strides[dim] + 1;
     }
-    Value reduce_window_input = rewriter.create<stablehlo::PadOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape(out_grad_shape, element_type),
+    Value reduce_window_input = stablehlo::PadOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape(out_grad_shape, element_type),
         /*operand=*/out_grad_divided->getOpResult(0),
         /*padding_value=*/zero,
         /*edge_padding_low=*/GetI64ArrayAttr(low_padding, &rewriter),
@@ -2839,13 +2863,13 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     Type sum_element_type = GetSumAccumulationType(element_type);
     if (element_type != sum_element_type) {
       // Convert to appropriate sum accumulation type to avoid precision loss.
-      reduce_window_input = rewriter.create<stablehlo::ConvertOp>(
-          loc, reduce_window_input, sum_element_type);
+      reduce_window_input = stablehlo::ConvertOp::create(
+          rewriter, loc, reduce_window_input, sum_element_type);
       zero = GetScalarConstOfType(sum_element_type, loc, 0, &rewriter);
     }
     auto ones = GetI64ArrayAttr(DimVector(num_dims, 1), &rewriter);
-    auto reduce_window_op = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc,
+    auto reduce_window_op = stablehlo::ReduceWindowOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(orig_input_shape,
                                              sum_element_type),
         /*operand=*/reduce_window_input,
@@ -2862,8 +2886,8 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
 
     if (element_type != sum_element_type) {
       // Convert back to original element type.
-      result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
-                                                     element_type);
+      result = stablehlo::ConvertOp::create(rewriter, op.getLoc(), result,
+                                            element_type);
     }
     rewriter.replaceOp(op, {result});
     return success();
@@ -2909,8 +2933,8 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
-    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc, op.getType(), op.getInput(), init,
+    auto reduce = stablehlo::ReduceWindowOp::create(
+        rewriter, loc, op.getType(), op.getInput(), init,
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
         /*base_dilations=*/DenseI64ArrayAttr(),
@@ -2958,7 +2982,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     bool needs_broadcast = cond_type.getRank() == 1 && then_type.getRank() != 1;
     Value then_shape_split = then_shape;
     if (needs_broadcast) {
-      Value const_one = b.create<arith::ConstantIndexOp>(1);
+      Value const_one = arith::ConstantIndexOp::create(b, 1);
       Type extent_first = shape::getExtentTensorType(b.getContext(), 1);
       Type extent_second =
           shape::getExtentTensorType(b.getContext(), then_type.getRank() - 1);
@@ -2978,7 +3002,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     }
     auto result_type = mlir::cast<TensorType>(op.getResult().getType());
     auto assuming_op =
-        b.create<shape::AssumingOp>(ArrayRef<Type>{result_type}, assumption);
+        shape::AssumingOp::create(b, ArrayRef<Type>{result_type}, assumption);
 
     OpBuilder::InsertionGuard guard(b);
     b.createBlock(&assuming_op.getDoRegion());
@@ -2986,17 +3010,18 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     // Broadcast the cond if necessary.
     Value cond = op.getCondition();
     if (needs_broadcast) {
-      Value result_extents = b.create<shape::ToExtentTensorOp>(
-          GetExtentsTensorTypeFor(result_type), then_shape);
-      cond = b.create<stablehlo::DynamicBroadcastInDimOp>(
+      Value result_extents = shape::ToExtentTensorOp::create(
+          b, GetExtentsTensorTypeFor(result_type), then_shape);
+      cond = stablehlo::DynamicBroadcastInDimOp::create(
+          b,
           tensorflow::GetTypeFromTFTensorShape(result_type.getShape(),
                                                b.getI1Type()),
           cond, result_extents,
           GetI64ArrayAttrForSeq(0, cond_type.getRank(), &b));
     }
-    Value select = b.create<stablehlo::SelectOp>(
-        result_type, cond, op.getThenValue(), op.getElseValue());
-    b.create<shape::AssumingYieldOp>(select);
+    Value select = stablehlo::SelectOp::create(
+        b, result_type, cond, op.getThenValue(), op.getElseValue());
+    shape::AssumingYieldOp::create(b, select);
     rewriter.replaceOp(op, {assuming_op.getResult(0)});
     return success();
   }
@@ -3034,57 +3059,58 @@ class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
 
     int rank = begin_type.getDimSize(0);
     auto shape_scalar_type = begin_type.getElementType();
-    Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
     SmallVector<Value, 4> stride_values(rank, one);
     SmallVector<Value, 4> end_values;
     SmallVector<Value, 4> begin_values;
     end_values.reserve(rank);
     for (int i = 0; i < rank; ++i) {
       SmallVector<Value, 4> indices;
-      indices.push_back(rewriter.create<arith::ConstantIndexOp>(loc, i));
+      indices.push_back(arith::ConstantIndexOp::create(rewriter, loc, i));
       auto begin_value =
-          rewriter.create<tensor::ExtractOp>(loc, begin_indices, indices);
-      auto size_value = rewriter.create<tensor::ExtractOp>(loc, sizes, indices);
-      Value minus_one = rewriter.create<arith::IndexCastOp>(
-          loc, shape_scalar_type,
-          rewriter.create<arith::ConstantIndexOp>(loc, -1));
-      auto is_minus_one = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, size_value, minus_one);
+          tensor::ExtractOp::create(rewriter, loc, begin_indices, indices);
+      auto size_value =
+          tensor::ExtractOp::create(rewriter, loc, sizes, indices);
+      Value minus_one = arith::IndexCastOp::create(
+          rewriter, loc, shape_scalar_type,
+          arith::ConstantIndexOp::create(rewriter, loc, -1));
+      auto is_minus_one = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::eq, size_value, minus_one);
       Value end_value =
-          rewriter.create<arith::AddIOp>(loc, begin_value, size_value);
-      auto dim_value = rewriter.create<arith::IndexCastOp>(
-          loc, shape_scalar_type,
-          rewriter.create<tensor::DimOp>(loc, input, i));
-      end_value = rewriter.create<mlir::arith::SelectOp>(loc, is_minus_one,
-                                                         dim_value, end_value);
-      auto end_value_casted = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), end_value);
+          arith::AddIOp::create(rewriter, loc, begin_value, size_value);
+      auto dim_value = arith::IndexCastOp::create(
+          rewriter, loc, shape_scalar_type,
+          tensor::DimOp::create(rewriter, loc, input, i));
+      end_value = mlir::arith::SelectOp::create(rewriter, loc, is_minus_one,
+                                                dim_value, end_value);
+      auto end_value_casted = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getIndexType(), end_value);
       end_values.push_back(end_value_casted);
 
-      auto begin_value_casted = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), begin_value);
+      auto begin_value_casted = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getIndexType(), begin_value);
       begin_values.push_back(begin_value_casted);
     }
     auto index_ty = rewriter.getIndexType();
-    auto start_indices = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    auto start_indices = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(begin_values.size())}, index_ty),
         begin_values);
-    auto end_indices = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    auto end_indices = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(end_values.size())}, index_ty),
         end_values);
-    auto stride_indices = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    auto stride_indices = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(stride_values.size())}, index_ty),
         stride_values);
 
-    auto d_slice = rewriter.create<stablehlo::RealDynamicSliceOp>(
-        loc, op.getOperation()->getResult(0).getType(), input, start_indices,
-        end_indices, stride_indices);
+    auto d_slice = stablehlo::RealDynamicSliceOp::create(
+        rewriter, loc, op.getOperation()->getResult(0).getType(), input,
+        start_indices, end_indices, stride_indices);
     rewriter.replaceOp(op, d_slice.getOperation()->getResults());
     return success();
   }
@@ -3110,15 +3136,15 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
 
   // TODO(silvasean): Reduce duplication across reified shape calculations and
   // the static computation of output types needed to create ops.
-  Value lhs_shape = rewriter->create<shape::ShapeOfOp>(loc, lhs);
-  Value rhs_shape = rewriter->create<shape::ShapeOfOp>(loc, rhs);
+  Value lhs_shape = shape::ShapeOfOp::create(*rewriter, loc, lhs);
+  Value rhs_shape = shape::ShapeOfOp::create(*rewriter, loc, rhs);
   Value const_neg2 =
-      rewriter->create<arith::ConstantOp>(loc, rewriter->getIndexAttr(-2));
+      arith::ConstantOp::create(*rewriter, loc, rewriter->getIndexAttr(-2));
   auto shape_type = shape::ShapeType::get(rewriter->getContext());
-  auto lhs_splitted = rewriter->create<shape::SplitAtOp>(
-      loc, TypeRange{shape_type, shape_type}, lhs_shape, const_neg2);
-  auto rhs_splitted = rewriter->create<shape::SplitAtOp>(
-      loc, TypeRange{shape_type, shape_type}, rhs_shape, const_neg2);
+  auto lhs_splitted = shape::SplitAtOp::create(
+      *rewriter, loc, TypeRange{shape_type, shape_type}, lhs_shape, const_neg2);
+  auto rhs_splitted = shape::SplitAtOp::create(
+      *rewriter, loc, TypeRange{shape_type, shape_type}, rhs_shape, const_neg2);
   auto lhs_type = mlir::cast<RankedTensorType>(lhs.getType());
   auto rhs_type = mlir::cast<RankedTensorType>(rhs.getType());
   // The last two dimensions are the matrix row/col dimensions. Don't broadcast
@@ -3127,9 +3153,10 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
   mlir::OpTrait::util::getBroadcastedShape(
       lhs_type.getShape().drop_back(2), rhs_type.getShape().drop_back(2),
       result_batch_shape_compile_time_extents);
-  auto result_batch_shape = rewriter->create<shape::BroadcastOp>(
-      loc, shape_type, lhs_splitted.getHead(), rhs_splitted.getHead(),
-      /*error=*/nullptr);
+  auto result_batch_shape =
+      shape::BroadcastOp::create(*rewriter, loc, shape_type,
+                                 lhs_splitted.getHead(), rhs_splitted.getHead(),
+                                 /*error=*/nullptr);
   // Lambda which handles the broadcasting of one side to the common
   // leading-batch dimensions.
   auto broadcast_one_side = [&](Value side, RankedTensorType type,
@@ -3139,16 +3166,16 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
     result_shape.append(matrix_dims.begin(), matrix_dims.end());
     auto result_type = tensorflow::GetTypeFromTFTensorShape(
         result_shape, type.getElementType());
-    auto shape = rewriter->create<shape::ConcatOp>(
-        loc, shape_type, result_batch_shape, tail_shape);
-    auto shape_tensor = rewriter->create<shape::ToExtentTensorOp>(
-        loc,
+    auto shape = shape::ConcatOp::create(*rewriter, loc, shape_type,
+                                         result_batch_shape, tail_shape);
+    auto shape_tensor = shape::ToExtentTensorOp::create(
+        *rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(result_shape.size())},
             rewriter->getIndexType()),
         shape);
-    *out_side = rewriter->create<TF::BroadcastToOp>(loc, result_type, side,
-                                                    shape_tensor);
+    *out_side = TF::BroadcastToOp::create(*rewriter, loc, result_type, side,
+                                          shape_tensor);
   };
   broadcast_one_side(lhs, lhs_type, lhs_splitted.getTail(), out_lhs);
   broadcast_one_side(rhs, rhs_type, rhs_splitted.getTail(), out_rhs);
@@ -3177,10 +3204,10 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
     auto rhs_type = mlir::dyn_cast<RankedTensorType>(rhs.getType());
     if (!lhs_type || !rhs_type) return failure();
     if (mlir::isa<ComplexType>(lhs_type.getElementType()) && op.getAdjX()) {
-      lhs = rewriter.create<TF::ConjOp>(op.getLoc(), lhs_type, lhs);
+      lhs = TF::ConjOp::create(rewriter, op.getLoc(), lhs_type, lhs);
     }
     if (mlir::isa<ComplexType>(rhs_type.getElementType()) && op.getAdjY()) {
-      rhs = rewriter.create<TF::ConjOp>(op.getLoc(), rhs_type, rhs);
+      rhs = TF::ConjOp::create(rewriter, op.getLoc(), rhs_type, rhs);
     }
 
     // Broadcast both operands.
@@ -3288,8 +3315,8 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
     for (int i = 0; i < num_splits; ++i) {
       begin_indices[dim_index] = i * slice_size;
       end_indices[dim_index] = (i + 1) * slice_size;
-      slices.push_back(rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(), slice_type, op.getValue(),
+      slices.push_back(stablehlo::SliceOp::create(
+          rewriter, op.getLoc(), slice_type, op.getValue(),
           GetI64ArrayAttr(begin_indices, &rewriter),
           GetI64ArrayAttr(end_indices, &rewriter),
           GetI64ArrayAttr(strides, &rewriter)));
@@ -3332,23 +3359,23 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
     if (dim_index < 0) dim_index += input_rank;
 
     Value input_dim_size =
-        rewriter.create<tensor::DimOp>(loc, input, dim_index);
+        tensor::DimOp::create(rewriter, loc, input, dim_index);
     // Calculate the dimension size for each slice along the split dimension.
     int num_splits = op.getNumResults();
-    Value num_splits_value = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIndexAttr(num_splits));
+    Value num_splits_value = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getIndexAttr(num_splits));
     Value slice_size =
-        rewriter.create<arith::DivSIOp>(loc, input_dim_size, num_splits_value);
+        arith::DivSIOp::create(rewriter, loc, input_dim_size, num_splits_value);
 
-    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
 
     SmallVector<Value, 4> begin_indices(input_rank, zero);
     SmallVector<Value, 4> end_indices;
     end_indices.reserve(input_rank);
     SmallVector<Value, 4> strides(input_rank, one);
     for (int i = 0; i < input_rank; ++i) {
-      end_indices.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+      end_indices.push_back(tensor::DimOp::create(rewriter, loc, input, i));
     }
 
     // All HLO d_slice results used to replace the original tf.Split op.
@@ -3356,30 +3383,32 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
     slices.reserve(num_splits);
 
     for (int i = 0; i < num_splits; ++i) {
-      begin_indices[dim_index] = rewriter.create<arith::MulIOp>(
-          loc, slice_size, rewriter.create<arith::ConstantIndexOp>(loc, i));
-      end_indices[dim_index] = rewriter.create<arith::MulIOp>(
-          loc, slice_size, rewriter.create<arith::ConstantIndexOp>(loc, i + 1));
+      begin_indices[dim_index] = arith::MulIOp::create(
+          rewriter, loc, slice_size,
+          arith::ConstantIndexOp::create(rewriter, loc, i));
+      end_indices[dim_index] = arith::MulIOp::create(
+          rewriter, loc, slice_size,
+          arith::ConstantIndexOp::create(rewriter, loc, i + 1));
 
       Type index_ty = rewriter.getIndexType();
-      auto begin_value = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      auto begin_value = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(begin_indices.size())}, index_ty),
           begin_indices);
-      auto end_value = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      auto end_value = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(end_indices.size())}, index_ty),
           end_indices);
-      auto stride_value = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      auto stride_value = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(strides.size())}, index_ty),
           strides);
-      slices.push_back(rewriter.create<stablehlo::RealDynamicSliceOp>(
-          loc, op.getOperation()->getResult(i).getType(), input, begin_value,
-          end_value, stride_value));
+      slices.push_back(stablehlo::RealDynamicSliceOp::create(
+          rewriter, loc, op.getOperation()->getResult(i).getType(), input,
+          begin_value, end_value, stride_value));
     }
 
     rewriter.replaceOp(op, slices);
@@ -3484,10 +3513,11 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 
     for (int i = 0, end = op.getNumResults(); i < end; ++i) {
       end_indices[dim_index] = begin_indices[dim_index] + split_sizes[i];
-      slices.push_back(rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(), op.getValue(), GetI64ArrayAttr(begin_indices, &rewriter),
-          GetI64ArrayAttr(end_indices, &rewriter),
-          GetI64ArrayAttr(strides, &rewriter)));
+      slices.push_back(
+          stablehlo::SliceOp::create(rewriter, op.getLoc(), op.getValue(),
+                                     GetI64ArrayAttr(begin_indices, &rewriter),
+                                     GetI64ArrayAttr(end_indices, &rewriter),
+                                     GetI64ArrayAttr(strides, &rewriter)));
       // Prepare the begin indice for the next slice.
       begin_indices[dim_index] = end_indices[dim_index];
     }
@@ -3568,11 +3598,11 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     Location loc = op.getLoc();
     Value input = op.getInput();
     if (!dims_to_reverse.empty())
-      input = rewriter.create<stablehlo::ReverseOp>(
-          loc, input_ty, op.getInput(),
+      input = stablehlo::ReverseOp::create(
+          rewriter, loc, input_ty, op.getInput(),
           GetI64ArrayAttr(dims_to_reverse, &rewriter));
-    auto sliced = rewriter.create<stablehlo::SliceOp>(
-        loc, input, GetI64ArrayAttr(hlo_begin_indices, &rewriter),
+    auto sliced = stablehlo::SliceOp::create(
+        rewriter, loc, input, GetI64ArrayAttr(hlo_begin_indices, &rewriter),
         GetI64ArrayAttr(hlo_end_indices, &rewriter),
         GetI64ArrayAttr(hlo_strides, &rewriter));
 
@@ -3663,21 +3693,21 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
         continue;
       }
 
-      auto index = rewriter.create<stablehlo::SliceOp>(
-          loc, op.getBegin(), GetI64ArrayAttr({d}, &rewriter),
+      auto index = stablehlo::SliceOp::create(
+          rewriter, loc, op.getBegin(), GetI64ArrayAttr({d}, &rewriter),
           GetI64ArrayAttr({d + 1}, &rewriter), GetI64ArrayAttr({1}, &rewriter));
       // Convert index to scalar.
       auto reshaped_index =
-          rewriter.create<stablehlo::ReshapeOp>(loc, type, index);
+          stablehlo::ReshapeOp::create(rewriter, loc, type, index);
       // If the index is negative, wrap it around with dimension size.
       auto index_negative =
-          rewriter.create<TF::LessOp>(loc, reshaped_index, zero);
+          TF::LessOp::create(rewriter, loc, reshaped_index, zero);
       auto input_val = GetScalarConstOfType(begin_element_ty, loc,
                                             input_shape[d], &rewriter);
       auto wrapped_index =
-          rewriter.create<TF::AddV2Op>(loc, input_val, reshaped_index);
-      auto final_index = rewriter.create<stablehlo::SelectOp>(
-          loc, type, index_negative, wrapped_index, reshaped_index);
+          TF::AddV2Op::create(rewriter, loc, input_val, reshaped_index);
+      auto final_index = stablehlo::SelectOp::create(
+          rewriter, loc, type, index_negative, wrapped_index, reshaped_index);
       slice_begin_indices.push_back(final_index);
       slice_sizes.push_back(1);
     }
@@ -3687,8 +3717,9 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
         slice_sizes, op.getType().getElementType());
     // This must be an xla DynamicSlice op due to the inputs that aren't
     // constant.
-    auto sliced = rewriter.create<stablehlo::DynamicSliceOp>(
-        loc, sliced_type, op.getInput(), slice_begin_indices, slice_sizes_attr);
+    auto sliced = stablehlo::DynamicSliceOp::create(
+        rewriter, loc, sliced_type, op.getInput(), slice_begin_indices,
+        slice_sizes_attr);
 
     // Reshape slice result so that the shape is updated depending on
     // 'new_axis_mask' or 'shrink_axis_mask' attributes.
@@ -3760,9 +3791,9 @@ class ConvertStridedSliceGradOp
     Type element_type = mlir::cast<ShapedType>(grad.getType()).getElementType();
 
     // Perform reshape to undo any new/shrink axes done by strided slice.
-    grad = rewriter.create<stablehlo::ReshapeOp>(
-        op.getLoc(), tensorflow::GetTypeFromTFTensorShape(shape, element_type),
-        grad);
+    grad = stablehlo::ReshapeOp::create(
+        rewriter, op.getLoc(),
+        tensorflow::GetTypeFromTFTensorShape(shape, element_type), grad);
 
     SmallVector<int64_t, 4> padding_low, padding_high, padding_interm;
     SmallVector<int64_t, 4> dims_to_reverse;
@@ -3797,8 +3828,8 @@ class ConvertStridedSliceGradOp
     }
 
     if (!dims_to_reverse.empty()) {
-      grad = rewriter.create<stablehlo::ReverseOp>(
-          op.getLoc(), grad.getType(), grad,
+      grad = stablehlo::ReverseOp::create(
+          rewriter, op.getLoc(), grad.getType(), grad,
           GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
@@ -3840,10 +3871,10 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
       return failure();
     }
 
-    auto iota = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), result_type, rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, op.getDelta(),
+    auto iota = stablehlo::IotaOp::create(rewriter, op.getLoc(), result_type,
+                                          rewriter.getI64IntegerAttr(0));
+    auto scaled = chlo::BroadcastMulOp::create(
+        rewriter, op.getLoc(), result_type, iota, op.getDelta(),
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, op.getDelta()));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
         op, result_type, scaled, op.getStart(),
@@ -3893,25 +3924,25 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     //
     // %size = ceil(abs((%limit - %start) / %delta))
     auto range =
-        rewriter.create<stablehlo::SubtractOp>(op.getLoc(), limit, start);
-    auto abs = rewriter.create<stablehlo::AbsOp>(op.getLoc(), range);
+        stablehlo::SubtractOp::create(rewriter, op.getLoc(), limit, start);
+    auto abs = stablehlo::AbsOp::create(rewriter, op.getLoc(), range);
 
     // Delta is not necessarily the same type as start and limit.
     auto abs_cast =
-        rewriter.create<stablehlo::ConvertOp>(op.getLoc(), compute_type, abs);
-    auto delta_cast =
-        rewriter.create<stablehlo::ConvertOp>(op.getLoc(), compute_type, delta);
+        stablehlo::ConvertOp::create(rewriter, op.getLoc(), compute_type, abs);
+    auto delta_cast = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                   compute_type, delta);
 
     // Compute the total number of integer steps and convert to the HLO
     // dimension tensor.
     auto normalized =
-        rewriter.create<stablehlo::DivOp>(op.getLoc(), abs_cast, delta_cast);
-    auto ceil = rewriter.create<stablehlo::CeilOp>(op.getLoc(), normalized);
-    auto steps = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(),
+        stablehlo::DivOp::create(rewriter, op.getLoc(), abs_cast, delta_cast);
+    auto ceil = stablehlo::CeilOp::create(rewriter, op.getLoc(), normalized);
+    auto steps = stablehlo::ConvertOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()), ceil);
-    auto reshape = rewriter.create<stablehlo::ReshapeOp>(
-        op.getLoc(),
+    auto reshape = stablehlo::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getI64Type()),
         steps);
 
@@ -3920,15 +3951,16 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     // %range = %start + %delta * iota(%size)
     auto out_scalar_type = tensorflow::GetTypeFromTFTensorShape(
         {}, getElementTypeOrSelf(result_type));
-    auto start_out_cast = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), out_scalar_type, start);
-    auto delta_out_cast = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), out_scalar_type, delta);
-
-    auto iota = rewriter.create<stablehlo::DynamicIotaOp>(
-        op.getLoc(), result_type, reshape, rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, delta_out_cast,
+    auto start_out_cast = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                       out_scalar_type, start);
+    auto delta_out_cast = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                       out_scalar_type, delta);
+
+    auto iota = stablehlo::DynamicIotaOp::create(rewriter, op.getLoc(),
+                                                 result_type, reshape,
+                                                 rewriter.getI64IntegerAttr(0));
+    auto scaled = chlo::BroadcastMulOp::create(
+        rewriter, op.getLoc(), result_type, iota, delta_out_cast,
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, delta_cast));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
         op, result_type, scaled, start_out_cast,
@@ -3979,29 +4011,32 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     int64_t num = (*num_attr.begin()).getSExtValue();
 
     // Calculate the scaling that needs to be applied to the iota.
-    auto step_numerator = rewriter.create<chlo::BroadcastSubOp>(
-        op.getLoc(), op.getStart().getType(), op.getStop(), op.getStart(),
+    auto step_numerator = chlo::BroadcastSubOp::create(
+        rewriter, op.getLoc(), op.getStart().getType(), op.getStop(),
+        op.getStart(),
         hlo::getBroadcastDimensionsAttr(&rewriter, op.getStop(),
                                         op.getStart()));
-    Value step_denominator = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), op.getNum(), result_type.getElementType());
+    Value step_denominator = stablehlo::ConvertOp::create(
+        rewriter, op.getLoc(), op.getNum(), result_type.getElementType());
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
                                        op.getLoc(), 1, &rewriter);
-      step_denominator = rewriter.create<chlo::BroadcastSubOp>(
-          op.getLoc(), step_denominator.getType(), step_denominator, one,
+      step_denominator = chlo::BroadcastSubOp::create(
+          rewriter, op.getLoc(), step_denominator.getType(), step_denominator,
+          one,
           hlo::getBroadcastDimensionsAttr(&rewriter, step_denominator, one));
     }
-    auto step = rewriter.create<chlo::BroadcastDivOp>(
-        op.getLoc(), step_numerator.getType(), step_numerator, step_denominator,
+    auto step = chlo::BroadcastDivOp::create(
+        rewriter, op.getLoc(), step_numerator.getType(), step_numerator,
+        step_denominator,
         hlo::getBroadcastDimensionsAttr(&rewriter, step_numerator,
                                         step_denominator));
 
     // Scale the iota and add the offset.
-    auto iota = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), result_type, rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, step,
+    auto iota = stablehlo::IotaOp::create(rewriter, op.getLoc(), result_type,
+                                          rewriter.getI64IntegerAttr(0));
+    auto scaled = chlo::BroadcastMulOp::create(
+        rewriter, op.getLoc(), result_type, iota, step,
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, step));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
         op, result_type, scaled, op.getStart(),
@@ -4068,14 +4103,14 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // repeated arithmetic operations.
     Type reduce_element_type =
         is_accumulation ? GetAccumulationType(element_type) : element_type;
-    auto casted_input = rewriter.create<stablehlo::ConvertOp>(
-        loc, op.getInput(), reduce_element_type);
+    auto casted_input = stablehlo::ConvertOp::create(
+        rewriter, loc, op.getInput(), reduce_element_type);
 
     // Each reduction op can have a different initial value.
     Value init = Derived::GetInitialValue(reduce_element_type, loc, &rewriter);
 
-    auto reduction = rewriter.create<stablehlo::ReduceOp>(
-        loc, casted_input.getResult(), init,
+    auto reduction = stablehlo::ReduceOp::create(
+        rewriter, loc, casted_input.getResult(), init,
         GetI64ArrayAttr(xla_dimensions, &rewriter), reduce_element_type);
     BuildReduceBody<ReductionOp>(reduce_element_type, &reduction.getBody(),
                                  &rewriter);
@@ -4083,32 +4118,34 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
 
     // The mean op needs to divide by the product of the reduced dimensions.
     if (std::is_same<OpTy, TF::MeanOp>::value) {
-      Value in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
-      Value divisor_count = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+      Value in_shape = shape::ShapeOfOp::create(rewriter, loc, op.getInput());
+      Value divisor_count = arith::ConstantIndexOp::create(rewriter, loc, 1);
       for (size_t i = 0; i < input_shape.size(); ++i) {
         if (reduced_dimensions_bitmap[i]) {
-          Value index = rewriter.create<arith::ConstantIndexOp>(loc, i);
-          auto dim = rewriter.create<tensor::ExtractOp>(loc, in_shape, index);
+          Value index = arith::ConstantIndexOp::create(rewriter, loc, i);
+          auto dim = tensor::ExtractOp::create(rewriter, loc, in_shape, index);
           divisor_count =
-              rewriter.create<arith::MulIOp>(loc, divisor_count, dim);
+              arith::MulIOp::create(rewriter, loc, divisor_count, dim);
         }
       }
       // HLO ops are only defined on tensors, so we cast the divisor from
       // index -> i64 -> tensor<1xi64> -> tensor<i64> -> tensor<reduction type>
-      Value divisor_casted = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getI64Type(), divisor_count);
-      Value divisor_tensor = rewriter.create<tensor::FromElementsOp>(
-          loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()),
+      Value divisor_casted = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getI64Type(), divisor_count);
+      Value divisor_tensor = tensor::FromElementsOp::create(
+          rewriter, loc,
+          tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()),
           divisor_casted);
-      Value divisor = rewriter.create<stablehlo::ConvertOp>(
-          loc, tensorflow::GetTypeFromTFTensorShape({}, reduce_element_type),
+      Value divisor = stablehlo::ConvertOp::create(
+          rewriter, loc,
+          tensorflow::GetTypeFromTFTensorShape({}, reduce_element_type),
           divisor_tensor);
       auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
-      result = rewriter.create<chlo::BroadcastDivOp>(loc, result, divisor,
-                                                     broadcast_dims);
+      result = chlo::BroadcastDivOp::create(rewriter, loc, result, divisor,
+                                            broadcast_dims);
     }
 
-    result = rewriter.create<stablehlo::ConvertOp>(loc, result, element_type);
+    result = stablehlo::ConvertOp::create(rewriter, loc, result, element_type);
 
     // Need to reshape back after the reduction if we're keeping the reduced
     // dimensions. Note that we do this through successive (nominally 1)
@@ -4122,8 +4159,8 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
         if (dim_is_reduced.value()) {
           auto index_attr = GetI32ElementsAttr(
               {static_cast<int>(dim_is_reduced.index())}, &rewriter);
-          Value index = rewriter.create<arith::ConstantOp>(loc, index_attr);
-          result = rewriter.create<TF::ExpandDimsOp>(loc, result, index);
+          Value index = arith::ConstantOp::create(rewriter, loc, index_attr);
+          result = TF::ExpandDimsOp::create(rewriter, loc, result, index);
         }
       }
     }
@@ -4300,15 +4337,15 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
 
     IntegerAttr iota_dimension =
         IntegerAttr::get(rewriter.getIntegerType(64), axis);
-    Value input_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
-    Value index_values = rewriter.create<stablehlo::DynamicIotaOp>(
-        loc, index_type, input_shape, iota_dimension);
+    Value input_shape = shape::ShapeOfOp::create(rewriter, loc, op.getInput());
+    Value index_values = stablehlo::DynamicIotaOp::create(
+        rewriter, loc, index_type, input_shape, iota_dimension);
 
     Value operands[] = {op.getInput(), index_values};
     Value init_values[] = {init_value, index_init_value};
 
-    auto reduction = rewriter.create<stablehlo::ReduceOp>(
-        loc, llvm::ArrayRef<Value>(operands),
+    auto reduction = stablehlo::ReduceOp::create(
+        rewriter, loc, llvm::ArrayRef<Value>(operands),
         llvm::ArrayRef<Value>(init_values), GetI64ArrayAttr({axis}, &rewriter),
         TypeRange({input_element_type, index_element_type}));
     auto direction = Derived::GetDirection();
@@ -4426,14 +4463,14 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
       auto const_attr = GetI64ElementsAttr(expected_update_shape, &rewriter);
 
       auto const_op =
-          rewriter.create<TF::ConstOp>(op->getLoc(), const_type, const_attr);
+          TF::ConstOp::create(rewriter, op->getLoc(), const_type, const_attr);
 
       auto broadcast_to_type = tensorflow::GetTypeFromTFTensorShape(
           llvm::ArrayRef<int64_t>(expected_update_shape),
           updates_ty.getElementType());
 
-      updates = rewriter.create<TF::BroadcastToOp>(
-          op->getLoc(), broadcast_to_type, op.getUpdates(), const_op);
+      updates = TF::BroadcastToOp::create(
+          rewriter, op->getLoc(), broadcast_to_type, op.getUpdates(), const_op);
 
       updates_ty = mlir::dyn_cast<RankedTensorType>(updates.getType());
     }
@@ -4455,9 +4492,9 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
         indices_rank - 1);
 
     Location loc = op.getLoc();
-    auto scatter = rewriter.create<stablehlo::ScatterOp>(
-        loc, op.getType(), ValueRange(Value(op.getTensor())), op.getIndices(),
-        updates, dims_attr);
+    auto scatter = stablehlo::ScatterOp::create(
+        rewriter, loc, op.getType(), ValueRange(Value(op.getTensor())),
+        op.getIndices(), updates, dims_attr);
     Derived::BuildScatterBody(tensor_ty.getElementType(),
                               &scatter.getUpdateComputation(), loc, rewriter);
 
@@ -4479,7 +4516,7 @@ class ConvertTensorScatterUpdateOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    builder.create<stablehlo::ReturnOp>(loc, block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, block->getArgument(1));
   }
 };
 
@@ -4496,9 +4533,9 @@ class ConvertTensorScatterAddOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto add_op = builder.create<stablehlo::AddOp>(loc, block->getArgument(0),
-                                                   block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, add_op.getResult());
+    auto add_op = stablehlo::AddOp::create(builder, loc, block->getArgument(0),
+                                           block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, add_op.getResult());
   }
 };
 
@@ -4515,9 +4552,9 @@ class ConvertTensorScatterSubOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto sub_op = builder.create<stablehlo::SubtractOp>(
-        loc, block->getArgument(0), block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, sub_op.getResult());
+    auto sub_op = stablehlo::SubtractOp::create(
+        builder, loc, block->getArgument(0), block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, sub_op.getResult());
   }
 };
 
@@ -4534,9 +4571,9 @@ class ConvertTensorScatterMinOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto min_op = builder.create<stablehlo::MinOp>(loc, block->getArgument(0),
-                                                   block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, min_op.getResult());
+    auto min_op = stablehlo::MinOp::create(builder, loc, block->getArgument(0),
+                                           block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, min_op.getResult());
   }
 };
 
@@ -4553,9 +4590,9 @@ class ConvertTensorScatterMaxOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto max_op = builder.create<stablehlo::MaxOp>(loc, block->getArgument(0),
-                                                   block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, max_op.getResult());
+    auto max_op = stablehlo::MaxOp::create(builder, loc, block->getArgument(0),
+                                           block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, max_op.getResult());
   }
 };
 
@@ -4670,10 +4707,10 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
       auto dim_size = input_ty.getDimSize(i);
       if (dim_size == ShapedType::kDynamic) {
         input_shape_values.push_back(
-            rewriter.create<tensor::DimOp>(loc, input, i));
+            tensor::DimOp::create(rewriter, loc, input, i));
       } else {
-        input_shape_values.push_back(rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIndexAttr(dim_size)));
+        input_shape_values.push_back(arith::ConstantOp::create(
+            rewriter, loc, rewriter.getIndexAttr(dim_size)));
       }
     }
 
@@ -4691,12 +4728,12 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     SmallVector<Value, 4> out_dim_size;
     out_dim_size.reserve(input_rank * 2);
     for (int64_t dim_idx = 0; dim_idx < input_rank; ++dim_idx) {
-      Value index = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexAttr(dim_idx));
-      Value multiples_size =
-          rewriter.create<tensor::ExtractOp>(loc, multiples, ValueRange{index});
+      Value index = arith::ConstantOp::create(rewriter, loc,
+                                              rewriter.getIndexAttr(dim_idx));
+      Value multiples_size = tensor::ExtractOp::create(rewriter, loc, multiples,
+                                                       ValueRange{index});
       Value multiples_size_casted =
-          rewriter.create<arith::IndexCastOp>(loc, index_ty, multiples_size);
+          arith::IndexCastOp::create(rewriter, loc, index_ty, multiples_size);
       out_dim_size.push_back(multiples_size_casted);
       out_dim_size.push_back(input_shape_values[dim_idx]);
     }
@@ -4707,8 +4744,8 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     }
     auto broadcast_dims_attr = GetI64ArrayAttr(broadcast_dimensions, &rewriter);
 
-    Value out_dim_size_tensor = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    Value out_dim_size_tensor = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(out_dim_size.size())}, index_ty),
         out_dim_size);
@@ -4716,19 +4753,21 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
                                             ShapedType::kDynamic);
     RankedTensorType broadcast_type =
         tensorflow::GetTypeFromTFTensorShape(broadcast_shape, element_type);
-    Value broadcast = rewriter.create<stablehlo::DynamicBroadcastInDimOp>(
-        loc, broadcast_type, input, out_dim_size_tensor, broadcast_dims_attr);
+    Value broadcast = stablehlo::DynamicBroadcastInDimOp::create(
+        rewriter, loc, broadcast_type, input, out_dim_size_tensor,
+        broadcast_dims_attr);
 
     // %shape = [MS1, MS2]
     SmallVector<Value, 4> shape_values;
     shape_values.reserve(input_rank);
     for (int64_t i = 0; i < input_rank; ++i) {
-      Value dim_size_value = rewriter.create<mlir::arith::MulIOp>(
-          loc, out_dim_size[2 * i], out_dim_size[2 * i + 1]);
+      Value dim_size_value = mlir::arith::MulIOp::create(
+          rewriter, loc, out_dim_size[2 * i], out_dim_size[2 * i + 1]);
       shape_values.push_back(dim_size_value);
     }
-    Value shape = rewriter.create<tensor::FromElementsOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({input_rank}, index_ty),
+    Value shape = tensor::FromElementsOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({input_rank}, index_ty),
         shape_values);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(op, op.getType(),
                                                              broadcast, shape);
@@ -4758,8 +4797,8 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
 
-    auto result = rewriter.create<stablehlo::SelectAndScatterOp>(
-        loc, op.getType(), op.getOrigInput(), op.getGrad(),
+    auto result = stablehlo::SelectAndScatterOp::create(
+        rewriter, loc, op.getType(), op.getOrigInput(), op.getGrad(),
         GetScalarConstOfType(element_type, loc, 0, &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
@@ -4776,10 +4815,10 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
           tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
       block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
 
-      auto reducer = rewriter.create<stablehlo::CompareOp>(
-          loc, block->getArgument(0), block->getArgument(1),
+      auto reducer = stablehlo::CompareOp::create(
+          rewriter, loc, block->getArgument(0), block->getArgument(1),
           stablehlo::ComparisonDirection::GE);
-      rewriter.create<stablehlo::ReturnOp>(loc, reducer.getResult());
+      stablehlo::ReturnOp::create(rewriter, loc, reducer.getResult());
     }
 
     rewriter.replaceOp(op, result);
@@ -4955,7 +4994,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       Type filter_element_ty = filter_ty.getElementType();
       auto ty =
           tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<stablehlo::ReshapeOp>(op.getLoc(), ty, filter);
+      filter = stablehlo::ReshapeOp::create(rewriter, op.getLoc(), ty, filter);
 
       // 2. Transpose to [H, W, ..., G, filter_in_depth, out_depth / G].
       llvm::SmallVector<int64_t, 6> perm(num_dims + 1);
@@ -4963,15 +5002,15 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       std::swap(perm[num_spatial_dims], perm[num_spatial_dims + 1]);
       std::swap(new_shape[num_spatial_dims], new_shape[num_spatial_dims + 1]);
       ty = tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<stablehlo::TransposeOp>(
-          op.getLoc(), ty, filter, GetI64ArrayAttr(perm, &rewriter));
+      filter = stablehlo::TransposeOp::create(rewriter, op.getLoc(), ty, filter,
+                                              GetI64ArrayAttr(perm, &rewriter));
 
       // 3. Reshape to [H, W, ..., in_depth, out_depth / G].
       new_shape[num_spatial_dims] *= new_shape[num_spatial_dims + 1];
       new_shape[num_spatial_dims + 1] = new_shape.back();
       new_shape.pop_back();
       ty = tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<stablehlo::ReshapeOp>(op.getLoc(), ty, filter);
+      filter = stablehlo::ReshapeOp::create(rewriter, op.getLoc(), ty, filter);
     }
 
     SmallVector<int64_t, 4> kernel_spatial_dims;
@@ -4979,13 +5018,14 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
     std::iota(kernel_spatial_dims.begin(), kernel_spatial_dims.end(), 0);
 
     // Mirror the filter in the spatial dimensions.
-    filter = rewriter.create<stablehlo::ReverseOp>(
-        op.getLoc(), filter, GetI64ArrayAttr(kernel_spatial_dims, &rewriter));
+    filter = stablehlo::ReverseOp::create(
+        rewriter, op.getLoc(), filter,
+        GetI64ArrayAttr(kernel_spatial_dims, &rewriter));
 
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
-    Value result = rewriter.create<stablehlo::ConvolutionOp>(
-        op.getLoc(), op.getType(), op.getOutBackprop(), filter,
+    Value result = stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), op.getType(), op.getOutBackprop(), filter,
         /*window_strides=*/
         GetI64ArrayAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
                                 &rewriter),
@@ -5191,8 +5231,8 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
     const int batch_dim =
         tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
 
-    Value result = rewriter.create<stablehlo::ConvolutionOp>(
-        op.getLoc(), op.getType(), op.getInput(), op.getOutBackprop(),
+    Value result = stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), op.getType(), op.getInput(), op.getOutBackprop(),
         /*window_strides=*/GetI64ArrayAttr(window_strides, &rewriter),
         /*padding=*/paddings_attr, /*lhs_dilation=*/
         GetI64ArrayAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
@@ -5331,14 +5371,15 @@ class ConvertInfeedDequeueTupleOp
 
     // Infeed takes a single token operand. Generate the token using
     // create_token op to pass to the infeed op.
-    auto token = rewriter.create<stablehlo::CreateTokenOp>(
-        op.getLoc(), stablehlo::TokenType::get(rewriter.getContext()));
+    auto token = stablehlo::CreateTokenOp::create(
+        rewriter, op.getLoc(),
+        stablehlo::TokenType::get(rewriter.getContext()));
 
     result_types.push_back(token.getType());
 
     ArrayAttr layout;  // filled in during the xla-adjust-layout pass
-    auto data_and_token = rewriter.create<stablehlo::InfeedOp>(
-        op.getLoc(), result_types, token,
+    auto data_and_token = stablehlo::InfeedOp::create(
+        rewriter, op.getLoc(), result_types, token,
         /*infeed_config=*/rewriter.getStringAttr(""),
         /*layout=*/layout);
 
@@ -5409,11 +5450,11 @@ class ConvertOutfeedEnqueueTupleOp
                                 PatternRewriter &rewriter) const override {
     auto token_type = stablehlo::TokenType::get(rewriter.getContext());
     auto token =
-        rewriter.create<stablehlo::CreateTokenOp>(op.getLoc(), token_type);
+        stablehlo::CreateTokenOp::create(rewriter, op.getLoc(), token_type);
 
-    rewriter.create<stablehlo::OutfeedOp>(
-        op.getLoc(), token_type, op.getInputs(), token,
-        /*outfeed_config=*/rewriter.getStringAttr(""));
+    stablehlo::OutfeedOp::create(rewriter, op.getLoc(), token_type,
+                                 op.getInputs(), token,
+                                 /*outfeed_config=*/rewriter.getStringAttr(""));
     rewriter.eraseOp(op);
     return success();
   }
@@ -5475,14 +5516,15 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
       begin_indices[axis] = i;
       end_indices[axis] = i + 1;
 
-      auto slice_op = rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(), op.getValue(), GetI64ArrayAttr(begin_indices, &rewriter),
-          GetI64ArrayAttr(end_indices, &rewriter),
-          GetI64ArrayAttr(strides, &rewriter));
+      auto slice_op =
+          stablehlo::SliceOp::create(rewriter, op.getLoc(), op.getValue(),
+                                     GetI64ArrayAttr(begin_indices, &rewriter),
+                                     GetI64ArrayAttr(end_indices, &rewriter),
+                                     GetI64ArrayAttr(strides, &rewriter));
       // Reshape to drop the axis dimension.
-      auto result = rewriter.create<TF::SqueezeOp>(
-          op.getLoc(), op.getType(i), slice_op,
-          rewriter.getI64ArrayAttr(op.getAxis()));
+      auto result =
+          TF::SqueezeOp::create(rewriter, op.getLoc(), op.getType(i), slice_op,
+                                rewriter.getI64ArrayAttr(op.getAxis()));
       results.push_back(result);
     }
 
@@ -5525,16 +5567,16 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
     for (int64_t dim_idx = 0; dim_idx < value_rank; ++dim_idx) {
       int64_t dim_size = value_type.getDimSize(dim_idx);
       if (dim_size == ShapedType::kDynamic) {
-        Value dim_i = rewriter.create<arith::IndexCastOp>(
-            loc, shape_scalar_type,
-            rewriter.create<tensor::DimOp>(loc, op.getOperand(), dim_idx));
+        Value dim_i = arith::IndexCastOp::create(
+            rewriter, loc, shape_scalar_type,
+            tensor::DimOp::create(rewriter, loc, op.getOperand(), dim_idx));
         end_indices.push_back(dim_i);
         if (dim_idx != axis) {
           shape_values.push_back(dim_i);
         }
       } else {
-        Value dim_i = rewriter.create<arith::ConstantOp>(
-            loc, shape_scalar_type,
+        Value dim_i = arith::ConstantOp::create(
+            rewriter, loc, shape_scalar_type,
             rewriter.getIntegerAttr(shape_scalar_type, dim_size));
         end_indices.push_back(dim_i);
         if (dim_idx != axis) {
@@ -5545,44 +5587,45 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
         }
       }
       begin_indices.push_back(
-          rewriter.create<arith::ConstantIntOp>(loc, 0, 32));
-      strides.push_back(rewriter.create<arith::ConstantIntOp>(loc, 1, 32));
+          arith::ConstantIntOp::create(rewriter, loc, 0, 32));
+      strides.push_back(arith::ConstantIntOp::create(rewriter, loc, 1, 32));
     }
 
     SmallVector<Value, 4> results;
     results.reserve(op.getNumResults());
     Type i32_ty = rewriter.getI32Type();
     for (int64_t i = 0; i < op.getNumResults(); ++i) {
-      begin_indices[axis] = rewriter.create<arith::ConstantIntOp>(loc, i, 32);
-      end_indices[axis] = rewriter.create<arith::ConstantIntOp>(loc, i + 1, 32);
-      Value slice_op = rewriter.create<stablehlo::RealDynamicSliceOp>(
-          loc,
+      begin_indices[axis] = arith::ConstantIntOp::create(rewriter, loc, i, 32);
+      end_indices[axis] =
+          arith::ConstantIntOp::create(rewriter, loc, i + 1, 32);
+      Value slice_op = stablehlo::RealDynamicSliceOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(slice_shape,
                                                value_type.getElementType()),
           op.getValue(),
-          rewriter.create<tensor::FromElementsOp>(
-              loc,
+          tensor::FromElementsOp::create(
+              rewriter, loc,
               tensorflow::GetTypeFromTFTensorShape(
                   {static_cast<int64_t>(begin_indices.size())}, i32_ty),
               begin_indices),
-          rewriter.create<tensor::FromElementsOp>(
-              loc,
+          tensor::FromElementsOp::create(
+              rewriter, loc,
               tensorflow::GetTypeFromTFTensorShape(
                   {static_cast<int64_t>(end_indices.size())}, i32_ty),
               end_indices),
-          rewriter.create<tensor::FromElementsOp>(
-              loc,
+          tensor::FromElementsOp::create(
+              rewriter, loc,
               tensorflow::GetTypeFromTFTensorShape(
                   {static_cast<int64_t>(strides.size())}, i32_ty),
               strides));
       // Reshape to drop the axis dimension.
-      Value new_shape = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      Value new_shape = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(shape_values.size())}, i32_ty),
           shape_values);
-      Value reshape_op = rewriter.create<stablehlo::DynamicReshapeOp>(
-          loc, op.getType(i), slice_op, new_shape);
+      Value reshape_op = stablehlo::DynamicReshapeOp::create(
+          rewriter, loc, op.getType(i), slice_op, new_shape);
       results.push_back(reshape_op);
     }
 
@@ -5619,16 +5662,20 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
       assert(mlir::isa<FloatType>(elem_tp));
       attr = rewriter.getFloatAttr(elem_tp, 1);
     }
-    Value one = rewriter.create<stablehlo::ConstantOp>(
-        loc, DenseElementsAttr::get(
-                 tensorflow::GetTypeFromTFTensorShape({}, elem_tp), attr));
-
-    auto v0 = rewriter.create<chlo::BroadcastMulOp>(
-        loc, dy, y, hlo::getBroadcastDimensionsAttr(&rewriter, dy, y));
-    auto v1 = rewriter.create<chlo::BroadcastSubOp>(
-        loc, one, y, hlo::getBroadcastDimensionsAttr(&rewriter, one, y));
-    auto result = rewriter.create<chlo::BroadcastMulOp>(
-        loc, v0, v1, hlo::getBroadcastDimensionsAttr(&rewriter, v0, v1));
+    Value one = stablehlo::ConstantOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(
+            tensorflow::GetTypeFromTFTensorShape({}, elem_tp), attr));
+
+    auto v0 = chlo::BroadcastMulOp::create(
+        rewriter, loc, dy, y,
+        hlo::getBroadcastDimensionsAttr(&rewriter, dy, y));
+    auto v1 = chlo::BroadcastSubOp::create(
+        rewriter, loc, one, y,
+        hlo::getBroadcastDimensionsAttr(&rewriter, one, y));
+    auto result = chlo::BroadcastMulOp::create(
+        rewriter, loc, v0, v1,
+        hlo::getBroadcastDimensionsAttr(&rewriter, v0, v1));
 
     rewriter.replaceOp(op, result.getOperation()->getResults());
     return success();
@@ -5684,8 +5731,8 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     // 'operand' parameter to scatter to for the final scatter op.
     Value init = ConcreteClass::GetInitialValue(data_type.getElementType(),
                                                 op.getLoc(), &rewriter);
-    auto broadcasted_init = rewriter.create<stablehlo::BroadcastOp>(
-        op.getLoc(), output_type, init,
+    auto broadcasted_init = stablehlo::BroadcastOp::create(
+        rewriter, op.getLoc(), output_type, init,
         GetI64ArrayAttr(output_shape, &rewriter));
 
     // Parameters for the generated scatter op.
@@ -5702,9 +5749,10 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
         /*scatterIndicesBatchingDims=*/{}, scatter_dims_to_operand_dims,
         index_vector_dim);
 
-    auto scatter = rewriter.create<stablehlo::ScatterOp>(
-        op.getLoc(), op.getType(), ValueRange(Value(broadcasted_init)),
-        op.getSegmentIds(), op.getData(), dims_attr);
+    auto scatter = stablehlo::ScatterOp::create(
+        rewriter, op.getLoc(), op.getType(),
+        ValueRange(Value(broadcasted_init)), op.getSegmentIds(), op.getData(),
+        dims_attr);
     BuildReduceBody<ReductionOp>(data_type.getElementType(),
                                  &scatter.getUpdateComputation(), &rewriter);
 
@@ -5868,8 +5916,8 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     // Generate range(n) as the initial value for the indices to be swapped.
     auto indices_type = tensorflow::GetTypeFromTFTensorShape(
         {first_dim_size}, rewriter.getIntegerType(32));
-    Value indices = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), indices_type, rewriter.getI64IntegerAttr(0));
+    Value indices = stablehlo::IotaOp::create(
+        rewriter, op.getLoc(), indices_type, rewriter.getI64IntegerAttr(0));
 
     // Generate random numbers to be used as swaps for the indices.
     Value swaps = CreateRngUniform32(op.getLoc(), first_dim_size, 0,
@@ -5889,22 +5937,23 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
 
       // We need to swap the indices[i] with indices[swaps[i]]. First get
       // these index values.
-      Value source_index = builder->create<stablehlo::DynamicSliceOp>(
-          loc, indices, i, scalar_one);
-      Value swap_index = builder->create<stablehlo::ReshapeOp>(
-          loc, scalar_i32_type,
-          builder->create<stablehlo::DynamicSliceOp>(loc, swaps, i,
-                                                     scalar_one));
-      Value target_index = builder->create<stablehlo::DynamicSliceOp>(
-          loc, indices, swap_index, scalar_one);
+      Value source_index = stablehlo::DynamicSliceOp::create(
+          *builder, loc, indices, i, scalar_one);
+      Value swap_index = stablehlo::ReshapeOp::create(
+          *builder, loc, scalar_i32_type,
+          stablehlo::DynamicSliceOp::create(*builder, loc, swaps, i,
+                                            scalar_one));
+      Value target_index = stablehlo::DynamicSliceOp::create(
+          *builder, loc, indices, swap_index, scalar_one);
 
       // Then perform the swap.
       // indices[i] <- indices[swaps[i]]
-      indices = builder->create<stablehlo::DynamicUpdateSliceOp>(
-          loc, indices.getType(), indices, target_index, llvm::ArrayRef(i));
+      indices = stablehlo::DynamicUpdateSliceOp::create(
+          *builder, loc, indices.getType(), indices, target_index,
+          llvm::ArrayRef(i));
       // indices[swaps[i]] <- indices[i]
-      indices = builder->create<stablehlo::DynamicUpdateSliceOp>(
-          loc, indices.getType(), indices, source_index,
+      indices = stablehlo::DynamicUpdateSliceOp::create(
+          *builder, loc, indices.getType(), indices, source_index,
           llvm::ArrayRef(swap_index));
 
       // Update new values.
@@ -5932,25 +5981,27 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     SmallVector<Value> slice_sizes_values;
     for (auto i = 0; i < slice_sizes.size(); ++i) {
       if (slice_sizes[i] == tensorflow::kTFDynamicSize) {
-        Value i_const = rewriter.create<arith::ConstantOp>(
-            op.getLoc(), rewriter.getIndexAttr(i));
+        Value i_const = arith::ConstantOp::create(rewriter, op.getLoc(),
+                                                  rewriter.getIndexAttr(i));
         Value slice_size_index =
-            rewriter.create<shape::DimOp>(op.getLoc(), op.getValue(), i_const);
-        Value index_to_i64 = rewriter.create<arith::IndexCastOp>(
-            op.getLoc(), rewriter.getI64Type(), slice_size_index);
-        Value i64_to_tensor = rewriter.create<tensor::FromElementsOp>(
-            op.getLoc(),
+            shape::DimOp::create(rewriter, op.getLoc(), op.getValue(), i_const);
+        Value index_to_i64 = arith::IndexCastOp::create(
+            rewriter, op.getLoc(), rewriter.getI64Type(), slice_size_index);
+        Value i64_to_tensor = tensor::FromElementsOp::create(
+            rewriter, op.getLoc(),
             tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getI64Type()),
             index_to_i64);
         slice_sizes_values.push_back(i64_to_tensor);
       } else {
-        slice_sizes_values.push_back(rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), GetI64ElementsAttr({slice_sizes[i]}, &rewriter)));
+        slice_sizes_values.push_back(stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            GetI64ElementsAttr({slice_sizes[i]}, &rewriter)));
       }
     }
 
-    auto slice_sizes_concat = rewriter.create<stablehlo::ConcatenateOp>(
-        op.getLoc(), slice_sizes_values, rewriter.getI64IntegerAttr(0));
+    auto slice_sizes_concat = stablehlo::ConcatenateOp::create(
+        rewriter, op.getLoc(), slice_sizes_values,
+        rewriter.getI64IntegerAttr(0));
     rewriter.replaceOpWithNewOp<stablehlo::DynamicGatherOp>(
         op, op.getType(), op.getValue(), swaped_indices, slice_sizes_concat,
         dims_attr);
@@ -5981,8 +6032,8 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
     NamedAttribute call_target_name = rewriter.getNamedAttr(
         "call_target_name", rewriter.getStringAttr("Sharding"));
 
-    auto custom_call = rewriter.create<stablehlo::CustomCallOp>(
-        op.getLoc(), op.getType(), op.getInput(),
+    auto custom_call = stablehlo::CustomCallOp::create(
+        rewriter, op.getLoc(), op.getType(), op.getInput(),
         ArrayRef<NamedAttribute>{call_target_name});
     custom_call->setAttr(kShardingAttr, *sharding);
     rewriter.replaceOp(op, custom_call.getResult(0));
@@ -6023,8 +6074,8 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     // subsequent ones are constructed based on zero_attr. Thus the type
     // for zero_attr needs to be i32 as well.
     auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(32), 0);
-    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
-        op.getLoc(), unpacked_indices_type, indices, zero_attr);
+    auto unpacked_indices = TF::UnpackOp::create(
+        rewriter, op.getLoc(), unpacked_indices_type, indices, zero_attr);
 
     SmallVector<int64_t, 4> split_updates_shape;
     split_updates_shape.append(updates_type.getShape().begin(),
@@ -6036,10 +6087,10 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
         tensorflow::GetTypeFromTFTensorShape(split_updates_shape,
                                              updates_type.getElementType()));
 
-    auto cst = rewriter.create<stablehlo::ConstantOp>(op.getLoc(), zero_attr)
+    auto cst = stablehlo::ConstantOp::create(rewriter, op.getLoc(), zero_attr)
                    .getResult();
-    auto split_updates = rewriter.create<TF::SplitOp>(
-        op.getLoc(), split_updates_type, cst, updates);
+    auto split_updates = TF::SplitOp::create(rewriter, op.getLoc(),
+                                             split_updates_type, cst, updates);
 
     SmallVector<Value, 6> input_indices;
     input_indices.resize(input_type.getRank(), cst);
@@ -6047,8 +6098,9 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     for (auto pair :
          llvm::zip(unpacked_indices.getOutput(), split_updates.getOutput())) {
       input_indices.front() = std::get<0>(pair);
-      input = rewriter.create<stablehlo::DynamicUpdateSliceOp>(
-          op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
+      input = stablehlo::DynamicUpdateSliceOp::create(
+          rewriter, op.getLoc(), op.getType(), input, std::get<1>(pair),
+          input_indices);
     }
 
     rewriter.replaceOp(op, input);
@@ -6073,8 +6125,8 @@ class ConvertXlaDynamicUpdateSliceOp
     SmallVector<Type, 4> unpacked_indices_type(
         indices_type.getDimSize(0), tensorflow::GetTypeFromTFTensorShape(
                                         {}, indices_type.getElementType()));
-    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
-        op.getLoc(), unpacked_indices_type, op.getIndices(),
+    auto unpacked_indices = TF::UnpackOp::create(
+        rewriter, op.getLoc(), unpacked_indices_type, op.getIndices(),
         IntegerAttr::get(rewriter.getIntegerType(64), 0));
     rewriter.replaceOpWithNewOp<stablehlo::DynamicUpdateSliceOp>(
         op, op.getType(), op.getInput(), op.getUpdate(),
@@ -6106,8 +6158,8 @@ class ConvertXlaReduceScatterOp
     Location loc = op.getLoc();
     Type element_type = getElementTypeOrSelf(op.getInput().getType());
 
-    auto reduce_scatter = rewriter.create<stablehlo::ReduceScatterOp>(
-        loc, op.getType(), op.getInput(),
+    auto reduce_scatter = stablehlo::ReduceScatterOp::create(
+        rewriter, loc, op.getType(), op.getInput(),
         rewriter.getIntegerAttr(rewriter.getIntegerType(64),
                                 scatter_dimension.getSExtValue()),
         replica_groups, stablehlo::ChannelHandleAttr());
@@ -6140,8 +6192,8 @@ class ConvertXlaReduceScatterOp
       auto divisor = GetScalarConstOfType(element_type, loc, replica_group_size,
                                           &rewriter);
       auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
-      result = rewriter.create<chlo::BroadcastDivOp>(
-          loc, result, divisor.getResult(), broadcast_dims);
+      result = chlo::BroadcastDivOp::create(
+          rewriter, loc, result, divisor.getResult(), broadcast_dims);
     }
 
     rewriter.replaceOp(op, {result});
@@ -6171,8 +6223,8 @@ class ConvertXlaReduceWindowOp
 
     SmallVector<Type> result_types{op.getResult().getType()};
     // Create the stablehlo.SelectAndScatter op.
-    auto reduce_window_op = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc, result_types, op.getInput(), op.getInitValue(),
+    auto reduce_window_op = stablehlo::ReduceWindowOp::create(
+        rewriter, loc, result_types, op.getInput(), op.getInitValue(),
         ToDenseI64ArrayAttr(window_dimensions, &rewriter),
         ToDenseI64ArrayAttr(window_strides, &rewriter),
         ToDenseI64ArrayAttr(base_dilations, &rewriter),
@@ -6213,20 +6265,20 @@ class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
       return failure();
     }
 
-    auto shape = rewriter.create<TF::ShapeOp>(
-        op.getLoc(),
-        tensorflow::GetTypeFromTFTensorShape({input_ty.getRank()},
-                                             rewriter.getI32Type()),
-        input);
+    auto shape =
+        TF::ShapeOp::create(rewriter, op.getLoc(),
+                            tensorflow::GetTypeFromTFTensorShape(
+                                {input_ty.getRank()}, rewriter.getI32Type()),
+                            input);
 
     if (min_ty != input_ty) {
-      min =
-          rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, min, shape);
+      min = TF::BroadcastToOp::create(rewriter, op.getLoc(), input_ty, min,
+                                      shape);
     }
 
     if (max_ty != input_ty) {
-      max =
-          rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, max, shape);
+      max = TF::BroadcastToOp::create(rewriter, op.getLoc(), input_ty, max,
+                                      shape);
     }
 
     rewriter.replaceOpWithNewOp<stablehlo::ClampOp>(op, input_ty, min, input,
@@ -6250,9 +6302,9 @@ class ConvertConstOp : public OpRewritePattern<TF::ConstOp> {
       return failure();
 
     Location loc = op.getLoc();
-    Value result = rewriter.create<stablehlo::ConstantOp>(loc, op.getValue());
+    Value result = stablehlo::ConstantOp::create(rewriter, loc, op.getValue());
     if (result.getType() != op.getType())
-      result = rewriter.create<tensor::CastOp>(loc, op.getType(), result);
+      result = tensor::CastOp::create(rewriter, loc, op.getType(), result);
     rewriter.replaceOp(op, result);
     return success();
   }
@@ -6298,8 +6350,9 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     // the input and then later reverse the output.
     if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
-      input = rewriter.create<stablehlo::ReverseOp>(
-          op.getLoc(), input, GetI64ArrayAttr(dims_to_reverse, &rewriter));
+      input = stablehlo::ReverseOp::create(
+          rewriter, op.getLoc(), input,
+          GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     // Convert if we need to enlarge the element type's bitwidth to avoid
@@ -6313,8 +6366,8 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     }
 
     Type sum_element_type = GetSumAccumulationType(input_element_type);
-    input = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), input,
-                                                  sum_element_type);
+    input = stablehlo::ConvertOp::create(rewriter, op.getLoc(), input,
+                                         sum_element_type);
 
     SmallVector<int64_t, 4> window_dims(rank, 1);
     SmallVector<int64_t, 4> window_strides(rank, 1);
@@ -6333,8 +6386,8 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     Value init = GetScalarConstOfType(sum_element_type, op.getLoc(), init_value,
                                       &rewriter);
 
-    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
-        op.getLoc(), input.getType(), input, init,
+    auto reduce = stablehlo::ReduceWindowOp::create(
+        rewriter, op.getLoc(), input.getType(), input, init,
         GetI64ArrayAttr(window_dims, &rewriter),
         GetI64ArrayAttr(window_strides, &rewriter),
         /*base_dilations=*/DenseI64ArrayAttr(),
@@ -6353,20 +6406,22 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
       llvm::SmallVector<int64_t, 4> interior_padding(rank, 0);
       low_padding[axis] = 1;
       high_padding[axis] = -1;
-      result = rewriter.create<stablehlo::PadOp>(
-          op.getLoc(), result, init, GetI64ArrayAttr(low_padding, &rewriter),
+      result = stablehlo::PadOp::create(
+          rewriter, op.getLoc(), result, init,
+          GetI64ArrayAttr(low_padding, &rewriter),
           GetI64ArrayAttr(high_padding, &rewriter),
           GetI64ArrayAttr(interior_padding, &rewriter));
     }
 
     // Convert back if we enlarged the element type's bitwidth.
-    result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
-                                                   input_element_type);
+    result = stablehlo::ConvertOp::create(rewriter, op.getLoc(), result,
+                                          input_element_type);
 
     if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
-      result = rewriter.create<stablehlo::ReverseOp>(
-          op.getLoc(), result, GetI64ArrayAttr(dims_to_reverse, &rewriter));
+      result = stablehlo::ReverseOp::create(
+          rewriter, op.getLoc(), result,
+          GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     rewriter.replaceOp(op, result);
@@ -6397,7 +6452,7 @@ class ConvertShapeOp : public OpRewritePattern<TF::ShapeOp> {
     auto index_tensor = tensorflow::GetTypeFromTFTensorShape(
         result_ty.getShape(), rewriter.getIndexType());
     auto shape_op =
-        rewriter.create<shape::ShapeOfOp>(op.getLoc(), index_tensor, input);
+        shape::ShapeOfOp::create(rewriter, op.getLoc(), index_tensor, input);
     rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, result_ty, shape_op);
     return success();
   }
@@ -6422,8 +6477,8 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
       return failure();
     }
 
-    auto shape = rewriter.create<shape::ShapeOfOp>(
-        op.getLoc(),
+    auto shape = shape::ShapeOfOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({input_ty.getRank()},
                                              rewriter.getIndexType()),
         input);
@@ -6444,17 +6499,18 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
     }
 
     dims[inserted_dim] =
-        rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 1);
+        arith::ConstantIndexOp::create(rewriter, op.getLoc(), 1);
 
     for (int i = 0; i < dims.size() - 1; i++) {
       // Add the extracted dim.
-      Value index = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), i);
-      Value dim = rewriter.create<tensor::ExtractOp>(op.getLoc(), shape, index);
+      Value index = arith::ConstantIndexOp::create(rewriter, op.getLoc(), i);
+      Value dim =
+          tensor::ExtractOp::create(rewriter, op.getLoc(), shape, index);
       dims[i >= inserted_dim ? i + 1 : i] = dim;
     }
 
     auto from_extents =
-        rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
+        tensor::FromElementsOp::create(rewriter, op.getLoc(), dims);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(
         op, result_ty, input, from_extents);
     return success();
@@ -6497,11 +6553,11 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
     llvm::SmallVector<Value> dims;
     for (int64_t i = 0; i != input_rank; ++i) {
       if (llvm::is_contained(squeeze_dims, i)) continue;
-      dims.push_back(rewriter.create<tensor::DimOp>(op.getLoc(), input, i));
+      dims.push_back(tensor::DimOp::create(rewriter, op.getLoc(), input, i));
     }
 
     auto from_extents =
-        rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
+        tensor::FromElementsOp::create(rewriter, op.getLoc(), dims);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(
         op, result_ty, input, from_extents);
     return success();
@@ -6592,9 +6648,9 @@ class ConvertXlaSelectAndScatterOp
 
     SmallVector<Type> result_types{op.getResult().getType()};
     // Create the stablehlo.SelectAndScatter op.
-    auto select_and_scatter_op = rewriter.create<stablehlo::SelectAndScatterOp>(
-        loc, result_types, op.getOperand(), op.getSource(), op.getInitValue(),
-        ToDenseI64ArrayAttr(window_dimensions, &rewriter),
+    auto select_and_scatter_op = stablehlo::SelectAndScatterOp::create(
+        rewriter, loc, result_types, op.getOperand(), op.getSource(),
+        op.getInitValue(), ToDenseI64ArrayAttr(window_dimensions, &rewriter),
         ToDenseI64ArrayAttr(window_strides, &rewriter),
         mlir::cast<DenseIntElementsAttr>(
             hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))));
@@ -6672,8 +6728,9 @@ class ConvertXlaRngBitGeneratorOp
     auto algorithm_attr = mlir::stablehlo::RngAlgorithmAttr::get(
         rewriter.getContext(),
         *mlir::stablehlo::symbolizeRngAlgorithm(xla_alg.value()));
-    auto rng_bit_generator_op = rewriter.create<stablehlo::RngBitGeneratorOp>(
-        loc, op.getResultTypes(), algorithm_attr, op.getInitialState());
+    auto rng_bit_generator_op = stablehlo::RngBitGeneratorOp::create(
+        rewriter, loc, op.getResultTypes(), algorithm_attr,
+        op.getInitialState());
 
     rewriter.replaceOp(op, rng_bit_generator_op.getResults());
 
@@ -6700,8 +6757,8 @@ class ConvertXlaVariadicReduceV2Op
         [](Type ty) { return mlir::cast<ShapedType>(ty).getElementType(); })};
 
     // Create the stablehlo.reduce op.
-    auto reduce_op = rewriter.create<stablehlo::ReduceOp>(
-        loc, op.getInputs(), op.getInitValues(),
+    auto reduce_op = stablehlo::ReduceOp::create(
+        rewriter, loc, op.getInputs(), op.getInitValues(),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getDimensionsToReduce()),
                             &rewriter),
         elementTypes);
@@ -6727,9 +6784,9 @@ class ConvertXlaVariadicSortOp
     ElementsAttr dimension;
     matchPattern(op.getDimension(), m_Constant(&dimension));
     // Create the stablehlo.sort op.
-    auto sort_op = rewriter.create<stablehlo::SortOp>(
-        loc, op.getInputs(), dimension.getValues<IntegerAttr>()[0].getInt(),
-        op.getIsStable());
+    auto sort_op = stablehlo::SortOp::create(
+        rewriter, loc, op.getInputs(),
+        dimension.getValues<IntegerAttr>()[0].getInt(), op.getIsStable());
     mlir::SymbolRefAttr func = op.getComparator();
     auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
         op->getParentOfType<mlir::ModuleOp>(), func));
@@ -6816,9 +6873,9 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
     if constexpr (std::is_same<DstOpT, stablehlo::CaseOp>::value) {
       // Explicitly handle the Case op because it has variadic regions and takes
       // the number of regions as an input along with the operands.
-      stablehlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
-                                             adaptor.getBranchIndex(),
-                                             op.getBranches().size());
+      stablehlo_op =
+          DstOpT::create(rewriter, loc, op.getResultTypes(),
+                         adaptor.getBranchIndex(), op.getBranches().size());
     } else if constexpr (std::is_same<DstOpT, stablehlo::WhileOp>::value) {
       llvm::SmallVector<Type, 4> while_result_types;
       while_result_types.reserve(num_results);
@@ -6827,11 +6884,11 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
         while_result_types.push_back(ty);
       }
 
-      stablehlo_op = rewriter.create<DstOpT>(loc, TypeRange(while_result_types),
-                                             adaptor.getOperands());
+      stablehlo_op = DstOpT::create(
+          rewriter, loc, TypeRange(while_result_types), adaptor.getOperands());
     } else {
-      stablehlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
-                                             adaptor.getOperands());
+      stablehlo_op = DstOpT::create(rewriter, loc, op.getResultTypes(),
+                                    adaptor.getOperands());
     }
 
     int64_t num_regions = op.getNumRegions();
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
index 7061aaa4a5657b..abfcc0d26acc65 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
@@ -143,8 +143,9 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
   ChannelHandleAttr channel_handle = ConvertChannel(builder, channel_id, mode);
   Location loc = op->getLoc();
   Type element_type = getElementTypeOrSelf(input.getType());
-  auto all_reduce = builder.create<AllReduceOp>(
-      loc, result_type, input, replica_groups, channel_handle, nullptr);
+  auto all_reduce =
+      AllReduceOp::create(builder, loc, result_type, input, replica_groups,
+                          channel_handle, nullptr);
 
   if (all_reduce.getNumResults() != 1) {
     return op->emitOpError()
@@ -178,8 +179,8 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
     auto divisor =
         GetScalarConstOfType(element_type, loc, replica_group_size, &builder);
     auto broadcast_dims = builder.getDenseI64ArrayAttr({});
-    result = builder.create<chlo::BroadcastDivOp>(
-        loc, all_reduce.getResult(0), divisor.getResult(), broadcast_dims);
+    result = chlo::BroadcastDivOp::create(builder, loc, all_reduce.getResult(0),
+                                          divisor.getResult(), broadcast_dims);
   } else if (final_op != "Id") {
     return op->emitOpError()
            << "invalid final_op " << final_op << ", want one of [Id, Div]";
@@ -373,11 +374,12 @@ class ConvertCollectiveAssignGroupV2
     IntegerAttr group_size = rewriter.getI32IntegerAttr(replica_groups.size());
     IntegerAttr group_key = rewriter.getI32IntegerAttr(0);
 
-    auto const_group_size = rewriter.create<TF::ConstOp>(
-        assign_group->getLoc(), assign_group.getResult(0).getType(),
-        group_size);
-    auto const_group_key = rewriter.create<TF::ConstOp>(
-        assign_group->getLoc(), assign_group.getResult(1).getType(), group_key);
+    auto const_group_size =
+        TF::ConstOp::create(rewriter, assign_group->getLoc(),
+                            assign_group.getResult(0).getType(), group_size);
+    auto const_group_key =
+        TF::ConstOp::create(rewriter, assign_group->getLoc(),
+                            assign_group.getResult(1).getType(), group_key);
     rewriter.replaceAllUsesWith(assign_group.getResult(0), const_group_size);
     rewriter.replaceAllUsesWith(assign_group.getResult(1), const_group_key);
     rewriter.eraseOp(assign_group);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
index 7e653188857283..b1105d1a4e4000 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
@@ -289,10 +289,10 @@ Value CreateSendOp(OpBuilder& builder, Location loc, Value operand,
                                                /*handle=*/GetNextChannelId(),
                                                /*type=*/2);
   auto empty_source_target_pairs = builder.getI64TensorAttr({});
-  auto send = builder.create<SendOp>(
-      loc, token.getType(), operand, token, channel_handle,
-      /*is_host_transfer=*/builder.getBoolAttr(true),
-      /*source_target_pairs=*/empty_source_target_pairs);
+  auto send = SendOp::create(builder, loc, token.getType(), operand, token,
+                             channel_handle,
+                             /*is_host_transfer=*/builder.getBoolAttr(true),
+                             /*source_target_pairs=*/empty_source_target_pairs);
   SetFrontendAttributes(send, index, key, operand.getType(),
                         /*device_to_host=*/true, host_handler_name);
 
@@ -311,10 +311,10 @@ Value CreateRecvOp(OpBuilder& builder, Location loc, Value result,
                                                /*type=*/3);
   auto result_type = result.getType();
   SmallVector<Type, 2> recv_result_type = {result_type, token.getType()};
-  auto recv = builder.create<RecvOp>(
-      loc, recv_result_type, token, channel_handle,
-      /*is_host_transfer=*/builder.getBoolAttr(true),
-      /*source_target_pairs=*/builder.getI64TensorAttr({}));
+  auto recv =
+      RecvOp::create(builder, loc, recv_result_type, token, channel_handle,
+                     /*is_host_transfer=*/builder.getBoolAttr(true),
+                     /*source_target_pairs=*/builder.getI64TensorAttr({}));
 
   SetFrontendAttributes(recv, index, key, result_type,
                         /*device_to_host=*/false, host_handler_name);
@@ -336,7 +336,7 @@ Value CreateSinkToken(OpBuilder& builder, Location loc, ArrayRef<Value> tokens,
   } else if (llvm::hasSingleElement(tokens)) {
     return tokens[0];
   } else {
-    return builder.create<AfterAllOp>(loc, original_token.getType(), tokens)
+    return AfterAllOp::create(builder, loc, original_token.getType(), tokens)
         .getResult();
   }
 }
@@ -413,8 +413,8 @@ Value RewriteCallOp(OpBuilder& builder, func::CallOp call,
   new_operands.push_back(token);
   auto new_result_types = llvm::to_vector(call.getResultTypes());
   new_result_types.push_back(token.getType());
-  auto new_call = builder.create<func::CallOp>(
-      call.getLoc(), new_result_types,
+  auto new_call = func::CallOp::create(
+      builder, call.getLoc(), new_result_types,
       new_symbol ? *new_symbol : call.getCallee(), new_operands);
 
   for (auto results : llvm::zip(call.getResults(), new_call.getResults()))
@@ -435,7 +435,7 @@ struct OpVisitorState {
 
 // Creates a tuple from a sequence of values.
 Value CreateTuple(OpBuilder& builder, Location loc, ArrayRef<Value> operands) {
-  return builder.create<TupleOp>(loc, operands).getResult();
+  return TupleOp::create(builder, loc, operands).getResult();
 }
 
 // Extends `values` with the value `token` attached. If `flatten_tuple` is
@@ -480,7 +480,7 @@ SmallVector<Value> GetValueWithToken(
   SmallVector<Value, 4> tuple_operands;
   for (auto idx : llvm::seq<int32_t>(0, tuple_type.getTypes().size()))
     tuple_operands.push_back(
-        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+        GetTupleElementOp::create(builder, value.getLoc(), value, idx)
             .getResult());
 
   tuple_operands.push_back(token);
@@ -518,7 +518,7 @@ Value CreateSubTuple(OpBuilder& builder, Value value, size_t end) {
   SmallVector<Value, 4> tuple_operands;
   for (auto idx : llvm::seq<int32_t>(0, end))
     tuple_operands.push_back(
-        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+        GetTupleElementOp::create(builder, value.getLoc(), value, idx)
             .getResult());
 
   return CreateTuple(builder, value.getLoc(), tuple_operands);
@@ -543,8 +543,8 @@ void ReplaceWithTupleResult(OpBuilder& builder, ValueRange values,
   auto tuple_type = mlir::dyn_cast<TupleType>(value.getType());
   if (!tuple_type) {
     if (!value.use_empty()) {
-      auto new_element = builder.create<GetTupleElementOp>(replacement.getLoc(),
-                                                           replacement, 0);
+      auto new_element = GetTupleElementOp::create(
+          builder, replacement.getLoc(), replacement, 0);
       value.replaceAllUsesWith(new_element.getResult());
     }
     return;
@@ -620,8 +620,8 @@ void RewriteRegionIfOp(OpBuilder& builder, IfOp region_if,
                        /*flatten_tuple=*/true);
 
   // Create new `mhlo.if` op with extra token operands and result.
-  auto new_if = builder.create<IfOp>(region_if.getLoc(), new_result_types,
-                                     region_if.getPred());
+  auto new_if = IfOp::create(builder, region_if.getLoc(), new_result_types,
+                             region_if.getPred());
 
   // Move all regions from the old `mhlo.if` op to its replacement.
   new_if.getTrueBranch().takeBody(region_if.getTrueBranch());
@@ -745,8 +745,8 @@ void RewriteRegionWhileOp(OpBuilder& builder, WhileOp region_while,
                        /*flatten_tuple*/ true);
 
   // Create new `mhlo.while` op with extra token operand and result.
-  auto new_while = builder.create<WhileOp>(region_while.getLoc(),
-                                           new_result_types, new_val_operands);
+  auto new_while = WhileOp::create(builder, region_while.getLoc(),
+                                   new_result_types, new_val_operands);
 
   // Move all regions from the old `mhlo.while` op to its replacement.
   new_while.getCond().takeBody(region_while.getCond());
@@ -815,7 +815,7 @@ void RewriteFunctionTerminator(OpBuilder& builder,
   auto new_results = llvm::to_vector(terminator.getOperands());
   new_results.push_back(token);
   builder.setInsertionPoint(terminator);
-  builder.create<mlir::func::ReturnOp>(terminator.getLoc(), new_results);
+  mlir::func::ReturnOp::create(builder, terminator.getLoc(), new_results);
   terminator.erase();
 }
 
@@ -844,7 +844,7 @@ LogicalResult RewriteFunction(
   // a token will be created. Otherwise a token block argument is inserted.
   Value init_token =
       rewrite_block ? func_body.addArgument(token_type, func.getLoc())
-                    : builder.create<CreateTokenOp>(func.getLoc(), token_type)
+                    : CreateTokenOp::create(builder, func.getLoc(), token_type)
                           .getResult();
 
   // Stack to keep track of region based control flow op nesting and current
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index 5507c82bc6f479..957c4887366e16 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -41,7 +41,7 @@ def CastValueToI64: NativeCodeCall<
   "CastValueToI64($0.getLoc(), $1, &$_builder)">;
 
 def CastValueToElementType: NativeCodeCall<
-  "$_builder.create<stablehlo::ConvertOp>($0.getLoc(), $1, "
+  "stablehlo::ConvertOp::create($_builder, $0.getLoc(), $1, "
   "getElementTypeOrSelf($2.getType()))">;
 
 // Here, $0 is an ElementsAttr with exactly one element of type integer. $1 is
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc
index ecf3aea5f65d48..0b0e68548032a9 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc
@@ -108,15 +108,15 @@ void PopulateEmptyIsland(tf_executor::IslandOp island) {
   OpBuilder builder(&island.GetBody(), island.GetBody().begin());
   tf_executor::YieldOp yield = island.GetYield();
   if (yield.getNumOperands() == 0) {
-    builder.create<TF::NoOp>(island.getLoc(), TypeRange{}, ValueRange{});
+    TF::NoOp::create(builder, island.getLoc(), TypeRange{}, ValueRange{});
   } else if (yield.getNumOperands() == 1) {
     Value operand = yield.getOperand(0);
-    auto identity = builder.create<TF::IdentityOp>(island.getLoc(),
-                                                   operand.getType(), operand);
+    auto identity = TF::IdentityOp::create(builder, island.getLoc(),
+                                           operand.getType(), operand);
     yield.setOperand(0, identity.getOutput());
   } else {
-    auto identity_n = builder.create<TF::IdentityNOp>(
-        island.getLoc(), yield.getOperandTypes(), yield.getOperands());
+    auto identity_n = TF::IdentityNOp::create(
+        builder, island.getLoc(), yield.getOperandTypes(), yield.getOperands());
     for (const auto& it : llvm::enumerate(identity_n.getResults()))
       yield.setOperand(it.index(), it.value());
   }
@@ -128,15 +128,15 @@ tf_executor::IslandOp CreateIsland(TypeRange result_types,
                                    const Location& loc, Operation& sub_op,
                                    tf_executor::IslandOp original_island) {
   OpBuilder builder(original_island);
-  auto island = builder.create<tf_executor::IslandOp>(
-      loc, result_types, control_type, mlir::ValueRange{});
+  auto island = tf_executor::IslandOp::create(builder, loc, result_types,
+                                              control_type, mlir::ValueRange{});
   island.getBody().push_back(new Block);
   Block* block = &island.getBody().back();
   OpBuilder island_builder(original_island);
   island_builder.setInsertionPointToEnd(block);
   sub_op.replaceAllUsesWith(island.getOutputs());
   sub_op.moveBefore(block, block->begin());
-  island_builder.create<tf_executor::YieldOp>(loc, sub_op.getResults());
+  tf_executor::YieldOp::create(island_builder, loc, sub_op.getResults());
   return island;
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc
index d888b0c12588c9..2e5e4764f63d34 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc
@@ -53,12 +53,11 @@ class SplitIntoIslandPerOpPass : public ::testing::Test {
     llvm::SmallVector<mlir::Type, 1> island_result_types;
     island_result_types.push_back(op_builder_.getF64Type());
 
-    mlir::Operation* yield_op = op_builder_.create<mlir::tf_executor::YieldOp>(
-        op_state.location, mlir::ValueRange{});
-    mlir::tf_executor::IslandOp island_op =
-        op_builder_.create<mlir::tf_executor::IslandOp>(
-            op_state.location, island_result_types, mlir::ValueRange{},
-            mlir::ArrayRef<mlir::NamedAttribute>{});
+    mlir::Operation* yield_op = mlir::tf_executor::YieldOp::create(
+        op_builder_, op_state.location, mlir::ValueRange{});
+    mlir::tf_executor::IslandOp island_op = mlir::tf_executor::IslandOp::create(
+        op_builder_, op_state.location, island_result_types, mlir::ValueRange{},
+        mlir::ArrayRef<mlir::NamedAttribute>{});
     island_op.getBody().push_back(new mlir::Block);
     island_op.getBody().back().push_back(yield_op);
     return island_op;
@@ -126,13 +125,13 @@ TEST_F(SplitIntoIslandPerOpPass, IslandOpTwoOpsSplitsIntoTwoIslands) {
   islandOp.getBody().back().push_front(inner_op_2);
   // Code relies on a parent with a fetch op containing the island op.
   mlir::tf_executor::GraphOp parent_graph_op =
-      op_builder_.create<mlir::tf_executor::GraphOp>(
-          mlir::UnknownLoc::get(&context_),
+      mlir::tf_executor::GraphOp::create(
+          op_builder_, mlir::UnknownLoc::get(&context_),
           mlir::TypeRange{op_builder_.getF64Type()});
   parent_graph_op.getRegion().push_back(new mlir::Block);
   parent_graph_op.push_back(islandOp);
   mlir::tf_executor::FetchOp fetch_op =
-      op_builder_.create<mlir::tf_executor::FetchOp>(parent_graph_op.getLoc());
+      mlir::tf_executor::FetchOp::create(op_builder_, parent_graph_op.getLoc());
   parent_graph_op.GetBody().push_back(fetch_op);
 
   SplitIsland(islandOp, control_type);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc
index a7e9726e7575a3..2f7089edacbe31 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc
@@ -57,8 +57,9 @@ LogicalResult TFXLADeviceSpecificTransforms::ConvertGetAlgOp(
 
   OpBuilder opbuilder(get_alg_op);
 
-  auto tf_const = opbuilder.create<TF::ConstOp>(
-      get_alg_op->getLoc(), opbuilder.getI32IntegerAttr((int)tensorflow_rng));
+  auto tf_const =
+      TF::ConstOp::create(opbuilder, get_alg_op->getLoc(),
+                          opbuilder.getI32IntegerAttr((int)tensorflow_rng));
 
   get_alg_op->replaceAllUsesWith(tf_const);
   get_alg_op->erase();
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc b/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc
index 0152cd1d1a7363..61c8e8e161425d 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc
@@ -24,11 +24,11 @@ namespace mhlo {
 
 ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
                                 OpBuilder* builder) {
-  return builder->create<ConstantOp>(loc, hlo::getScalarOfType(ty, raw_value));
+  return ConstantOp::create(*builder, loc, hlo::getScalarOfType(ty, raw_value));
 }
 
 ConstantOp GetScalarNegZeroOfType(Type ty, Location loc, OpBuilder* builder) {
-  return builder->create<ConstantOp>(loc, hlo::getScalarNegZeroOfType(ty));
+  return ConstantOp::create(*builder, loc, hlo::getScalarNegZeroOfType(ty));
 }
 
 DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr) {
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/utils.h b/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
index 5dba4a4dcf894c..a6b848ae2fc27b 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
@@ -42,8 +42,8 @@ void BuildReduceBody(Type element_type, Region* body, OpBuilder* builder) {
   block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
 
   auto reducer =
-      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
-  builder->create<ReturnOp>(loc, reducer.getResult());
+      Op::create(*builder, loc, block->getArgument(0), block->getArgument(1));
+  ReturnOp::create(*builder, loc, reducer.getResult());
 }
 
 ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
index 6572aef984b043..71dce38198c96a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
@@ -63,8 +63,8 @@ class XlaLegalizeTargetsTest : public testing::Test {
 };
 
 TEST_F(XlaLegalizeTargetsTest, CreatesConversionTargets) {
-  auto const_int = builder_.create<mlir::arith::ConstantIntOp>(
-      builder_.getUnknownLoc(), builder_.getI32Type(), /*value=*/10);
+  auto const_int = mlir::arith::ConstantIntOp::create(
+      builder_, builder_.getUnknownLoc(), builder_.getI32Type(), /*value=*/10);
 
   ConversionTarget target =
       GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/false);
@@ -72,8 +72,8 @@ TEST_F(XlaLegalizeTargetsTest, CreatesConversionTargets) {
 }
 
 TEST_F(XlaLegalizeTargetsTest, AllowsCHLODialect) {
-  auto const_int = builder_.create<chlo::ConstantOp>(
-      builder_.getUnknownLoc(), builder_.getI32TensorAttr({42}));
+  auto const_int = chlo::ConstantOp::create(builder_, builder_.getUnknownLoc(),
+                                            builder_.getI32TensorAttr({42}));
 
   ConversionTarget target =
       GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/true);
@@ -82,8 +82,8 @@ TEST_F(XlaLegalizeTargetsTest, AllowsCHLODialect) {
 }
 
 TEST_F(XlaLegalizeTargetsTest, DontAllowCHLODialect) {
-  auto const_int = builder_.create<chlo::ConstantOp>(
-      builder_.getUnknownLoc(), builder_.getI32TensorAttr({42}));
+  auto const_int = chlo::ConstantOp::create(builder_, builder_.getUnknownLoc(),
+                                            builder_.getI32TensorAttr({42}));
 
   ConversionTarget target =
       GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/false);
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
index e2f1bdbfb0a0de..2672a90f93cdd3 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -81,8 +81,9 @@ absl::StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
   std::string tfr_lib_dir;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar(
       kTFRLibEnv, "tensorflow/compiler/mlir/tfr/resources", &tfr_lib_dir));
-  string composite_mlir_dir = io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
-  std::vector<string> files;
+  std::string composite_mlir_dir =
+      io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
+  std::vector<std::string> files;
   TF_RETURN_IF_ERROR(env->GetChildren(composite_mlir_dir, &files));
   if (files.empty()) {
     return errors::Internal(absl::StrCat(
@@ -90,7 +91,7 @@ absl::StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
   }
   std::string tfr_raw_text;
   for (const auto& file : files) {
-    string fullpath = io::JoinPath(composite_mlir_dir, file);
+    std::string fullpath = io::JoinPath(composite_mlir_dir, file);
     if (env->MatchPath(fullpath, io::JoinPath(composite_mlir_dir, "*.mlir"))) {
       std::string text;
       TF_RETURN_IF_ERROR(ReadFileToString(env, fullpath, &text));
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index d44e65f029ada3..66b5167839731b 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -118,10 +118,11 @@ class TFRInlinerInterface : public DialectInlinerInterface {
     auto result_itype = llvm::cast<IntegerType>(result_type);
     if (input_itype.getWidth() == result_itype.getWidth()) return nullptr;
     if (input_itype.getWidth() > result_itype.getWidth()) {
-      return builder.create<arith::TruncIOp>(conversion_loc, result_type,
-                                             input);
+      return arith::TruncIOp::create(builder, conversion_loc, result_type,
+                                     input);
     } else {
-      return builder.create<arith::ExtSIOp>(conversion_loc, result_type, input);
+      return arith::ExtSIOp::create(builder, conversion_loc, result_type,
+                                    input);
     }
   }
 };
@@ -148,11 +149,11 @@ TFRDialect::TFRDialect(MLIRContext *context)
 Operation *TFRDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                            Type type, Location loc) {
   if (arith::ConstantOp::isBuildableWith(value, type))
-    return builder.create<arith::ConstantOp>(loc, type,
-                                             llvm::cast<TypedAttr>(value));
+    return arith::ConstantOp::create(builder, loc, type,
+                                     llvm::cast<TypedAttr>(value));
   if (func::ConstantOp::isBuildableWith(value, type))
-    return builder.create<func::ConstantOp>(
-        loc, type, llvm::cast<FlatSymbolRefAttr>(value));
+    return func::ConstantOp::create(builder, loc, type,
+                                    llvm::cast<FlatSymbolRefAttr>(value));
   return nullptr;
 }
 
@@ -421,9 +422,10 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
           {static_cast<int64_t>(array.size())}, *all_types.begin());
       DenseElementsAttr attr =
           DenseElementsAttr::get(new_out_type, array.getValue());
-      new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, attr);
+      new_cst = TF::ConstOp::create(rewriter, loc, new_out_type, attr);
       if (isa<TFRTensorType>(out_type)) {
-        new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
+        new_cst =
+            CastOp::create(rewriter, loc, out_type, new_cst->getResult(0));
       }
       rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
       return success();
@@ -432,9 +434,10 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
     TypedAttr scalar;
     if (matchPattern(cst_tensor_op.getArg(), m_Constant(&scalar))) {
       Type new_out_type = RankedTensorType::get({}, scalar.getType());
-      new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, scalar);
+      new_cst = TF::ConstOp::create(rewriter, loc, new_out_type, scalar);
       if (isa<TFRTensorType>(out_type)) {
-        new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
+        new_cst =
+            CastOp::create(rewriter, loc, out_type, new_cst->getResult(0));
       }
       rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
       return success();
@@ -481,8 +484,8 @@ class RemoveRedundantCast : public OpRewritePattern<CastOp> {
     if ((input_tensor_type.getElementType() !=
          output_tensor_type.getElementType()) &&
         !isQuantizedType(input_type) && !isQuantizedType(output_type)) {
-      auto new_tfr_cast = rewriter.create<TFR::CastOp>(
-          cast_op.getLoc(),
+      auto new_tfr_cast = TFR::CastOp::create(
+          rewriter, cast_op.getLoc(),
           output_tensor_type.clone(input_tensor_type.getElementType()),
           cast_op.getArg());
       rewriter.replaceOpWithNewOp<TF::CastOp>(cast_op, output_type,
@@ -652,8 +655,9 @@ class RemoveRawDataOp : public OpRewritePattern<TFRQuantRawDataOp> {
       new_list_values.push_back(redundant_cast.getArg());
     }
 
-    auto new_list = rewriter.create<BuildListOp>(
-        raw_data_op.getLoc(), preceding_list.getType(), new_list_values);
+    auto new_list =
+        BuildListOp::create(rewriter, raw_data_op.getLoc(),
+                            preceding_list.getType(), new_list_values);
     raw_data_op.getOutput().replaceAllUsesWith(new_list.getOut());
     return success();
   }
@@ -679,11 +683,11 @@ class RemoveQParamsOp : public OpRewritePattern<TFRQuantQParamsOp> {
     rewriter.setInsertionPoint(qparams_op);
     Location loc = qparams_op->getLoc();
     if (auto qtype = llvm::dyn_cast<quant::UniformQuantizedType>(cast_qtype)) {
-      scale_op = rewriter.create<TF::ConstOp>(
-          loc, RankedTensorType::get({}, rewriter.getF32Type()),
+      scale_op = TF::ConstOp::create(
+          rewriter, loc, RankedTensorType::get({}, rewriter.getF32Type()),
           rewriter.getF32FloatAttr(qtype.getScale()));
-      zp_op = rewriter.create<TF::ConstOp>(
-          loc, RankedTensorType::get({}, rewriter.getI32Type()),
+      zp_op = TF::ConstOp::create(
+          rewriter, loc, RankedTensorType::get({}, rewriter.getI32Type()),
           rewriter.getI32IntegerAttr(qtype.getZeroPoint()));
     } else if (auto qtype = llvm::dyn_cast<quant::UniformQuantizedPerAxisType>(
                    cast_qtype)) {
@@ -697,20 +701,20 @@ class RemoveQParamsOp : public OpRewritePattern<TFRQuantQParamsOp> {
           {static_cast<int64_t>(num_channels)}, rewriter.getF32Type());
       auto scales_attr =
           DenseElementsAttr::get(scales_type, llvm::ArrayRef(scales));
-      scale_op = rewriter.create<TF::ConstOp>(loc, scales_attr);
+      scale_op = TF::ConstOp::create(rewriter, loc, scales_attr);
 
       auto zps_type = RankedTensorType::get(
           {static_cast<int64_t>(num_channels)}, rewriter.getI32Type());
       auto zps_attr = DenseElementsAttr::get(zps_type, llvm::ArrayRef(zps));
-      zp_op = rewriter.create<TF::ConstOp>(loc, zps_attr);
+      zp_op = TF::ConstOp::create(rewriter, loc, zps_attr);
     }
     if (!scale_op || !zp_op) {
       return failure();
     }
-    auto scale_cast = rewriter.create<CastOp>(
-        loc, qparams_op.getScale().getType(), scale_op.getOutput());
-    auto zp_cast = rewriter.create<CastOp>(loc, qparams_op.getZp().getType(),
-                                           zp_op.getOutput());
+    auto scale_cast = CastOp::create(
+        rewriter, loc, qparams_op.getScale().getType(), scale_op.getOutput());
+    auto zp_cast = CastOp::create(rewriter, loc, qparams_op.getZp().getType(),
+                                  zp_op.getOutput());
 
     qparams_op.getScale().replaceAllUsesWith(scale_cast.getOut());
     qparams_op.getZp().replaceAllUsesWith(zp_cast.getOut());
@@ -787,10 +791,11 @@ class RemoveScaleFactorOp : public OpRewritePattern<TFRQuantScaleFactorOp> {
     }
     rewriter.setInsertionPoint(scale_factor_op);
     const Location loc = scale_factor_op->getLoc();
-    auto result_scale_op = rewriter.create<TF::ConstOp>(
-        loc, DenseElementsAttr::get(scale_type, llvm::ArrayRef(scale_factors)));
-    auto result_scale_cast_op = rewriter.create<CastOp>(
-        loc, scale_factor_op.getType(), result_scale_op.getOutput());
+    auto result_scale_op = TF::ConstOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(scale_type, llvm::ArrayRef(scale_factors)));
+    auto result_scale_cast_op = CastOp::create(
+        rewriter, loc, scale_factor_op.getType(), result_scale_op.getOutput());
     scale_factor_op.getScaleFactor().replaceAllUsesWith(
         result_scale_cast_op.getOut());
     return success();
@@ -812,50 +817,55 @@ class RemoveRescaleOp : public OpRewritePattern<TFRQuantRescaleOp> {
     const Location loc = rescale_op->getLoc();
     const auto result_types = rescale_op->getResultTypes();
     auto c_false =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getBoolAttr(false));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getBoolAttr(false));
     TypeAttr f32_attr = TypeAttr::get(rewriter.getF32Type());
     TFRAttrType output_type = TFRAttrType::get(rewriter.getContext());
-    auto constant_f32_op = rewriter.create<ConstOp>(loc, output_type, f32_attr);
+    auto constant_f32_op =
+        ConstOp::create(rewriter, loc, output_type, f32_attr);
     TypeAttr i32_attr = TypeAttr::get(rewriter.getI32Type());
-    auto constant_i32_op = rewriter.create<ConstOp>(loc, output_type, i32_attr);
+    auto constant_i32_op =
+        ConstOp::create(rewriter, loc, output_type, i32_attr);
 
     IntegerAttr zp_attr;
     if (!matchPattern(zp, m_Constant(&zp_attr))) {
       return failure();
     }
     rewriter.setInsertionPoint(zp.getDefiningOp());
-    auto zp_tensor = rewriter.create<TF::ConstOp>(
-        loc, RankedTensorType::get({}, zp.getType()), zp_attr);
-    auto zp_cast = rewriter.create<CastOp>(
-        loc, rewriter.getType<TFRTensorType>(), zp_tensor.getOutput());
+    auto zp_tensor = TF::ConstOp::create(
+        rewriter, loc, RankedTensorType::get({}, zp.getType()), zp_attr);
+    auto zp_cast =
+        CastOp::create(rewriter, loc, rewriter.getType<TFRTensorType>(),
+                       zp_tensor.getOutput());
 
     rewriter.setInsertionPoint(rescale_op);
-    auto cast_input_to_float_op = rewriter.create<CallOp>(
-        loc, result_types,
-        SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
-        ArrayRef<Value>{input, constant_f32_op, c_false},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto input_x_scale_op = rewriter.create<CallOp>(
-        loc, result_types, SymbolRefAttr::get(rewriter.getContext(), "tf__mul"),
+    auto cast_input_to_float_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
+                       ArrayRef<Value>{input, constant_f32_op, c_false},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto input_x_scale_op = CallOp::create(
+        rewriter, loc, result_types,
+        SymbolRefAttr::get(rewriter.getContext(), "tf__mul"),
         ArrayRef<Value>{cast_input_to_float_op.getResult(0), scale},
         /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto round_rescaled_op = rewriter.create<CallOp>(
-        loc, result_types,
-        SymbolRefAttr::get(rewriter.getContext(), "tf__round"),
-        ArrayRef<Value>{input_x_scale_op->getResult(0)},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto cast_zp_to_float_op = rewriter.create<CallOp>(
-        loc, result_types,
-        SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
-        ArrayRef<Value>{zp_cast, constant_f32_op, c_false},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto recentered_op = rewriter.create<CallOp>(
-        loc, result_types, SymbolRefAttr::get(rewriter.getContext(), "tf__add"),
-        ArrayRef<Value>{round_rescaled_op->getResult(0),
-                        cast_zp_to_float_op->getResult(0)},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto cast_output_to_i32 = rewriter.create<CallOp>(
-        loc, result_types,
+    auto round_rescaled_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__round"),
+                       ArrayRef<Value>{input_x_scale_op->getResult(0)},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto cast_zp_to_float_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
+                       ArrayRef<Value>{zp_cast, constant_f32_op, c_false},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto recentered_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__add"),
+                       ArrayRef<Value>{round_rescaled_op->getResult(0),
+                                       cast_zp_to_float_op->getResult(0)},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto cast_output_to_i32 = CallOp::create(
+        rewriter, loc, result_types,
         SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
         ArrayRef<Value>{recentered_op->getResult(0), constant_i32_op, c_false},
         /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
diff --git a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
index fb0640536d4fe5..7a03a46972371c 100644
--- a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
@@ -75,12 +75,12 @@ class UnrollSCFForOp : public OpRewritePattern<scf::ForOp> {
     for (auto i = 0; i < trip_count; ++i) {
       if (!iv.use_empty()) {
         // iv' = iv + step * i;
-        Value iter = rewriter.create<arith::ConstantIndexOp>(loc, i);
+        Value iter = arith::ConstantIndexOp::create(rewriter, loc, i);
         Value step_cst =
-            rewriter.create<arith::ConstantIndexOp>(loc, step.getSExtValue());
-        Value stride = rewriter.create<arith::MulIOp>(loc, step_cst, iter);
+            arith::ConstantIndexOp::create(rewriter, loc, step.getSExtValue());
+        Value stride = arith::MulIOp::create(rewriter, loc, step_cst, iter);
         Value iv_unroll =
-            rewriter.create<arith::AddIOp>(loc, mapping.lookup(iv), stride);
+            arith::AddIOp::create(rewriter, loc, mapping.lookup(iv), stride);
         mapping.map(iv, iv_unroll);
       }
 
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
index 94a84cc3072ea6..5dd6a22f90c972 100644
--- a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -148,7 +148,7 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
         mlir::cast<TypeAttr>(cast_op.getInputElementType()).getValue();
     if (result_elt_type != original_input_type) {
       UnrankedTensorType result_type = UnrankedTensorType::get(result_elt_type);
-      return rewriter.create<TF::CastOp>(loc, result_type, cast_op.getArg());
+      return TF::CastOp::create(rewriter, loc, result_type, cast_op.getArg());
     }
     return cast_op.getArg();
   }
@@ -167,7 +167,7 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
       Type current_input_type = mlir::cast<TypeAttr>(input_types[i]).getValue();
       if (current_input_type != target_input_type) {
         input_values[i] =
-            rewriter.create<TF::CastOp>(loc, result_type, input_values[i]);
+            TF::CastOp::create(rewriter, loc, result_type, input_values[i]);
       }
     }
   }
@@ -397,7 +397,7 @@ LogicalResult RewriteTFRCallOp::CreateAndReplaceOp(
     Type res_type = res.value();
     if (mlir::dyn_cast<TFRTensorType>(res_type)) {
       Value new_res = new_op->getResult(res.index());
-      auto casted = rewriter.create<CastOp>(loc, res_type, new_res);
+      auto casted = CastOp::create(rewriter, loc, res_type, new_res);
       new_results.push_back(casted.getOut());
     } else if (auto list_type =
                    mlir::dyn_cast<TFRTensorListType>(res.value())) {
@@ -405,10 +405,10 @@ LogicalResult RewriteTFRCallOp::CreateAndReplaceOp(
       for (int i = res.index(); i < new_op->getNumResults(); i++) {
         Value new_res = new_op->getResult(i);
         auto casted =
-            rewriter.create<CastOp>(loc, unconstrainted_type, new_res);
+            CastOp::create(rewriter, loc, unconstrainted_type, new_res);
         tensor_list.push_back(casted.getOut());
       }
-      auto list_op = rewriter.create<BuildListOp>(loc, res_type, tensor_list);
+      auto list_op = BuildListOp::create(rewriter, loc, res_type, tensor_list);
       new_results.push_back(list_op.getOut());
     }
   }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir b/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir
index 2118183569e6ed..24e195dbc9dc42 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir
@@ -27,7 +27,14 @@ func.func @xla_launch(%arg: tensor<i32>, %v0: tensor<*x!tf_type.resource>, %v1:
       device = "/device:GPU:0", executor_type = "", f = @callee}
       : (tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource>, tensor<i32>, tensor<*x!tf_type.resource>) -> tensor<i32>
 
-  func.return %r2 : tensor<i32>
+  // CHECK: tf.XlaLaunchV2
+  // CHECK-SAME: constants = [0, 3]
+  // CHECK-SAME: resources = [2, 4]
+  %r3 = "tf.PartitionedCall"(%c0, %r2, %v0, %c1, %v1) {_XlaMustCompile = true, config = "", config_proto = "",
+      device = "/device:CPU:0", executor_type = "", f = @callee}
+      : (tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource>, tensor<i32>, tensor<*x!tf_type.resource>) -> tensor<i32>
+
+  func.return %r3 : tensor<i32>
 }
 
 func.func @callee(%c0: tensor<i32>, %arg: tensor<i32>, %v0: tensor<*x!tf_type.resource>, %c1: tensor<i32>, %v1: tensor<*x!tf_type.resource>) -> (tensor<i32>) {
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc b/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc
index a42d2f5d2ad7d1..aafd3d958f826b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc
@@ -94,7 +94,7 @@ mlir::func::FuncOp CreateBranchFunctionWithDeduplicatedResults(
   auto new_func_type = mlir::FunctionType::get(builder.getContext(), arg_types,
                                                new_result_types);
 
-  auto new_func = builder.create<mlir::func::FuncOp>(loc, name, new_func_type);
+  auto new_func = mlir::func::FuncOp::create(builder, loc, name, new_func_type);
   new_func.setVisibility(mlir::func::FuncOp::Visibility::Private);
 
   mlir::OpBuilder::InsertionGuard guard(builder);
@@ -110,8 +110,8 @@ mlir::func::FuncOp CreateBranchFunctionWithDeduplicatedResults(
 
   // Create the call op to the original func. The arguments are simply
   // the arguments from the wrapper function.
-  auto call_op = builder.create<mlir::TF::PartitionedCallOp>(
-      loc, result_types, block->getArguments(), /*args_attrs=*/nullptr,
+  auto call_op = mlir::TF::PartitionedCallOp::create(
+      builder, loc, result_types, block->getArguments(), /*args_attrs=*/nullptr,
       /*res_attrs=*/nullptr,
       mlir::FlatSymbolRefAttr::get(func.getSymNameAttr()), empty_string_attr,
       empty_string_attr, empty_string_attr);
@@ -120,7 +120,7 @@ mlir::func::FuncOp CreateBranchFunctionWithDeduplicatedResults(
     results.push_back(call_op.getResult(i));
   }
 
-  builder.create<mlir::func::ReturnOp>(loc, results);
+  mlir::func::ReturnOp::create(builder, loc, results);
 
   return new_func;
 }
@@ -183,8 +183,8 @@ void DeduplicateIfOps(mlir::ModuleOp module) {
         new_result_types.push_back(op->getResult(i).getType());
       }
 
-      auto new_if_op = builder.create<mlir::TF::IfOp>(
-          op.getLoc(), new_result_types, op.getCond(), op.getInput(),
+      auto new_if_op = mlir::TF::IfOp::create(
+          builder, op.getLoc(), new_result_types, op.getCond(), op.getInput(),
           new_then_func.getSymName(), new_else_func.getSymName(),
           op.getIsStateless());
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
index 77de1e0eb48669..73d5836fa895a6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
@@ -159,12 +159,11 @@ void FuseCompileAndExecuteOps(
   auto producer_name =
       used_exec_op->getAttrOfType<mlir::StringAttr>("_producer_name");
   if (!producer_name) producer_name = mlir::StringAttr::get(context, "default");
-  auto compile_and_execute_op =
-      builder.create<mlir::TF::TPUCompileMlirAndExecuteOp>(
-          used_exec_op.getLoc(), output_types, exec_op_args,
-          static_shape_tensors,
-          builder.getI32ArrayAttr(static_shaped_operand_indices_attr),
-          compile_op.getMlirModule(), compile_op.getMetadata(), producer_name);
+  auto compile_and_execute_op = mlir::TF::TPUCompileMlirAndExecuteOp::create(
+      builder, used_exec_op.getLoc(), output_types, exec_op_args,
+      static_shape_tensors,
+      builder.getI32ArrayAttr(static_shaped_operand_indices_attr),
+      compile_op.getMlirModule(), compile_op.getMetadata(), producer_name);
 
   for (auto exec_op : exec_op_in_group) {
     exec_op.replaceAllUsesWith(compile_and_execute_op.getResults());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
index 2fc2c173fed8ba..1e2231f1c59584 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
@@ -151,8 +151,8 @@ class RewriteClusterToIfrtCallPass
       // ifrt program already exists
       builder.setInsertionPoint(cluster_func);
 
-      mlir::TF::IfrtCallOp ifrt_call_op = builder.create<mlir::TF::IfrtCallOp>(
-          cluster_func->getLoc(), cluster_func.getResultTypes(),
+      mlir::TF::IfrtCallOp ifrt_call_op = mlir::TF::IfrtCallOp::create(
+          builder, cluster_func->getLoc(), cluster_func.getResultTypes(),
           cluster_func->getOperands());
 
       int64_t program_id;
@@ -189,8 +189,8 @@ class RewriteClusterToIfrtCallPass
     mlir::OpBuilder::InsertionGuard insertion_guard(builder);
     builder.setInsertionPoint(callee_func);
 
-    mlir::func::FuncOp cloned_ifrt_program = builder.create<mlir::func::FuncOp>(
-        callee_func->getLoc(), ifrt_program_name,
+    mlir::func::FuncOp cloned_ifrt_program = mlir::func::FuncOp::create(
+        builder, callee_func->getLoc(), ifrt_program_name,
         callee_func.getFunctionType());
     mlir::IRMapping mapper;
     callee_func.cloneInto(cloned_ifrt_program, mapper);
@@ -226,8 +226,8 @@ class RewriteClusterToIfrtCallPass
 
     builder.setInsertionPoint(cluster_func);
 
-    mlir::TF::IfrtCallOp ifrt_call_op = builder.create<mlir::TF::IfrtCallOp>(
-        cluster_func->getLoc(), cluster_func.getResultTypes(),
+    mlir::TF::IfrtCallOp ifrt_call_op = mlir::TF::IfrtCallOp::create(
+        builder, cluster_func->getLoc(), cluster_func.getResultTypes(),
         cluster_func->getOperands());
 
     // TODO(b/304839793): populate variable names after adding a variable
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
index 5220824d3f716a..d0c8f03bf7f9c2 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
@@ -130,15 +130,15 @@ class TfRestoreMergingPass
     // merged in order to keep the dominance property.
     mlir::OpBuilder builder(restores_to_merge.front());
 
-    auto new_tensor_names = builder.create<mlir::TF::ConstOp>(
-        builder.getFusedLoc(tensor_names_locs),
+    auto new_tensor_names = mlir::TF::ConstOp::create(
+        builder, builder.getFusedLoc(tensor_names_locs),
         GetStringTensorAttr(merged_tensor_names));
-    auto new_shape_and_slices = builder.create<mlir::TF::ConstOp>(
-        builder.getFusedLoc(shape_and_slices_locs),
+    auto new_shape_and_slices = mlir::TF::ConstOp::create(
+        builder, builder.getFusedLoc(shape_and_slices_locs),
         GetStringTensorAttr(merged_shape_and_slices));
 
-    auto new_restore = builder.create<mlir::TF::RestoreV2Op>(
-        builder.getFusedLoc(restore_locs),
+    auto new_restore = mlir::TF::RestoreV2Op::create(
+        builder, builder.getFusedLoc(restore_locs),
         mlir::TypeRange(mlir::ValueRange(values_to_replace)), prefix,
         new_tensor_names, new_shape_and_slices);
     for (auto [old_value, new_value] :
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
index 130ca0a2e90b74..cb5b3e7afdcc13 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
@@ -93,15 +93,15 @@ class TfRestoreSplittingPass
                    shape_and_slices.getValues<llvm::StringRef>(),
                    restore.getTensors())) {
       auto new_tensor_names =
-          builder.create<mlir::TF::ConstOp>(restore.getTensorNames().getLoc(),
-                                            GetStringTensorAttr({tensor_name}));
+          mlir::TF::ConstOp::create(builder, restore.getTensorNames().getLoc(),
+                                    GetStringTensorAttr({tensor_name}));
 
-      auto new_shape_and_slices = builder.create<mlir::TF::ConstOp>(
-          restore.getShapeAndSlices().getLoc(),
+      auto new_shape_and_slices = mlir::TF::ConstOp::create(
+          builder, restore.getShapeAndSlices().getLoc(),
           GetStringTensorAttr({shape_and_slice}));
 
-      auto new_restore = builder.create<mlir::TF::RestoreV2Op>(
-          restore.getLoc(), mlir::TypeRange({result.getType()}),
+      auto new_restore = mlir::TF::RestoreV2Op::create(
+          builder, restore.getLoc(), mlir::TypeRange({result.getType()}),
           restore.getPrefix(), new_tensor_names, new_shape_and_slices);
       result.replaceAllUsesWith(new_restore.getTensors()[0]);
     }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 34b37eeefe7843..916b41620ad33e 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -440,8 +440,8 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
   // "_tfrt_resource_init" is the special function that executes all invariant
   // ops (eg. read-only variables) used in the model. This function should be
   // executed after user-specified initialization.
-  auto init_func_op = builder.create<mlir::func::FuncOp>(
-      module.getLoc(), "_tfrt_resource_init",
+  auto init_func_op = mlir::func::FuncOp::create(
+      builder, module.getLoc(), "_tfrt_resource_init",
       mlir::FunctionType::get(module.getContext(), /*inputs=*/{},
                               /*results=*/{}));
   auto *block = init_func_op.addEntryBlock();
@@ -481,8 +481,8 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
     auto *new_op = new_value.getDefiningOp();
     assert(new_op);
     builder.setInsertionPointAfter(new_op);
-    auto set_resource_op = builder.create<mlir::TF::_TfrtSetResourceOp>(
-        new_op->getLoc(), new_value, index);
+    auto set_resource_op = mlir::TF::_TfrtSetResourceOp::create(
+        builder, new_op->getLoc(), new_value, index);
 
     // Preserve the device attribute.
     llvm::StringRef device = kCpuDeviceName;
@@ -494,7 +494,7 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
 
   builder.setInsertionPointToEnd(block);
   // Finish building the init function by inserting an return op.
-  builder.create<mlir::func::ReturnOp>(init_func_op.getLoc());
+  mlir::func::ReturnOp::create(builder, init_func_op.getLoc());
 
   // Now that we have the index for each value that will be replaced, we can
   // create the tf._TfrtGetResource op in each function using these indices.
@@ -568,8 +568,8 @@ void LowerTFSavedModelPass::ReplaceHoistedValues(
       llvm::SmallVector<mlir::Value> new_values;
 
       if (fuse_get_resource_ops_) {
-        auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
-            block->getParentOp()->getLoc(), old_values.getTypes(),
+        auto get_resource_op = mlir::TF::_TfrtGetResourceOp::create(
+            builder, block->getParentOp()->getLoc(), old_values.getTypes(),
             builder.getI64ArrayAttr(indices),
             builder.getStrArrayAttr(shared_name_arr),
             builder.getStrArrayAttr(container_arr));
@@ -577,8 +577,8 @@ void LowerTFSavedModelPass::ReplaceHoistedValues(
         new_values = get_resource_op.getResults();
       } else {
         for (int i = 0; i < old_values.size(); ++i) {
-          auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
-              block->getParentOp()->getLoc(),
+          auto get_resource_op = mlir::TF::_TfrtGetResourceOp::create(
+              builder, block->getParentOp()->getLoc(),
               mlir::TypeRange(old_values[i].getType()),
               builder.getI64ArrayAttr(indices[i]),
               builder.getStrArrayAttr(shared_name_arr[i]),
@@ -670,8 +670,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
 
   mlir::OpBuilder builder(var_op);
 
-  auto var_handle_op = builder.create<mlir::TF::VarHandleOp>(
-      var_op.getLoc(),
+  auto var_handle_op = mlir::TF::VarHandleOp::create(
+      builder, var_op.getLoc(),
       mlir::RankedTensorType::get(
           {}, mlir::TF::ResourceType::get(
                   llvm::ArrayRef<mlir::TensorType>{tensor_type},
@@ -682,8 +682,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // Set insertion point to this identity_op so that the side-effect
     // visibility is preserved.
     builder.setInsertionPoint(op);
-    auto read_var_op = builder.create<mlir::TF::ReadVariableOp>(
-        op.getLoc(), op.getType(), var_handle_op);
+    auto read_var_op = mlir::TF::ReadVariableOp::create(
+        builder, op.getLoc(), op.getType(), var_handle_op);
     op.replaceAllUsesWith(read_var_op.getValue());
     op.erase();
   }
@@ -692,8 +692,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // Set the insertion point after the assign op so that all operands are
     // dominating the newly created op.
     builder.setInsertionPoint(op);
-    builder.create<mlir::TF::AssignVariableOp>(op.getLoc(), var_handle_op,
-                                               op.getValue());
+    mlir::TF::AssignVariableOp::create(builder, op.getLoc(), var_handle_op,
+                                       op.getValue());
     op.erase();
   }
 
@@ -704,8 +704,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // the newly created op.
     builder.setInsertionPoint(op);
     // Create a new read variable op, so that the side-effects are preserved.
-    auto read_var_op = builder.create<mlir::TF::ReadVariableOp>(
-        op->getLoc(), tensor_type, var_handle_op);
+    auto read_var_op = mlir::TF::ReadVariableOp::create(
+        builder, op->getLoc(), tensor_type, var_handle_op);
     op->setOperand(idx, read_var_op.getValue());
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
index 59f602c0991faf..38737e22d1c588 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
@@ -225,8 +225,8 @@ class MergeTfIfOpsPass
                     [](mlir::TF::IfOp op) { return op.getIsStateless(); });
 
     // Create the merged tf.If op using the new branches.
-    auto new_if_op = builder.create<mlir::TF::IfOp>(
-        loc, new_result_types, if_ops.front().getCond(),
+    auto new_if_op = mlir::TF::IfOp::create(
+        builder, loc, new_result_types, if_ops.front().getCond(),
         if_ops.front().getInput(), then_branch_name, else_branch_name,
         is_stateless);
 
@@ -249,8 +249,8 @@ class MergeTfIfOpsPass
       llvm::ArrayRef<mlir::TF::IfOp> if_ops,
       llvm::function_ref<mlir::FlatSymbolRefAttr(mlir::TF::IfOp)> get_branch) {
     std::string branch_name = absl::StrCat(branch_prefix, branch_suffix);
-    auto branch = builder.create<mlir::func::FuncOp>(loc, branch_name,
-                                                     branch_function_type);
+    auto branch = mlir::func::FuncOp::create(builder, loc, branch_name,
+                                             branch_function_type);
     branch.setVisibility(mlir::func::FuncOp::Visibility::Private);
 
     mlir::OpBuilder::InsertionGuard guard(builder);
@@ -267,8 +267,9 @@ class MergeTfIfOpsPass
     for (auto if_op : if_ops) {
       // Create the call op to the original branch. The arguments are simply
       // the arguments from the wrapper function.
-      auto call_op = builder.create<mlir::TF::PartitionedCallOp>(
-          if_op.getLoc(), if_op.getResultTypes(), block->getArguments(),
+      auto call_op = mlir::TF::PartitionedCallOp::create(
+          builder, if_op.getLoc(), if_op.getResultTypes(),
+          block->getArguments(),
           /*args_attrs=*/nullptr, /*res_attrs=*/nullptr, get_branch(if_op),
           empty_string_attr, empty_string_attr, empty_string_attr);
 
@@ -276,7 +277,7 @@ class MergeTfIfOpsPass
       results.append(call_op.getOutput().begin(), call_op.getOutput().end());
     }
 
-    builder.create<mlir::func::ReturnOp>(loc, results);
+    mlir::func::ReturnOp::create(builder, loc, results);
 
     return branch.getSymName();
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc b/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
index 0ed5a6ac1b6a8a..fea7a988bd40d7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
@@ -38,15 +38,16 @@ namespace tensorflow {
 namespace tfrt_compiler {
 namespace {
 
-struct RewriteStatefulPartitionedCallToXlaLaunchOnCpu
-    : public mlir::OpRewritePattern<mlir::TF::StatefulPartitionedCallOp> {
-  using OpRewritePattern::OpRewritePattern;
+template <typename OpType>
+struct RewriteFunctionCallToXlaLaunchOnCpu
+    : public mlir::OpRewritePattern<OpType> {
+ public:
+  using mlir::OpRewritePattern<OpType>::OpRewritePattern;
 
   mlir::LogicalResult matchAndRewrite(
-      mlir::TF::StatefulPartitionedCallOp op,
-      mlir::PatternRewriter& rewriter) const override {
+      OpType op, mlir::PatternRewriter& rewriter) const override {
     if (auto xla_must_compile =
-            op->getAttrOfType<mlir::BoolAttr>("_XlaMustCompile");
+            op->template getAttrOfType<mlir::BoolAttr>("_XlaMustCompile");
         !xla_must_compile || !xla_must_compile.getValue()) {
       return mlir::failure();
     }
@@ -92,7 +93,11 @@ struct TfrtXlaRewritePass
   void runOnOperation() override {
     mlir::RewritePatternSet patterns(&getContext());
 
-    patterns.add<RewriteStatefulPartitionedCallToXlaLaunchOnCpu>(&getContext());
+    patterns
+        .add<RewriteFunctionCallToXlaLaunchOnCpu<mlir::TF::PartitionedCallOp>>(
+            &getContext());
+    patterns.add<RewriteFunctionCallToXlaLaunchOnCpu<
+        mlir::TF::StatefulPartitionedCallOp>>(&getContext());
 
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
index 159e630fb8fb16..b0ad89b6b55d24 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
@@ -280,8 +280,8 @@ DenseMap<BroadcastIntent, Value> realizeBroadcastIntents(
       setInsertionPointToEarliestPointWithAllValuesAvailable(
           rewriter, parentBlock,
           ValueRange{it.targetValue, it.outputDimensions});
-      realizations[it] = rewriter.create<DynamicBroadcastInDimOp>(
-          it.targetValue.getLoc(), it.resultType, it.targetValue,
+      realizations[it] = DynamicBroadcastInDimOp::create(
+          rewriter, it.targetValue.getLoc(), it.resultType, it.targetValue,
           it.outputDimensions,
           mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
       continue;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
index 200f09c33021b1..18459a9e4e13a8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
@@ -95,10 +95,10 @@ struct AllocOpConverter : public OpConversionPattern<memref::AllocOp> {
         alloc, alloc.getType(), *ctx, adaptor.getOperands(),
         reuse_input_candidates, reuse_output_index);
     Location loc = buffer.getLoc();
-    Value cond = rewriter.create<IsValidMemRefOp>(
-        loc, rewriter.getIntegerType(1), buffer);
-    rewriter.create<TFAssertOp>(loc, *ctx, cond, ErrorCode::RESOURCE_EXHAUSTED,
-                                "failed to allocate memory");
+    Value cond = IsValidMemRefOp::create(rewriter, loc,
+                                         rewriter.getIntegerType(1), buffer);
+    TFAssertOp::create(rewriter, loc, *ctx, cond, ErrorCode::RESOURCE_EXHAUSTED,
+                       "failed to allocate memory");
     return success();
   }
 };
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
index 89d946516f6b9b..59792ae7297ce2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
@@ -65,8 +65,8 @@ LogicalResult RewriteToFullJit(func::FuncOp op) {
                                          old_body->getArgumentTypes(), locs);
 
   // Create the JIT compile op.
-  auto jit_compile_op = rewriter.create<tf_framework::JITCompileOp>(
-      loc, rewriter.getType<tf_framework::JITCallableType>(),
+  auto jit_compile_op = tf_framework::JITCompileOp::create(
+      rewriter, loc, rewriter.getType<tf_framework::JITCallableType>(),
       /*ctx=*/mlir::Value());
 
   // Move the original functions operations into the body.
@@ -80,18 +80,18 @@ LogicalResult RewriteToFullJit(func::FuncOp op) {
 
     Operation *terminator = jit_block->getTerminator();
     rewriter.setInsertionPointAfter(terminator);
-    rewriter.create<tf_framework::JITCompileYieldOp>(
-        loc, terminator->getOperands().front());
+    tf_framework::JITCompileYieldOp::create(rewriter, loc,
+                                            terminator->getOperands().front());
     terminator->erase();
   }
 
   // Create JIT execute op.
-  auto execute = rewriter.create<tf_framework::JITExecuteOp>(
-      loc, op.getResultTypes().front(), /*ctx=*/Value(),
+  auto execute = tf_framework::JITExecuteOp::create(
+      rewriter, loc, op.getResultTypes().front(), /*ctx=*/Value(),
       jit_compile_op.getResult(), new_body->getArguments());
 
   // Create a return.
-  rewriter.create<func::ReturnOp>(loc, execute.getResult());
+  func::ReturnOp::create(rewriter, loc, execute.getResult());
   return success();
 }
 
@@ -111,28 +111,28 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
 
   // Create large argument condition.
   auto arg_1 = new_body->getArgument(0);
-  auto shape_1 = rewriter.create<shape::ShapeOfOp>(loc, arg_1);
-  auto num_elems_1 = rewriter.create<shape::NumElementsOp>(loc, shape_1);
-  Value cst_i32_limit = rewriter.create<arith::ConstantIndexOp>(loc, i32Limit);
-  Value large_tensor_predicate = rewriter.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::sgt, num_elems_1, cst_i32_limit);
+  auto shape_1 = shape::ShapeOfOp::create(rewriter, loc, arg_1);
+  auto num_elems_1 = shape::NumElementsOp::create(rewriter, loc, shape_1);
+  Value cst_i32_limit = arith::ConstantIndexOp::create(rewriter, loc, i32Limit);
+  Value large_tensor_predicate = arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::sgt, num_elems_1, cst_i32_limit);
   if (new_body->getNumArguments() > 1) {
     auto arg_2 = new_body->getArgument(1);
-    auto shape_2 = rewriter.create<shape::ShapeOfOp>(loc, arg_2);
-    auto num_elems_2 = rewriter.create<shape::NumElementsOp>(loc, shape_2);
-    large_tensor_predicate = rewriter.create<arith::OrIOp>(
-        loc, large_tensor_predicate,
+    auto shape_2 = shape::ShapeOfOp::create(rewriter, loc, arg_2);
+    auto num_elems_2 = shape::NumElementsOp::create(rewriter, loc, shape_2);
+    large_tensor_predicate = arith::OrIOp::create(
+        rewriter, loc, large_tensor_predicate,
         // Compare op to check size of the second op
-        rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt,
-                                       num_elems_2, cst_i32_limit));
+        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::sgt,
+                              num_elems_2, cst_i32_limit));
   }
 
   // Create dispatch code.
   auto jit_body_builder_fn = [&](OpBuilder &b, Location loc) {
     // Create JIT compile op.
     auto callable_ty = b.getType<tf_framework::JITCallableType>();
-    auto jit_compile_op =
-        b.create<tf_framework::JITCompileOp>(loc, callable_ty, /*ctx=*/Value());
+    auto jit_compile_op = tf_framework::JITCompileOp::create(
+        b, loc, callable_ty, /*ctx=*/Value());
     {
       OpBuilder::InsertionGuard g(b);
       Block *block = b.createBlock(
@@ -144,15 +144,15 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
       for (auto &op : old_body->without_terminator()) {
         b.clone(op, bvm);
       }
-      b.create<tf_framework::JITCompileYieldOp>(
-          loc, block->back().getResults().front());
+      tf_framework::JITCompileYieldOp::create(
+          b, loc, block->back().getResults().front());
     }
 
     // Create JIT execute op.
-    auto jit_execute_op = b.create<tf_framework::JITExecuteOp>(
-        loc, op.getResultTypes().front(), /*ctx=*/Value(),
+    auto jit_execute_op = tf_framework::JITExecuteOp::create(
+        b, loc, op.getResultTypes().front(), /*ctx=*/Value(),
         jit_compile_op.getResult(), new_body->getArguments());
-    b.create<scf::YieldOp>(loc, jit_execute_op.getResult());
+    scf::YieldOp::create(b, loc, jit_execute_op.getResult());
   };
   auto aot_body_builder_fn = [&](OpBuilder &b, Location loc) {
     IRMapping bvm;
@@ -161,13 +161,13 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
     for (auto &op : old_body->without_terminator()) {
       last_clone = b.clone(op, bvm);
     }
-    b.create<scf::YieldOp>(loc, last_clone->getResults().front());
+    scf::YieldOp::create(b, loc, last_clone->getResults().front());
   };
 
   // Create the conditional and return operation.
-  auto ifOp = rewriter.create<scf::IfOp>(
-      loc, large_tensor_predicate, jit_body_builder_fn, aot_body_builder_fn);
-  rewriter.create<func::ReturnOp>(loc, ifOp.getResults().front());
+  auto ifOp = scf::IfOp::create(rewriter, loc, large_tensor_predicate,
+                                jit_body_builder_fn, aot_body_builder_fn);
+  func::ReturnOp::create(rewriter, loc, ifOp.getResults().front());
 
   // Remove the old body.
   rewriter.eraseBlock(old_body);
@@ -186,19 +186,19 @@ void PackJITCompileOp(tf_framework::JITCompileOp op,
   // Temporarily, build the module that would be JIT-compiled. This is only to
   // obtain the serialized code attribute.
   auto loc = op->getLoc();
-  auto jit_module = rewriter.create<ModuleOp>(loc);
+  auto jit_module = ModuleOp::create(rewriter, loc);
   {
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointToStart(jit_module.SingleBlock::getBody());
-    auto jit_function = rewriter.create<func::FuncOp>(
-        loc, tf_framework::JITCompileFromStrOp::kJITEntryFunctionName,
+    auto jit_function = func::FuncOp::create(
+        rewriter, loc, tf_framework::JITCompileFromStrOp::kJITEntryFunctionName,
         rewriter.getFunctionType(body->getArgumentTypes(),
                                  yield_op->getOperandTypes()));
     jit_function->setAttr(tf_framework::TFFrameworkDialect::kTFEntryAttrName,
                           rewriter.getUnitAttr());
     jit_function.getBody().takeBody(op.getBodyRegion());
     rewriter.setInsertionPointToEnd(&jit_function.getBody().front());
-    rewriter.create<func::ReturnOp>(loc, yield_op.getResult());
+    func::ReturnOp::create(rewriter, loc, yield_op.getResult());
     rewriter.eraseOp(yield_op);
   }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
index 4b1d10ca8dd372..66a455ca71c745 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
@@ -73,8 +73,8 @@ struct ShapeReificationPattern : public OpRewritePattern<shape::ShapeOfOp> {
 
     // Insert cast if needed.
     if (reifiedShape.getType() != op.getType()) {
-      reifiedShape = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(),
-                                                     reifiedShape);
+      reifiedShape = tensor::CastOp::create(rewriter, op.getLoc(), op.getType(),
+                                            reifiedShape);
     }
 
     rewriter.replaceOp(op, reifiedShape);
@@ -148,9 +148,9 @@ LogicalResult moveUpIntoAssumingOpMatchAndRewrite(Operation *op,
   // Insert the rewritten assuming op right before the old one.
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(assumingOp);
-  auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-      assumingOp.getLoc(), assumingOp.getWitness(),
-      [&](OpBuilder &b, Location) {
+  auto newAssumingOp = shape::AssumingOp::create(
+      rewriter, assumingOp.getLoc(), assumingOp.getWitness(),
+      [&](OpBuilder& b, Location) {
         // Copy body.
         IRMapping mapping;
         for (auto &nested : body->without_terminator())
@@ -304,9 +304,9 @@ struct MoveUpOutOfAssumingOpPattern : public OpRewritePattern<OpTy> {
     // explicitly as they are assumed to be independent. The assuming op is
     // rewritten accordingly.
     SmallVector<Value, 2> replacementValues;
-    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-        assumingOp.getLoc(), assumingOp.getWitness(),
-        [&](OpBuilder &b, Location) {
+    auto newAssumingOp = shape::AssumingOp::create(
+        rewriter, assumingOp.getLoc(), assumingOp.getWitness(),
+        [&](OpBuilder& b, Location) {
           // Copy body.
           IRMapping mapping;
           for (Operation &nested : body->without_terminator()) {
@@ -354,15 +354,16 @@ struct MergeAssumingOpsPattern : public OpRewritePattern<shape::AssumingOp> {
     // Merge witnesses.
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(precedingOp);
-    Value newWitness = rewriter.create<shape::AssumingAllOp>(
-        op.getWitness().getDefiningOp()->getLoc(),
+    Value newWitness = shape::AssumingAllOp::create(
+        rewriter, op.getWitness().getDefiningOp()->getLoc(),
         ValueRange{precedingOp.getWitness(), op.getWitness()});
 
     // Merge assuming ops.
     Block *body_a = precedingOp.getBody();
     Block *body_b = op.getBody();
-    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-        precedingOp.getLoc(), newWitness, [&](OpBuilder &b, Location) {
+    auto newAssumingOp = shape::AssumingOp::create(
+        rewriter, precedingOp.getLoc(), newWitness,
+        [&](OpBuilder& b, Location) {
           // Copy preceding op's body.
           IRMapping mapping;
           for (auto &nested : body_a->without_terminator()) {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
index ceda47565bf999..959c56a87982ec 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
@@ -49,14 +49,14 @@ class TFAssertOpConverter : public OpConversionPattern<TFAssertOp> {
     auto func = op->getParentOfType<func::FuncOp>();
     Block *error_reporting_block =
         rewriter.createBlock(&func.getRegion(), {}, {});
-    rewriter.create<ReportErrorOp>(loc, adaptor.getCtx(),
-                                   adaptor.getErrorCode(), adaptor.getMsg());
+    ReportErrorOp::create(rewriter, loc, adaptor.getCtx(),
+                          adaptor.getErrorCode(), adaptor.getMsg());
 
     SmallVector<Value, 2> null_memrefs;
     for (auto type : func.getFunctionType().getResults()) {
-      null_memrefs.push_back(rewriter.create<NullMemRefOp>(loc, type));
+      null_memrefs.push_back(NullMemRefOp::create(rewriter, loc, type));
     }
-    rewriter.create<func::ReturnOp>(loc, null_memrefs);
+    func::ReturnOp::create(rewriter, loc, null_memrefs);
 
     rewriter.restoreInsertionPoint(ip);
     rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
index a7d26813239571..2fd419972f4289 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
@@ -116,8 +116,8 @@ struct PropagateTfAbiKnowledgeToKernelsPass
           Value offset = kernel.getArgument(kernel_p + 2);
           Value &zero = constants[0];
           if (!zero) {
-            zero = b.create<LLVM::ConstantOp>(loc, offset.getType(),
-                                              b.getIndexAttr(0));
+            zero = LLVM::ConstantOp::create(b, loc, offset.getType(),
+                                            b.getIndexAttr(0));
           }
           offset.replaceAllUsesWith(zero);
         }
@@ -128,9 +128,9 @@ struct PropagateTfAbiKnowledgeToKernelsPass
               kernel.getArgument(kernel_p + 2 + memref.getRank() * 2);
           Value &stride_val = constants[const_stride->second];
           if (!stride_val) {
-            stride_val = b.create<LLVM::ConstantOp>(
-                loc, inner_stride.getType(),
-                b.getIndexAttr(const_stride->second));
+            stride_val =
+                LLVM::ConstantOp::create(b, loc, inner_stride.getType(),
+                                         b.getIndexAttr(const_stride->second));
           }
           inner_stride.replaceAllUsesWith(stride_val);
         }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 4cbe21b73f62c3..21d477b30547c1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -70,25 +70,27 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     // If the attribute is missing or empty, set the element count to 0 and
     // return NULL.
     if (!attr.has_value() || attr.value().empty()) {
-      Value zero = rewriter->create<LLVM::ConstantOp>(
-          loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
-      Value null_ptr = rewriter->create<LLVM::ZeroOp>(loc, ptr_ty);
+      Value zero = LLVM::ConstantOp::create(
+          *rewriter, loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
+      Value null_ptr = LLVM::ZeroOp::create(*rewriter, loc, ptr_ty);
       return std::make_pair(zero, null_ptr);
     }
 
     // Allocate array to store the elements.
     auto &array_attr = attr.value();
-    Value array_size = rewriter->create<LLVM::ConstantOp>(
-        loc, size_ty, rewriter->getIntegerAttr(size_ty, array_attr.size()));
-    Value array_ptr = rewriter->create<LLVM::AllocaOp>(
-        loc, ptr_ty, element_ty, array_size, /*alignment=*/0);
+    Value array_size = LLVM::ConstantOp::create(
+        *rewriter, loc, size_ty,
+        rewriter->getIntegerAttr(size_ty, array_attr.size()));
+    Value array_ptr = LLVM::AllocaOp::create(*rewriter, loc, ptr_ty, element_ty,
+                                             array_size, /*alignment=*/0);
     for (const auto &e : llvm::enumerate(array_attr)) {
-      Value index = rewriter->create<LLVM::ConstantOp>(
-          loc, size_ty, rewriter->getIntegerAttr(size_ty, e.index()));
-      Value element_ptr = rewriter->create<LLVM::GEPOp>(loc, ptr_ty, element_ty,
-                                                        array_ptr, index);
+      Value index = LLVM::ConstantOp::create(
+          *rewriter, loc, size_ty,
+          rewriter->getIntegerAttr(size_ty, e.index()));
+      Value element_ptr = LLVM::GEPOp::create(*rewriter, loc, ptr_ty,
+                                              element_ty, array_ptr, index);
       Value element = create_element(e.value());
-      rewriter->create<LLVM::StoreOp>(loc, element, element_ptr);
+      LLVM::StoreOp::create(*rewriter, loc, element, element_ptr);
     }
     return std::make_pair(array_size, array_ptr);
   }
@@ -101,8 +103,8 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     assert(mlir::isa<IntegerType>(element_ty) && "expect integer element type");
     return ConvertArrayAttrToStackAllocatedArray(
         loc, size_ty, element_ty, attr, rewriter, [&](Attribute attr) {
-          return rewriter->create<LLVM::ConstantOp>(
-              loc, element_ty,
+          return LLVM::ConstantOp::create(
+              *rewriter, loc, element_ty,
               rewriter->getIntegerAttr(element_ty,
                                        mlir::cast<IntegerAttr>(attr).getInt()));
         });
@@ -136,8 +138,8 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
 
     // Convert `output_index` or set it to -1 if the attribute is missing.
     Type llvmInt32Type = IntegerType::get(rewriter.getContext(), 32);
-    Value output_index = rewriter.create<LLVM::ConstantOp>(
-        loc, llvmInt32Type,
+    Value output_index = LLVM::ConstantOp::create(
+        rewriter, loc, llvmInt32Type,
         rewriter.getI32IntegerAttr(tf_alloc_op.getOutputIndex().has_value()
                                        ? tf_alloc_op.getOutputIndex().value()
                                        : -1));
@@ -152,12 +154,11 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     Value allocated_byte_ptr =
-        rewriter
-            .create<LLVM::CallOp>(
-                loc, getVoidPtrType(), tf_func_ref,
-                llvm::ArrayRef({adaptor.getCtx(), num_elements, element_size,
-                                output_index, candidates_count_and_ptr.first,
-                                candidates_count_and_ptr.second}))
+        LLVM::CallOp::create(
+            rewriter, loc, getVoidPtrType(), tf_func_ref,
+            llvm::ArrayRef({adaptor.getCtx(), num_elements, element_size,
+                            output_index, candidates_count_and_ptr.first,
+                            candidates_count_and_ptr.second}))
             .getResult();
 
     MemRefDescriptor memRefDescriptor = CreateMemRefDescriptor(
@@ -213,7 +214,7 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
       // Update stride
       if (pos > 0) {
         stride_carried =
-            rewriter.create<LLVM::MulOp>(loc, stride_carried, size);
+            LLVM::MulOp::create(rewriter, loc, stride_carried, size);
       }
     }
     return memref_desc;
@@ -272,12 +273,12 @@ class JITCompileFromStrOpConverter
         ConvertIntegerArrayAttrToStackAllocatedArray(
             loc, rewriter.getI64Type(), rewriter.getI64Type(),
             op.getUnrollFactors(), &rewriter);
-    Value enable_ftz = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), op.getEnableFtzAttr());
-    Value index_64bit = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), op.getIndex64BitAttr());
-    Value cpu_codegen = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), op.getCpuCodegenAttr());
+    Value enable_ftz = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), op.getEnableFtzAttr());
+    Value index_64bit = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), op.getIndex64BitAttr());
+    Value cpu_codegen = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), op.getCpuCodegenAttr());
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
@@ -327,40 +328,39 @@ class JITExecuteOpConverter : public ConvertToLLVMCallOpPattern<JITExecuteOp> {
         getTypeConverter()->convertType(op->getResultTypes().front());
     Type ptr_ty = LLVM::LLVMPointerType::get(getContext());
     Type i64_ty = rewriter.getI64Type();
-    Value one = rewriter.create<LLVM::ConstantOp>(
-        loc, i64_ty, rewriter.getI64IntegerAttr(1));
+    Value one = LLVM::ConstantOp::create(rewriter, loc, i64_ty,
+                                         rewriter.getI64IntegerAttr(1));
     auto result_ptr =
-        rewriter.create<LLVM::AllocaOp>(loc, ptr_ty, result_ty, one);
+        LLVM::AllocaOp::create(rewriter, loc, ptr_ty, result_ty, one);
 
     // Pass the buffer arguments as a stack-allocated array.
     Type args_elem_ty = adaptor.getInputs().front().getType();
-    Value num_args = rewriter.create<LLVM::ConstantOp>(
-        loc, i64_ty,
+    Value num_args = LLVM::ConstantOp::create(
+        rewriter, loc, i64_ty,
         rewriter.getI64IntegerAttr(
             static_cast<int64_t>(adaptor.getInputs().size())));
     Value args_ptr =
-        rewriter.create<LLVM::AllocaOp>(loc, ptr_ty, args_elem_ty, num_args,
-                                        /*alignment=*/0);
+        LLVM::AllocaOp::create(rewriter, loc, ptr_ty, args_elem_ty, num_args,
+                               /*alignment=*/0);
     for (const auto &it : llvm::enumerate(adaptor.getInputs())) {
-      Value index = rewriter.create<LLVM::ConstantOp>(
-          loc, i64_ty, rewriter.getI64IntegerAttr(it.index()));
-      Value element_ptr = rewriter.create<LLVM::GEPOp>(
-          loc, ptr_ty, args_elem_ty, args_ptr, index);
-      rewriter.create<LLVM::StoreOp>(loc, it.value(), element_ptr);
+      Value index = LLVM::ConstantOp::create(
+          rewriter, loc, i64_ty, rewriter.getI64IntegerAttr(it.index()));
+      Value element_ptr = LLVM::GEPOp::create(rewriter, loc, ptr_ty,
+                                              args_elem_ty, args_ptr, index);
+      LLVM::StoreOp::create(rewriter, loc, it.value(), element_ptr);
     }
 
     // Materialize runtime call.
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
-    rewriter.create<LLVM::CallOp>(
-        loc, mlir::TypeRange(), tf_func_ref,
-        ValueRange{adaptor.getCtx(), adaptor.getCallable(), result_ptr,
-                   num_args, args_ptr});
+    LLVM::CallOp::create(rewriter, loc, mlir::TypeRange(), tf_func_ref,
+                         ValueRange{adaptor.getCtx(), adaptor.getCallable(),
+                                    result_ptr, num_args, args_ptr});
 
     // Copy result (including the descriptor) to a stack-allocated buffer and
     // free the old descriptor.
     llvm::SmallVector<Value, 1> final_result = {
-        rewriter.create<LLVM::LoadOp>(loc, result_ty, result_ptr)};
+        LLVM::LoadOp::create(rewriter, loc, result_ty, result_ptr)};
     if (failed(copyUnrankedDescriptors(rewriter, loc, op->getResultTypes(),
                                        final_result,
                                        /*toDynamic=*/false))) {
@@ -402,8 +402,8 @@ class ReportErrorOpConverter
     // Insert function call.
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
-    Value error_code = rewriter.create<LLVM::ConstantOp>(
-        loc, typeConverter->convertType(rewriter.getI32Type()),
+    Value error_code = LLVM::ConstantOp::create(
+        rewriter, loc, typeConverter->convertType(rewriter.getI32Type()),
         adaptor.getErrorCodeAttr());
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         op, mlir::TypeRange(), tf_func_ref,
@@ -489,7 +489,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
 
       // Prepare packed args [allocatedPtr, alignedPtr, offset, sizes, strides]
       // to create a memref descriptor.
-      Value null = rewriter.create<LLVM::ZeroOp>(loc, llvm_ptr_type);
+      Value null = LLVM::ZeroOp::create(rewriter, loc, llvm_ptr_type);
       SmallVector<Value, 12> packed_values{null, null, zero};
       packed_values.append(sizes);
       packed_values.append(strides);
@@ -518,11 +518,12 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
     // setting its pointer to NULL.
     Value alloca_size = UnrankedMemRefDescriptor::computeSize(
         rewriter, loc, *getTypeConverter(), desc, addressSpace);
-    Value underlying_desc_ptr = rewriter.create<LLVM::AllocaOp>(
-        loc, getVoidPtrType(), IntegerType::get(getContext(), 8), alloca_size);
+    Value underlying_desc_ptr =
+        LLVM::AllocaOp::create(rewriter, loc, getVoidPtrType(),
+                               IntegerType::get(getContext(), 8), alloca_size);
 
     // Populate underlying ranked descriptor.
-    Value null = rewriter.create<LLVM::ZeroOp>(loc, llvm_ptr_type);
+    Value null = LLVM::ZeroOp::create(rewriter, loc, llvm_ptr_type);
     UnrankedMemRefDescriptor::setAllocatedPtr(
         rewriter, loc, underlying_desc_ptr, llvm_ptr_type, null);
     UnrankedMemRefDescriptor::setAlignedPtr(rewriter, loc, *getTypeConverter(),
@@ -551,21 +552,23 @@ class IsValidMemRefOpConverter
 
     // Compare every size in the descriptor to 0 to check num_elements == 0.
     int64_t rank = mlir::cast<MemRefType>(op.getArg().getType()).getRank();
-    Value is_empty_shape = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), rewriter.getBoolAttr(false));
+    Value is_empty_shape = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), rewriter.getBoolAttr(false));
     Value zero = createIndexAttrConstant(rewriter, loc, getIndexType(), 0);
     for (int i = 0; i < rank; ++i) {
       Value size = desc.size(rewriter, loc, i);
-      Value is_zero_size = rewriter.create<LLVM::ICmpOp>(
-          loc, rewriter.getI1Type(), LLVM::ICmpPredicate::eq, size, zero);
+      Value is_zero_size =
+          LLVM::ICmpOp::create(rewriter, loc, rewriter.getI1Type(),
+                               LLVM::ICmpPredicate::eq, size, zero);
       is_empty_shape =
-          rewriter.create<LLVM::OrOp>(loc, is_empty_shape, is_zero_size);
+          LLVM::OrOp::create(rewriter, loc, is_empty_shape, is_zero_size);
     }
 
     Value ptr = desc.allocatedPtr(rewriter, loc);
-    Value null = rewriter.create<LLVM::ZeroOp>(loc, getVoidPtrType());
-    Value is_not_nullptr = rewriter.create<LLVM::ICmpOp>(
-        loc, rewriter.getI1Type(), LLVM::ICmpPredicate::ne, ptr, null);
+    Value null = LLVM::ZeroOp::create(rewriter, loc, getVoidPtrType());
+    Value is_not_nullptr =
+        LLVM::ICmpOp::create(rewriter, loc, rewriter.getI1Type(),
+                             LLVM::ICmpPredicate::ne, ptr, null);
 
     // Valid memref = ptr != NULL || num_elements == 0;
     rewriter.replaceOpWithNewOp<LLVM::OrOp>(op, is_not_nullptr, is_empty_shape);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index ff19510805fe50..e51a397363e01e 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -115,27 +115,28 @@ Value ConvertLaunchFuncOpToTfRuntimeCallPattern::generateParamsArray(
   for (auto argument : arguments) argument_types.push_back(argument.getType());
   auto struct_type = LLVM::LLVMStructType::getNewIdentified(
       context_, StringRef(), argument_types);
-  auto one = builder.create<LLVM::ConstantOp>(loc, llvm_int32_type_,
-                                              builder.getI32IntegerAttr(1));
-  auto struct_ptr = builder.create<LLVM::AllocaOp>(
-      loc, llvm_pointer_type_, struct_type, one, /*alignment=*/0);
-  auto array_size = builder.create<LLVM::ConstantOp>(
-      loc, llvm_int32_type_, builder.getI32IntegerAttr(num_arguments));
-  auto array_ptr = builder.create<LLVM::AllocaOp>(
-      loc, llvm_pointer_type_, llvm_pointer_type_, array_size, /*alignment=*/0);
-  auto zero = builder.create<LLVM::ConstantOp>(loc, llvm_int32_type_,
-                                               builder.getI32IntegerAttr(0));
+  auto one = LLVM::ConstantOp::create(builder, loc, llvm_int32_type_,
+                                      builder.getI32IntegerAttr(1));
+  auto struct_ptr = LLVM::AllocaOp::create(builder, loc, llvm_pointer_type_,
+                                           struct_type, one, /*alignment=*/0);
+  auto array_size = LLVM::ConstantOp::create(
+      builder, loc, llvm_int32_type_, builder.getI32IntegerAttr(num_arguments));
+  auto array_ptr =
+      LLVM::AllocaOp::create(builder, loc, llvm_pointer_type_,
+                             llvm_pointer_type_, array_size, /*alignment=*/0);
+  auto zero = LLVM::ConstantOp::create(builder, loc, llvm_int32_type_,
+                                       builder.getI32IntegerAttr(0));
   for (auto en : llvm::enumerate(arguments)) {
-    auto index = builder.create<LLVM::ConstantOp>(
-        loc, llvm_int32_type_, builder.getI32IntegerAttr(en.index()));
-    auto field_ptr = builder.create<LLVM::GEPOp>(
-        loc, llvm_pointer_type_, struct_type, struct_ptr,
+    auto index = LLVM::ConstantOp::create(
+        builder, loc, llvm_int32_type_, builder.getI32IntegerAttr(en.index()));
+    auto field_ptr = LLVM::GEPOp::create(
+        builder, loc, llvm_pointer_type_, struct_type, struct_ptr,
         ArrayRef<Value>{zero, index.getResult()});
-    builder.create<LLVM::StoreOp>(loc, en.value(), field_ptr);
+    LLVM::StoreOp::create(builder, loc, en.value(), field_ptr);
     auto element_ptr =
-        builder.create<LLVM::GEPOp>(loc, llvm_pointer_type_, llvm_pointer_type_,
-                                    array_ptr, index.getResult());
-    builder.create<LLVM::StoreOp>(loc, field_ptr, element_ptr);
+        LLVM::GEPOp::create(builder, loc, llvm_pointer_type_,
+                            llvm_pointer_type_, array_ptr, index.getResult());
+    LLVM::StoreOp::create(builder, loc, field_ptr, element_ptr);
   }
   return array_ptr;
 }
@@ -220,11 +221,11 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
                          });
     rewriter.setInsertionPointToStart(
         launch_op->getParentOfType<ModuleOp>().getBody());
-    function = rewriter.create<LLVM::LLVMFuncOp>(
-        loc, kTfWrapperLibaryLaunchHelperName, function_type);
+    function = LLVM::LLVMFuncOp::create(
+        rewriter, loc, kTfWrapperLibaryLaunchHelperName, function_type);
   }
-  rewriter.create<LLVM::CallOp>(
-      loc, TypeRange(), mlir::SymbolRefAttr::get(function),
+  LLVM::CallOp::create(
+      rewriter, loc, TypeRange(), mlir::SymbolRefAttr::get(function),
 
       ArrayRef<Value>{context_arg, module_blob, kernel_name_global,
                       adaptor.getGridSizeX(), adaptor.getGridSizeY(),
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
index b3cb73b78baf20..a6ee71bfed73b8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
@@ -35,7 +35,7 @@ FlatSymbolRefAttr GetOrInsertLLVMFunction(StringRef func_name, Type func_type,
   if (!tf_func) {
     OpBuilder::InsertionGuard guard(*b);
     b->setInsertionPointToStart(module.getBody());
-    tf_func = b->create<LLVMFuncOp>(b->getUnknownLoc(), func_name, func_type);
+    tf_func = LLVMFuncOp::create(*b, b->getUnknownLoc(), func_name, func_type);
   }
   return SymbolRefAttr::get(b->getContext(), func_name);
 }
@@ -55,11 +55,12 @@ Value CreateOrFindGlobalStringConstant(Location loc, StringRef global_name,
     StringRef symbol_name = global_op.getName();
     Type symbol_type = global_op.getType();
     Type ptr_type = LLVM::LLVMPointerType::get(b->getContext());
-    Value global_ptr = b->create<LLVM::AddressOfOp>(loc, ptr_type, symbol_name);
+    Value global_ptr =
+        LLVM::AddressOfOp::create(*b, loc, ptr_type, symbol_name);
     Value c0 =
-        b->create<LLVM::ConstantOp>(loc, b->getI64Type(), b->getIndexAttr(0));
-    return b->create<LLVM::GEPOp>(loc, ptr_type, symbol_type, global_ptr,
-                                  ValueRange{c0, c0});
+        LLVM::ConstantOp::create(*b, loc, b->getI64Type(), b->getIndexAttr(0));
+    return LLVM::GEPOp::create(*b, loc, ptr_type, symbol_type, global_ptr,
+                               ValueRange{c0, c0});
   }
   return LLVM::createGlobalString(loc, *b, global_name, content,
                                   LLVM::Linkage::Internal);
diff --git a/tensorflow/compiler/mlir/tools/optimize/BUILD b/tensorflow/compiler/mlir/tools/optimize/BUILD
index 6a3cc301bc24ca..d7bece21567fdf 100644
--- a/tensorflow/compiler/mlir/tools/optimize/BUILD
+++ b/tensorflow/compiler/mlir/tools/optimize/BUILD
@@ -17,9 +17,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/tools:safe_cast",
         "//tensorflow/core/framework:tensor_shape",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:macros",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc b/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc
index 09a5b928a2622f..5e0ec1ccac3b9f 100644
--- a/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/tools/safe_cast.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tflite_migration {
@@ -92,13 +92,13 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
   // Quantize the values.
   int indices[kPerChannelMaxDim];
   tensorflow::TensorShape unextended_shape;
-  TF_CHECK_OK(tensorflow::TensorShapeUtils::MakeShape(absl::MakeSpan(dimension),
-                                                      &unextended_shape));
+  CHECK_OK(tensorflow::TensorShapeUtils::MakeShape(absl::MakeSpan(dimension),
+                                                   &unextended_shape));
   tensorflow::TensorShape shape;
   for (int i = 0; i < kPerChannelMaxDim - unextended_shape.dims(); ++i) {
-    TF_CHECK_OK(shape.AddDimWithStatus(1));
+    CHECK_OK(shape.AddDimWithStatus(1));
   }
-  TF_CHECK_OK(shape.AppendShapeWithStatus(unextended_shape));
+  CHECK_OK(shape.AppendShapeWithStatus(unextended_shape));
   channel_dim_index += kPerChannelMaxDim - unextended_shape.dims();
 
   for (indices[0] = 0; indices[0] < shape.dim_size(0); indices[0]++) {
diff --git a/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir b/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
index 02e9c0649e3f78..cd9a2dcdf746fd 100644
--- a/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tosa-opt --tosa-convert-tfl-uint8  --verify-each %s | FileCheck %s
+// RUN: tf-tosa-opt --tosa-convert-tfl-uint8 --verify-diagnostics --verify-each %s | FileCheck %s
 
 
 // Operations for testing --tosa-convert-tfl-uint8
@@ -28,3 +28,20 @@ func.func @test_cast_ui8(%arg0: tensor<1x256x256x3x!quant.uniform<u8:f32, 0.0156
   %0 = "tfl.cast"(%arg0) : (tensor<1x256x256x3x!quant.uniform<u8:f32, 0.015603500418365002:128>>) -> tensor<1x256x256x3xf32>
   func.return %0 : tensor<1x256x256x3xf32>
 }
+
+// ----
+
+// CHECK-LABEL: test_error_tosa_ops
+func.func @test_error_tosa_ops(%arg0: tensor<5x10xi8>) -> (tensor<5x10xi8>, none) {
+
+  // Dummy use to TFL dialect to load TFL dialect in MLIR context
+  %0 = "tfl.no_value"() <{value}> : () -> none
+
+  // expected-error @+1 {{tosa operations are not expected in this pass. Run tosa-convert-tfl-uint8 before tosa-legalize-tfl}}
+  %cst1 = "tosa.const"() <{values = dense<1> : tensor<5x10xi8>}> : () -> tensor<5x10xi8>
+  // expected-error @+1 {{tosa operations are not expected in this pass. Run tosa-convert-tfl-uint8 before tosa-legalize-tfl}}
+  %1 = "tosa.add"(%arg0, %cst1) : (tensor<5x10xi8>, tensor<5x10xi8>) -> tensor<5x10xi8>
+
+
+  func.return %1, %0 : tensor<5x10xi8>, none
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index 0b6dd410c57d9a..78e616d8967bb6 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -2930,6 +2930,21 @@ func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568594
 
 // -----
 
+// CHECK-LABEL:   func.func @test_relu_qu16(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>> {
+// CHECK:           %[[VAL_0:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xui16>}> : () -> tensor<1xui16>
+// CHECK:           %[[RESCALE_0:.*]] = tosa.rescale %[[ARG0]], %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]] {input_unsigned = true, output_unsigned = true, per_channel = false, rounding_mode = SINGLE_ROUND, scale32 = true} : (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>, tensor<1xi32>, tensor<1xi8>, tensor<1xui16>, tensor<1xui16>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>
+// CHECK:           %[[CLAMP_0:.*]] = tosa.clamp %[[RESCALE_0]] {max_val = 65535 : ui16, min_val = 0 : ui16} : (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>
+// CHECK:           return %[[CLAMP_0]]
+func.func @test_relu_qu16(%arg0:tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>) -> (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>) {
+    %0 = "tfl.relu"(%arg0) : (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>
+    return %0 : tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>
+}
+
+// -----
+
 // CHECK-LABEL: test_relu0To1_qi8
 // CHECK-DAG: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>
 // CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2147449478> : tensor<1xi32>}>
@@ -3267,6 +3282,21 @@ func.func @test_fullyconnected_dynamic_output(%arg0: tensor<1x2048xf32>, %arg1:
 
 // -----
 
+// CHECK-LABEL: @test_fullyconnected_dynamic_batch
+func.func @test_fullyconnected_dynamic_batch(%arg0: tensor<?x512xf32>, %arg1: tensor<256x512xf32>, %arg2: tensor<256xf32>) -> tensor<?x256xf32> {
+  // CHECK-DAG: %[[OUT_SHAPE:.*]] = tosa.const_shape  {values = dense<[-1, 256]> : tensor<2xindex>} : () -> !tosa.shape<2>
+  // CHECK-DAG: %[[FILTER_SHAPE:.*]] = tosa.const_shape  {values = dense<[256, 1, 1, 512]> : tensor<4xindex>} : () -> !tosa.shape<4>
+  // CHECK-DAG: %[[IN_SHAPE:.*]] = tosa.const_shape  {values = dense<[-1, 1, 1, 512]> : tensor<4xindex>} : () -> !tosa.shape<4>
+  // CHECK: %[[RESHAPE_IN:.*]] = tosa.reshape %arg0, %[[IN_SHAPE]]
+  // CHECK: %[[RESHAPE_FILTER:.*]] = tosa.reshape %arg1, %[[FILTER_SHAPE]]
+  // CHECK: %[[CONV:.*]] = tosa.conv2d %[[RESHAPE_IN]], %[[RESHAPE_FILTER]], %arg2, {{.*}}, {{.*}}
+  // CHECK: tosa.reshape %[[CONV]], %[[OUT_SHAPE]]
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<?x512xf32>, tensor<256x512xf32>, tensor<256xf32>) -> tensor<?x256xf32>
+  func.return %0 : tensor<?x256xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @test_fullyconnected_keep_dims
 func.func @test_fullyconnected_keep_dims(%arg0: tensor<1x64x64x768x!quant.uniform<i8:f32, 0.13852123916149139:5>>, %arg1: tensor<3072x768x!quant.uniform<i8<-127:127>:f32, 0.003333511995151639>>, %arg2: tensor<3072x!quant.uniform<i32:f32, 4.6176221803762019E-4>>) -> tensor<1x64x64x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>> {
     // CHECK-DAG: %[[CONST_SHAPE0:.*]] = tosa.const_shape  {values = dense<[1, 64, 64, 3072]> : tensor<4xindex>}
@@ -3684,6 +3714,24 @@ func.func @test_conv2d_int8_input_variable_bias(%input: tensor<1x32x32x8x!quant.
 
 // -----
 
+// CHECK-LABEL:   func.func @test_conv2d_qu16(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8x!quant.uniform<u16:f32, 1.000000e+00>>,
+// CHECK-SAME:      %[[ARG1:.*]]: tensor<3x3x8x16x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.000000e+00>> {
+// CHECK:           %[[VAL_0:.*]] = "tosa.const"() <{values = dense<14> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{values = dense<16384> : tensor<1xi16>}> : () -> tensor<1xi16>
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi48>}> : () -> tensor<1xi48>
+// CHECK:           %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xui16>}> : () -> tensor<1xui16>
+// CHECK:           %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_5:.*]] = tosa.conv2d %[[ARG0]], %[[ARG1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] {acc_type = i48, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 3, 4>, stride = array<i64: 1, 1>} : (tensor<1x32x32x8x!quant.uniform<u16:f32, 1.000000e+00>>, tensor<3x3x8x16x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<1xi48>, tensor<1xui16>, tensor<1xi8>) -> tensor<1x32x32x3xi48>
+// CHECK:           %[[RESCALE_0:.*]] = tosa.rescale %[[VAL_5]], %[[VAL_1]], %[[VAL_0]], %[[VAL_2]], %[[VAL_3]] {input_unsigned = true, output_unsigned = true, per_channel = false, rounding_mode = SINGLE_ROUND, scale32 = false} : (tensor<1x32x32x3xi48>, tensor<1xi16>, tensor<1xi8>, tensor<1xi48>, tensor<1xui16>) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.000000e+00>>
+// CHECK:           return %[[RESCALE_0]]
+func.func @test_conv2d_qu16(%input: tensor<1x32x32x8x!quant.uniform<u16:f32, 1.0>>, %filter: tensor<3x3x8x16x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.0>> {
+  %bias = "tfl.no_value"() {value} : () -> none
+  %0 = "tfl.conv_2d"(%input, %filter, %bias) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x8x!quant.uniform<u16:f32, 1.0>>, tensor<3x3x8x16x!quant.uniform<i8:f32, 1.0>>, none) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.0>>
+  return %0 : tensor<1x32x32x3x!quant.uniform<u16:f32, 1.0>>
+}
+// -----
+
 // CHECK-LABEL: @test_squeeze
 func.func @test_squeeze(%arg0: tensor<2x1x3x1xf32>) -> tensor<2x3x1xf32> {
   // CHECK: tosa.reshape
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
index c4d07792549543..7805fdd9742f11 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
@@ -13,6 +13,15 @@ func.func @test_add(%arg0: tensor<192x192x3xf32>, %arg1: tensor<16x192x192x3xf32
 
 // -----
 
+// CHECK-LABEL: test_add_dynamic
+func.func @test_add_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<5xf32>) -> tensor<?x?x5xf32> {
+    // CHECK: tosa.add
+    %1 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x5xf32>
+    func.return %1 : tensor<?x?x5xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_add_qi8
 func.func @test_add_qi8(%arg0: tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, %arg1: tensor<1x13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>) -> tensor<1x13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>> {
   // CHECK: tosa.add
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
index afd66102b8a29c..6edfc57ad8a89a 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
@@ -264,7 +264,8 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
     // Convert intermediate tensor.
     for (auto &op : bb) {
       if (llvm::dyn_cast<tosa::ConstOp>(&op)) {
-        continue;  // Skip if the operation is a tosa::ConstOp
+        // Skip tosa const ops created during rescaling. 
+        continue;
       }
 
       for (Value output_val : op.getResults()) {
@@ -355,6 +356,13 @@ void ConvertUint8ToInt8::runOnOperation() {
   auto &ctx = getContext();
   mlir::func::FuncOp func = getOperation();
 
+  func.walk([&](Operation *op) {
+    if (isa<TosaOp>(op)){
+      // Run this before calling convert_graph_uint8_tensor as rescaling introduces tosa ops
+      op->emitError("tosa operations are not expected in this pass. Run tosa-convert-tfl-uint8 before tosa-legalize-tfl");
+    }
+  });
+
   // Convert uint8 const tensor. const needs to be handled specifically.
   patterns.add<ConvertUint8QConstOp>(&ctx);
   (void)applyPatternsGreedily(func, std::move(patterns));
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 803061fe56adaf..a2aaa3b905f87f 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -4774,7 +4774,7 @@ std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape({N, W, C},
                                            on_value_type.getElementType()),
       op1_reshape_on_value.getResult(),
-      getTosaConstShape(rewriter, op, {N, W, C}));
+      getTosaConstShape(rewriter, op->getLoc(), {N, W, C}));
 
   // Reshape off_value to [1, 1, 1]
   auto op3_reshape_off_value = CreateOpAndInfer<tosa::ReshapeOp>(
@@ -4789,7 +4789,7 @@ std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape({N, K, C},
                                            on_value_type.getElementType()),
       op3_reshape_off_value.getResult(),
-      getTosaConstShape(rewriter, op, {N, K, C}));
+      getTosaConstShape(rewriter, op->getLoc(), {N, K, C}));
 
   // Reshape indices to [N, W]
   shape_value =
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index 9d227f75bad616..43a22266bcb0c6 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -1558,7 +1558,7 @@ LogicalResult ConvertTFTileOp::matchAndRewrite(
     multiples_vals.push_back(
         multiples_elems.getValues<IntegerAttr>()[i].getInt());
 
-  auto multiples = getTosaConstShape(rewriter, op, multiples_vals);
+  auto multiples = getTosaConstShape(rewriter, op->getLoc(), multiples_vals);
 
   CreateReplaceOpAndInfer<tosa::TileOp>(rewriter, op, output_type,
                                         tf_tile_op.getInput(), multiples);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index 37a9f4234d992a..b5e19e35e9d40a 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -31,25 +31,26 @@ limitations under the License.
 #include <unordered_set>
 
 #include "llvm/ADT/ArrayRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"             // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"         // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"             // from @llvm-project
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
+#include "mlir/IR/Block.h"                       // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"           // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"       // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"                // from @llvm-project
+#include "mlir/IR/MLIRContext.h"                 // from @llvm-project
+#include "mlir/IR/Matchers.h"                    // from @llvm-project
+#include "mlir/IR/PatternMatch.h"                // from @llvm-project
+#include "mlir/IR/Region.h"                      // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"               // from @llvm-project
+#include "mlir/IR/Types.h"                       // from @llvm-project
+#include "mlir/IR/Value.h"                       // from @llvm-project
+#include "mlir/IR/ValueRange.h"                  // from @llvm-project
+#include "mlir/Support/LLVM.h"                   // from @llvm-project
+#include "mlir/Support/LogicalResult.h"          // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
@@ -359,7 +360,8 @@ LogicalResult ConvertTFLReluOp::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type =
+        tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -429,7 +431,7 @@ LogicalResult ConvertTFLRelu1Op::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type = tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -496,7 +498,7 @@ LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type = tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -563,7 +565,7 @@ LogicalResult ConvertTFLRelu6Op::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type = tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -1405,7 +1407,8 @@ RankedTensorType getTypeForSlice(RankedTensorType type, int64_t slice_dim,
         per_channel_qtype.getZeroPoints().begin() + offset,
         per_channel_qtype.getZeroPoints().begin() + offset + slice_size);
     auto output_per_channel_qtype = quant::UniformQuantizedPerAxisType::get(
-        per_channel_qtype.getFlags(), per_channel_qtype.getStorageType(),
+        per_channel_qtype.getFlags(),
+        tosa::getStorageElementTypeFromQuantized(per_channel_qtype),
         per_channel_qtype.getExpressedType(), output_scale_arr, output_zp_arr,
         per_channel_qtype.getQuantizedDimension(),
         per_channel_qtype.getStorageTypeMin(),
@@ -2333,7 +2336,10 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
   // shape[1].
   if (input_type.getRank() != 2) {
     int64_t num_elems = filter_type.getShape()[1];
-    int64_t num_batch = input_type.getNumElements() / num_elems;
+    int64_t num_batch = ShapedType::kDynamic;
+    if (input_type.hasStaticShape()) {
+      num_batch = input_type.getNumElements() / num_elems;
+    }
     SmallVector<int64_t, 2> shape_vals({num_batch, num_elems});
 
     RankedTensorType reshape_type =
@@ -3006,7 +3012,7 @@ LogicalResult ConvertTFLTileOp::matchAndRewrite(
     multiples_vals.push_back(
         multiples_elems.getValues<APInt>()[i].getSExtValue());
 
-  auto multiples = getTosaConstShape(rewriter, op, multiples_vals);
+  auto multiples = getTosaConstShape(rewriter, op->getLoc(), multiples_vals);
 
   CreateReplaceOpAndInfer<tosa::TileOp>(rewriter, op, output_type,
                                         tfl_tile_op.getInput(), multiples);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index d1f6772ae6c5fa..dcfff41af1f1d7 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -54,11 +54,11 @@ mlir::TypeAttr getConvAccTypeAttr(PatternRewriter& rewriter,
   // in case of quantized types: get base element types
   if (auto qtype =
           llvm::dyn_cast<mlir::quant::UniformQuantizedType>(input_etype))
-    input_etype = qtype.getStorageType();
+    input_etype = tosa::getStorageElementTypeFromQuantized(qtype);
 
   if (auto qtype =
           llvm::dyn_cast<mlir::quant::UniformQuantizedType>(output_etype))
-    output_etype = qtype.getStorageType();
+    output_etype = tosa::getStorageElementTypeFromQuantized(qtype);
 
   // special cases: input_etype and output_etype are both f16 or bf16: use
   // acc_type=f32
@@ -355,8 +355,19 @@ Value buildRescale(PatternRewriter& rewriter, Operation* op,
                    int32_t scale_multiplier, int32_t scale_shift,
                    int64_t input_zp, int64_t output_zp, tosa::RoundingMode rounding_mode,
                    bool scale32) {
-  bool input_unsigned = input_val.getType().isUnsignedInteger();
-  bool output_unsigned = output_type.isUnsignedInteger();
+  bool input_unsigned, output_unsigned;
+  if (auto qtype = dyn_cast<mlir::quant::QuantizedType>(
+          cast<ShapedType>(input_val.getType()).getElementType())) {
+    input_unsigned = !qtype.isSigned();
+  } else {
+    input_unsigned = input_val.getType().isUnsignedInteger();
+  }
+  if (auto qtype =
+          dyn_cast<mlir::quant::QuantizedType>(output_type.getElementType())) {
+    output_unsigned = !qtype.isSigned();
+  } else {
+    output_unsigned = output_type.isUnsignedInteger();
+  }
   auto loc = op->getLoc();
   Value multiplier_val =
       buildRescaleMultiplier(scale32, rewriter, loc, {scale_multiplier});
@@ -486,8 +497,8 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
   const auto rounding_mode_attr = tosa::RoundingModeAttr::get(
       rewriter.getContext(), rounding_mode);
 
-  bool input_unsigned = input_qtype.isUnsignedInteger();
-  bool output_unsigned = output_qtype.isUnsignedInteger();
+  bool input_unsigned = !input_qtype.isSigned();
+  bool output_unsigned = !output_qtype.isSigned();
 
   auto loc = op->getLoc();
   const Value empty_output_val = rewriter.create<tensor::EmptyOp>(
@@ -664,7 +675,7 @@ Value getTosaConstHardSwish8bitTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -128, 127);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {256}, element_qtype.getStorageType());
+      {256}, getStorageElementTypeFromQuantized(element_qtype));
   auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
@@ -718,7 +729,8 @@ Value getTosaConstRsqrt8bitTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -128, 127);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {256}, element_qtype.getStorageType());
+      {256},
+      tosa::getStorageElementTypeFromQuantized(element_qtype));
   auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
@@ -756,7 +768,7 @@ Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -128, 127);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {256}, element_qtype.getStorageType());
+      {256}, tosa::getStorageElementTypeFromQuantized(element_qtype));
   auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
@@ -880,7 +892,7 @@ void getTosaConst32bitSoftmaxExpTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -32768, 32767);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({513}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {513}, element_qtype.getStorageType());
+      {513}, tosa::getStorageElementTypeFromQuantized(element_qtype));
 
   auto first_const_attr =
       DenseElementsAttr::get(storage_type, llvm::ArrayRef(first_table));
@@ -979,15 +991,6 @@ Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
   return const_op.getResult();
 }
 
-Value getTosaConstShape(PatternRewriter& rewriter, Operation* op,
-                        llvm::ArrayRef<int64_t> values) {
-  auto attr = rewriter.getIndexTensorAttr(values);
-  auto type =
-      tosa::shapeType::get(rewriter.getContext(), /* rank = */ values.size());
-  return CreateOpAndInfer<tosa::ConstShapeOp>(rewriter, op->getLoc(), type,
-                                              attr);
-}
-
 // Create a vector from a 32-bit value tensor.  Returns the size of
 // the new vector or -1 on error.
 // Populate a int32_t vector from a val tensor
@@ -1409,7 +1412,7 @@ Value reshapeScalarTo1D(PatternRewriter& rewriter, Location loc, Value value) {
     auto element_qtype = dyn_cast<quant::QuantizedType>(element_type);
     if (element_qtype) {
       storage_type = tensorflow::GetTypeFromTFTensorShape(
-          {1}, element_qtype.getStorageType());
+          {1}, tosa::getStorageElementTypeFromQuantized(element_qtype));
     }
 
     DenseElementsAttr const_attr;
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index 20908312f40718..b22db1b0963278 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -144,11 +144,6 @@ Value getTosaConstTensorSingleI32(PatternRewriter& rewriter, Operation* op,
 Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
                                   int64_t val, int rank);
 
-// Create a tosa::ConstShape based on the specified values
-Value getTosaConstShape(PatternRewriter& rewriter, Operation* op,
-                        llvm::ArrayRef<int64_t> values);
-
-
 // Populate a int32_t vector from a val tensor
 // return failure if val is not a constant value
 // return success otherwise
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.h b/tensorflow/compiler/mlir/tosa/transforms/passes.h
index 0475d46a37a091..bd170f61cb2fb8 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -70,7 +70,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass();
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFLStatefulPass();
 
 #define GEN_PASS_REGISTRATION
-#define GEN_PASS_CLASSES
 #define GEN_PASS_DECL_TOSALEGALIZETFPASS
 #define GEN_PASS_DECL_TOSALEGALIZETFLPASS
 #define GEN_PASS_DECL_TOSALEGALIZETFTFLPASS
diff --git a/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc b/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
index e4a6ca5a6e56a5..53d21de0195999 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
@@ -25,7 +25,8 @@ limitations under the License.
 
 namespace mlir::tosa {
 
-#define GEN_PASS_DEF_STRIPM
+#define GEN_PASS_DEF_STRIPFUNCTIONMETADATA
+#define GEN_PASS_DEF_STRIPMODULEMETADATA
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
 
 namespace {
@@ -45,7 +46,7 @@ static bool isTFLAttr(NamedAttribute &namedAttr) {
 }
 
 class StripModuleMetadataPass
-    : public StripModuleMetadataBase<StripModuleMetadataPass> {
+    : public impl::StripModuleMetadataBase<StripModuleMetadataPass> {
  public:
   void runOnOperation() override {
     auto moduleOp = getOperation();
@@ -59,7 +60,7 @@ class StripModuleMetadataPass
 };
 
 class StripFunctionMetadataPass
-    : public StripFunctionMetadataBase<StripFunctionMetadataPass> {
+    : public impl::StripFunctionMetadataBase<StripFunctionMetadataPass> {
  public:
   void runOnOperation() override {
     auto funcOp = getOperation();
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 00cd5c6dd87c96..3989c361047566 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1788,7 +1788,6 @@ tf_xla_py_strict_test(
     srcs = ["unary_ops_test.py"],
     shard_count = 20,
     tags = [
-        "cuda-only",
         "no_aarch64",  # TODO(b/348125886)
         "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -2861,7 +2860,6 @@ tf_cuda_cc_test(
     tags = [
         "config-cuda-only",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "cuda-only",  # ROCmSoftwarePlatform #958
         "noasan",  # TODO(b/201651800)
         "requires-gpu-nvidia",
     ] + tf_cuda_tests_tags(),
@@ -2882,7 +2880,6 @@ tf_cuda_cc_test(
     tags = [
         "config-cuda-only",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "cuda-only",  # ROCmSoftwarePlatform #958
         "noasan",  # TODO(b/201651800)
         "requires-gpu-nvidia",
     ] + tf_cuda_tests_tags(),
@@ -2902,13 +2899,12 @@ tf_cuda_cc_test(
         "//tensorflow/compiler/jit:xla_kernel_creator",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_testutil",
-        "@local_xla//xla/tsl/platform:status",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index fcd3aadbe10c9a..43d468a763f190 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -90,11 +90,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -4935,8 +4933,8 @@ int main(int argc, char** argv) {
   // XLA devices register kernels at construction time; create all known devices
   // to make sure the kernels are registered.
   std::vector<std::unique_ptr<tensorflow::Device>> devices;
-  TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
-      tensorflow::SessionOptions(), "", &devices));
+  CHECK_OK(tensorflow::DeviceFactory::AddDevices(tensorflow::SessionOptions(),
+                                                 "", &devices));
   tensorflow::StaticDeviceMgr device_mgr(std::move(devices));
 
   tensorflow::Device* ignored;
diff --git a/tensorflow/compiler/tests/unary_ops_composition_test.cc b/tensorflow/compiler/tests/unary_ops_composition_test.cc
index c27b8070bbb450..00fd0ea67041b9 100644
--- a/tensorflow/compiler/tests/unary_ops_composition_test.cc
+++ b/tensorflow/compiler/tests/unary_ops_composition_test.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/device_factory.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/port.h"
 
@@ -84,8 +84,8 @@ class UnaryOpsCompositionTest : public OpsTestBase {
     DeviceContext* device_context =
         device_->tensorflow_accelerator_device_info()->default_context;
 
-    TF_CHECK_OK(device_context->CopyCPUTensorToDeviceSync(&input_on_host,
-                                                          device_, input));
+    CHECK_OK(device_context->CopyCPUTensorToDeviceSync(&input_on_host, device_,
+                                                       input));
 
     TF_ASSERT_OK(RunOpKernel());
 
@@ -95,7 +95,7 @@ class UnaryOpsCompositionTest : public OpsTestBase {
     Tensor* output = GetOutput(0);
     Tensor output_on_host(cpu_allocator, output->dtype(), output->shape());
 
-    TF_CHECK_OK(device_context->CopyDeviceTensorToCPUSync(
+    CHECK_OK(device_context->CopyDeviceTensorToCPUSync(
         output, "output 0", device_, &output_on_host));
 
     test::ExpectClose(expected_tensor, output_on_host, /*atol=*/1e-5,
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e5545445817ec2..9c2dfc073afccb 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -490,7 +490,6 @@ cc_library(
         "@local_xla//xla/service/cpu:executable_proto_cc",
         "@local_xla//xla/tsl/concurrency:async_value",
         "@local_xla//xla/tsl/platform:env",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
@@ -721,12 +720,15 @@ cc_library(
         ":common",
         ":xla_expression",
         ":xla_helpers",
+        ":xla_resource",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:core_cpu_internal",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_xla//xla:literal",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla:xla_data_proto_cc",
@@ -767,7 +769,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
         "@local_xla//xla:util",
         "@local_xla//xla/client:client_library",
     ],
@@ -846,18 +847,25 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:refcount",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@local_xla//xla:executable_run_options",
-        "@local_xla//xla:types",
-        "@local_xla//xla/backends/gpu/collectives:gpu_clique_key",
+        "@local_xla//xla:literal",
+        "@local_xla//xla:literal_util",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla/core/collectives:clique_id",
         "@local_xla//xla/core/collectives:clique_key",
         "@local_xla//xla/hlo/builder:xla_builder",
         "@local_xla//xla/hlo/builder:xla_computation",
-        "@local_xla//xla/hlo/builder/lib:arithmetic",
-        "@local_xla//xla/hlo/builder/lib:constants",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:layout_util",
         "@local_xla//xla/service:computation_placer_hdr",
@@ -1046,6 +1054,11 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@local_xla//xla:status_macros",
@@ -1063,6 +1076,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1071,6 +1085,8 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1253,10 +1269,9 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/stream_executor:device_memory",
         "@local_xla//xla/stream_executor:stream",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
     alwayslink = 1,
 )
@@ -1327,21 +1342,21 @@ cc_library(
         "functionalize_cond.h",
     ],
     deps = [
-        ":frontend_attributes_util",
         ":functionalize_control_flow_util",
-        ":tf2xla_util",
+        ":tf2xla_defs",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/core/platform:hash",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "@local_xla//xla:status_macros",
-        "@local_xla//xla:union_find",
     ],
 )
 
@@ -1607,7 +1622,14 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:str_util",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_xla//xla/tsl/platform:errors",
     ],
 )
 
@@ -1673,12 +1695,18 @@ cc_library(
     deps = [
         ":resource_operation_table",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:graph",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:function_body",
+        "//tensorflow/core/common_runtime:function_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_xla//xla:status_macros",
     ],
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 2adc83512c6617..b5426bc35c58df 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -16,30 +16,49 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 
 #include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
 #include <deque>
-#include <stack>
+#include <functional>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
-#include "absl/memory/memory.h"
-#include "absl/strings/match.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/types/optional.h"
-#include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
-#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "xla/union_find.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/hash.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
@@ -1138,7 +1157,7 @@ StateMap::CondId FunctionalizeCond::StateAlongEdge(const Edge* e) {
     StateMap::CondState state;
     if (id != nullptr) state = *id;
     OutputTensor predicate;
-    TF_CHECK_OK(GetSwitchPredicate(*src, &predicate));
+    CHECK_OK(GetSwitchPredicate(*src, &predicate));
     if (e->IsControlEdge()) {
       // In gradients of tf.cond(), in each branch, we have a NoOp node as
       // control pivot. These NoOp nodes have control dependency from Switch
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index bb50d530484b10..281da5c23c54e8 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1893,18 +1893,17 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_resource",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:framework",
-        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_xla//xla:literal",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_builder",
         "@local_xla//xla/hlo/builder:xla_computation",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
@@ -3019,10 +3018,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/log:check",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_builder",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index 899c0063035b82..b699358106e5c4 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -553,7 +553,8 @@ absl::Status PopulateMetadataBufferIfNeeded(OpKernelContext& ctx,
               callback_data.outputs(i).buffer_description().shape()));
       void* location = static_cast<char*>(allocated->data()) +
                        xla::ShapeUtil::ByteSizeOf(xla_shape);
-      se::DeviceMemoryBase m{location, num_dimensions * sizeof(int32_t)};
+      stream_executor::DeviceAddressBase m{location,
+                                           num_dimensions * sizeof(int32_t)};
       TF_RETURN_IF_ERROR(stream->Memcpy(&m, shape_info.data(),
                                         num_dimensions * sizeof(int32_t)));
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 3bfe9e384405b2..6e8417e2d25ff2 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -16,9 +16,11 @@ limitations under the License.
 // XLA-specific reduction Ops.
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -30,14 +32,12 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/literal.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
@@ -125,7 +125,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   // Construct the builder for the reduction lambda.
   xla::XlaBuilder r(absl::StrCat(desc, "-reduction"));
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
+  CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
 
   auto data = xla::ConvertElementType(ctx->Input(0), type);
   // Call virtual method to get the initial value.
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index e06c0b09ba9938..c2aee328a9cd23 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -519,7 +519,7 @@ class XlaCallModuleOp : public XlaOpKernel {
         } else if (options.add_token_input_output) {
           // Add a dummy token if the inner computation takes a token but the
           // custom call doesn't have a token argument.
-          args.push_back(builder.create<mlir::stablehlo::CreateTokenOp>(loc));
+          args.push_back(mlir::stablehlo::CreateTokenOp::create(builder, loc));
         }
 
         input_args.reserve(result.input_mapping.size());
@@ -530,7 +530,7 @@ class XlaCallModuleOp : public XlaOpKernel {
 
       // Call the lowered function.
       auto call =
-          builder.create<mlir::func::CallOp>(loc, main_func, input_args);
+          mlir::func::CallOp::create(builder, loc, main_func, input_args);
 
       // Unpack the result tuple (`options.always_return_tuple` is true). If
       // `has_tuple_input_output` is true, the first result is a token type.
@@ -548,7 +548,7 @@ class XlaCallModuleOp : public XlaOpKernel {
             mlir::Value token = results.back();
             if (!token.use_empty()) {
               token.replaceAllUsesWith(
-                  builder.create<mlir::stablehlo::CreateTokenOp>(loc));
+                  mlir::stablehlo::CreateTokenOp::create(builder, loc));
             }
             results.pop_back();
           }
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
index 99a0ec6d9e38dd..e9d0314780ca54 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/shape.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace {
@@ -47,8 +47,7 @@ class XlaCustomCallOp : public XlaOpKernel {
     }
 
     xla::Shape output_shape;
-    TF_CHECK_OK(
-        TensorShapeToXLAShape(output_type_, output_shape_, &output_shape));
+    CHECK_OK(TensorShapeToXLAShape(output_type_, output_shape_, &output_shape));
     xla::XlaOp output = xla::CustomCall(ctx->builder(), target_name_, inputs,
                                         output_shape, backend_config_);
     ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
index dd493a5606b597..93444bfeb49125 100644
--- a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
+++ b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <string>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -24,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -36,8 +39,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 // Sample kernels for the light outside compilation test.
 
@@ -64,14 +65,15 @@ class TestStaticTfOp : public OpKernel {
 
     // Just pass the value through.
     uint64_t size = input.AllocatedBytes();
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(), size};
+    stream_executor::DeviceAddressBase gpu_dst{out_tensor->data(), size};
     se::Stream* stream = ctx->op_device_context()->stream();
 
-    OP_REQUIRES_OK(ctx,
-                   stream->MemcpyD2D(
-                       /*gpu_dst=*/&gpu_dst,
-                       /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
-                       /*size=*/input.AllocatedBytes()));
+    OP_REQUIRES_OK(
+        ctx,
+        stream->MemcpyD2D(
+            /*gpu_dst=*/&gpu_dst,
+            /*gpu_src=*/stream_executor::DeviceAddressBase{input.data(), size},
+            /*size=*/input.AllocatedBytes()));
   }
 };
 
@@ -105,21 +107,23 @@ class TestStaticMultipleOutputTfOp : public OpKernel {
 
     // Just pass the value through.
     uint64_t size = input.AllocatedBytes();
-    se::DeviceMemoryBase gpu_dst1{out_tensor1->data(), size};
-    se::DeviceMemoryBase gpu_dst2{out_tensor2->data(), size};
+    stream_executor::DeviceAddressBase gpu_dst1{out_tensor1->data(), size};
+    stream_executor::DeviceAddressBase gpu_dst2{out_tensor2->data(), size};
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
 
-    OP_REQUIRES_OK(ctx,
-                   stream->MemcpyD2D(
-                       /*gpu_dst=*/&gpu_dst1,
-                       /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
-                       /*size=*/input.AllocatedBytes()));
-    OP_REQUIRES_OK(ctx,
-                   stream->MemcpyD2D(
-                       /*gpu_dst=*/&gpu_dst2,
-                       /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
-                       /*size=*/input.AllocatedBytes()));
+    OP_REQUIRES_OK(
+        ctx,
+        stream->MemcpyD2D(
+            /*gpu_dst=*/&gpu_dst1,
+            /*gpu_src=*/stream_executor::DeviceAddressBase{input.data(), size},
+            /*size=*/input.AllocatedBytes()));
+    OP_REQUIRES_OK(
+        ctx,
+        stream->MemcpyD2D(
+            /*gpu_dst=*/&gpu_dst2,
+            /*gpu_src=*/stream_executor::DeviceAddressBase{input.data(), size},
+            /*size=*/input.AllocatedBytes()));
   }
 };
 
@@ -165,12 +169,12 @@ class TestDynamicTfOp : public OpKernel {
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
 
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy};
+    stream_executor::DeviceAddressBase gpu_dst{out_tensor->data(), size_to_cpy};
     OP_REQUIRES_OK(ctx, stream->MemcpyD2D(
                             /*gpu_dst=*/&gpu_dst,
                             /*gpu_src=*/
-                            se::DeviceMemoryBase{input.data(),
-                                                 static_cast<uint64_t>(size)},
+                            stream_executor::DeviceAddressBase{
+                                input.data(), static_cast<uint64_t>(size)},
                             /*size=*/size_to_cpy));
   }
 
@@ -211,7 +215,7 @@ class DynamicMultidimOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape output_shape;
-    auto vec = ctx->input(0).flat<int32>();
+    auto vec = ctx->input(0).flat<int32_t>();
     for (int i = 0; i < vec.size(); i++) {
       OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(vec(i)));
     }
@@ -225,8 +229,8 @@ class DynamicMultidimOp : public OpKernel {
     for (int i = 0; i < output_shape.num_elements(); i++) {
       host_data[i] = 1.0;
     }
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(),
-                                 static_cast<uint64_t>(num_elements)};
+    stream_executor::DeviceAddressBase gpu_dst{
+        out_tensor->data(), static_cast<uint64_t>(num_elements)};
 
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
@@ -299,13 +303,13 @@ class TestTfMustBeConstantOp : public OpKernel {
     AllocatorAttributes pinned_alloc_attrs;
     pinned_alloc_attrs.set_on_host(true);
     pinned_alloc_attrs.set_gpu_compatible(true);
-    TF_CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp,
-                                   pinned_alloc_attrs));
+    CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp,
+                                pinned_alloc_attrs));
 
-    OP_REQUIRES_OK(
-        ctx, stream->Memcpy(tmp.data(),
-                            se::DeviceMemoryBase{input.data(), allocated_size},
-                            allocated_size));
+    OP_REQUIRES_OK(ctx, stream->Memcpy(tmp.data(),
+                                       stream_executor::DeviceAddressBase{
+                                           input.data(), allocated_size},
+                                       allocated_size));
 
     OP_REQUIRES_OK(ctx, stream->BlockHostUntilDone());
 
@@ -316,8 +320,8 @@ class TestTfMustBeConstantOp : public OpKernel {
     Tensor* out_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", ctx->input(0).shape(),
                                              &out_tensor));
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(),
-                                 static_cast<uint64_t>(allocated_size)};
+    stream_executor::DeviceAddressBase gpu_dst{
+        out_tensor->data(), static_cast<uint64_t>(allocated_size)};
     OP_REQUIRES_OK(ctx, stream->Memcpy(&gpu_dst, tmp.data(), allocated_size));
   }
 };
@@ -361,11 +365,12 @@ class TestDynamicTfWithBoundOp : public OpKernel {
 
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy};
+    stream_executor::DeviceAddressBase gpu_dst{out_tensor->data(), size_to_cpy};
     OP_REQUIRES_OK(
         ctx, stream->MemcpyD2D(
                  /*gpu_dst=*/&gpu_dst,
-                 /*gpu_src=*/se::DeviceMemoryBase{input.data(), size_to_cpy},
+                 /*gpu_src=*/
+                 stream_executor::DeviceAddressBase{input.data(), size_to_cpy},
                  /*size=*/size_to_cpy));
   }
 
diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc
index b7c9b5fd7bbf13..e8c7dc1a579b6b 100644
--- a/tensorflow/compiler/tf2xla/literal_util_test.cc
+++ b/tensorflow/compiler/tf2xla/literal_util_test.cc
@@ -52,9 +52,9 @@ TEST(LiteralUtil, LiteralToHostTensor) {
 template <class T>
 using LiteralUtilTest = ::testing::Test;
 using Types =
-    ::testing::Types<std::pair<int8, qint8>, std::pair<uint8, quint8>,
-                     std::pair<int16, qint16>, std::pair<uint16, quint16>,
-                     std::pair<int32, qint32>>;
+    ::testing::Types<std::pair<int8_t, qint8>, std::pair<uint8_t, quint8>,
+                     std::pair<int16_t, qint16>, std::pair<uint16_t, quint16>,
+                     std::pair<int32_t, qint32>>;
 
 TYPED_TEST_SUITE(LiteralUtilTest, Types);
 
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index a9b2ead7b4d839..114905925cbf20 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -127,7 +127,7 @@ absl::Status ConvertGraphDefToXlaViaMlir(
   // with a placeholder node that contains a single output.
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), graph_def.library());
   std::unique_ptr<Graph> graph(new Graph(flib_def));
-  std::unordered_map<string, string> feed_name_remap;
+  std::unordered_map<std::string, std::string> feed_name_remap;
   TF_RETURN_IF_ERROR(AddPlaceholdersForFeeds(config, graph->op_registry(),
                                              &feed_name_remap, &graph_def));
 
diff --git a/tensorflow/compiler/tf2xla/resource_util.cc b/tensorflow/compiler/tf2xla/resource_util.cc
index e78828df4e13a4..50990e0bb2858d 100644
--- a/tensorflow/compiler/tf2xla/resource_util.cc
+++ b/tensorflow/compiler/tf2xla/resource_util.cc
@@ -15,13 +15,27 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/resource_util.h"
 
+#include <optional>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -204,8 +218,8 @@ absl::Status PropagateThroughCallOp(
   // Instantiate associated function to get function body.
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(InstantiateFunctionCall(n.def(), lib_runtime, &handle));
-  auto release_handle_on_return = gtl::MakeCleanup(
-      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+  auto release_handle_on_return =
+      gtl::MakeCleanup([&] { CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
 
   // Recursively analyze called function for resource sources and users.
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index e8b2a56cdf64d2..a28d6ac8b1554f 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/str_util.h"
 
 namespace tensorflow {
 
@@ -98,9 +111,8 @@ std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
 
         first_side_effecting_node_on_path = n;
         std::string original_node_name;
-        TF_CHECK_OK(GetNodeAttr(n->def(),
-                                kXlaOriginalOutsideCompilationNodeName,
-                                &original_node_name));
+        CHECK_OK(GetNodeAttr(n->def(), kXlaOriginalOutsideCompilationNodeName,
+                             &original_node_name));
         results.insert(original_node_name);
       },
       [&](Node* n) {
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 042b572c234355..5884cddba62b3d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -15,19 +15,36 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
-#include <functional>
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
 #include <queue>
 #include <random>
 #include <set>
+#include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -35,13 +52,16 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
@@ -1025,7 +1045,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
     // Look into forward While body function and check if TensorListPushBack op
     // has a Const input.
     NameAttrList fwd_body_attr;
-    TF_CHECK_OK(GetNodeAttr(fwd_while->def(), "body", &fwd_body_attr));
+    CHECK_OK(GetNodeAttr(fwd_while->def(), "body", &fwd_body_attr));
     const FunctionDef* fwd_body = fld->Find(fwd_body_attr.name());
     if (!fwd_body) {
       return errors::InvalidArgument("Cannot find function ",
@@ -1033,7 +1053,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
                                      fwd_while->DebugString());
     }
     std::unique_ptr<FunctionBody> fwd_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(
+    CHECK_OK(FunctionDefToBodyHelper(
         *fwd_body, AttrSlice(&fwd_body_attr.attr()), fld, &fwd_fbody));
 
     // Find the TensorListPushBack node; it's one of fwd_arg's successors.
@@ -1051,7 +1071,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
 
     // Get input for the TensorListPushBack node.
     Node* input_node;
-    TF_CHECK_OK(tl_push_nodes[0]->input_node(1, &input_node));
+    CHECK_OK(tl_push_nodes[0]->input_node(1, &input_node));
     if (input_node->type_string() != "Const") {
       // Input for the TensorList is not Const node.
       continue;
@@ -1062,7 +1082,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
     // Rewrite backward While body function, replace usages of
     // TensorListPopBack with a Const node.
     NameAttrList bwd_body_attr;
-    TF_CHECK_OK(GetNodeAttr(bwd_while->def(), "body", &bwd_body_attr));
+    CHECK_OK(GetNodeAttr(bwd_while->def(), "body", &bwd_body_attr));
     const FunctionDef* bwd_body = fld->Find(bwd_body_attr.name());
     if (!bwd_body) {
       return errors::InvalidArgument("Cannot find function ",
@@ -1070,7 +1090,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
                                      bwd_while->DebugString());
     }
     std::unique_ptr<FunctionBody> bwd_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(
+    CHECK_OK(FunctionDefToBodyHelper(
         *bwd_body, AttrSlice(&bwd_body_attr.attr()), fld, &bwd_fbody));
 
     // Find the TensorListPopBack node; it's one of bwd_arg's successors.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index ef64b82f50e5be..1d81f778232523 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -15,25 +15,39 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/ops/data_flow_ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/list_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/no_op.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -492,8 +506,7 @@ TEST(PropagateConstIntoFunctionalNodes, RewriteTensorListWithConstMember) {
   const FunctionDef* bwd_body = fld.Find("bwd_body_tl_rewrite_0");
   ASSERT_NE(bwd_body, nullptr);
   std::unique_ptr<FunctionBody> bwd_fbody;
-  TF_CHECK_OK(
-      FunctionDefToBodyHelper(*bwd_body, AttrSlice(), &fld, &bwd_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*bwd_body, AttrSlice(), &fld, &bwd_fbody));
   auto node_name_index = bwd_fbody->graph->BuildNodeNameIndex();
   const Node* identity = node_name_index.at("identity");
   ASSERT_NE(identity, nullptr);
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index add79c369b69ef..e7925a011f9eb5 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -45,7 +45,8 @@ class XlaCompilationAllocator : public Allocator {
     // Regardless of the size requested, always allocates an XlaExpression.
     // Respects the alignment request because there is alignment checking even
     // for Tensors whose data is never accessed.
-    void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
+    void* p = tsl::port::AlignedMalloc(
+        sizeof(XlaExpression), static_cast<std::align_val_t>(alignment));
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
     new (expression) XlaExpression();
     return expression;
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
index c2b9cc26d5d461..f4b7ed44ff41d5 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/status.h"
 
 namespace tensorflow {
 
@@ -47,17 +46,17 @@ XlaCompiledCpuFunctionThunks::XlaCompiledCpuFunctionThunks(
       std::move(function_library));
 
   // To load a CPU executable we don't need a compiler or a stream executor.
-  TF_CHECK_OK(aot_compilation_result.status());
+  CHECK_OK(aot_compilation_result.status());
   // NO_CDC: aot_compilation_result is checked to be OK above.
   auto cpu_executable = std::move(*aot_compilation_result.value())
-                            .LoadExecutable(nullptr, nullptr);
+                            .LoadExecutable(/*stream_exec=*/nullptr);
 
-  TF_CHECK_OK(cpu_executable.status());
+  CHECK_OK(cpu_executable.status());
   auto executable_or_err =
       // NO_CDC: cpu_executable is checked to be OK above.
       xla::cpu::NanoRtExecutable::Create(std::move(cpu_executable.value()));
 
-  TF_CHECK_OK(executable_or_err.status());
+  CHECK_OK(executable_or_err.status());
   // NO_CDC: executable_or_err is checked to be OK above.
   executable_ = std::move(executable_or_err.value());
 }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 216125f9cb153e..b9abd5006a958a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -226,7 +226,7 @@ class XlaCompiler {
     // This must be a shared_ptr, as this is passed all the way down to the
     // cluster compilation. This allows asynchronous compilation to hold a
     // reference until the compilation is finished.
-    std::shared_ptr<se::DeviceMemoryAllocator> device_allocator;
+    std::shared_ptr<stream_executor::DeviceAddressAllocator> device_allocator;
 
     // Alias input and output buffers for parameters that are passed-through XLA
     // modules without being changed.
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index fad607b1ae1333..16289828892460 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -15,23 +15,29 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 
+#include <cstdint>
+#include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "xla/client/client_library.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "xla/shape_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
@@ -44,8 +50,8 @@ const char XlaContext::kXlaContextResourceName[] = "_xla_context";
   // per-step context is looked up in the resource manager. The
   // JIT will prepopulate the JITContext.
   XlaContext* context;
-  TF_CHECK_OK(ctx->step_container()->Lookup(ctx->resource_manager(),
-                                            kXlaContextResourceName, &context));
+  CHECK_OK(ctx->step_container()->Lookup(ctx->resource_manager(),
+                                         kXlaContextResourceName, &context));
   // The resource manager handed us a fresh reference to 'context', but retains
   // a reference itself so the context won't be freed. The resource manager will
   // outlive the JIT compilation.
@@ -88,7 +94,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -104,7 +110,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -120,7 +126,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -137,7 +143,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateLogAddExp(
     VLOG(1) << "Building LogAddExp() for " << type_string;
     xla::XlaBuilder b("log_add_exp<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -158,7 +164,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 45814517342abc..0250d423296ede 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -17,58 +17,74 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 
-#include <map>
+#include <cstdint>
+#include <numeric>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/container/btree_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
-#include "xla/hlo/builder/lib/arithmetic.h"
-#include "xla/hlo/builder/lib/constants.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/refcount.h"
 
 namespace tensorflow {
 
 xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return xla::ConstantLiteral(b, xla::LiteralUtil::Zero(type));
 }
 
 xla::XlaOp XlaHelpers::One(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return xla::ConstantLiteral(b, xla::LiteralUtil::One(type));
 }
 
 xla::XlaOp XlaHelpers::IntegerLiteral(xla::XlaBuilder* b, DataType data_type,
                                       int64_t value) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return ::tensorflow::IntegerLiteral(b, type, value);
 }
 
 xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
                                     double value) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return ::tensorflow::FloatLiteral(b, type, value);
 }
 
@@ -139,7 +155,7 @@ DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
 xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp operand,
                                           const DataType new_element_type) {
   xla::PrimitiveType convert_to;
-  TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
+  CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
   return xla::ConvertElementType(operand, convert_to);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index c74db865769229..f8e85ba81f677a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
-#include <functional>
+#include <algorithm>
+#include <iterator>
 #include <memory>
+#include <set>
 #include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
@@ -28,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -42,11 +48,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 
 namespace tensorflow {
 
@@ -265,7 +267,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
             "Ops registered: \n" +
                 dynamic_cast<OpRegistry*>(op_registry)->DebugString(true));
       }
-      TF_CHECK_OK(lookup_status);
+      CHECK_OK(lookup_status);
 
       std::unordered_set<std::string> type_attrs;
       for (const OpDef::AttrDef& attr_def : op_def->attr()) {
@@ -475,7 +477,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const std::string& op) {
       }
     } else {
       int start, stop;
-      TF_CHECK_OK(op_kernel->InputRange(input, &start, &stop));
+      CHECK_OK(op_kernel->InputRange(input, &start, &stop));
       for (int i = start; i < stop; ++i) {
         result->push_back(i);
       }
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b76a4ffd8955b9..183b5289a1736c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -180,23 +180,23 @@ tf_proto_library(
     srcs = [],
     create_go_proto = False,
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
+        "//tensorflow/core/grappler/costs:op_performance_data",
         "//tensorflow/core/lib/core:error_codes_proto",
         "//tensorflow/core/profiler:profiler_options_proto",
         "//tensorflow/core/protobuf:error_codes_proto_impl",
         "//tensorflow/core/protobuf:for_core_protos",
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto",
-        "//tensorflow/core/grappler/costs:op_performance_data",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto",
         "@local_xla//xla/tsl/protobuf:coordination_config_proto",
         "@local_xla//xla/tsl/protobuf:distributed_runtime_payloads_proto",
         "@local_xla//xla/tsl/protobuf:status_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_jspb_proto_library(
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
index d252b541fcddfb..e485fb8c7d31b0 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
@@ -184,12 +184,12 @@ class EMBenchmarkHelper {
   // The rest of these are one per chain.
   NodeDef add_node_def_;
   NodeDef id_node_def_;
-  gtl::InlinedVector<TensorValue, 4> add_inputs_;
+  absl::InlinedVector<TensorValue, 4UL> add_inputs_;
   std::vector<AllocatorAttributes> allocator_attrs_;
-  gtl::InlinedVector<Tensor, 4> gpu_inputs_;
-  gtl::InlinedVector<Tensor, 4> gpu_outputs_;
-  gtl::InlinedVector<Tensor, 4> host_inputs_;
-  gtl::InlinedVector<Tensor, 4> host_outputs_;
+  absl::InlinedVector<Tensor, 4UL> gpu_inputs_;
+  absl::InlinedVector<Tensor, 4UL> gpu_outputs_;
+  absl::InlinedVector<Tensor, 4UL> host_inputs_;
+  absl::InlinedVector<Tensor, 4UL> host_outputs_;
 
  public:
   // Length of tensors.  TODO(tucker): make this a variable parameter.
@@ -242,7 +242,7 @@ class EMBenchmarkHelper {
   }
 
   std::unique_ptr<OpKernel> GetOpKernel(const NodeDef& node_def,
-                                        Status* status) {
+                                        absl::Status* status) {
     return CreateOpKernel("GPU", gpu_helper_->gpu(),
                           gpu_helper_->gpu_allocator(), node_def,
                           TF_GRAPH_DEF_VERSION, status);
@@ -256,7 +256,7 @@ class EMBenchmarkHelper {
                        .Device("/job:a/replica:0/task:0/GPU:0")
                        .Finalize(&add_node_def_));
     }
-    Status status;
+    absl::Status status;
     add_kernels_.emplace_back(GetOpKernel(add_node_def_, &status));
     TF_ASSERT_OK(status);
     add_params_.push_back(new OpKernelContext::Params);
@@ -385,12 +385,12 @@ class EMBenchmarkHelper {
           gpu_helper_->h2d_stream()->WaitFor(gpu_helper_->compute_stream()));
       // Begin by copying the input values from CPU to GPU.
       const int64_t src_bytes = host_inputs_[0].TotalBytes();
-      se::DeviceMemoryBase gpu_dst_ptr0(DMAHelper::base(&gpu_inputs_[0]),
-                                        src_bytes);
+      stream_executor::DeviceAddressBase gpu_dst_ptr0(
+          DMAHelper::base(&gpu_inputs_[0]), src_bytes);
       TF_ASSERT_OK(gpu_helper_->h2d_stream()->Memcpy(
           &gpu_dst_ptr0, DMAHelper::base(&host_inputs_[0]), src_bytes));
-      se::DeviceMemoryBase gpu_dst_ptr1(DMAHelper::base(&gpu_inputs_[1]),
-                                        src_bytes);
+      stream_executor::DeviceAddressBase gpu_dst_ptr1(
+          DMAHelper::base(&gpu_inputs_[1]), src_bytes);
       TF_ASSERT_OK(gpu_helper_->h2d_stream()->Memcpy(
           &gpu_dst_ptr1, DMAHelper::base(&host_inputs_[1]), src_bytes));
       TF_ASSERT_OK(
@@ -421,8 +421,8 @@ class EMBenchmarkHelper {
       TF_ASSERT_OK(
           gpu_helper_->d2h_stream()->WaitFor(gpu_helper_->compute_stream()));
       const int64_t return_bytes = ctx->mutable_output(0)->TotalBytes();
-      se::DeviceMemoryBase gpu_src_ptr(DMAHelper::base(ctx->mutable_output(0)),
-                                       return_bytes);
+      stream_executor::DeviceAddressBase gpu_src_ptr(
+          DMAHelper::base(ctx->mutable_output(0)), return_bytes);
       TF_ASSERT_OK(gpu_helper_->d2h_stream()->Memcpy(
           DMAHelper::base(&host_outputs_[0]), gpu_src_ptr, return_bytes));
       gpu_helper_->event_mgr()->ThenExecute(gpu_helper_->d2h_stream(),
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index d285bb2f8740d1..ed5ab0149ecbee 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -272,7 +272,7 @@ tf_cuda_library(
                 "//tensorflow/core/framework:resource_base",
                 "@local_xla//xla/pjrt/distributed:key_value_store_interface",
                 "@local_xla//xla/pjrt:local_device_state",
-                "@local_xla//xla/pjrt/gpu:gpu_topology",
+                "@local_xla//xla/service:gpu_topology",
                 "@local_xla//xla/pjrt:pjrt_client",
                 "@local_xla//xla/pjrt:pjrt_compiler",
                 "@local_xla//xla/service/gpu:gpu_executable_run_options",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 9852cce5ee3413..e7700d1076c132 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -35,13 +35,14 @@ namespace {
 
 mutex g_op_name_to_attr_type_map_lock(LINKER_INITIALIZED);
 
-tensorflow::gtl::FlatMap<string, const AttrTypeMap*>* OpNameToAttrTypeMap() {
+tensorflow::gtl::FlatMap<std::string, const AttrTypeMap*>*
+OpNameToAttrTypeMap() {
   static auto* const m =
-      new tensorflow::gtl::FlatMap<string, const AttrTypeMap*>;
+      new tensorflow::gtl::FlatMap<std::string, const AttrTypeMap*>;
   return m;
 }
 
-const uint32 kIsList = 1U << 31;
+const uint32_t kIsList = 1U << 31;
 
 AttrTypeMap* DefaultFunctionAttrTypeMap() {
   AttrTypeMap* map = new AttrTypeMap();
@@ -57,7 +58,7 @@ const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
 
 }  // namespace
 
-absl::Status OpDefForOp(const string& op_name, const OpDef** op_def) {
+absl::Status OpDefForOp(const std::string& op_name, const OpDef** op_def) {
   const OpRegistrationData* op_reg_data = nullptr;
   absl::Status s = OpRegistry::Global()->LookUp(op_name, &op_reg_data);
   if (s.ok()) {
@@ -102,12 +103,12 @@ absl::Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
   for (const auto& attr : op_def->attr()) {
-    string type = attr.type();
+    std::string type = attr.type();
     const bool is_list = (type.length() > 6 && type.compare(0, 4, "list") == 0);
     if (is_list) {
       type = type.substr(5, type.length() - 6);
     }
-    uint32 t = is_list ? kIsList : 0;
+    uint32_t t = is_list ? kIsList : 0;
     if (type == "string") {
       t |= TF_ATTR_STRING;
     } else if (type == "int") {
@@ -163,7 +164,7 @@ DEFINE_GET_ATTR(tensorflow::DataType, type, "type");
 template <>
 absl::Status AttrBuilder::Get(absl::string_view attr_name,
                               absl::InlinedVector<DataType, 4>* value) const {
-  auto it = encoded_attrs_.find(string(attr_name));
+  auto it = encoded_attrs_.find(std::string(attr_name));
   if (it == encoded_attrs_.end()) {
     return errors::NotFound("No attr named '", attr_name,
                             "' found in AttrBuilder for ", op_name_);
@@ -207,7 +208,7 @@ void AttrBuilder::FillAttrValueMap(AttrValueMap* m) const {
 
 namespace {
 
-bool ValueMatchesDefault(const OpDef* op_def, const string& attr_name,
+bool ValueMatchesDefault(const OpDef* op_def, const std::string& attr_name,
                          const AttrValue& attr_value) {
   // TODO(iga): It might make sense to augment OpRegistrationData with a
   // {attr_name -> default_attr_value} FlatMap to avoid the loop here.
@@ -238,7 +239,7 @@ void AttrBuilder::FillAttrValueMapWithoutDefaults(AttrValueMap* m) const {
 
 void AttrBuilder::AddAttrIfNotPresent(absl::string_view attr_name,
                                       const AttrValue& value) {
-  encoded_attrs_.emplace(string(attr_name), value.SerializeAsString());
+  encoded_attrs_.emplace(std::string(attr_name), value.SerializeAsString());
 }
 
 const NodeDef& AttrBuilder::BuildNodeDef() {
@@ -260,7 +261,7 @@ void AttrBuilder::CopyAttributes(const AttrBuilder& other) {
                         other.encoded_attrs_.end());
 }
 
-absl::Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+absl::Status AttrTypeByName(const AttrTypeMap& m, const std::string& attr_name,
                             TF_AttrType* out, unsigned char* is_list) {
   auto* t = gtl::FindOrNull(m, attr_name);
   if (t == nullptr) {
@@ -290,7 +291,7 @@ inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s,
   return FingerprintCat128(a, b);
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s, uint64 b) {
+inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s, uint64_t b) {
   return CacheKeyHelper(s, {b, b});
 }
 
@@ -299,7 +300,7 @@ inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s, uint64 b) {
 tensorflow::Fprint128 AttrBuilder::CacheKey(const absl::string_view device) {
   if (!cached_cache_key_ || device != device_for_cached_cache_key_) {
     cached_cache_key_ = BuildCacheKeyForDevice(device);
-    device_for_cached_cache_key_ = string(device);
+    device_for_cached_cache_key_ = std::string(device);
   }
 
   return *cached_cache_key_;
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 9dc480d8c8187a..bdd644a6331ca6 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -40,10 +40,10 @@ namespace tensorflow {
 // If the type is not a list type, the value is the same as the TF_AttrType type
 // of the value. Else, the highest order bit is on, and the rest of the bits
 // represent the TF_AttrType type of the values in the list.
-typedef std::unordered_map<string, uint32> AttrTypeMap;
+typedef std::unordered_map<std::string, uint32_t> AttrTypeMap;
 
 // Look up OpDef for `op_name`.
-absl::Status OpDefForOp(const string& op_name, const OpDef** op_def);
+absl::Status OpDefForOp(const std::string& op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
 // If op_name is not registered in global op registry, AttrTypeMapForOp assumes
@@ -53,7 +53,7 @@ absl::Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
                               bool* is_function);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-absl::Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+absl::Status AttrTypeByName(const AttrTypeMap& m, const std::string& attr_name,
                             TF_AttrType* out, unsigned char* is_list);
 
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
@@ -111,8 +111,8 @@ class AttrBuilder : public AbstractOpAttrs {
     device_for_cached_cache_key_.clear();
   }
 
-  const string& op_name() const { return op_name_; }
-  void set_op_name(const string& name) { op_name_ = name; }
+  const std::string& op_name() const { return op_name_; }
+  void set_op_name(const std::string& name) { op_name_ = name; }
 
   // Needed to work around call to ValidateNodeDef in CreateOpKernel.
   AttrBuilder& NumInputs(int n);
@@ -186,7 +186,7 @@ class AttrBuilder : public AbstractOpAttrs {
   tensorflow::Fprint128 BuildCacheKeyForDevice(absl::string_view device) const;
 
   template <class T>
-  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
+  void SetInAttrValueMap(AttrValueMap* m, const std::string& attr_name,
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
@@ -196,17 +196,17 @@ class AttrBuilder : public AbstractOpAttrs {
 
   void AddAttrIfNotPresent(absl::string_view attr_name, const AttrValue& value);
 
-  gtl::FlatMap<string, string> encoded_attrs_;
+  gtl::FlatMap<std::string, std::string> encoded_attrs_;
   mutable AttrValue attr_tmp_;  // For encoding
 
-  string op_name_;
+  std::string op_name_;
   int num_inputs_;
   NodeDef node_def_;
   bool node_def_initialized_;
   bool node_def_finalized_;
 
   std::optional<tensorflow::Fprint128> cached_cache_key_;
-  string device_for_cached_cache_key_;
+  std::string device_for_cached_cache_key_;
 };
 
 template <>
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 77462842f493a2..e0a35cfc59c524 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -85,8 +85,8 @@ TEST(AttrTypeMap, CacheKey) {
   ASSERT_FALSE(cache_key == a.CacheKey("cpu:0"));
 }
 
-string ToString(const AttrValueMap& m) {
-  std::vector<string> strs;
+std::string ToString(const AttrValueMap& m) {
+  std::vector<std::string> strs;
   for (const auto& e : m) {
     strs.push_back(absl::StrCat(e.first, " -> ", e.second.DebugString()));
   }
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index fe649546530f3c..358c51f22c098e 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -150,7 +150,8 @@ EagerContext::EagerContext(
       allow_soft_placement_(opts.config.allow_soft_placement()),
       num_active_steps_(0),
       step_container_(std::make_unique<ScopedStepContainer>(
-          0, [this](const string& name) { ClearResourceContainer(name); })),
+          0,
+          [this](const std::string& name) { ClearResourceContainer(name); })),
       default_executor_(async,
                         /*enable_streaming_enqueue=*/!opts.config.experimental()
                             .disable_eager_executor_streaming_enqueue()),
@@ -198,7 +199,7 @@ AbstractTensorInterface* EagerContext::CreateInt64Scalar(int64_t value) {
   return new TensorInterface(Tensor(value));
 }
 
-AbstractTensorInterface* EagerContext::CreateUint64Scalar(uint64 value) {
+AbstractTensorInterface* EagerContext::CreateUint64Scalar(uint64_t value) {
   return new TensorInterface(Tensor(value));
 }
 
@@ -285,8 +286,9 @@ void EagerContext::InitPrioritizedDeviceTypeList() {
 namespace {
 // Using absl::StrJoin with lambda does not work in tf-lite builds.
 // TODO(b/148160441): Replace with absl::StrJoin once DeviceBase has operator<<.
-std::vector<string> DevicesToString(const PrioritizedDeviceVector& devices) {
-  std::vector<string> v;
+std::vector<std::string> DevicesToString(
+    const PrioritizedDeviceVector& devices) {
+  std::vector<std::string> v;
   v.reserve(devices.size());
   for (const auto& p : devices) {
     v.push_back(p.first->name());
@@ -294,9 +296,9 @@ std::vector<string> DevicesToString(const PrioritizedDeviceVector& devices) {
   return v;
 }
 
-std::vector<string> DeviceTypesToString(
+std::vector<std::string> DeviceTypesToString(
     const PrioritizedDeviceTypeVector& types) {
-  std::vector<string> v;
+  std::vector<std::string> v;
   v.reserve(types.size());
   for (const auto& p : types) {
     v.push_back(p.first.type_string());
@@ -316,8 +318,8 @@ std::vector<string> DeviceTypesToString(
 Device* SelectBestMatchingDevice(const DeviceNameUtils::ParsedName& pattern,
                                  const PrioritizedDeviceVector& existing,
                                  const PrioritizedDeviceTypeVector& supported) {
-  for (const std::pair<DeviceType, int32>& prioritized_type : supported) {
-    for (const std::pair<Device*, int32>& prioritized_device : existing) {
+  for (const std::pair<DeviceType, int32_t>& prioritized_type : supported) {
+    for (const std::pair<Device*, int32_t>& prioritized_device : existing) {
       Device* dev = prioritized_device.first;
       if (DeviceType(dev->attributes().device_type()) ==
               prioritized_type.first &&
@@ -485,7 +487,7 @@ void EagerContext::ClearCachesAndDefaultExecutor() {
   {
     mutex_lock ml(metadata_mu_);
     step_container_ = std::make_unique<ScopedStepContainer>(
-        0, [this](const string& name) { ClearResourceContainer(name); });
+        0, [this](const std::string& name) { ClearResourceContainer(name); });
   }
 }
 
@@ -509,7 +511,7 @@ ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() const {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-std::vector<string> EagerContext::GetRemoteContexts() {
+std::vector<std::string> EagerContext::GetRemoteContexts() {
   tf_shared_lock l(remote_state_mu_);
   return remote_contexts_;
 }
@@ -520,9 +522,9 @@ bool EagerContext::IsRemoteContextsEmpty() {
 }
 
 void EagerContext::CloseAndClearAllRemoteContexts() {
-  uint64 context_id;
-  uint64 context_view_id;
-  std::vector<string> remote_contexts_copy;
+  uint64_t context_id;
+  uint64_t context_view_id;
+  std::vector<std::string> remote_contexts_copy;
   {
     mutex_lock l(remote_state_mu_);
     if (!is_master_) return;
@@ -541,8 +543,8 @@ void EagerContext::CloseAndClearAllRemoteContexts() {
 }
 
 void EagerContext::CloseRemoteContexts(
-    const std::vector<string>& remote_contexts, uint64 context_id,
-    uint64 context_view_id) {
+    const std::vector<std::string>& remote_contexts, uint64_t context_id,
+    uint64_t context_view_id) {
   // Close all remote contexts.
   eager::CloseContextRequest request;
   request.set_context_id(context_id);
@@ -689,21 +691,22 @@ EagerContext::~EagerContext() {
   }
 }
 
-bool EagerContext::FindFunctionByName(const string& name) const {
+bool EagerContext::FindFunctionByName(const std::string& name) const {
   return func_lib_def_.Find(name) != nullptr;
 }
 
 absl::Status EagerContext::FindFunctionOpData(
-    const string& name, const tensorflow::OpRegistrationData** op_data) {
+    const std::string& name, const tensorflow::OpRegistrationData** op_data) {
   return func_lib_def_.LookUp(name, op_data);
 }
 
-const FunctionDef* EagerContext::FindFunctionDef(const string& name) const {
+const FunctionDef* EagerContext::FindFunctionDef(
+    const std::string& name) const {
   return func_lib_def_.Find(name);
 }
 
 core::RefCountPtr<FunctionRecord> EagerContext::FindRecord(
-    const string& name) const {
+    const std::string& name) const {
   return func_lib_def_.FindRecord(name);
 }
 
@@ -763,7 +766,7 @@ std::vector<Device*> EagerContext::ListAllTfDevices() {
   // Since remote_device_mgr may also contain local devices, make sure no
   // duplicated device is returned.
   std::vector<Device*> devices;
-  std::unordered_set<string> dev_names;
+  std::unordered_set<std::string> dev_names;
 
   if (local_device_mgr()) {
     for (const auto& dev : local_device_mgr()->ListDevices()) {
@@ -832,7 +835,7 @@ void EagerContext::EndStep() {
     // TODO(b/139809335): This does not properly clean up remote resources
     // Clean up the previous step container and create a new one.
     step_container_ = std::make_unique<ScopedStepContainer>(
-        0, [this](const string& name) { ClearResourceContainer(name); });
+        0, [this](const std::string& name) { ClearResourceContainer(name); });
   }
 }
 
@@ -880,7 +883,7 @@ absl::Status EagerContext::MaybeRegisterFunctionRemotely(
 }
 
 absl::Status EagerContext::MaybeRemoveFunctionRemotely(
-    const string& function_name) {
+    const std::string& function_name) {
   // Only client context can remove function on remote worker context.
   if (!remote_device_manager_.Owned()) {
     return absl::OkStatus();
@@ -917,10 +920,10 @@ absl::Status EagerContext::MaybeRemoveFunctionRemotely(
 }
 
 absl::Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
-    const std::vector<string>& remote_workers) {
+    const std::vector<std::string>& remote_workers) {
 #if !defined(IS_MOBILE_PLATFORM)
   // Register multiple functions on selected remote workers.
-  uint64 context_id = GetContextId();
+  uint64_t context_id = GetContextId();
   FunctionDefLibrary function_defs = func_lib_def_.ToProto();
   std::vector<std::shared_ptr<eager::EnqueueRequest>> requests(
       function_defs.function_size());
@@ -1079,16 +1082,17 @@ absl::Status EagerContext::AddComponentFunction(
   return absl::OkStatus();
 }
 
-const FunctionDef* EagerContext::GetFunctionDef(const string& function_name) {
+const FunctionDef* EagerContext::GetFunctionDef(
+    const std::string& function_name) {
   return func_lib_def_.Find(function_name);
 }
 
-std::vector<string> EagerContext::ListFunctionNames() {
+std::vector<std::string> EagerContext::ListFunctionNames() {
   return func_lib_def_.ListFunctionNames();
 }
 
 absl::Status EagerContext::AddRemoveFunctionNotifier(
-    const string& func, std::function<void()> notifier) {
+    const std::string& func, std::function<void()> notifier) {
   mutex_lock l(remove_function_notifiers_mu_);
   auto iter = remove_function_notifiers_.find(func);
   if (iter != remove_function_notifiers_.end()) {
@@ -1122,7 +1126,7 @@ EagerContext::GetCacheStats() {
   return stats;
 }
 
-absl::Status EagerContext::RemoveFunction(const string& func) {
+absl::Status EagerContext::RemoveFunction(const std::string& func) {
   // TODO(mdan): The context owns these functions. Why check refcount then?
   std::vector<std::function<void()>> notifiers;
   bool is_last_ref = false;
@@ -1308,14 +1312,14 @@ absl::Status EagerContext::FindCompositeDeviceFromName(
   return errors::NotFound("Unknown composite device: ", device_name);
 }
 
-bool EagerContext::IsCustomDevice(const string& device_name) {
+bool EagerContext::IsCustomDevice(const std::string& device_name) {
   CustomDevice* device = nullptr;
   return custom_device_op_handler_.FindCustomDeviceFromName(device_name,
                                                             &device);
 }
 
 absl::Status EagerContext::RegisterCustomDevice(
-    const string& device_name, std::unique_ptr<CustomDevice> device) {
+    const std::string& device_name, std::unique_ptr<CustomDevice> device) {
   Device* existing_physical_device = nullptr;
   if (FindDeviceFromName(device_name.c_str(), &existing_physical_device).ok()) {
     return errors::AlreadyExists(device_name,
@@ -1326,14 +1330,15 @@ absl::Status EagerContext::RegisterCustomDevice(
 }
 
 absl::Status EagerContext::FindOrCreateCompositeDevice(
-    const std::vector<string>& underlying_devices, const string& device_name,
-    CompositeDevice** composite_device) {
+    const std::vector<std::string>& underlying_devices,
+    const std::string& device_name, CompositeDevice** composite_device) {
   if (!device_name.empty() &&
       FindCompositeDeviceFromName(device_name, composite_device).ok()) {
     return absl::OkStatus();
   }
 
-  const uint64 hash_key = Fingerprint64(absl::StrJoin(underlying_devices, ","));
+  const uint64_t hash_key =
+      Fingerprint64(absl::StrJoin(underlying_devices, ","));
 
   mutex_lock l(composite_devices_mu_);
   auto iter = composite_devices_.find(hash_key);
@@ -1371,14 +1376,14 @@ bool EagerContext::OnSameTask(const Device* first, const Device* second) const {
 // Gets the CPU device on the task of device.
 absl::Status EagerContext::CPUDeviceOnTask(const Device* device,
                                            Device** cpu_device) const {
-  string cpu_device_name;
+  std::string cpu_device_name;
   TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
       device->name(), &cpu_device_name));
 
   return FindDeviceFromName(cpu_device_name.c_str(), cpu_device);
 }
 
-void EagerContext::ClearResourceContainer(const string& name) {
+void EagerContext::ClearResourceContainer(const std::string& name) {
   // TODO(b/139809335): This does not properly clean up remote resources
   auto local_devices = local_device_mgr()->ListDevices();
   for (Device* device : local_devices) {
@@ -1406,8 +1411,8 @@ void EagerContext::UpdateGlobalRendezvousDeviceManager(
 }
 
 namespace {
-absl::Status GetTaskName(Device* d, string* task_name) {
-  string ignored;
+absl::Status GetTaskName(Device* d, std::string* task_name) {
+  std::string ignored;
   if (!DeviceNameUtils::SplitDeviceName(d->name(), task_name, &ignored)) {
     return errors::InvalidArgument("Unable to parse device name: ", d->name());
   }
@@ -1425,7 +1430,7 @@ absl::Status EagerContext::GetClient(
 absl::Status EagerContext::GetClient(
     const DeviceNameUtils::ParsedName& device_name,
     core::RefCountPtr<eager::EagerClient>* client) {
-  string device_task_name;
+  std::string device_task_name;
   if (!DeviceNameUtils::GetTaskName(device_name, &device_task_name)) {
     return errors::InvalidArgument(
         "Task is not fully specified in device name: ",
@@ -1457,7 +1462,8 @@ absl::Status EagerContext::GetClient(
 }
 
 absl::Status EagerContext::GetClient(
-    const string& remote_task, core::RefCountPtr<eager::EagerClient>* client) {
+    const std::string& remote_task,
+    core::RefCountPtr<eager::EagerClient>* client) {
   {
     tf_shared_lock l(remote_state_mu_);
     if (remote_eager_workers_ == nullptr) {
@@ -1474,12 +1480,12 @@ absl::Status EagerContext::GetClient(
   return absl::OkStatus();
 }
 
-uint64 EagerContext::GetContextId() const {
+uint64_t EagerContext::GetContextId() const {
   tf_shared_lock l(remote_state_mu_);
   return context_id_;
 }
 
-uint64 EagerContext::GetContextViewId() const {
+uint64_t EagerContext::GetContextViewId() const {
   tf_shared_lock l(remote_state_mu_);
   return context_view_id_;
 }
@@ -1544,9 +1550,10 @@ absl::Status EagerContext::StoreCollectiveOpsServer(
 }
 
 absl::Status EagerContext::SetRemoteDeviceFilters(
-    const string& remote_worker, const std::vector<string>& device_filters) {
+    const std::string& remote_worker,
+    const std::vector<std::string>& device_filters) {
   // Get fully specified task name for remote worker
-  string remote_worker_task_name;
+  std::string remote_worker_task_name;
   DeviceNameUtils::ParsedName pw;
   if (!DeviceNameUtils::ParseFullName(remote_worker, &pw)) {
     return tensorflow::errors::InvalidArgument(
@@ -1583,7 +1590,7 @@ absl::Status EagerContext::SetRemoteDeviceFilters(
 }
 
 void EagerContext::FilterDevicesForRemoteWorkers(
-    const string& remote_worker,
+    const std::string& remote_worker,
     const protobuf::RepeatedPtrField<DeviceAttributes>& device_attrs,
     std::vector<bool>* filtered_device_mask) {
   filtered_device_mask->resize(device_attrs.size());
@@ -1634,7 +1641,7 @@ absl::Status EagerContext::InitializeRemoteMaster(
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-    const std::vector<string>& remote_contexts, uint64 context_id,
+    const std::vector<std::string>& remote_contexts, uint64_t context_id,
     tsl::core::RefCountPtr<Rendezvous> r, DeviceMgr* local_device_mgr,
     int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
@@ -1661,10 +1668,10 @@ absl::Status EagerContext::InitializeRemoteMaster(
 }
 
 absl::Status EagerContext::UpdateRemoteMaster(
-    uint64 context_id,
+    uint64_t context_id,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    const std::vector<string>& add_remote_contexts,
-    const std::vector<string>& remove_remote_contexts) {
+    const std::vector<std::string>& add_remote_contexts,
+    const std::vector<std::string>& remove_remote_contexts) {
   {
     tf_shared_lock l(remote_state_mu_);
     if (context_id != context_id_) {
@@ -1682,7 +1689,7 @@ absl::Status EagerContext::UpdateRemoteMaster(
     // a larger view id and ignores this request.
     CloseRemoteContexts(remove_remote_contexts, context_id, GetContextViewId());
     mutex_lock l(remote_state_mu_);
-    for (const string& remote_context : remove_remote_contexts) {
+    for (const std::string& remote_context : remove_remote_contexts) {
       remote_contexts_.erase(
           std::remove(remote_contexts_.begin(), remote_contexts_.end(),
                       remote_context),
@@ -1731,10 +1738,10 @@ absl::Status EagerContext::SetMasterContextState(
     std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    std::unique_ptr<DynamicDeviceMgr> remote_device_manager, uint64 context_id,
-    uint64 context_view_id, tsl::core::RefCountPtr<Rendezvous> r,
-    DeviceMgr* local_device_mgr, int keep_alive_secs,
-    DistributedFunctionLibraryRuntime* cluster_flr,
+    std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
+    uint64_t context_id, uint64_t context_view_id,
+    tsl::core::RefCountPtr<Rendezvous> r, DeviceMgr* local_device_mgr,
+    int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
   mutex_lock l(remote_state_mu_);
@@ -1852,8 +1859,8 @@ absl::Status EagerContext::SetMasterContextState(
 absl::Status EagerContext::InitializeRemoteWorker(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     DynamicDeviceMgr* remote_device_mgr,
-    const std::vector<string>& remote_contexts, uint64 context_id,
-    uint64 context_view_id,
+    const std::vector<std::string>& remote_contexts, uint64_t context_id,
+    uint64_t context_view_id,
     std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
         rendezvous_creator,
     DistributedFunctionLibraryRuntime* cluster_flr,
@@ -1908,7 +1915,7 @@ absl::Status EagerContext::InitializeRemoteWorker(
 
 absl::Status EagerContext::UpdateRemoteWorker(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    const std::vector<string>& remote_contexts, uint64 context_id) {
+    const std::vector<std::string>& remote_contexts, uint64_t context_id) {
   {
     mutex_lock l(remote_state_mu_);
     if (context_id != context_id_) {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 98fa2e7e31b9a7..1013cc17bf95fe 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -86,10 +86,10 @@ bool SkipRemoteHandleWaitReady();
 
 class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
  public:
-  static constexpr uint64 kInvalidContextId = 0;
+  static constexpr uint64_t kInvalidContextId = 0;
 
-  static uint64 NewContextId() {
-    uint64 context_id = random::New64();
+  static uint64_t NewContextId() {
+    uint64_t context_id = random::New64();
     while (context_id == kInvalidContextId) {
       context_id = random::New64();
     }
@@ -108,7 +108,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void Release() override { Unref(); }
 
   AbstractTensorInterface* CreateInt64Scalar(int64_t value) override;
-  AbstractTensorInterface* CreateUint64Scalar(uint64 value) override;
+  AbstractTensorInterface* CreateUint64Scalar(uint64_t value) override;
   AbstractTensorInterface* CreateInt32Scalar(int32_t value) override;
   AbstractTensorInterface* CreateFloatScalar(float value) override;
   AbstractTensorInterface* CreateDoubleScalar(double value) override;
@@ -208,14 +208,14 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                             const NodeDef& ndef, Device** out) const;
 
   // TODO(mdan): Rename to ContainsFunction.
-  bool FindFunctionByName(const string& name) const;
+  bool FindFunctionByName(const std::string& name) const;
 
   absl::Status FindFunctionOpData(
-      const string& name, const tensorflow::OpRegistrationData** op_data);
+      const std::string& name, const tensorflow::OpRegistrationData** op_data);
 
-  const FunctionDef* FindFunctionDef(const string& name) const override;
+  const FunctionDef* FindFunctionDef(const std::string& name) const override;
   core::RefCountPtr<FunctionRecord> FindRecord(
-      const string& name) const override;
+      const std::string& name) const override;
 
   Device* HostCPU() const { return host_cpu_device_; }
   Device* CanonicalDevice(Device* d) const {
@@ -225,7 +225,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
     return HostCPU()->parsed_name();
   }
 
-  const string& HostCPUName() const override { return HostCPU()->name(); }
+  const std::string& HostCPUName() const override { return HostCPU()->name(); }
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
@@ -263,14 +263,14 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   absl::Status AddComponentFunction(const FunctionDef& fdef,
                                     const FunctionDefLibrary& library);
 
-  const FunctionDef* GetFunctionDef(const string& function_name);
+  const FunctionDef* GetFunctionDef(const std::string& function_name);
 
-  std::vector<string> ListFunctionNames() override;
+  std::vector<std::string> ListFunctionNames() override;
   tensorflow::ImmediateExecutionContext::CacheStats GetCacheStats() override;
 
-  absl::Status RemoveFunction(const string& func) override;
+  absl::Status RemoveFunction(const std::string& func) override;
   absl::Status AddRemoveFunctionNotifier(
-      const string& func, std::function<void()> notifier) override;
+      const std::string& func, std::function<void()> notifier) override;
 
   // Wait for pending nodes to be finished in local executors (including context
   // default executor and thread executors) and executors on remote workers.
@@ -401,7 +401,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   const FunctionLibraryDefinition* FuncLibDef() const { return &func_lib_def_; }
 
   FunctionLibraryDefinition* GetComponentFunctionFunctionLibraryDefinition(
-      const string& function_name) {
+      const std::string& function_name) {
     tf_shared_lock lock(cache_mu_);
     auto iter = component_function_libraries_.find(function_name);
     if (iter != component_function_libraries_.end()) {
@@ -421,11 +421,11 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                          core::RefCountPtr<eager::EagerClient>* client);
   absl::Status GetClient(const DeviceNameUtils::ParsedName& device_name,
                          core::RefCountPtr<eager::EagerClient>* client);
-  absl::Status GetClient(const string& remote_task,
+  absl::Status GetClient(const std::string& remote_task,
                          core::RefCountPtr<eager::EagerClient>* client);
 
-  uint64 GetContextId() const;
-  uint64 GetContextViewId() const;
+  uint64_t GetContextId() const;
+  uint64_t GetContextViewId() const;
   void IncrementContextViewId();
 
   absl::Status EnableCollectiveOps(const ServerDef& server_def) override;
@@ -450,7 +450,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-      const std::vector<string>& remote_contexts, uint64 context_id,
+      const std::vector<std::string>& remote_contexts, uint64_t context_id,
       tsl::core::RefCountPtr<Rendezvous> r,
       /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -464,18 +464,18 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // can still be accessed, and will automatically register existing functions
   // if there are newly added hosts.
   absl::Status UpdateRemoteMaster(
-      uint64 context_id,
+      uint64_t context_id,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      const std::vector<string>& add_remote_contexts,
-      const std::vector<string>& remove_remote_contexts);
+      const std::vector<std::string>& add_remote_contexts,
+      const std::vector<std::string>& remove_remote_contexts);
 
   // Similar with InitializeRemoteMaster but this context will not kill remote
   // contexts in shutdown.
   absl::Status InitializeRemoteWorker(
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       DynamicDeviceMgr* remote_device_mgr,
-      const std::vector<string>& remote_contexts, uint64 context_id,
-      uint64 context_view_id,
+      const std::vector<std::string>& remote_contexts, uint64_t context_id,
+      uint64_t context_view_id,
       std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
           rendezvous_creator,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -487,7 +487,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // increment context_view_id.
   absl::Status UpdateRemoteWorker(
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      const std::vector<string>& remote_contexts, uint64 context_id);
+      const std::vector<std::string>& remote_contexts, uint64_t context_id);
 
   absl::Status StoreCollectiveOpsServer(
       std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
@@ -495,7 +495,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   // For the specified remote worker, preprocess and set its device filters.
   absl::Status SetRemoteDeviceFilters(
-      const string& remote_worker, const std::vector<string>& device_filters);
+      const std::string& remote_worker,
+      const std::vector<std::string>& device_filters);
 
   // For the specified remote worker, apply the stored device filters to the
   // list of device attributes following these rules:
@@ -507,7 +508,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // filtered_device_mask) indicating whether each of the devices is visible to
   // the remote worker.
   void FilterDevicesForRemoteWorkers(
-      const string& remote_worker,
+      const std::string& remote_worker,
       const protobuf::RepeatedPtrField<DeviceAttributes>& device_attrs,
       std::vector<bool>* filtered_device_mask);
 
@@ -567,10 +568,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   absl::Status FindCompositeDeviceFromName(absl::string_view device_name,
                                            CompositeDevice** device) const;
 
-  bool IsCustomDevice(const string& device_name) override;
+  bool IsCustomDevice(const std::string& device_name) override;
 
   absl::Status RegisterCustomDevice(
-      const string& name, std::unique_ptr<CustomDevice> device) override;
+      const std::string& name, std::unique_ptr<CustomDevice> device) override;
 
   CustomDeviceOpHandler& GetCustomDeviceOpHandler() override {
     return custom_device_op_handler_;
@@ -579,8 +580,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Find or create a composite device with the given `underlying_devices` and
   // `device_name` (if not empty).
   absl::Status FindOrCreateCompositeDevice(
-      const std::vector<string>& underlying_devices, const string& device_name,
-      CompositeDevice** composite_device);
+      const std::vector<std::string>& underlying_devices,
+      const std::string& device_name, CompositeDevice** composite_device);
 
   bool OnSameTask(const Device* first, const Device* second) const;
   // Gets the CPU device on the task of device.
@@ -667,9 +668,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   ~EagerContext() override;
 
   absl::Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
-  absl::Status MaybeRemoveFunctionRemotely(const string& function_name);
+  absl::Status MaybeRemoveFunctionRemotely(const std::string& function_name);
   absl::Status RegisterExistingFunctionsOnRemoteWorkers(
-      const std::vector<string>& remote_workers);
+      const std::vector<std::string>& remote_workers);
 
   void ResetPFLR(const DeviceMgr* device_mgr, Env* env,
                  const ConfigProto* config, int graph_def_version,
@@ -681,7 +682,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void ResetClusterFLR(DistributedFunctionLibraryRuntime* cluster_flr);
   void UpdateGlobalRendezvousDeviceManager(tensorflow::DeviceMgr* device_mgr);
 
-  void ClearResourceContainer(const string& name);
+  void ClearResourceContainer(const std::string& name);
 
   template <typename T>
   struct OwnedOrUnownedHelper {
@@ -750,7 +751,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Maps from the fingerprint of a set of device names to a virtual
   // CompositeDevice.
   // TODO(b/145922293): Consider taking device names as keys.
-  absl::flat_hash_map<uint64, std::unique_ptr<CompositeDevice>>
+  absl::flat_hash_map<uint64_t, std::unique_ptr<CompositeDevice>>
       composite_devices_ ABSL_GUARDED_BY(composite_devices_mu_);
 
   FunctionLibraryDefinition func_lib_def_{OpRegistry::Global(),
@@ -780,10 +781,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   std::unordered_map<Fprint128, core::RefCountPtr<KernelAndDevice>,
                      Fprint128Hasher>
       kernel_cache_ TF_GUARDED_BY(cache_mu_);
-  std::unordered_map<string, RegisteredFunction*> registered_functions_
+  std::unordered_map<std::string, RegisteredFunction*> registered_functions_
       TF_GUARDED_BY(cache_mu_);
 
-  std::unordered_map<string, std::unique_ptr<FunctionLibraryDefinition>>
+  std::unordered_map<std::string, std::unique_ptr<FunctionLibraryDefinition>>
       component_function_libraries_ TF_GUARDED_BY(cache_mu_);
   absl::flat_hash_map<Fprint128, Device*, Fprint128Hasher> device_cache_
       TF_GUARDED_BY(device_cache_mu_);
@@ -830,11 +831,12 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   OwnedOrUnownedHelper<CollectiveExecutorMgrInterface> collective_executor_mgr_;
 
 #if !defined(IS_MOBILE_PLATFORM)
-  std::vector<string> GetRemoteContexts() TF_LOCKS_EXCLUDED(remote_state_mu_);
+  std::vector<std::string> GetRemoteContexts()
+      TF_LOCKS_EXCLUDED(remote_state_mu_);
   bool IsRemoteContextsEmpty() TF_LOCKS_EXCLUDED(remote_state_mu_);
   void CloseAndClearAllRemoteContexts();
-  void CloseRemoteContexts(const std::vector<string>& remote_contexts,
-                           uint64 context_id, uint64 context_view_id);
+  void CloseRemoteContexts(const std::vector<std::string>& remote_contexts,
+                           uint64_t context_id, uint64_t context_view_id);
 
   // TODO(b/184375824): clean up parameter order for better readability.
   absl::Status SetMasterContextState(
@@ -842,7 +844,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-      uint64 context_id, uint64 context_view_id,
+      uint64_t context_id, uint64_t context_view_id,
       tsl::core::RefCountPtr<Rendezvous> r,
       /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -858,12 +860,12 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   mutable mutex remote_state_mu_;
 
-  uint64 context_id_ TF_GUARDED_BY(remote_state_mu_);
+  uint64_t context_id_ TF_GUARDED_BY(remote_state_mu_);
   // The view id of an eager context should be set to 0 when context is created,
   // and continuously incremented when context with the same context_id gets
   // updated. The view id should be consistent between master and workers.
-  uint64 context_view_id_ TF_GUARDED_BY(remote_state_mu_);
-  std::vector<string> remote_contexts_ TF_GUARDED_BY(remote_state_mu_);
+  uint64_t context_view_id_ TF_GUARDED_BY(remote_state_mu_);
+  std::vector<std::string> remote_contexts_ TF_GUARDED_BY(remote_state_mu_);
   std::unique_ptr<eager::EagerClientCache> remote_eager_workers_
       TF_GUARDED_BY(remote_state_mu_);
 
@@ -880,7 +882,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   bool is_master_ TF_GUARDED_BY(remote_state_mu_);
 
   // Maps from a remote worker to a list of parsed device filters.
-  std::unordered_map<string, std::vector<DeviceNameUtils::ParsedName>>
+  std::unordered_map<std::string, std::vector<DeviceNameUtils::ParsedName>>
       cluster_device_filters_ TF_GUARDED_BY(remote_state_mu_);
 
   // A distributed manager that helps setup, update, and check liveness of
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index d51031b78b7387..8725479fb891da 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -80,11 +80,11 @@ limitations under the License.
 #if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
 #define TF_GPU_USE_PJRT
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
 #include "tensorflow/core/framework/resource_base.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/tfrt/common/global_state.h"
@@ -362,7 +362,7 @@ bool AreLocalDevicesCompatible(const EagerContext* context,
 }
 
 absl::Status AddRemoteDevicesToMgr(
-    const std::vector<string>& added_remote_workers,
+    const std::vector<std::string>& added_remote_workers,
     WorkerCacheInterface* worker_cache, DynamicDeviceMgr* remote_device_mgr) {
   std::vector<std::unique_ptr<Device>> remote_devices;
   mutex remote_devices_mu;
@@ -394,7 +394,7 @@ absl::Status AddRemoteDevicesToMgr(
 }
 
 absl::Status GetAllRemoteDevices(
-    const std::vector<string>& remote_workers,
+    const std::vector<std::string>& remote_workers,
     WorkerCacheInterface* worker_cache,
     std::unique_ptr<DynamicDeviceMgr>* device_mgr) {
   auto remote_device_mgr = std::make_unique<DynamicDeviceMgr>();
@@ -405,13 +405,13 @@ absl::Status GetAllRemoteDevices(
 }
 
 absl::Status RemoveRemoteDevicesFromMgr(
-    const std::vector<string>& removed_remote_workers,
+    const std::vector<std::string>& removed_remote_workers,
     DynamicDeviceMgr* remote_device_mgr) {
   const std::vector<Device*> remote_devices =
       (remote_device_mgr->ListDevices());
   std::vector<Device*> devices_to_remove;
   for (Device* d : remote_devices) {
-    for (const string& remote_worker : removed_remote_workers) {
+    for (const std::string& remote_worker : removed_remote_workers) {
       if (DeviceNameUtils::IsSameAddressSpace(remote_worker, d->name())) {
         devices_to_remove.emplace_back(d);
         break;
@@ -423,8 +423,8 @@ absl::Status RemoveRemoteDevicesFromMgr(
 }
 
 absl::Status ListRemoteWorkers(ServerInterface* server,
-                               const string& local_worker,
-                               std::vector<string>* remote_workers) {
+                               const std::string& local_worker,
+                               std::vector<std::string>* remote_workers) {
   server->master_env()->worker_cache->ListWorkers(remote_workers);
   remote_workers->erase(
       std::remove(remote_workers->begin(), remote_workers->end(), local_worker),
@@ -432,22 +432,22 @@ absl::Status ListRemoteWorkers(ServerInterface* server,
   return absl::OkStatus();
 }
 
-void DifferentiateWorkerLists(const std::vector<string>* current_list,
-                              const std::vector<string>* new_list,
-                              std::vector<string>* added,
-                              std::vector<string>* removed,
-                              std::vector<string>* existing) {
+void DifferentiateWorkerLists(const std::vector<std::string>* current_list,
+                              const std::vector<std::string>* new_list,
+                              std::vector<std::string>* added,
+                              std::vector<std::string>* removed,
+                              std::vector<std::string>* existing) {
   // Get STL set_difference and set_intersection with one list traversal.
   // Similar to the set_difference library function, the input lists
   // (`current_list` and `new_list`) must be sorted before calling the function.
   added->resize(new_list->size());
   removed->resize(current_list->size());
   existing->resize(current_list->size());
-  std::vector<string>::const_iterator curr_it = current_list->begin();
-  std::vector<string>::const_iterator new_it = new_list->begin();
-  std::vector<string>::iterator added_it = added->begin();
-  std::vector<string>::iterator removed_it = removed->begin();
-  std::vector<string>::iterator existing_it = existing->begin();
+  std::vector<std::string>::const_iterator curr_it = current_list->begin();
+  std::vector<std::string>::const_iterator new_it = new_list->begin();
+  std::vector<std::string>::iterator added_it = added->begin();
+  std::vector<std::string>::iterator removed_it = removed->begin();
+  std::vector<std::string>::iterator existing_it = existing->begin();
   while (curr_it != current_list->end() && new_it != new_list->end()) {
     if (*curr_it < *new_it) {
       *removed_it++ = *curr_it++;
@@ -466,10 +466,10 @@ void DifferentiateWorkerLists(const std::vector<string>* current_list,
 }
 
 absl::Status GetReplacedFromExistingWorkers(
-    const std::vector<string>* existing_workers, uint64 context_id,
-    uint64 context_view_id, const ServerDef& server_def,
+    const std::vector<std::string>* existing_workers, uint64_t context_id,
+    uint64_t context_view_id, const ServerDef& server_def,
     eager::EagerClientCache* client_cache,
-    std::vector<string>* replaced_workers) {
+    std::vector<std::string>* replaced_workers) {
   BlockingCounter counter(existing_workers->size());
   std::vector<absl::Status> statuses(existing_workers->size());
   eager::KeepAliveRequest request;
@@ -505,8 +505,8 @@ absl::Status GetReplacedFromExistingWorkers(
 }
 
 absl::Status CreateRemoteContexts(
-    EagerContext* context, const std::vector<string>& remote_workers,
-    uint64 context_id, uint64 context_view_id, int keep_alive_secs,
+    EagerContext* context, const std::vector<std::string>& remote_workers,
+    uint64_t context_id, uint64_t context_view_id, int keep_alive_secs,
     const ServerDef& server_def, eager::EagerClientCache* remote_eager_workers,
     bool async, const eager::CreateContextRequest& base_request,
     int64_t init_timeout_in_ms, int retries, bool clear_existing_contexts) {
@@ -514,7 +514,7 @@ absl::Status CreateRemoteContexts(
   BlockingCounter counter(num_remote_workers);
   std::vector<absl::Status> statuses(num_remote_workers);
   for (int i = 0; i < num_remote_workers; i++) {
-    const string& remote_worker = remote_workers[i];
+    const std::string& remote_worker = remote_workers[i];
     DeviceNameUtils::ParsedName parsed_name;
     if (!DeviceNameUtils::ParseFullName(remote_worker, &parsed_name)) {
       statuses[i] = errors::InvalidArgument("Unable to parse ", remote_worker,
@@ -583,10 +583,10 @@ absl::Status CreateRemoteContexts(
 }
 
 absl::Status UpdateRemoteContexts(
-    EagerContext* context, const std::vector<string>& remote_workers,
-    const std::vector<string>& added_workers,
-    const std::vector<string>& removed_workers, uint64 context_id,
-    uint64 context_view_id, const ServerDef& server_def,
+    EagerContext* context, const std::vector<std::string>& remote_workers,
+    const std::vector<std::string>& added_workers,
+    const std::vector<std::string>& removed_workers, uint64_t context_id,
+    uint64_t context_view_id, const ServerDef& server_def,
     eager::EagerClientCache* remote_eager_workers,
     const eager::CreateContextRequest& base_request) {
   int num_remote_workers = remote_workers.size();
@@ -594,8 +594,8 @@ absl::Status UpdateRemoteContexts(
   std::vector<absl::Status> statuses(num_remote_workers);
 
   int cluster_device_count = base_request.cluster_device_attributes_size();
-  std::unordered_set<string> added_or_removed(added_workers.begin(),
-                                              added_workers.end());
+  std::unordered_set<std::string> added_or_removed(added_workers.begin(),
+                                                   added_workers.end());
   std::copy(removed_workers.begin(), removed_workers.end(),
             std::inserter(added_or_removed, added_or_removed.end()));
   // Whether each device is in the updated (added or removed) workers
@@ -604,7 +604,7 @@ absl::Status UpdateRemoteContexts(
     const auto& da = base_request.cluster_device_attributes().at(i);
     DeviceNameUtils::ParsedName pn;
     DeviceNameUtils::ParseFullName(da.name(), &pn);
-    string task_name;
+    std::string task_name;
     DeviceNameUtils::GetTaskName(pn, &task_name);
     if (added_or_removed.find(task_name) != added_or_removed.end()) {
       device_added_or_removed[i] = true;
@@ -612,7 +612,7 @@ absl::Status UpdateRemoteContexts(
   }
 
   for (int i = 0; i < num_remote_workers; i++) {
-    const string& remote_worker = remote_workers[i];
+    const std::string& remote_worker = remote_workers[i];
     DeviceNameUtils::ParsedName parsed_name;
     if (!DeviceNameUtils::ParseFullName(remote_worker, &parsed_name)) {
       statuses[i] = errors::InvalidArgument("Unable to parse ", remote_worker,
@@ -689,15 +689,15 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
                                         bool reset_context, int keep_alive_secs,
                                         int64_t init_timeout_in_ms, int retries,
                                         bool clear_existing_contexts = false) {
-  string worker_name =
-      strings::StrCat("/job:", server_def.job_name(),
-                      "/replica:0/task:", server_def.task_index());
+  std::string worker_name =
+      absl::StrCat("/job:", server_def.job_name(),
+                   "/replica:0/task:", server_def.task_index());
 
   // List of current remote workers before updating server_def. Unused if
   // resetting the server_def.
-  std::vector<string> curr_remote_workers;
+  std::vector<std::string> curr_remote_workers;
   // List of updated remote workers.
-  std::vector<string> remote_workers;
+  std::vector<std::string> remote_workers;
 
   // New server created for new server_def. Unused if updating server_def.
   std::unique_ptr<ServerInterface> new_server;
@@ -722,10 +722,10 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
         ListRemoteWorkers(server, worker_name, &remote_workers));
   }
 
-  uint64 context_id = context->GetContextId();
+  uint64_t context_id = context->GetContextId();
   // TODO(b/291142876) Check for invalid context id here (instead of in the C
   // API).
-  uint64 context_view_id = context->GetContextViewId();
+  uint64_t context_view_id = context->GetContextViewId();
   if (reset_context) {
     context_id = EagerContext::NewContextId();
     context_view_id = 0;
@@ -757,10 +757,10 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
   // * existing_workers: set(curr_remote_workers) intersect set(remote_workers)
   // * replaced_workers: workers with the same task names and potentially the
   //     same `hostname:port`s, but replaced by different processes
-  std::vector<string> added_workers;
-  std::vector<string> removed_workers;
-  std::vector<string> existing_workers;
-  std::vector<string> replaced_workers;
+  std::vector<std::string> added_workers;
+  std::vector<std::string> removed_workers;
+  std::vector<std::string> existing_workers;
+  std::vector<std::string> replaced_workers;
 
   // New remote device manager created for new server_def. Unused if updating
   // server_def.
@@ -791,10 +791,11 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
         remote_eager_workers.get(), &replaced_workers));
     if (VLOG_IS_ON(1)) {
       VLOG(1) << "Updating cluster with following changes";
-      for (const string& w : added_workers) VLOG(1) << "  Added worker " << w;
-      for (const string& w : removed_workers)
+      for (const std::string& w : added_workers)
+        VLOG(1) << "  Added worker " << w;
+      for (const std::string& w : removed_workers)
         VLOG(1) << "  Removed worker " << w;
-      for (const string& w : replaced_workers)
+      for (const std::string& w : replaced_workers)
         VLOG(1) << "  Replaced worker " << w;
     }
     if (!replaced_workers.empty()) {
@@ -804,7 +805,7 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
                              replaced_workers.end());
       added_workers.insert(added_workers.end(), replaced_workers.begin(),
                            replaced_workers.end());
-      for (const string& w : replaced_workers) {
+      for (const std::string& w : replaced_workers) {
         existing_workers.erase(
             std::remove(existing_workers.begin(), existing_workers.end(), w),
             existing_workers.end());
@@ -868,7 +869,7 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
     }
     if (!existing_workers.empty()) {
       if (VLOG_IS_ON(1)) {
-        for (const string& w : existing_workers) {
+        for (const std::string& w : existing_workers) {
           VLOG(1) << "Updating cluster with existing worker " << w;
         }
       }
@@ -883,7 +884,7 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
     }
   }
 
-  auto session_name = strings::StrCat("eager_", context_id);
+  auto session_name = absl::StrCat("eager_", context_id);
   auto* session_mgr = server->worker_env()->session_mgr;
   if (reset_context) {
     tsl::core::RefCountPtr<RemoteRendezvous> r =
@@ -937,15 +938,16 @@ absl::Status EagerContextDistributedManager::SetOrUpdateServerDef(
     if (reset_context) {
       const auto& cdf = server_def.cluster_device_filters();
       for (const auto& jdf : cdf.jobs()) {
-        const string remote_prefix = "/job:" + jdf.name() + "/task:";
+        const std::string remote_prefix = "/job:" + jdf.name() + "/task:";
         for (const auto& tdf : jdf.tasks()) {
           const int32_t task_index = tdf.first;
-          std::vector<string> device_filters(tdf.second.device_filters_size());
+          std::vector<std::string> device_filters(
+              tdf.second.device_filters_size());
           for (int i = 0; i < tdf.second.device_filters_size(); i++) {
             device_filters[i] = tdf.second.device_filters(i);
           }
-          const string remote_worker =
-              strings::StrCat(remote_prefix, task_index);
+          const std::string remote_worker =
+              absl::StrCat(remote_prefix, task_index);
           TF_RETURN_IF_ERROR(
               context_->SetRemoteDeviceFilters(remote_worker, device_filters));
         }
@@ -973,9 +975,9 @@ absl::Status EagerContextDistributedManager::SetOrUpdateServerDef(
 
 absl::Status EagerContextDistributedManager::InitializeLocalOnlyContext(
     const ServerDef& server_def, int keep_alive_secs) {
-  string worker_name =
-      strings::StrCat("/job:", server_def.job_name(),
-                      "/replica:0/task:", server_def.task_index());
+  std::string worker_name =
+      absl::StrCat("/job:", server_def.job_name(),
+                   "/replica:0/task:", server_def.task_index());
   // New server created for new server_def. Unused if updating server_def.
   std::unique_ptr<ServerInterface> new_server;
   ServerInterface* server;
@@ -985,7 +987,7 @@ absl::Status EagerContextDistributedManager::InitializeLocalOnlyContext(
   LOG_AND_RETURN_IF_ERROR(
       NewServerWithOptions(server_def, {device_mgr}, &new_server));
   server = new_server.get();
-  uint64 context_id = EagerContext::NewContextId();
+  uint64_t context_id = EagerContext::NewContextId();
   // Make master eager context accessible by local eager service, which might
   // receive send tensor requests from remote workers.
   LOG_AND_RETURN_IF_ERROR(
@@ -995,7 +997,7 @@ absl::Status EagerContextDistributedManager::InitializeLocalOnlyContext(
   server->worker_env()->device_mgr->ListDeviceAttributes(
       &local_device_attributes);
 
-  auto session_name = strings::StrCat("eager_", context_id);
+  auto session_name = absl::StrCat("eager_", context_id);
   auto* session_mgr = server->worker_env()->session_mgr;
   tsl::core::RefCountPtr<RemoteRendezvous> r =
       server->worker_env()->rendezvous_mgr->Find(context_id);
@@ -1054,7 +1056,7 @@ absl::Status EagerContextDistributedManager::EnableCollectiveOps(
     const bool enable_coordination =
         !config.experimental().coordination_config().service_type().empty();
     if (enable_coordination) {
-      auto session_name = strings::StrCat("eager_", context_->GetContextId());
+      auto session_name = absl::StrCat("eager_", context_->GetContextId());
       std::shared_ptr<WorkerSession> worker_session;
       auto* session_mgr = server->worker_env()->session_mgr;
       // Start coordination service within session if this is the leader.
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index 56cdcf5c5fa746..590abf83871f67 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -50,7 +50,7 @@ using ::testing::HasSubstr;
 typedef FunctionDefHelper FDH;
 
 // Return a fake device.
-static Device* CreateDevice(const string& type, int n) {
+static Device* CreateDevice(const std::string& type, int n) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -99,7 +99,7 @@ class EagerContextTest : public ::testing::Test {
 
 TEST_F(EagerContextTest, CompositeDevice) {
   InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
-  std::vector<string> underlying_devices = {
+  std::vector<std::string> underlying_devices = {
       "/job:worker/replica:0/task:0/device:CPU:0",
       "/job:worker/replica:0/task:0/device:CPU:1"};
   CompositeDevice* composite_device_0 = nullptr;
@@ -134,10 +134,10 @@ TEST_F(EagerContextTest, CompositeDevice) {
 
 TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
   InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
-  const std::vector<string> underlying_devices_0 = {
+  const std::vector<std::string> underlying_devices_0 = {
       "/job:worker/replica:0/task:0/device:CPU:0",
       "/job:worker/replica:0/task:0/device:CPU:1"};
-  const string composite_device_name =
+  const std::string composite_device_name =
       "/job:worker1/replica:0/task:0/device:COMPOSITE:5";
   // Create a CompositeDevice with the given name.
   CompositeDevice* composite_device_0 = nullptr;
@@ -150,7 +150,7 @@ TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
       context()->FindCompositeDeviceFromName(composite_device_name, &device));
   EXPECT_EQ(device, composite_device_0);
 
-  std::vector<string> underlying_devices_1 = {
+  std::vector<std::string> underlying_devices_1 = {
       "/job:worker/replica:0/task:0/device:CPU:1",
       "/job:worker/replica:0/task:0/device:CPU:2"};
   // Find a CompositeDevice with the given name.
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 34fe7f2b122de0..d12f4965e1fded 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -71,8 +71,8 @@ class CopyToDeviceNode : public EagerNode {
 
   void Abort(absl::Status status) override { dst_->Poison(status, dstd_); }
 
-  string DebugString() const override {
-    string out = "[CopyToDeviceNode]";
+  std::string DebugString() const override {
+    std::string out = "[CopyToDeviceNode]";
     absl::StrAppend(&out, " src_tensor: ", src_->DebugString());
     absl::StrAppend(&out, ", dst_tensor: ", dst_->DebugString());
     absl::StrAppend(&out, ", dst_device: ", dstd_ ? dstd_->name() : "[]");
diff --git a/tensorflow/core/common_runtime/eager/custom_device.h b/tensorflow/core/common_runtime/eager/custom_device.h
index 2f4f5acc95549f..f72f76b0f5a7ca 100644
--- a/tensorflow/core/common_runtime/eager/custom_device.h
+++ b/tensorflow/core/common_runtime/eager/custom_device.h
@@ -37,13 +37,14 @@ class CustomDeviceTensorHandle;
 class CustomDevice {
  public:
   virtual ~CustomDevice() = default;
-  virtual const string& name() = 0;
+  virtual const std::string& name() = 0;
   virtual absl::Status CopyTensorToDevice(
       ImmediateExecutionTensorHandle* tensor,
       ImmediateExecutionTensorHandle** result) = 0;
 
   virtual absl::Status CopyTensorFromDevice(
-      ImmediateExecutionTensorHandle* tensor, const string& target_device_name,
+      ImmediateExecutionTensorHandle* tensor,
+      const std::string& target_device_name,
       ImmediateExecutionTensorHandle** result) = 0;
 
   virtual absl::Status Execute(const ImmediateExecutionOperation* op,
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
index 426930f04b8cda..2a736e67bae789 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 void CustomDeviceOpHandler::Clear() { custom_devices_.clear(); }
 
 absl::Status CustomDeviceOpHandler::RegisterCustomDevice(
-    const string& device_name, std::unique_ptr<CustomDevice> device) {
+    const std::string& device_name, std::unique_ptr<CustomDevice> device) {
   DeviceNameUtils::ParsedName parsed;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed) ||
       !parsed.has_job || !parsed.has_replica || !parsed.has_task ||
@@ -46,7 +46,7 @@ absl::Status CustomDeviceOpHandler::RegisterCustomDevice(
 }
 
 bool CustomDeviceOpHandler::FindCustomDeviceFromName(
-    const string& name, CustomDevice** device) const {
+    const std::string& name, CustomDevice** device) const {
   auto dev_it = custom_devices_.find(name);
   if (dev_it == custom_devices_.end()) {
     return false;
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.h b/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
index 6c38e50d458dcd..66d186014b2176 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
@@ -29,11 +29,11 @@ class CustomDeviceOpHandler {
  public:
   ~CustomDeviceOpHandler() = default;
   // Register a new custom device.
-  absl::Status RegisterCustomDevice(const string& device_name,
+  absl::Status RegisterCustomDevice(const std::string& device_name,
                                     std::unique_ptr<CustomDevice> device);
 
   // Find the custom device from given name. Return true if it finds one.
-  bool FindCustomDeviceFromName(const string& name,
+  bool FindCustomDeviceFromName(const std::string& name,
                                 CustomDevice** device) const;
 
   absl::Status Execute(ImmediateExecutionOperation* op,
@@ -53,7 +53,8 @@ class CustomDeviceOpHandler {
   void Clear();
 
  private:
-  std::unordered_map<string, std::unique_ptr<CustomDevice>> custom_devices_;
+  std::unordered_map<std::string, std::unique_ptr<CustomDevice>>
+      custom_devices_;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index fc552f3127576d..02f8eae99fb80a 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -117,7 +117,7 @@ absl::Status EagerExecutor::SyncExecute(EagerNode* node) {
   }
   // NOTE: SyncExecute runs every node regardless of error status in executor.
 
-  uint64 id = next_node_id_++;
+  uint64_t id = next_node_id_++;
 
   absl::Status s = node->Prepare();
   if (!s.ok()) {
@@ -312,9 +312,9 @@ void EagerExecutor::NodeDone(const core::RefCountPtr<NodeItem>& item,
   // a deadlock.
 }
 
-void EagerExecutor::NotifyWaiters(uint64 id) {
+void EagerExecutor::NotifyWaiters(uint64_t id) {
   if (!node_done_notifications_.empty()) {
-    uint64 upperbound_id = 0;
+    uint64_t upperbound_id = 0;
     if (!unfinished_nodes_.empty()) {
       upperbound_id = unfinished_nodes_.begin()->first - 1;
     } else if (!node_queue_.empty()) {
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index 7826b271ec98f1..ff8ce9cbc7322c 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -76,7 +76,7 @@ class EagerNode {
   virtual AsyncEagerNode* AsAsync() { return nullptr; }
   virtual AsyncRemoteExecuteNode* AsAsyncRemoteExecuteNode() { return nullptr; }
 
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Indicates whether a node failure should make the executor unusable.
   virtual bool Fatal() const { return true; }
@@ -193,7 +193,7 @@ class EagerExecutor {
   struct NodeItem : core::RefCounted {
     // Unique id generated in EagerExecutor::Add(). If item1.id < item2.id, it
     // means item1.node is added before item2.node.
-    uint64 id;
+    uint64_t id;
     std::unique_ptr<EagerNode> node;
     NodeState state;
   };
@@ -203,7 +203,8 @@ class EagerExecutor {
 
   void NodeDone(const core::RefCountPtr<NodeItem>& item,
                 const absl::Status& status, bool from_queue);
-  void NotifyWaiters(uint64 id) TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+  void NotifyWaiters(uint64_t id)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
   // Starts execution of pending EagerNodes. This function loops till executor
   // state_ is set to kShutDown. If any errors are encountered, these are set
@@ -220,9 +221,9 @@ class EagerExecutor {
   absl::Status WaitForAllPendingNodesLocked(mutex_lock* lock)
       TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
-  absl::Status WaitImpl(bool wait_all, uint64 node_id);
+  absl::Status WaitImpl(bool wait_all, uint64_t node_id);
 
-  std::atomic<uint64> next_node_id_;
+  std::atomic<uint64_t> next_node_id_;
 
   mutable mutex node_queue_mutex_;
 
@@ -236,7 +237,7 @@ class EagerExecutor {
       TF_GUARDED_BY(node_queue_mutex_);
 
   // Ordered by NodeItem::id.
-  std::map<uint64, core::RefCountPtr<NodeItem>, std::less<uint64>>
+  std::map<uint64_t, core::RefCountPtr<NodeItem>, std::less<uint64_t>>
       unfinished_nodes_ TF_GUARDED_BY(node_queue_mutex_);
 
   // `status_` is set based on any errors raised during execution of a
@@ -248,7 +249,7 @@ class EagerExecutor {
   // These condition_variables are notified and removed when that EagerNode is
   // done executing, or if an error is found in execution of any EagerNode.
   // The map is ordered by id.
-  std::multimap<uint64, condition_variable*, std::less<uint64>>
+  std::multimap<uint64_t, condition_variable*, std::less<uint64_t>>
       node_done_notifications_ TF_GUARDED_BY(node_queue_mutex_);
 
   // thread_exited_notification_ is notified by the `thread_` right before it
diff --git a/tensorflow/core/common_runtime/eager/eager_executor_test.cc b/tensorflow/core/common_runtime/eager/eager_executor_test.cc
index 3fc6f3860085f0..acaba8320ed871 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor_test.cc
@@ -63,7 +63,7 @@ class TestEagerNode : public EagerNode {
   };
 
   void Abort(absl::Status status) override {}
-  string DebugString() const override { return "testEagerNode"; }
+  std::string DebugString() const override { return "testEagerNode"; }
 
  private:
   TestState* state_;
@@ -94,7 +94,7 @@ class TestAsyncEagerNode : public AsyncEagerNode {
   };
 
   void Abort(absl::Status status) override {}
-  string DebugString() const override { return "testAsyncEagerNode"; }
+  std::string DebugString() const override { return "testAsyncEagerNode"; }
 
  private:
   TestState* state_;
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
index bd7098473d7532..221d30d98518f6 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
@@ -28,7 +28,7 @@ namespace tensorflow {
 // implement the Run method.
 class EagerOpRewrite {
  public:
-  EagerOpRewrite(string name, string file, string line) {
+  EagerOpRewrite(std::string name, std::string file, std::string line) {
     debug_info_.name = name;
     debug_info_.file = file;
     debug_info_.line = line;
@@ -43,7 +43,7 @@ class EagerOpRewrite {
 
   // Holds information about the rewrite registration.
   struct DebugInfo {
-    string name, file, line;
+    std::string name, file, line;
   };
 
   // Returns information about the registered Eager op rewrite.
@@ -75,7 +75,7 @@ class EagerOpRewriteRegistry {
  private:
   static constexpr int32_t kNumPhases = 2;
   // Holds all the registered Eager op rewrites and their ordinal numbers.
-  std::array<std::list<std::pair<std::unique_ptr<EagerOpRewrite>, int32>>,
+  std::array<std::list<std::pair<std::unique_ptr<EagerOpRewrite>, int32_t>>,
              kNumPhases>
       rewrites_;
 };
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index d50f3e0a4ec411..e76627a3680daf 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 
 class TestEagerOpRewrite : public EagerOpRewrite {
  public:
-  TestEagerOpRewrite(string name, string file, string line)
+  TestEagerOpRewrite(std::string name, std::string file, std::string line)
       : EagerOpRewrite(name, file, line),
         executor_(/*async=*/false, /*enable_streaming_enqueue=*/true) {}
   static int count_;
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index b14cbeeba9bb81..d730df6b608b06 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -473,7 +473,7 @@ absl::Status EagerOperation::MutableTensorHandleInputs(
 }
 
 absl::Status EagerOperation::SetDeviceName(const char* c_name) {
-  string name(c_name != nullptr ? c_name : "");
+  std::string name(c_name != nullptr ? c_name : "");
   if (name != last_set_device_name_) {
     if (!DeviceNameUtils::ParseFullName(name, &device_parsed_name_)) {
       return errors::InvalidArgument("Malformed device specification '", name,
@@ -498,7 +498,7 @@ bool EagerOperation::IsLocal() const {
          device_parsed_name_.task == host_cpu_name.task;
 }
 
-string VariantDeviceDebugString(VariantDevice device) {
+std::string VariantDeviceDebugString(VariantDevice device) {
   if (device == kVariantDeviceNull) {
     return "[]";
   } else if (std::holds_alternative<CustomDevice*>(device)) {
@@ -513,8 +513,8 @@ void EagerOperation::AddAttrs(const AbstractOpAttrs* op_attrs) {
   attrs_.CopyAttributes(*(down_cast<const AttrBuilder*>(op_attrs)));
 }
 
-string EagerOperation::DebugString() const {
-  string out;
+std::string EagerOperation::DebugString() const {
+  std::string out;
   VLOG(1) << "EagerOperation::DebugString() over " << this;
 
   absl::StrAppend(&out, "Name: ", Name(), "\n");
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 04cefa00861198..b51e098413685d 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -57,9 +57,9 @@ class EagerOperation : public ImmediateExecutionOperation {
     return Reset(op, raw_device_name, false, nullptr);
   }
 
-  const string& Name() const override { return attrs_.op_name(); }
+  const std::string& Name() const override { return attrs_.op_name(); }
 
-  const string& DeviceName() const override { return device_name_; }
+  const std::string& DeviceName() const override { return device_name_; }
 
   ImmediateExecutionContext* GetContext() const override { return &ctx_; }
 
@@ -196,7 +196,7 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   // This is useful if we want the EagerOperation to point to a different
   // function.
-  void UpdateName(const string& name) {
+  void UpdateName(const std::string& name) {
     attrs_.set_op_name(name);
     op_name_ = attrs_.op_name();
   }
@@ -242,7 +242,7 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   EagerExecutor& Executor() { return *executor_; }
 
-  string DebugString() const;
+  std::string DebugString() const;
 
   const absl::optional<EagerFunctionParams>& eager_func_params() const {
     return eager_func_params_;
@@ -289,12 +289,12 @@ class EagerOperation : public ImmediateExecutionOperation {
   // The last device name given to SetDeviceName.
   // This is used to avoid having to re-process the same device in repeated
   // calls to SetDeviceName.
-  string last_set_device_name_;
+  std::string last_set_device_name_;
 
   // The operation's device name.
   // This contains the named passed to SetDeviceName until device_ is set,
   // at which point it contains the device_ name.
-  string device_name_;
+  std::string device_name_;
 
   // The parsed device name.
   // This will always contain the result of
diff --git a/tensorflow/core/common_runtime/eager/eager_operation_test.cc b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
index 499d2ef110bfd9..2ff6952eb0d17e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
@@ -68,7 +68,7 @@ TEST(EagerOperationTest, EagerFunctionParamsAndStepId) {
   auto op = new EagerOperation(ctx);
   EXPECT_FALSE(op->eager_func_params().has_value());
 
-  string device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  std::string device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
   TF_ASSERT_OK(op->SetDeviceName(device_name.c_str()));
   TF_ASSERT_OK(op->Reset("DummyFunction", device_name.c_str()));
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index d12d51db3907f9..547336cdeb6d76 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -132,8 +132,8 @@ bool SendAsProtosWhenPossible() {
   return send_as_protos_when_possible;
 }
 
-const string& DeviceNameOrUnspecified(Device* device) {
-  static string* unspecified_string = new string("<unspecified>");
+const std::string& DeviceNameOrUnspecified(Device* device) {
+  static std::string* unspecified_string = new std::string("<unspecified>");
   return (device == nullptr) ? *unspecified_string : device->name();
 }
 
@@ -158,7 +158,7 @@ absl::Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
   // Should only be called when these don't match
   DCHECK(expected_input_device != handle_device);
   *result = nullptr;
-  const string& op_device_name = DeviceNameOrUnspecified(op_device);
+  const std::string& op_device_name = DeviceNameOrUnspecified(op_device);
 
   switch (ctx->GetDevicePlacementPolicy()) {
     case DEVICE_PLACEMENT_SILENT_FOR_INT32:
@@ -314,7 +314,7 @@ absl::Status GetDeviceForInput(const EagerOperation& op,
                                const bool is_host_memory_arg,
                                TensorHandle* tensor_handle, Device** result) {
   Device* cpu_device = ctx.HostCPU();
-  string device_name;
+  std::string device_name;
   if (tensor_handle->Type() != TensorHandle::LOCAL) {
     Device* device = tensor_handle->device();
     device_name = device != nullptr ? device->name() : cpu_device->name();
@@ -473,7 +473,7 @@ absl::Status MustCompileWithXLA(const EagerOperation* op,
 // `has_jit_compile` and `device`.
 absl::Status HasNestedJitCompile(const EagerOperation& op,
                                  const EagerContext& ctx, bool* has_jit_compile,
-                                 string* device) {
+                                 std::string* device) {
   *has_jit_compile = false;
 
   const std::string kStatefulPartitionedCallOp = "StatefulPartitionedCall";
@@ -488,7 +488,7 @@ absl::Status HasNestedJitCompile(const EagerOperation& op,
   const FunctionLibraryDefinition* func_lib_def = op.FuncLibDef();
 
   while (!function_names.empty()) {
-    const string& function_name = function_names.front();
+    const std::string& function_name = function_names.front();
 
     const FunctionDef* function_def = func_lib_def->Find(function_name);
     if (function_def == nullptr) {
@@ -518,8 +518,8 @@ absl::Status HasNestedJitCompile(const EagerOperation& op,
   return absl::OkStatus();
 }
 
-string CanonicalizeDeviceType(std::string_view device_type) {
-  string canonical_device_type = "Unknown";
+std::string CanonicalizeDeviceType(std::string_view device_type) {
+  std::string canonical_device_type = "Unknown";
   if (device_type == "XLA_CPU" || device_type == tensorflow::DEVICE_CPU) {
     canonical_device_type = tensorflow::DEVICE_CPU;
   }
@@ -542,11 +542,12 @@ absl::Status UpdateCompileCounter(const EagerOperation* op,
     return absl::OkStatus();
   }
 
-  string device_type = CanonicalizeDeviceType(op->GetDeviceParsedName().type);
-  string compilation_option = kDisabled;
+  std::string device_type =
+      CanonicalizeDeviceType(op->GetDeviceParsedName().type);
+  std::string compilation_option = kDisabled;
   if (!compile_with_xla) {
     bool nested_jit_compile = false;
-    string device;
+    std::string device;
     if (!ctx.FuncLibDef()->HasOptimizedFunctionGraph(op->Name())) {
       TF_RETURN_IF_ERROR(
           HasNestedJitCompile(*op, ctx, &nested_jit_compile, &device));
@@ -586,14 +587,14 @@ absl::Status UpdateCompileCounter(const EagerOperation* op,
 
 using ProtoArgListType = protobuf::RepeatedPtrField<OpDef_ArgDef>;
 
-string EscapeOrigName(const string& orig_name) {
+std::string EscapeOrigName(const std::string& orig_name) {
   // Replace _ with __ in the original name to avoid name conflicts.
   return absl::StrReplaceAll(orig_name, {{"_", "__"}});
 }
 
 // Variadic args are flattened during wrapping. This utility returns the name
 // of a flattened arg/attr.
-string GetFlatName(const string orig_name, int index) {
+std::string GetFlatName(const std::string orig_name, int index) {
   return absl::StrCat(EscapeOrigName(orig_name), "_", index);
 }
 
@@ -607,13 +608,14 @@ string GetFlatName(const string orig_name, int index) {
 // IdentityN[T:[DT_FLOAT, DT_INT64]] -> __wrapped__IdentityN_T_2
 // Concat[N:2, T:DT_FLOAT] -> __wrapped__Concat_N_2
 absl::Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
-                                const AbstractOpAttrs* op_attrs, string* name) {
-  string fname = absl::StrCat("__wrapped__", EscapeOrigName(op->Name()));
+                                const AbstractOpAttrs* op_attrs,
+                                std::string* name) {
+  std::string fname = absl::StrCat("__wrapped__", EscapeOrigName(op->Name()));
   // For every variadic arg in `args`, populates `attr_to_len` with
   // (attr_name, len(arg)).
   auto FillAttrToLen = [op_attrs, op](
                            const ProtoArgListType& args,
-                           absl::btree_map<string, int>* attr_to_len) {
+                           absl::btree_map<std::string, int>* attr_to_len) {
     for (const auto& arg : args) {
       if (!arg.type_list_attr().empty()) {
         absl::InlinedVector<DataType, 4UL> type_list;
@@ -631,7 +633,7 @@ absl::Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
     }
     return absl::OkStatus();
   };
-  absl::btree_map<string, int> attr_to_len;
+  absl::btree_map<std::string, int> attr_to_len;
   TF_RETURN_IF_ERROR(FillAttrToLen(opdef.input_arg(), &attr_to_len));
   TF_RETURN_IF_ERROR(FillAttrToLen(opdef.output_arg(), &attr_to_len));
   for (auto& name_len : attr_to_len) {
@@ -768,7 +770,8 @@ absl::Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
 // Note that the N attr is preserved so that it can get copied to the
 // inner op via a placeholder. This allows additional verification.
 absl::Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
-                                     const string& fname, OpDef& signature) {
+                                     const std::string& fname,
+                                     OpDef& signature) {
   signature = opdef;
   signature.clear_input_arg();
   signature.clear_output_arg();
@@ -777,7 +780,7 @@ absl::Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
   auto FillSignatureArgs = [op_attrs, op](
                                const ProtoArgListType& opdef_args,
                                ProtoArgListType* sig_args,
-                               absl::flat_hash_set<string>& new_attrs) {
+                               absl::flat_hash_set<std::string>& new_attrs) {
     for (const auto& arg : opdef_args) {
       if (!arg.type_list_attr().empty()) {
         absl::InlinedVector<DataType, 4UL> type_list;
@@ -817,7 +820,7 @@ absl::Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
     }
     return absl::OkStatus();
   };
-  absl::flat_hash_set<string> new_attrs;
+  absl::flat_hash_set<std::string> new_attrs;
   TF_RETURN_IF_ERROR(FillSignatureArgs(
       opdef.input_arg(), signature.mutable_input_arg(), new_attrs));
   TF_RETURN_IF_ERROR(FillSignatureArgs(
@@ -838,7 +841,7 @@ absl::Status AddMixedTypeListAttrs(EagerOperation* wrapped_op,
                                    const OpDef& opdef) {
   auto FillAttrsToAdd =
       [op_attrs](const ProtoArgListType& opdef_args,
-                 absl::flat_hash_map<string, DataType>* attrs_to_add) {
+                 absl::flat_hash_map<std::string, DataType>* attrs_to_add) {
         for (const auto& arg : opdef_args) {
           if (!arg.type_list_attr().empty()) {
             absl::InlinedVector<DataType, 4UL> type_list;
@@ -852,7 +855,7 @@ absl::Status AddMixedTypeListAttrs(EagerOperation* wrapped_op,
         }
         return absl::OkStatus();
       };
-  absl::flat_hash_map<string, DataType> attrs_to_add;
+  absl::flat_hash_map<std::string, DataType> attrs_to_add;
   TF_RETURN_IF_ERROR(FillAttrsToAdd(opdef.input_arg(), &attrs_to_add));
   TF_RETURN_IF_ERROR(FillAttrsToAdd(opdef.output_arg(), &attrs_to_add));
   for (auto& name_type : attrs_to_add) {
@@ -867,7 +870,8 @@ absl::Status AddMixedTypeListAttrs(EagerOperation* wrapped_op,
 // outputs which need to be flattened.
 absl::Status PopulateRetMap(FunctionDef* fdef, const AbstractOpAttrs* op_attrs,
                             const EagerOperation* op, const OpDef& opdef,
-                            const OpDef& signature, const string& node_name) {
+                            const OpDef& signature,
+                            const std::string& node_name) {
   int next_sig_output = 0;
   for (size_t i = 0; i < opdef.output_arg_size(); i++) {
     const auto& output_arg = opdef.output_arg(i);
@@ -916,7 +920,7 @@ absl::Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   // TODO(srbs): Support list inputs/outputs.
   auto verify_wrappable_in_call_op = [](const OpDef& opdef,
                                         EagerOperation* op) -> absl::Status {
-    absl::flat_hash_set<string> opdef_attrs;
+    absl::flat_hash_set<std::string> opdef_attrs;
     for (const auto& attr : opdef.attr()) {
       opdef_attrs.insert(attr.name());
     }
@@ -941,7 +945,7 @@ absl::Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   // This can be avoided by introducing a dict in EagerContext that stores a
   // mapping from the eager op's name to its unique FunctionDef name.
   auto op_attrs = op->GetOpAttrs();
-  string fname;
+  std::string fname;
   TF_RETURN_IF_ERROR(BuildWrappedOpName(op, opdef, op_attrs, &fname));
   if (!op->EagerContext().GetFunctionDef(fname)) {
     FunctionDef fdef;
@@ -1168,7 +1172,8 @@ absl::StatusOr<Fprint128> GetKernelCacheKey(
 absl::Status ExtractFunctionInputInfo(
     EagerOperation* op, const KernelDef* kernel_def,
     std::vector<Device*>& input_device_ptrs,
-    absl::flat_hash_map<string, const std::vector<string>*>& composite_devices,
+    absl::flat_hash_map<std::string, const std::vector<std::string>*>&
+        composite_devices,
     std::unordered_map<int, DtypeAndPartialTensorShape>&
         input_resource_variable_dtypes_and_shapes) {
   tsl::profiler::TraceMe activity("EagerCopyToDevice",
@@ -1268,7 +1273,7 @@ absl::Status GetOrCreateKernelAndDevice(
   if (is_small_constant_optimization_enabled(*op)) {
     TF_ASSIGN_OR_RETURN(BoolTensorInputs bool_inputs,
                         GetBoolInputs(op, /*delete_inputs=*/false));
-    string folded_name = op->Name();
+    std::string folded_name = op->Name();
     for (const auto& [input_name, input_value] : bool_inputs) {
       folded_name = small_constants_optimizer::FoldedFunctionName(
           folded_name, input_name, input_value);
@@ -1320,7 +1325,8 @@ absl::Status GetOrCreateKernelAndDevice(
       (ctx.RunEagerOpAsFunction() && !op->is_function());
 
   std::vector<Device*> input_device_ptrs;
-  absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
+  absl::flat_hash_map<std::string, const std::vector<std::string>*>
+      composite_devices;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_variable_dtypes_and_shapes;
   const KernelDef* kernel_def = nullptr;
@@ -1380,7 +1386,7 @@ absl::Status GetOrCreateKernelAndDevice(
     bool run_function_with_flr = false;
     bool function_runs_at_most_once = FunctionRunsAtMostOnce(op, ctx);
 
-    std::optional<string> xla_compile_device_type;
+    std::optional<std::string> xla_compile_device_type;
     if (op->is_function()) {
       bool compile_with_xla;
       // By default we should run functions with FunctionLibraryRuntime.
@@ -1474,7 +1480,8 @@ absl::Status GetOrCreateKernelAndDevice(
         // Check if any of the Op's output_arg(s) are pinned to Host.
         if (kernel_def == nullptr) return false;
         const OpDef& op_def = OpRegistry::Global()->LookUp(op->Name())->op_def;
-        for (const string& host_memory_arg : kernel_def->host_memory_arg()) {
+        for (const std::string& host_memory_arg :
+             kernel_def->host_memory_arg()) {
           for (const auto& output_arg : op_def.output_arg()) {
             if (output_arg.name() == host_memory_arg) {
               return false;
@@ -1613,7 +1620,7 @@ absl::Status CreateUnshapedOutput(
     return errors::InvalidArgument(
         "Unable to find a remote op id for a remote output of ", kernel.name());
   }
-  string remote_task;
+  std::string remote_task;
   if (!DeviceNameUtils::GetTaskName(output_device->parsed_name(),
                                     &remote_task)) {
     return errors::InvalidArgument(
@@ -1762,8 +1769,8 @@ absl::Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
 
   if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
-                              kernel->device()->name());
+    std::string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
+                                   kernel->device()->name());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
@@ -1828,15 +1835,15 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   // TODO(fishx): Remove following code when lazy tensor copy is ready.
   if (op->Device() == kVariantDeviceNull) {
     tensorflow::Device* device = nullptr;
-    string device_name = op->DeviceName();
+    std::string device_name = op->DeviceName();
     TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(device_name.c_str(), &device));
     op->SetDevice(device);
   }
 
   core::RefCountPtr<eager::EagerClient> eager_client;
-  uint64 context_id = ctx.GetContextId();
+  uint64_t context_id = ctx.GetContextId();
   TF_RETURN_IF_ERROR(ctx.GetClient(op->GetDeviceParsedName(), &eager_client));
-  string remote_task;
+  std::string remote_task;
   if (!DeviceNameUtils::GetTaskName(op->GetDeviceParsedName(), &remote_task)) {
     return errors::InvalidArgument(
         "Unable to find remote task corresponding to device ",
@@ -1859,7 +1866,7 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       tensorflow::TensorHandle* input = (*inputs)[i];
       tensorflow::Device* input_device = input->device();
       tensorflow::Device* input_device_or_cpu = input->DeviceOrHostCPU(ctx);
-      const string* input_device_name = &input_device_or_cpu->name();
+      const std::string* input_device_name = &input_device_or_cpu->name();
       bool serialize_resource_dtype_and_shape = false;
       if (op_device != input_device &&
           // If the expected and actual devices are on the same task, don't
@@ -1986,7 +1993,7 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   }
   *num_retvals = num_outputs;
 
-  const tensorflow::uint64 id = remote_op->id();
+  const uint64_t id = remote_op->id();
   for (size_t i = 0; i < num_outputs; ++i) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a
     // list of pending decrefs that we can send as a batch with the next
@@ -2048,7 +2055,7 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       {retvals, num_outputs}));
 
   if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat(
+    std::string msg = absl::StrCat(
         "Executing op ", op->Name(), " on task ",
         DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
     if (!logging::LogToListeners(msg)) {
@@ -2362,7 +2369,7 @@ absl::Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
     return errors::Unimplemented(
         "Eager's remote execution is not available on mobile devices.");
 #else   // !IS_MOBILE_PLATFORM
-    uint64 recv_op_id = 0;
+    uint64_t recv_op_id = 0;
     if (receiver_is_local) {
       Device* d = ctx->CanonicalDevice(device);
       // TODO(gjn): Need to add support for async execution. Note if receiver
@@ -2403,7 +2410,7 @@ absl::Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
           return absl::OkStatus();
         }
       }
-      string remote_task;
+      std::string remote_task;
       if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
         return errors::InvalidArgument(
             "Unable to find remote task corresponding to device ",
@@ -2523,8 +2530,8 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
   }
 
   if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
-                              kernel->device()->name());
+    std::string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
+                                   kernel->device()->name());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index 09bebd3e1f7cf2..a8fb4fc308affe 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 #if !defined(IS_MOBILE_PLATFORM)
 bool ExecuteNodeArgs::IsRemote(EagerContext* ctx, Device* input_device,
                                TensorHandle* handle) {
-  uint64 context_view_id = ctx->GetContextViewId();
+  uint64_t context_view_id = ctx->GetContextViewId();
   if (handle->Type() == TensorHandle::REMOTE ||
       handle->HasRemoteMirror(input_device, context_view_id)) {
     if (!has_remote_inputs_) {
diff --git a/tensorflow/core/common_runtime/eager/execute_test.cc b/tensorflow/core/common_runtime/eager/execute_test.cc
index ea174fd22f76a2..5427851f3d8b3f 100644
--- a/tensorflow/core/common_runtime/eager/execute_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_test.cc
@@ -70,7 +70,7 @@ TEST(ExecuteTest, SimpleFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -125,7 +125,7 @@ TEST(ExecuteTest, SimpleFunctionInt32BadFullType) {
       /*run_eager_op_as_function=*/true);
 
   const Tensor kTwo = test::AsScalar<int32_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -188,7 +188,7 @@ TEST(ExecuteTest, CompiledFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -245,7 +245,7 @@ TEST(ExecuteTest, NestedCompiledFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -266,7 +266,7 @@ TEST(ExecuteTest, NestedCompiledFunction) {
       });
   TF_ASSERT_OK(ctx->AddFunctionDef(x_times_two));
 
-  const string call_function_name = "FunctionCall";
+  const std::string call_function_name = "FunctionCall";
   const FunctionDef function_call = FunctionDefHelper::Define(
       // Name
       call_function_name,
@@ -325,7 +325,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -346,7 +346,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
       });
   TF_ASSERT_OK(ctx->AddFunctionDef(x_times_two));
 
-  const string call_function_name = "FunctionCall";
+  const std::string call_function_name = "FunctionCall";
   FunctionDef function_call = FunctionDefHelper::Define(
       // Name
       call_function_name,
@@ -379,7 +379,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
 
   TF_ASSERT_OK(ctx->AddFunctionDef(function_call));
 
-  const string call_function_name2 = "FunctionCall2";
+  const std::string call_function_name2 = "FunctionCall2";
   const FunctionDef function_call2 = FunctionDefHelper::Define(
       // Name
       call_function_name2,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 790151d92129a4..ba437b5df5e37d 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -193,7 +193,7 @@ class KernelAndDevice : public core::RefCounted {
 
   virtual int num_inputs() const = 0;
   virtual int num_outputs() const = 0;
-  virtual const string& name() const = 0;
+  virtual const std::string& name() const = 0;
 
  protected:
   std::function<void(std::function<void()>)>* get_runner() const;
@@ -262,7 +262,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   }
   int num_inputs() const override { return kernel_->num_inputs(); }
   int num_outputs() const override { return kernel_->num_outputs(); }
-  const string& name() const override { return kernel_->name(); }
+  const std::string& name() const override { return kernel_->name(); }
 
  private:
   std::unique_ptr<OpKernel> kernel_;
@@ -286,19 +286,20 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
-      absl::flat_hash_map<string, const std::vector<string>*> composite_devices,
+      absl::flat_hash_map<std::string, const std::vector<std::string>*>
+          composite_devices,
       std::unordered_map<int, DtypeAndPartialTensorShape>
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
-      Device* host_cpu_device, const string& name,
+      Device* host_cpu_device, const std::string& name,
       const bool outputs_on_op_device,
       const bool allow_small_function_optimizations,
       const bool allow_control_flow_sync_execution,
       const bool shape_inference_on_tfe_dialect_import,
       const bool int_args_and_retvals_on_device,
       const bool function_runs_at_most_once,
-      std::optional<string> xla_compile_device_type,
+      std::optional<std::string> xla_compile_device_type,
       const bool allow_soft_placement, Rendezvous::Factory rendezvous_factory,
       std::function<int64_t()> get_op_id)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
@@ -366,7 +367,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   }
   int num_inputs() const override { return input_dtypes_.size(); }
   int num_outputs() const override { return output_dtypes_.size(); }
-  const string& name() const override { return name_; };
+  const std::string& name() const override { return name_; };
 
  private:
   std::shared_ptr<FunctionLibraryRuntime::Options> PrepareForRun(
@@ -402,7 +403,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   const bool function_runs_at_most_once_;
 
-  const absl::optional<string> xla_compile_device_type_;
+  const absl::optional<std::string> xla_compile_device_type_;
 
   const bool allow_soft_placement_;
 
@@ -413,13 +414,14 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   // devices.
   std::vector<Device*> input_devices_;
   // Maps from a CompositeDevice name to a list of physical device names.
-  absl::flat_hash_map<string, const std::vector<string>*> composite_devices_;
+  absl::flat_hash_map<std::string, const std::vector<std::string>*>
+      composite_devices_;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_dtypes_and_shapes_;
 
   DataTypeVector input_dtypes_;
   DataTypeVector output_dtypes_;
-  string name_;
+  std::string name_;
 
   Rendezvous::Factory rendezvous_factory_;
   std::function<int64_t()> get_op_id_;
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
index e6d547d1e9832b..9b6e0e66a72a64 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -34,19 +34,20 @@ namespace eager {
 // generate and then copy the data instead of just generating the data on the
 // device directly.
 static bool IsPinnableOp(absl::string_view op_name) {
-  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
-      "RandomUniform",
-      "RandomUniformInt",
-      "RandomStandardNormal",
-      "StatelessRandomUniform",
-      "StatelessRandomUniformInt",
-      "StatelessRandomUniformFullInt",
-      "StatelessRandomNormal",
-  });
+  static const gtl::FlatSet<std::string>* unpinnable_ops =
+      new gtl::FlatSet<std::string>({
+          "RandomUniform",
+          "RandomUniformInt",
+          "RandomStandardNormal",
+          "StatelessRandomUniform",
+          "StatelessRandomUniformInt",
+          "StatelessRandomUniformFullInt",
+          "StatelessRandomNormal",
+      });
 
   // XRT ops refer to per-device handles that are not safe to move between
   // devices.
-  return unpinnable_ops->find(string(op_name)) == unpinnable_ops->end() &&
+  return unpinnable_ops->find(std::string(op_name)) == unpinnable_ops->end() &&
          !absl::StartsWith(op_name, "XRT");
 }
 // Validate if the remote device with the given incarnation is valid in the
@@ -64,12 +65,12 @@ static absl::Status ValidateTensorHandleRemoteDevice(
 
 bool IsColocationExempt(absl::string_view op_name) {
   const auto& exempt_ops = InputColocationExemptionRegistry::Global()->Get();
-  return exempt_ops.find(string(op_name)) != exempt_ops.end();
+  return exempt_ops.find(std::string(op_name)) != exempt_ops.end();
 }
 
 bool IsFunction(absl::string_view op_name) {
   const OpDef* op_def = nullptr;
-  absl::Status s = OpDefForOp(string(op_name), &op_def);
+  absl::Status s = OpDefForOp(std::string(op_name), &op_def);
   if (!s.ok()) {
     if (!absl::IsNotFound(s)) {
       LOG(WARNING) << "Looking up OpDef failed with error: " << s;
diff --git a/tensorflow/core/common_runtime/eager/placement_utils_test.cc b/tensorflow/core/common_runtime/eager/placement_utils_test.cc
index c543b9475a072c..aadec6deab8eb8 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils_test.cc
@@ -94,7 +94,7 @@ struct MaybePinSmallOpsToCpuTestCase {
   std::string test_name;
   DataType dtype;
   TensorShape shape;
-  string op_name;
+  std::string op_name;
   const char* device;
   bool expect;
 };
@@ -152,7 +152,7 @@ INSTANTIATE_TEST_SUITE_P(
 struct MaybePinToResourceDeviceTestCase {
   std::string test_name;
   DataType dtype;
-  string op_name;
+  std::string op_name;
   const char* device;
   bool expect;
 };
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d4faba6415579f..583a8f15a657f4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -60,7 +60,7 @@ int64_t GetRemoteDeviceIncarnation(Device* device) {
   return device->attributes().incarnation();
 }
 
-string SafeDeviceDebugString(Device* device) {
+std::string SafeDeviceDebugString(Device* device) {
   if (device == nullptr) {
     return "[]";
   } else {
@@ -150,8 +150,8 @@ void TensorHandle::PackedTensorHandleData::Poison(absl::Status status) {
   is_poisoned_ = status;
 }
 
-string TensorHandle::PackedTensorHandleData::DebugString() const {
-  string debug_str = "PackedTensorHandleData: ";
+std::string TensorHandle::PackedTensorHandleData::DebugString() const {
+  std::string debug_str = "PackedTensorHandleData: ";
   for (const auto* handle : handles_) {
     debug_str.append(
         absl::StrCat(std::visit([](auto& data) { return data.DebugString(); },
@@ -308,7 +308,7 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 
 absl::Status TensorHandle::CreatePackedHandle(
     std::vector<TensorHandle*>&& handles, const tensorflow::DataType dtype,
-    const tensorflow::TensorShape& shape, const string& device_name,
+    const tensorflow::TensorShape& shape, const std::string& device_name,
     EagerContext* ctx, TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
@@ -319,7 +319,7 @@ absl::Status TensorHandle::CreatePackedHandle(
     TF_RETURN_IF_ERROR(
         handles.at(0)->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
   }
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   devices.reserve(handles.size());
   for (auto* handle : handles) {
     devices.push_back(handle->op_device() ? handle->op_device()->name()
@@ -372,7 +372,7 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
 
 #if !defined(IS_MOBILE_PLATFORM)
 TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
-    int64_t op_id, int32_t output_num, const string& remote_task,
+    int64_t op_id, int32_t output_num, const std::string& remote_task,
     tensorflow::DataType dtype, Device* d, EagerContext* ctx,
     const bool unknown_device) {
   return new TensorHandle(op_id, output_num, remote_task, dtype, d, ctx,
@@ -380,7 +380,7 @@ TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
 }
 
 TensorHandle::TensorHandle(int64_t op_id, int32_t output_num,
-                           const string& remote_task,
+                           const std::string& remote_task,
                            tensorflow::DataType dtype, Device* d,
                            EagerContext* ctx, const bool unknown_device)
     : ImmediateExecutionTensorHandle(kEager),
@@ -450,7 +450,7 @@ TensorHandle::HandleType TensorHandle::Type() const {
   }
 }
 
-string TensorHandle::TypeString() const {
+std::string TensorHandle::TypeString() const {
   if (data_.index() == 0) {
     return "LOCAL";
   } else if (data_.index() == 1) {
@@ -713,7 +713,7 @@ absl::Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 absl::Status TensorHandle::RemoteAddress(const Device* d,
                                          const bool wait_until_ready,
                                          int64_t* op_id,
-                                         int32* output_num) const {
+                                         int32_t* output_num) const {
   DVLOG(3) << "RemoteAddress on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -759,7 +759,7 @@ absl::Status TensorHandle::RemoteAddress(const Device* d,
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
-                                   uint64 context_view_id) const {
+                                   uint64_t context_view_id) const {
   DVLOG(3) << "HasRemoteMirror on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -777,7 +777,7 @@ bool TensorHandle::HasRemoteMirror(const Device* d,
 }
 
 bool TensorHandle::HasResourceShapeMirror(const Device* d,
-                                          uint64 context_view_id) const {
+                                          uint64_t context_view_id) const {
   DVLOG(3) << "HasResourceShapeMirror on TensorHandle: " << this
            << " device: " << d << " " << d->name();
 
@@ -793,11 +793,9 @@ bool TensorHandle::HasResourceShapeMirror(const Device* d,
   return false;
 }
 
-absl::Status TensorHandle::AddUnshapedRemoteMirror(const Device* d,
-                                                   int64_t op_id,
-                                                   int output_num,
-                                                   const string& remote_task,
-                                                   EagerContext* ctx) {
+absl::Status TensorHandle::AddUnshapedRemoteMirror(
+    const Device* d, int64_t op_id, int output_num,
+    const std::string& remote_task, EagerContext* ctx) {
   DVLOG(3) << "AddUnshapedRemoteMirror on TensorHandle: " << this
            << " device: " << d << " " << d->name() << " op_id: " << op_id
            << " output_num: " << output_num;
@@ -856,14 +854,14 @@ absl::Status TensorHandle::AddResourceShapeMirror(const Device* d,
 
 absl::Status TensorHandle::SetRemoteShape(const TensorShape& shape,
                                           const Device* d,
-                                          uint64 context_view_id) {
+                                          uint64_t context_view_id) {
   return SetRemoteShapeAndDevice(shape, d, context_view_id, /*op_device=*/"");
 }
 
 absl::Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
                                                    const Device* d,
-                                                   uint64 context_view_id,
-                                                   string op_device) {
+                                                   uint64_t context_view_id,
+                                                   std::string op_device) {
   DVLOG(3) << "SetRemoteShape on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -930,7 +928,7 @@ absl::Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
     resource_device_ = dtype == DT_RESOURCE ? device : nullptr;
     resource_remote_device_incarnation_ =
         GetRemoteDeviceIncarnation(resource_device_);
-    string remote_task;
+    std::string remote_task;
     if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
       return errors::InvalidArgument(
           "Unable to find remote task corresponding to device ",
@@ -948,7 +946,7 @@ absl::Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
 }
 
 void TensorHandle::PoisonRemote(absl::Status status, const Device* d,
-                                uint64 context_view_id) {
+                                uint64_t context_view_id) {
   DVLOG(3) << "PoisonRemote on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index ca60815d76ec9e..e2fdb872c317a2 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -66,9 +66,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
                tensorflow::DataType dtype, EagerContext* ctx);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  TensorHandle(int64_t op_id, int32_t output_num, const string& remote_task,
-               tensorflow::DataType dtype, Device* device, EagerContext* ctx,
-               bool unknown_device);
+  TensorHandle(int64_t op_id, int32_t output_num,
+               const std::string& remote_task, tensorflow::DataType dtype,
+               Device* device, EagerContext* ctx, bool unknown_device);
   TensorHandle(int64_t op_id, int32_t output_num, tensorflow::DataType dtype,
                Device* device, bool is_ready, EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
@@ -97,7 +97,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   static absl::Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                          tensorflow::DataType dtype,
                                          const tensorflow::TensorShape& shape,
-                                         const string& device_name,
+                                         const std::string& device_name,
                                          EagerContext* ctx,
                                          TensorHandle** packed_handle);
   static absl::Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
@@ -108,12 +108,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // An unshaped remote handle refers to a tensor on a remote worker. It's not
   // ready until the shape is set. It controls the lifetime of the remote
   // tensor.
-  static TensorHandle* CreateUnshapedRemoteHandle(int64_t op_id,
-                                                  int32_t output_num,
-                                                  const string& remote_task,
-                                                  tensorflow::DataType dtype,
-                                                  Device* d, EagerContext* ctx,
-                                                  bool unknown_device = false);
+  static TensorHandle* CreateUnshapedRemoteHandle(
+      int64_t op_id, int32_t output_num, const std::string& remote_task,
+      tensorflow::DataType dtype, Device* d, EagerContext* ctx,
+      bool unknown_device = false);
   // A lazy remote handle refers to a tensor on a remote worker. The lifetime of
   // the remote tensor is controlled by the remote worker, but not by the lazy
   // remote handle. Lazy handles are normally created on a default function
@@ -189,12 +187,12 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   absl::Status AddLocalMirror(tensorflow::Tensor&& tensor, const Device* d);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  bool HasRemoteMirror(const Device* d, uint64 context_view_id) const;
-  bool HasResourceShapeMirror(const Device* d, uint64 context_view_id) const;
+  bool HasRemoteMirror(const Device* d, uint64_t context_view_id) const;
+  bool HasResourceShapeMirror(const Device* d, uint64_t context_view_id) const;
 
   absl::Status AddUnshapedRemoteMirror(const Device* d, int64_t op_id,
                                        int output_num,
-                                       const string& remote_task,
+                                       const std::string& remote_task,
                                        EagerContext* ctx);
   absl::Status AddResourceShapeMirror(const Device* d, int64_t op_id,
                                       int output_num, EagerContext* ctx);
@@ -203,7 +201,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // If wait_until_ready is true, block until the remote tensor is ready on the
   // given remote worker.
   absl::Status RemoteAddress(const Device* d, bool wait_until_ready,
-                             int64_t* op_id, int32* output_num) const;
+                             int64_t* op_id, int32_t* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
@@ -213,12 +211,13 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // This method or Poison must be called exactly once for remote tensors that
   // were created without a known shape.
   absl::Status SetRemoteShape(const TensorShape& shape, const Device* d,
-                              uint64 context_view_id);
+                              uint64_t context_view_id);
   // If op_device is not empty, reset the devices of a remote tensor which is
   // created without known devices (e.g. function outputs).
   absl::Status SetRemoteShapeAndDevice(const TensorShape& shape,
-                                       const Device* d, uint64 context_view_id,
-                                       string op_device);
+                                       const Device* d,
+                                       uint64_t context_view_id,
+                                       std::string op_device);
 
   // Poisons either this handle or a remote mirror with error `status`.
   // Poisoning means that the handle will become ready and methods trying
@@ -226,7 +225,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // Exactly one of SetRemoteShape or PoisonRemote methods must be called on a
   // unshaped handle on a remote device.
   void PoisonRemote(absl::Status status, const Device* d,
-                    uint64 context_view_id);
+                    uint64_t context_view_id);
 #endif
 
   // Sets the `tensor` for this async non-ready handle making it ready.
@@ -260,7 +259,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   enum HandleType { LOCAL = 0, PACKED = 1, REMOTE = 2 };
 
   HandleType Type() const;
-  string TypeString() const;
+  std::string TypeString() const;
 
   void SetResourceHandleDtypeAndShape(
       std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes);
@@ -330,9 +329,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // TODO(yujingzhang): Remove resource_shape_mirrors_ once scalable per-replica
   // variable is ready, since we could get the shape locally without remote copy
   // then.
-  std::unordered_map<string, RemoteTensorHandleData> resource_shape_mirrors_
-      TF_GUARDED_BY(mu_);
-  std::unordered_map<string, RemoteTensorHandleData> remote_mirrors_
+  std::unordered_map<std::string, RemoteTensorHandleData>
+      resource_shape_mirrors_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, RemoteTensorHandleData> remote_mirrors_
       TF_GUARDED_BY(mu_);
 #endif
 
@@ -371,7 +370,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     bool IsReady() const;
     absl::Status WaitReady(const char* caller) const;
     void Poison(absl::Status status);
-    string DebugString() const;
+    std::string DebugString() const;
 
     // Number of packed handles.
     int NumPackedHandles() const;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
index 2212b19db9c683..b0a089874dd744 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
@@ -96,7 +96,7 @@ absl::Status LocalTensorHandleData::SetTensor(tensorflow::Tensor&& t) {
   return absl::OkStatus();
 }
 
-string LocalTensorHandleData::DebugString() const {
+std::string LocalTensorHandleData::DebugString() const {
   if (IsReady()) {
     return tensor_.DeviceSafeDebugString();
   } else {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.h b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
index ed58e83a183bfe..73a20425871156 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@@ -60,7 +60,7 @@ class LocalTensorHandleData {
 
   absl::Status SetTensor(tensorflow::Tensor&& t);
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   tensorflow::Tensor tensor_;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 209222d33f1185..0bd94f635f0f00 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -44,7 +44,7 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint16>()(a, b) = uint16(a * b);
+      t.matrix<uint16_t>()(a, b) = uint16_t(a * b);
     }
   }
 
@@ -181,7 +181,7 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   handles.push_back(h1);
 
   // Create 2 remote TensorHandles (not ready).
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d2 = ListGPUDevices().at(2);
   TensorHandle* h2 = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d2, context());
@@ -439,7 +439,7 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -478,7 +478,7 @@ TEST_F(RemoteTensorHandleTest, PoisonRemote) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -519,7 +519,7 @@ TEST_F(RemoteTensorHandleTest, PoisonRemoteMirror) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -565,7 +565,7 @@ TEST_F(RemoteTensorHandleTest, SetRemoteTensorHandleShapeTwice) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -623,7 +623,7 @@ TEST_F(RemoteTensorHandleTest, SetRemoteMirrorShapeTwice) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index c7a06f7e5c600b..87b1101bbab536 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -205,6 +205,7 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_xla//xla:shape_util",
+        "@local_xla//xla/pjrt:host_memory_allocator",
         "@local_xla//xla/stream_executor/gpu:gpu_init_impl",
         "@local_xla//xla/stream_executor/integrations:stream_executor_allocator",
         "@local_xla//xla/tsl/framework:device_id_utils",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 8fd3dc450c98a6..80ba5156327af4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -434,7 +434,7 @@ class GPUBFCAllocatorPrivateMethodsTest
 
     std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins> bin_infos;
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       bin_infos = a.get_bin_debug_info();
     }
 
@@ -486,7 +486,7 @@ class GPUBFCAllocatorPrivateMethodsTest
       initial_ptrs[i] = nullptr;
     }
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       bin_infos = a.get_bin_debug_info();
     }
     for (int i = 0; i < BFCAllocator::kNumBins; i++) {
@@ -610,7 +610,7 @@ class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
     }
 
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       // Make sure there are more than 1 regions in preparation for the test.
       EXPECT_LT(1, a.region_manager_.regions().size());
     }
@@ -623,7 +623,7 @@ class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
     // Deallocate free regions and there shall be only one region left.
     EXPECT_EQ(true, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       EXPECT_EQ(1, a.region_manager_.regions().size());
     }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 0a2d98e840a849..1bebcc31d45c76 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -44,7 +44,8 @@ int64_t* before_mask = NewMask(0xabababababababab);
 int64_t* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
 bool CheckMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
-  se::DeviceMemory<int64_t> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
+  stream_executor::DeviceAddress<int64_t> gpu_ptr{
+      stream_executor::DeviceAddressBase{ptr, MASK_BYTES}};
   int64_t tmp[MASK_WORDS];
 
   absl::Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
@@ -66,7 +67,8 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
 }
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
-  se::DeviceMemory<int64_t> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
+  stream_executor::DeviceAddress<int64_t> gpu_ptr{
+      stream_executor::DeviceAddressBase{ptr, MASK_BYTES}};
   absl::Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
   if (!result.ok()) {
     LOG(FATAL) << "Could not copy debug mask, " << result;
@@ -175,8 +177,9 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
   std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
                           std::nanf(""));
-  se::DeviceMemory<float> nan_ptr{
-      se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
+  stream_executor::DeviceAddress<float> nan_ptr{
+      stream_executor::DeviceAddressBase{static_cast<float*>(allocated_ptr),
+                                         req_size}};
 
   absl::Status result =
       stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
@@ -192,8 +195,8 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
     size_t req_size = base_allocator_->RequestedSize(ptr);
     std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
                             std::nanf(""));
-    se::DeviceMemory<float> nan_ptr{
-        se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    stream_executor::DeviceAddress<float> nan_ptr{
+        stream_executor::DeviceAddressBase{static_cast<float*>(ptr), req_size}};
     absl::Status result =
         stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
     if (!result.ok()) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 573e42fea61860..1d252f549d3803 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -57,7 +57,8 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
     memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64_t));
     int64_t* gpu_array =
         TypedAllocator::Allocate<int64_t>(&a, cpu_array.size(), {});
-    se::DeviceMemory<int64_t> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
+    stream_executor::DeviceAddress<int64_t> gpu_array_ptr{
+        stream_executor::DeviceAddressBase{gpu_array}};
     TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(
         &cpu_array[0], s * sizeof(int64_t), &gpu_array_ptr));
     EXPECT_TRUE(a.CheckHeader(gpu_array));
@@ -85,14 +86,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
           int64_t* gpu_array =
               TypedAllocator::Allocate<int64_t>(&a, cpu_array.size(), {});
 
-          se::DeviceMemory<int64_t> gpu_array_ptr{
-              se::DeviceMemoryBase{gpu_array}};
+          stream_executor::DeviceAddress<int64_t> gpu_array_ptr{
+              stream_executor::DeviceAddressBase{gpu_array}};
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(
               &cpu_array[0], cpu_array.size() * sizeof(int64_t),
               &gpu_array_ptr));
 
-          se::DeviceMemory<int64_t> gpu_hdr_ptr{
-              se::DeviceMemoryBase{gpu_array - 1}};
+          stream_executor::DeviceAddress<int64_t> gpu_hdr_ptr{
+              stream_executor::DeviceAddressBase{gpu_array - 1}};
           // Clobber first word of the header.
           float pi = 3.1417;
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(&pi, sizeof(float),
@@ -122,15 +123,15 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
           int64_t* gpu_array =
               TypedAllocator::Allocate<int64_t>(&a, cpu_array.size(), {});
 
-          se::DeviceMemory<int64_t> gpu_array_ptr{
-              se::DeviceMemoryBase{gpu_array}};
+          stream_executor::DeviceAddress<int64_t> gpu_array_ptr{
+              stream_executor::DeviceAddressBase{gpu_array}};
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(
               &cpu_array[0], cpu_array.size() * sizeof(int64_t),
               &gpu_array_ptr));
 
           // Clobber word of the footer.
-          se::DeviceMemory<int64_t> gpu_ftr_ptr{
-              se::DeviceMemoryBase{gpu_array + s}};
+          stream_executor::DeviceAddress<int64_t> gpu_ftr_ptr{
+              stream_executor::DeviceAddressBase{gpu_array + s}};
           float pi = 3.1417;
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(&pi, sizeof(float),
                                                         &gpu_ftr_ptr));
@@ -156,7 +157,8 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 
   // Allocate 1024 floats
   float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
-  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
+  stream_executor::DeviceAddress<float> gpu_array_ptr{
+      stream_executor::DeviceAddressBase{gpu_array}};
   TF_CHECK_OK(stream_exec->SynchronousMemcpyD2H(
       gpu_array_ptr, cpu_array.size() * sizeof(float), &cpu_array[0]));
   for (float f : cpu_array) {
@@ -200,7 +202,8 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 
   // Allocate 1024 floats
   float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
-  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
+  stream_executor::DeviceAddress<float> gpu_array_ptr{
+      stream_executor::DeviceAddressBase{gpu_array}};
   TF_CHECK_OK(stream_exec->SynchronousMemcpyD2H(
       gpu_array_ptr, cpu_array.size() * sizeof(float), &cpu_array[0]));
   for (float f : cpu_array) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 22eecde5ba7d8a..f40fd04472700c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // TODO(opensource): Use a more generic sounding preprocessor name than
 // GOOGLE_CUDA
+#include "xla/pjrt/host_memory_allocator.h"
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
@@ -1880,8 +1881,10 @@ Status BaseGPUDeviceFactory::CreateDevices(
     // TODO(chuanhao): Use the correct NUMA_NODE.
     const int64_t numa_node = 0;
 
-    std::unique_ptr<tsl::Allocator> pjrt_gpu_host_allocator(
-        process_state->GetGpuHostAllocator(/*options=*/{}, numa_node));
+    auto pjrt_gpu_host_allocator =
+        std::make_unique<xla::BasicHostMemoryAllocator>(
+            std::unique_ptr<tsl::Allocator>(
+                process_state->GetGpuHostAllocator(/*options=*/{}, numa_node)));
 
     if (populate_pjrt_gpu_client_creation_info &&
         !should_create_new_pjrt_client) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index d09cdc2fb2c0f4..441715bd2d22cb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -105,28 +105,28 @@ class BaseGPUDevice : public LocalDevice {
 #endif
     se::Stream* host_to_device = nullptr;
     se::Stream* device_to_host = nullptr;
-    gtl::InlinedVector<se::Stream*, 4> device_to_device;
+    absl::InlinedVector<stream_executor::Stream*, 4UL> device_to_device;
     int priority = 0;
   };
 
   // Initialize the device and return the status of initialization.
 #ifdef TF_GPU_USE_PJRT
-  Status Init(const SessionOptions& options,
-              xla::LocalDeviceState* xla_local_device_state);
+  absl::Status Init(const SessionOptions& options,
+                    xla::LocalDeviceState* xla_local_device_state);
 #else
-  Status Init(const SessionOptions& options);
+  absl::Status Init(const SessionOptions& options);
 #endif  // TF_GPU_USE_PJRT
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
 
-  Status Sync() override;
+  absl::Status Sync() override;
 
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
 
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override;
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
 
   void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
                               const DeviceContext* device_context,
@@ -135,9 +135,9 @@ class BaseGPUDevice : public LocalDevice {
   // The caller owns the returned device.
   PerOpGpuDevice* MakeGpuDevice() override;
 
-  Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
-                               DeviceContext* dc,
-                               Allocator* allocator) override;
+  absl::Status ReinitializeGpuDevice(OpKernelContext* context,
+                                     PerOpGpuDevice* device, DeviceContext* dc,
+                                     Allocator* allocator) override;
 
   // Returns the platform GPU id of this device within the native driver system;
   // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
@@ -164,7 +164,7 @@ class BaseGPUDevice : public LocalDevice {
 
   // If returned value is > 0 then GPU Memory chunks freed before this count
   // are guaranteed not to be in use by any kernel pending on this device.
-  uint64 SafeAllocFrontier(uint64 old_value) override;
+  uint64_t SafeAllocFrontier(uint64_t old_value) override;
 
   // Returns the number of kernels that have been queued for execution on
   // the compute stream and are not yet known to have completed.
@@ -216,13 +216,13 @@ class BaseGPUDevice : public LocalDevice {
   EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::unique_ptr<GPUKernelTracker> kernel_tracker_;
-  int32 pending_cap_ = 0;
+  int32_t pending_cap_ = 0;
   bool timestamped_allocator_ = false;
   NodeFileWriter* node_file_writer_ = nullptr;  // not owned
   const GPUOptions::Experimental::StreamMergeOptions stream_merge_options_;
 
   // Initialize scratch buffers used by Eigen.
-  Status InitScratchBuffers();
+  absl::Status InitScratchBuffers();
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
@@ -235,9 +235,9 @@ class BaseGPUDevice : public LocalDevice {
   // allocate memory or if the tensor "from" is not DMA-copyable.
   // If there is no error prior to enqueueing the copy, an OK status
   // is returned.
-  Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
-                              const Tensor& from, Tensor* to,
-                              StatusCallback done);
+  absl::Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
+                                    const Tensor& from, Tensor* to,
+                                    StatusCallback done);
 
   Tensor CopyGpuTensorToHostDebugOnly(const Tensor& gpu_tensor);
   void LogInputs(OpKernel* op_kernel, OpKernelContext* context);
@@ -293,25 +293,25 @@ class GPUKernelTracker {
   // Determine whether a GPU kernel should have a recording event queued
   // immediately afterwards.  If so, advance the counter and return the new
   // counter value after enqueuing.
-  uint64 MaybeQueue(OpKernelContext* ctx);
+  uint64_t MaybeQueue(OpKernelContext* ctx);
 
   // Record that a GPU kernel has just been enqueued on the compute stream.
   // Inserts the supplied counter value in a new PendingKernel record appended
   // to the end of the ring buffer then returns that same count.
   // Caller is responsible for ensuring that RecordTerminate() is eventually
   // called with the same counter value.
-  void RecordQueued(uint64 queued_count, int weight)
+  void RecordQueued(uint64_t queued_count, int weight)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Takes a count value returned by RecordQueued and finds the corresponding
   // PendingKernel record in the ring buffer.  Marks the kernel as completed and
   // advances the completion frontier accordingly.
-  void RecordTerminated(uint64 queued_count);
+  void RecordTerminated(uint64_t queued_count);
 
   // Returns the largest timing count such that all kernels queued no
   // later than that count are known to have terminated.
-  inline uint64 LastTerminatedCount(uint64 old_value) {
-    uint64 new_value = last_terminated_count_.load(std::memory_order_relaxed);
+  inline uint64_t LastTerminatedCount(uint64_t old_value) {
+    uint64_t new_value = last_terminated_count_.load(std::memory_order_relaxed);
     if (new_value == old_value) {
       MaybeQueueProgressEvent();
     }
@@ -344,22 +344,22 @@ class GPUKernelTracker {
   std::unique_ptr<SharedCounter> owned_counter_;
   Allocator* allocator_ = nullptr;
   EventMgr* em_ = nullptr;
-  std::atomic<uint64> last_terminated_count_ = {1};
+  std::atomic<uint64_t> last_terminated_count_ = {1};
 
   void MaybeQueueProgressEvent();
 
   // Records when a kernel was queued for execution.  Kernel launches are
   // identified by a unique count value from a per-GPU device timing counter.
   struct PendingKernel {
-    uint64 queued_count;
+    uint64_t queued_count;
     int weight;
     bool terminated;
     PendingKernel(const PendingKernel& pk) = default;
     PendingKernel() : queued_count(0), weight(0), terminated(false) {}
   };
   mutex mu_;
-  int32 mem_since_last_ TF_GUARDED_BY(mu_);
-  int32 ops_since_last_ TF_GUARDED_BY(mu_);
+  int32_t mem_since_last_ TF_GUARDED_BY(mu_);
+  int32_t ops_since_last_ TF_GUARDED_BY(mu_);
   // Ring buffer of PendingKernel records.
   std::vector<PendingKernel> pending_kernels_ TF_GUARDED_BY(mu_);
   // Next unused slot in pending_kernels_.
@@ -376,12 +376,13 @@ class GPUKernelTracker {
 
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override;
-  Status CreateDevices(const SessionOptions& options,
-                       const std::string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override;
-  Status GetDeviceDetails(int device_index,
-                          std::unordered_map<string, string>* details) override;
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override;
+  absl::Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) override;
+  absl::Status GetDeviceDetails(
+      int device_index,
+      std::unordered_map<std::string, std::string>* details) override;
 
   struct InterconnectMap {
     // Name of interconnect technology, if known.
@@ -390,7 +391,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
     // Where architecture-specific subclassing is not done that won't
     // always be possible.  The minimum expectation is that
     // faster links should have a higher value than slower links.
-    int32 strength;
+    int32_t strength;
     static const int kSameDeviceStrength;
     static const int kStreamExecutorStrength;
     std::set<std::pair<tsl::PlatformDeviceId, tsl::PlatformDeviceId>>
@@ -400,7 +401,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
  protected:
   // Populates *maps with interconnect maps for all local direct access
   // pathways between GPUs.
-  virtual Status GetInterconnectMaps(
+  virtual absl::Status GetInterconnectMaps(
       const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
       se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
 
@@ -413,7 +414,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
       LocalityMap;
   // Populates *localities with the DeviceLocality descriptor for
   // every TfDeviceId.
-  virtual Status GetDeviceLocalities(
+  virtual absl::Status GetDeviceLocalities(
       int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
       LocalityMap* localities);
 
@@ -422,29 +423,29 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // 'devices' vector. The 'gpu_allocator' is created by the caller and usually
   // preallocates a set amount of GPU memory.
 #ifdef TF_GPU_USE_PJRT
-  Status CreateGPUDevice(const SessionOptions& options,
-                         const std::string& name_prefix,
-                         tsl::TfDeviceId tf_device_id,
-                         const DeviceLocality& dev_locality,
-                         xla::LocalDeviceState* xla_local_device_state,
-                         Allocator* gpu_allocator,
-                         std::vector<std::unique_ptr<Device>>* devices);
+  absl::Status CreateGPUDevice(const SessionOptions& options,
+                               const std::string& name_prefix,
+                               tsl::TfDeviceId tf_device_id,
+                               const DeviceLocality& dev_locality,
+                               xla::LocalDeviceState* xla_local_device_state,
+                               Allocator* gpu_allocator,
+                               std::vector<std::unique_ptr<Device>>* devices);
 #else
-  Status CreateGPUDevice(const SessionOptions& options,
-                         const std::string& name_prefix,
-                         tsl::TfDeviceId tf_device_id,
-                         const DeviceLocality& dev_locality,
-                         Allocator* gpu_allocator,
-                         std::vector<std::unique_ptr<Device>>* devices);
+  absl::Status CreateGPUDevice(const SessionOptions& options,
+                               const std::string& name_prefix,
+                               tsl::TfDeviceId tf_device_id,
+                               const DeviceLocality& dev_locality,
+                               Allocator* gpu_allocator,
+                               std::vector<std::unique_ptr<Device>>* devices);
 #endif  // TF_GPU_USE_PJRT
 
   virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
-      const SessionOptions& options, const string& name, Bytes memory_limit,
-      const DeviceLocality& dev_locality, tsl::TfDeviceId tf_device_id,
-      const string& physical_device_desc, Allocator* gpu_allocator,
-      Allocator* cpu_allocator) = 0;
+      const SessionOptions& options, const std::string& name,
+      Bytes memory_limit, const DeviceLocality& dev_locality,
+      tsl::TfDeviceId tf_device_id, const std::string& physical_device_desc,
+      Allocator* gpu_allocator, Allocator* cpu_allocator) = 0;
 
-  Status EnablePeerAccess(
+  absl::Status EnablePeerAccess(
       const std::vector<tsl::PlatformDeviceId>& visible_gpu_order);
 
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
@@ -452,7 +453,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // based upon 'visible_gpu_order' which was generated by parsing
   // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
   // ROCm GPU ids.
-  Status GetValidDeviceIds(
+  absl::Status GetValidDeviceIds(
       const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
       std::vector<tsl::PlatformDeviceId>* ids);
 
@@ -460,7 +461,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
   // GetValidDeviceIds, so this should only be used in functions where all
   // devices should be treated as visible, like ListPhysicalDevices.
-  Status CacheDeviceIds();
+  absl::Status CacheDeviceIds();
 
   // visible_gpu_initialized_[platform_device_id] is true if visible GPU
   // platform_device_id has been initialized by the process.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index d5b9c127351a36..2848cf5d16d91d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -28,10 +28,11 @@ namespace tensorflow {
 
 class GPUDevice : public BaseGPUDevice {
  public:
-  GPUDevice(const SessionOptions& options, const string& name,
+  GPUDevice(const SessionOptions& options, const std::string& name,
             Bytes memory_limit, const DeviceLocality& locality,
-            tsl::TfDeviceId tf_device_id, const string& physical_device_desc,
-            Allocator* gpu_allocator, Allocator* cpu_allocator)
+            tsl::TfDeviceId tf_device_id,
+            const std::string& physical_device_desc, Allocator* gpu_allocator,
+            Allocator* cpu_allocator)
       : BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
                       false /* sync every op */),
@@ -64,10 +65,10 @@ class GPUDevice : public BaseGPUDevice {
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
   std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
-      const SessionOptions& options, const string& name, Bytes memory_limit,
-      const DeviceLocality& locality, tsl::TfDeviceId tf_device_id,
-      const string& physical_device_desc, Allocator* gpu_allocator,
-      Allocator* cpu_allocator) override {
+      const SessionOptions& options, const std::string& name,
+      Bytes memory_limit, const DeviceLocality& locality,
+      tsl::TfDeviceId tf_device_id, const std::string& physical_device_desc,
+      Allocator* gpu_allocator, Allocator* cpu_allocator) override {
     return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
                                         tf_device_id, physical_device_desc,
                                         gpu_allocator, cpu_allocator);
@@ -82,7 +83,7 @@ REGISTER_LOCAL_DEVICE_FACTORY("GPU", GPUDeviceFactory, 210);
 // -----------------------------------------------------------------------------
 class GPUCompatibleCPUDevice : public ThreadPoolDevice {
  public:
-  GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
+  GPUCompatibleCPUDevice(const SessionOptions& options, const std::string& name,
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
       : ThreadPoolDevice(options, name, memory_limit, locality, allocator),
@@ -114,14 +115,15 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 // The associated factory.
 class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
  public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override {
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override {
     devices->push_back("/physical_device:CPU:0");
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override {
+  absl::Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) override {
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -131,7 +133,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
                              ? port::NUMANumNodes()
                              : 1;
     for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/device:CPU:", i);
+      std::string name = absl::StrCat(name_prefix, "/device:CPU:", i);
       int numa_node = i % num_numa_nodes;
       DeviceLocality locality;
       locality.set_numa_node(numa_node);
@@ -140,7 +142,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
           ProcessState::singleton()->GetCPUAllocator(numa_node)));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 REGISTER_LOCAL_DEVICE_FACTORY("CPU", GPUCompatibleCPUDeviceFactory, 70);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index ad854582ff8116..f3c7681fa26b30 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -80,7 +80,7 @@ bool IsRocm() {
       .IsRocm();
 }
 
-void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
+void ExpectErrorMessageSubstr(const absl::Status& s, absl::string_view substr) {
   EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
@@ -96,12 +96,12 @@ class GPUDeviceTest : public ::testing::Test {
 
  protected:
   static SessionOptions MakeSessionOptions(
-      const string& visible_device_list = "",
+      const std::string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
       const std::vector<std::vector<float>>& memory_limit_mb = {},
-      const std::vector<std::vector<int32>>& priority = {},
-      const std::vector<std::vector<int32>>& device_ordinal = {},
-      const int32 num_virtual_devices = 0,
+      const std::vector<std::vector<int32_t>>& priority = {},
+      const std::vector<std::vector<int32_t>>& device_ordinal = {},
+      const int32_t num_virtual_devices = 0,
       const bool use_cuda_malloc_async = false) {
     SessionOptions options;
     ConfigProto* config = &options.config;
@@ -182,7 +182,7 @@ TEST_F(GPUDeviceTest, CudaMallocAsync) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {}, {}, 0,
                                            /*use_cuda_malloc_async=*/true);
   std::vector<std::unique_ptr<Device>> devices;
-  Status status;
+  absl::Status status;
   int number_instantiated =
       se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
   {  // The new scope is to trigger the destruction of the object.
@@ -213,7 +213,7 @@ TEST_F(GPUDeviceTest, CudaMallocAsyncPreallocate) {
                                            /*use_cuda_malloc_async=*/true);
   setenv("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", "2048", 1);
   std::vector<std::unique_ptr<Device>> devices;
-  Status status;
+  absl::Status status;
 
   int number_instantiated =
       se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
@@ -244,7 +244,7 @@ TEST_F(GPUDeviceTest, CudaMallocAsyncPreallocate) {
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(status, "Could not parse entry");
@@ -253,7 +253,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
 TEST_F(GPUDeviceTest, InvalidGpuId) {
   SessionOptions opts = MakeSessionOptions("100");
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(status,
@@ -263,7 +263,7 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,0");
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(status,
@@ -273,7 +273,7 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(
@@ -285,7 +285,7 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   // (empty) VirtualDevices messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
   ExpectErrorMessageSubstr(status,
@@ -297,7 +297,7 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   // messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
   ExpectErrorMessageSubstr(status,
@@ -311,7 +311,7 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // messages.
   SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(
@@ -380,7 +380,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-9999, 0}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+    absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
     EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
 #if TENSORFLOW_USE_ROCM
@@ -403,7 +403,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+    absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
     EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
 #if TENSORFLOW_USE_ROCM
@@ -461,7 +461,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
     // 0 is a valid priority value for both AMD and NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0}});
     std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+    absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
     EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
     ExpectErrorMessageSubstr(
@@ -550,7 +550,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
       ->mutable_experimental()
       ->set_use_unified_memory(true);
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INTERNAL);
   ExpectErrorMessageSubstr(status, "does not support oversubscription.");
@@ -615,7 +615,7 @@ TEST_F(GPUDeviceTest, CopyTensorInSameDevice) {
   CopyCPUToGPU(&cpu_tensor, &input_tensor, device, device_context);
   absl::Notification note;
   device->CopyTensorInSameDevice(&input_tensor, &output_tensor, device_context,
-                                 [&note](const Status& s) {
+                                 [&note](const absl::Status& s) {
                                    TF_ASSERT_OK(s);
                                    note.Notify();
                                  });
@@ -633,11 +633,11 @@ TEST_F(GPUDeviceTest, CopyTensorInSameDevice) {
 
 TEST_F(GPUDeviceTest, DeviceDetails) {
   DeviceFactory* factory = DeviceFactory::GetFactory("GPU");
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   TF_ASSERT_OK(factory->ListPhysicalDevices(&devices));
   EXPECT_GE(devices.size(), 1);
   for (int i = 0; i < devices.size(); i++) {
-    std::unordered_map<string, string> details;
+    std::unordered_map<std::string, std::string> details;
     TF_ASSERT_OK(factory->GetDeviceDetails(i, &details));
 #if TENSORFLOW_USE_ROCM
     EXPECT_EQ(details.count("compute_capability"), 0);
@@ -673,7 +673,7 @@ class GPUKernelTrackerTest : public ::testing::Test {
                                                nullptr));
   }
 
-  void RecordQueued(uint64 v) {
+  void RecordQueued(uint64_t v) {
     mutex_lock l(kernel_tracker_->mu_);
     kernel_tracker_->RecordQueued(v, 1);
   }
@@ -690,7 +690,7 @@ TEST_F(GPUKernelTrackerTest, CappingOnly) {
 
   std::deque<int64_t> queued_counts;
   for (int i = 0; i < 32; ++i) {
-    uint64 queued_count = timing_counter_->next();
+    uint64_t queued_count = timing_counter_->next();
     queued_counts.push_back(queued_count);
     RecordQueued(queued_count);
   }
@@ -712,7 +712,7 @@ TEST_F(GPUKernelTrackerTest, CappingOnly) {
   // to introduce gaps between last_completed_ and first_available_.
   int64_t lower_bound = timing_counter_->get();
   for (int i = 0; i < 1111; ++i) {
-    uint64 queued_count = timing_counter_->next();
+    uint64_t queued_count = timing_counter_->next();
     queued_counts.push_back(queued_count);
     RecordQueued(queued_count);
     int64_t upper_bound = timing_counter_->get();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3179d8858ad154..15fd92a873bea0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -122,11 +122,11 @@ static std::unique_ptr<SubAllocator> CreateSubAllocator(
                              options.experimental().use_unified_memory());
   if (use_unified_memory) {
     auto unified_memory_allocator =
-        executor->CreateMemoryAllocator(stream_executor::MemoryType::kUnified)
+        executor->CreateMemoryAllocator(stream_executor::MemorySpace::kUnified)
             .value();
     return std::make_unique<se::StreamExecutorAllocator>(
         std::move(unified_memory_allocator),
-        stream_executor::MemoryType::kUnified, platform_device_id.value(),
+        stream_executor::MemorySpace::kUnified, platform_device_id.value(),
         alloc_visitors);
   } else {
     return std::make_unique<se::DeviceMemAllocator>(
@@ -140,7 +140,7 @@ Allocator* GPUProcessState::GetGPUAllocator(
   CHECK(process_state_);
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-  const string& allocator_type = options.allocator_type();
+  const std::string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
   tsl::CheckValidTfDeviceId(
       DEVICE_GPU, se::GPUMachineManager()->VisibleDeviceCount(), tf_device_id);
@@ -172,7 +172,7 @@ Allocator* GPUProcessState::GetGPUAllocator(
 
     auto gpu_bfc_allocator = std::make_unique<GPUBFCAllocator>(
         std::move(sub_allocator), total_bytes,
-        strings::StrCat("GPU_", tf_device_id.value(), "_bfc"), [&] {
+        absl::StrCat("GPU_", tf_device_id.value(), "_bfc"), [&] {
           GPUBFCAllocator::Options o;
           o.allow_growth = options.allow_growth();
           o.allow_retry_on_failure =
@@ -366,9 +366,9 @@ Allocator* GPUProcessState::GetGpuHostAllocator(const GPUOptions& options,
       gpu_host_free_visitors_.push_back({});
     }
     auto host_memory_allocator =
-        se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+        se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
     SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-        std::move(host_memory_allocator), stream_executor::MemoryType::kHost,
+        std::move(host_memory_allocator), stream_executor::MemorySpace::kHost,
         numa_node, gpu_host_alloc_visitors_[numa_node],
         gpu_host_free_visitors_[numa_node]);
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 4d192d8af9fab4..6fb3a800d0ab60 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -548,34 +548,35 @@ absl::Status GPUUtil::SyncAll(Device* gpu_device) {
   return absl::OkStatus();
 }
 
-string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
-  string ret;
+std::string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
+  std::string ret;
   CHECK(tensor);
   const int64_t num_bytes = std::min<int64_t>(
       FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
   void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
-  strings::Appendf(&ret, "%p:", ptr);
+  void* arg1 = ptr;
+  absl::StrAppendFormat(&ret, "%p:", arg1);
   if (num_bytes > 0) {
     auto* dev_info = device->tensorflow_accelerator_device_info();
     if (!dev_info) {
-      strings::StrAppend(
+      absl::StrAppend(
           &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
     } else {
-      string buf;
+      std::string buf;
       buf.resize(num_bytes);
       DeviceMemoryBase gpu_ptr(ptr, num_bytes);
       auto s = dev_info->stream->parent()->SynchronousMemcpyD2H(
           gpu_ptr, num_bytes, &*buf.begin());
-      strings::StrAppend(&ret, PrintMemory(&*buf.begin(), num_bytes));
+      absl::StrAppend(&ret, PrintMemory(&*buf.begin(), num_bytes));
     }
   }
   return ret;
 }
 
 // TODO(pbar) Checksum is called from places without a valid device context.
-uint64 GPUUtil::Checksum(Device* gpu_device,
-                         const DeviceContext* device_context,
-                         const Tensor& tensor) {
+uint64_t GPUUtil::Checksum(Device* gpu_device,
+                           const DeviceContext* device_context,
+                           const Tensor& tensor) {
   Tensor copy(tensor.dtype(), tensor.shape());
   absl::Status s;
   absl::Notification n;
@@ -589,7 +590,7 @@ uint64 GPUUtil::Checksum(Device* gpu_device,
   return Checksum(copy);
 }
 
-uint64 GPUUtil::Checksum(const Tensor& tensor) {
+uint64_t GPUUtil::Checksum(const Tensor& tensor) {
   const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
   size_t num_bytes = tensor.TotalBytes();
   size_t num_floats = num_bytes / sizeof(float);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 0b650ad9804343..6675aa3802c081 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -62,7 +62,7 @@ class GPUUtil {
   // For debugging purpose, given a "device" and a "tensor" allocated
   // on the device, return a string printing each byte in the tensor
   // (up to a limit).  "device" can be either a CPU or a GPU device.
-  static string MemoryDebugString(const Device* device, Tensor* tensor);
+  static std::string MemoryDebugString(const Device* device, Tensor* tensor);
 
   // Map a Tensor as a DeviceMemory object wrapping the given typed
   // buffer.
@@ -72,18 +72,19 @@ class GPUUtil {
   template <typename T>
   static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<void*>(DMAHelper::base(&t)));
-    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(
+        stream_executor::DeviceAddressBase(ptr, t.TotalBytes()));
   }
 
   // Computes a checksum over the contents of "tensor", which is allocated
   // on "gpu_device".
-  static uint64 Checksum(Device* gpu_device,
-                         const DeviceContext* device_context,
-                         const Tensor& tensor);
+  static uint64_t Checksum(Device* gpu_device,
+                           const DeviceContext* device_context,
+                           const Tensor& tensor);
 
   // Computes a checksum over the contents of "tensor", which is allocated
   // in local CPU RAM.
-  static uint64 Checksum(const Tensor& tensor);
+  static uint64_t Checksum(const Tensor& tensor);
 
   static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  const DeviceContext* device_context,
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index fbc733ce4b85d4..954658e1111a4c 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -30,9 +30,9 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
 
@@ -49,9 +49,9 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(0 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
 
@@ -83,9 +83,9 @@ TEST(PoolAllocatorTest, Alignment) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(0 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
   for (int i = 0; i < 16; ++i) {
@@ -145,9 +145,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0,
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0,
       {alloc_visitor}, {free_visitor});
   PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
@@ -250,9 +250,9 @@ TEST(PoolAllocatorTest, Name) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
   EXPECT_EQ("pool", pool.Name());
diff --git a/tensorflow/core/common_runtime/gradients.cc b/tensorflow/core/common_runtime/gradients.cc
index 466977ecf772d6..ede2bed5eced15 100644
--- a/tensorflow/core/common_runtime/gradients.cc
+++ b/tensorflow/core/common_runtime/gradients.cc
@@ -40,18 +40,18 @@ namespace tensorflow {
 static const char* const kGradientOp = "SymbolicGradient";
 static const char* const kNodeLabel = "Func";
 
-string NodeOut::name() const {
+std::string NodeOut::name() const {
   if (index == 0) {
     return node->name();
   } else {
-    return strings::StrCat(node->name(), ":", index);
+    return absl::StrCat(node->name(), ":", index);
   }
 }
 
 DataType NodeOut::dtype() const { return node->output_type(index); }
 
 struct NodeOutHash {
-  uint64 operator()(const NodeOut& x) const {
+  uint64_t operator()(const NodeOut& x) const {
     return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                   x.index);
   }
@@ -334,7 +334,7 @@ NodeOut SymbolicGradientBuilder::SumGradients(const NodeOut& src) {
   return {add, 0};
 }
 
-static bool IsPrimitiveOpWithNoGrad(const string& func) {
+static bool IsPrimitiveOpWithNoGrad(const std::string& func) {
   gradient::Creator creator;
   absl::Status s = gradient::GetOpGradientCreator(func, &creator);
   return s.ok() && (creator == nullptr);
diff --git a/tensorflow/core/common_runtime/gradients.h b/tensorflow/core/common_runtime/gradients.h
index aaa9cad80ad691..6eb32e450e1dcf 100644
--- a/tensorflow/core/common_runtime/gradients.h
+++ b/tensorflow/core/common_runtime/gradients.h
@@ -28,7 +28,7 @@ struct NodeOut {
   int index;
 
   // Returns the string name that represents the output of this node.
-  string name() const;
+  std::string name() const;
   // Returns the data type of the output of this node.
   DataType dtype() const;
 };
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index cbbbee60ee7c6f..5fb43daa1c0b8d 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -137,14 +137,14 @@ class GraphConstructor {
     bool expect_device_spec;
     bool propagate_device_spec;
 
-    string prefix;
+    std::string prefix;
     bool uniquify_names;
     bool uniquify_prefix;
     std::map<TensorId, TensorId> input_map;
     bool skip_mapped_nodes;
-    std::vector<string> control_dependencies;
+    std::vector<std::string> control_dependencies;
     std::vector<TensorId> return_tensors;
-    std::vector<string> return_nodes;
+    std::vector<std::string> return_nodes;
 
     // TODO(ashankar): This bool exists to separate out functionality required
     // to make ImportGraphDef a close equivalent of Python's import_graph_def
@@ -166,7 +166,7 @@ class GraphConstructor {
     // value to the Node when they are missing from the NodeDef.
     bool add_default_attributes = true;
 
-    string default_device;
+    std::string default_device;
   };
 
   typedef absl::Span<const NodeDef* const> NodeDefSlice;
@@ -288,7 +288,7 @@ class GraphConstructor {
 
   // Returns a unique version of `original_name`, or `original_name` if it's
   // already unique in the graph.
-  string FindUniqueName(absl::string_view original_name);
+  std::string FindUniqueName(absl::string_view original_name);
 
   // Decrement pending count for users of `processed` and add the ones that now
   // have all of their pending inputs satisfied to `ready_`.
@@ -321,7 +321,7 @@ class GraphConstructor {
   const VersionDef original_versions_;
 
   // A copy of opts_.prefix, possibly uniquified.
-  string prefix_;
+  std::string prefix_;
 
   StackTracesMap traces_;
 
@@ -364,7 +364,7 @@ class GraphConstructor {
 
   // Imported node names that have been uniquified. The key is the original
   // name, the value is the new unique name.
-  gtl::FlatMap<string, string> uniquified_names_;
+  gtl::FlatMap<std::string, std::string> uniquified_names_;
 
   // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
   // (sorted) set so nodes are created in the order defined in the GraphDef.
@@ -381,10 +381,10 @@ class GraphConstructor {
   // Used in the conversion from node_defs_ to g_ to represent the ith input
   // of a node.
   struct InputInfo {
-    explicit InputInfo(const string& node_name, Node* n, int i)
+    explicit InputInfo(const std::string& node_name, Node* n, int i)
         : name(node_name), node(n), index(i) {}
     // Use string instead of StringPiece so we don't have to manage lifetime
-    string name;
+    std::string name;
     Node* node;
     int index;
 
@@ -402,10 +402,10 @@ class GraphConstructor {
   // Used in the conversion from node_defs_ to g_ to represent an edge from
   // the node named 'name' to node 'n'.
   struct EdgeInfo {
-    explicit EdgeInfo(const string& name, int i1, Node* n, int i2)
+    explicit EdgeInfo(const std::string& name, int i1, Node* n, int i2)
         : src_name(name), src_index(i1), dst_node(n), dst_index(i2) {}
     // Use string instead of StringPiece so we don't have to manage lifetime
-    string src_name;
+    std::string src_name;
     int src_index;
     Node* dst_node;
     int dst_index;
@@ -594,7 +594,7 @@ bool NodeNameInValues(const std::map<TensorId, TensorId>& input_map,
   return false;
 }
 
-bool NodeNameInValues(const std::vector<string>& control_dependencies,
+bool NodeNameInValues(const std::vector<std::string>& control_dependencies,
                       const absl::string_view& node_name) {
   return std::find(control_dependencies.begin(), control_dependencies.end(),
                    node_name) != control_dependencies.end();
@@ -632,7 +632,7 @@ absl::Status GraphConstructor::EnsureNoNameCollisions() {
   }
   if (prefix_.empty() && opts_.importing && !opts_.uniquify_names) {
     for (size_t i = 0; i < node_def_count(); ++i) {
-      const string& name = get_node_def(i).name();
+      const std::string& name = get_node_def(i).name();
       if (NameExistsInGraph(name)) {
         return errors::InvalidArgument("Node name '", name,
                                        "' already exists in the Graph");
@@ -646,7 +646,7 @@ absl::Status GraphConstructor::EnsureNoNameCollisions() {
                                      "' would lead to invalid node names");
     }
     if (NameExistsInGraph(prefix_no_slash) && opts_.uniquify_prefix) {
-      prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
+      prefix_ = absl::StrCat(FindUniqueName(prefix_no_slash), "/");
     }
   }
   return absl::OkStatus();
@@ -668,7 +668,7 @@ absl::Status GraphConstructor::ValidateInputMapAndControlDependencies() {
                                      "control edge and non-control edge");
     }
   }
-  for (const string& node : opts_.control_dependencies) {
+  for (const std::string& node : opts_.control_dependencies) {
     if (existing_nodes_.count(node) == 0) {
       return errors::InvalidArgument(
           "node '", node,
@@ -727,7 +727,7 @@ absl::Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_def_count();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
-  gtl::FlatSet<string> next_iteration_nodes;
+  gtl::FlatSet<std::string> next_iteration_nodes;
   for (int n = 0; n < node_def_count(); ++n) {
     const NodeDef& node_def = get_node_def(n);
     if (IsNextIteration(node_def)) {
@@ -752,7 +752,7 @@ absl::Status GraphConstructor::InitFromEdges() {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_nodes.find(string(id.first)) !=
+          if (next_iteration_nodes.find(std::string(id.first)) !=
               next_iteration_nodes.end()) {
             has_loop_back_edge = true;
           }
@@ -796,7 +796,7 @@ absl::Status GraphConstructor::ValidateColocationConstraints(
     return absl::OkStatus();
   const auto iter = node_def.attr().find(kColocationAttrName);
   if (iter == node_def.attr().end()) return absl::OkStatus();
-  for (const string& c : iter->second.list().s()) {
+  for (const std::string& c : iter->second.list().s()) {
     absl::string_view s(c);
     if (absl::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
@@ -957,11 +957,11 @@ void GraphConstructor::AddControlDependencies(
 
   // node_def either has no inputs or all remapped inputs, add the control
   // dependencies
-  for (const string& control_dep : opts_.control_dependencies) {
-    string input = TensorId(control_dep, Graph::kControlSlot).ToString();
+  for (const std::string& control_dep : opts_.control_dependencies) {
+    std::string input = TensorId(control_dep, Graph::kControlSlot).ToString();
     bool found = false;
     for (int i = node_def->input_size() - 1; i >= 0; --i) {
-      const string& node_input = node_def->input(i);
+      const std::string& node_input = node_def->input(i);
       if (node_input[0] != '^') {
         // Control inputs are at the end. Break when we reach the non-control
         // inputs.
@@ -984,7 +984,7 @@ void GraphConstructor::AddControlDependencies(
 void GraphConstructor::AddPrefixToNodeDef(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
   if (prefix_.empty()) return;
-  node_def->set_name(strings::StrCat(prefix_, node_def->name()));
+  node_def->set_name(absl::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
     // Skip remapped inputs (which already exist in g_ and are not being
@@ -992,9 +992,9 @@ void GraphConstructor::AddPrefixToNodeDef(
     if (input_already_exists[i]) continue;
     absl::string_view input(node_def->input(i));
     if (absl::ConsumePrefix(&input, "^")) {
-      node_def->set_input(i, strings::StrCat("^", prefix_, input));
+      node_def->set_input(i, absl::StrCat("^", prefix_, input));
     } else {
-      node_def->set_input(i, strings::StrCat(prefix_, input));
+      node_def->set_input(i, absl::StrCat(prefix_, input));
     }
   }
   // Update names of colocation groups
@@ -1004,7 +1004,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     for (int i = 0; i < list->s_size(); ++i) {
       absl::string_view v(list->s(i));
       if (absl::ConsumePrefix(&v, kColocationGroupPrefix)) {
-        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
+        list->set_s(i, absl::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
   }
@@ -1013,7 +1013,7 @@ void GraphConstructor::AddPrefixToNodeDef(
 void GraphConstructor::UniquifyNames(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
   if (NameExistsInGraph(node_def->name())) {
-    string old_name = node_def->name();
+    std::string old_name = node_def->name();
     node_def->set_name(FindUniqueName(node_def->name()));
     uniquified_names_[old_name] = node_def->name();
     // Note that we don't have to update gdef_nodes_ or gdef_prefixes_ with
@@ -1028,7 +1028,7 @@ void GraphConstructor::UniquifyNames(
     // We require that UniquifyNames() is called on all NodeDefs in topological
     // order. This guarantees that node_def's inputs will already be uniquified
     // if necessary.
-    auto iter = uniquified_names_.find(string(id.first));
+    auto iter = uniquified_names_.find(std::string(id.first));
     if (iter == uniquified_names_.end()) continue;
     id.first = iter->second;
     node_def->set_input(i, id.ToString());
@@ -1039,18 +1039,18 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
   for (const auto& pair : gdef_nodes_) {
     Node* node = pair.second.node;
     if (node == nullptr) continue;
-    std::vector<string> coloc_values;
+    std::vector<std::string> coloc_values;
     if (!TryGetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values))
       continue;
     bool updated = false;
     for (size_t i = 0; i < coloc_values.size(); ++i) {
       absl::string_view val(coloc_values[i]);
       if (absl::ConsumePrefix(&val, kColocationGroupPrefix)) {
-        auto name_pair = uniquified_names_.find(string(val));
+        auto name_pair = uniquified_names_.find(std::string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
         coloc_values[i] =
-            strings::StrCat(kColocationGroupPrefix, name_pair->second);
+            absl::StrCat(kColocationGroupPrefix, name_pair->second);
       }
     }
     if (updated) {
@@ -1071,13 +1071,13 @@ bool GraphConstructor::NameExistsInGraphDef(absl::string_view name) {
   return false;
 }
 
-string GraphConstructor::FindUniqueName(absl::string_view original_name) {
-  string name(original_name);
+std::string GraphConstructor::FindUniqueName(absl::string_view original_name) {
+  std::string name(original_name);
   int count = 0;
   // Check that any generated names don't collide with imported NodeDefs (as
   // well as nodes in g_).
   while (NameExistsInGraph(name) || (count > 0 && NameExistsInGraphDef(name))) {
-    name = strings::StrCat(original_name, "_", ++count);
+    name = absl::StrCat(original_name, "_", ++count);
   }
   return name;
 }
@@ -1280,7 +1280,7 @@ absl::Status GraphConstructor::Convert() {
         return errors::InvalidArgument(out.str());
       }
 
-      inputs.emplace_back(string(tensor_id.node()), src_node, src_index);
+      inputs.emplace_back(std::string(tensor_id.node()), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(node_def)) {
diff --git a/tensorflow/core/common_runtime/graph_constructor.h b/tensorflow/core/common_runtime/graph_constructor.h
index d0764903eb0931..e527801ea9f426 100644
--- a/tensorflow/core/common_runtime/graph_constructor.h
+++ b/tensorflow/core/common_runtime/graph_constructor.h
@@ -89,7 +89,7 @@ struct ImportGraphDefOptions {
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
   // named "animals/bunny" in *g. Must not be already used as a node name or
   // prefix in the graph.
-  string prefix;
+  std::string prefix;
 
   // If true, imported node names will be modified if their name already exists
   // in the graph. If false, conflicting names will be treated as an error. Note
@@ -125,7 +125,7 @@ struct ImportGraphDefOptions {
   // Note that to avoid creating many redundant control edges, ImportGraphDef()
   // won't add control edges to nodes that will inherit the dependencies from
   // other nodes in `gdef`.
-  std::vector<string> control_dependencies;
+  std::vector<std::string> control_dependencies;
 
   // Tensors in `gdef` that will be returned via the ImportGraphDefResults
   // output parameter of `ImportGraphDef()`. If this list is non-empty, the
@@ -151,7 +151,7 @@ struct ImportGraphDefOptions {
   // Unlike `return_tensors`, `input_map` has no effect on the nodes
   // returned. `return_nodes` must be empty if `skip_mapped_nodes` is true.
   // TODO(skyewm): make this work with `skip_mapped_nodes` if there's a need.
-  std::vector<string> return_nodes;
+  std::vector<std::string> return_nodes;
 
   // If true, checks that all colocation constraints are nodes in the GraphDef.
   bool validate_colocation_constraints = true;
@@ -165,7 +165,7 @@ struct ImportGraphDefOptions {
   // python API.
 
   // Try to set default execution device for this grapth.
-  string default_device;
+  std::string default_device;
 
   // If true, propagates a node's assigned device. By default the runtime
   // will recompute the assigned device every time.
diff --git a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
index fa92230d4dcdc2..df0c63473b849d 100644
--- a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
@@ -63,10 +63,10 @@ void FuzzGraphEndToEndSimpleFixedInput(const GraphDef& graph_def) {
   p1.scalar<float>()() = 1.0;
   Tensor p2(DT_FLOAT, TensorShape({1}));
   p2.scalar<float>()() = 2.0;
-  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
-                                                   {"Placeholder_1", p2}};
-  std::vector<string> output_names = {"O_FUZZ"};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"Placeholder", p1},
+                                                        {"Placeholder_1", p2}};
+  std::vector<std::string> output_names = {"O_FUZZ"};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   status = sess->Run(inputs, output_names, target_names, &outputs);
 }
@@ -93,22 +93,22 @@ void FuzzGraphEndToEndAllStatic(const GraphDef& graph_def) {
     return;
   }
 
-  std::vector<std::pair<string, Tensor>> inputs = {};
-  std::vector<string> output_names = {};
-  std::vector<string> target_names = {};
+  std::vector<std::pair<std::string, Tensor>> inputs = {};
+  std::vector<std::string> output_names = {};
+  std::vector<std::string> target_names = {};
   std::vector<Tensor> outputs = {};
   status = sess->Run(inputs, output_names, target_names, &outputs);
 }
 FUZZ_TEST(GraphDefFuzz, FuzzGraphEndToEndAllStatic);
 
-Node* FindNode(const string& name, Graph* graph) {
+Node* FindNode(const std::string& name, Graph* graph) {
   for (Node* n : graph->nodes()) {
     if (n->name() == name) return n;
   }
   return nullptr;
 }
 
-bool HasNode(const string& name, Graph* graph) {
+bool HasNode(const std::string& name, Graph* graph) {
   return FindNode(name, graph) != nullptr;
 }
 
@@ -399,10 +399,10 @@ void FuzzGraphEndToEndFDP(std::vector<uint8_t> data) {
     input_tensors.push_back(input_tensor);
   }
 
-  std::vector<std::pair<string, Tensor>> inputs = {{"N0", input_tensors[0]},
-                                                   {"N1", input_tensors[1]}};
-  std::vector<string> output_names = {last_node};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {
+      {"N0", input_tensors[0]}, {"N1", input_tensors[1]}};
+  std::vector<std::string> output_names = {last_node};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   s = sess->Run(inputs, output_names, target_names, &outputs);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
index 9494bf48f9a74f..036ee63a354f89 100644
--- a/tensorflow/core/common_runtime/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -53,22 +53,22 @@ class GraphConstructorTest : public ::testing::Test {
  protected:
   GraphConstructorTest() : graph_(OpRegistry::Global()) {}
 
-  void Convert(const string& gdef_ascii) {
+  void Convert(const std::string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef_));
   }
 
-  void ExpectError(const string& gdef_ascii,
-                   const std::vector<string>& expected_error_strs,
-                   string not_expected_error_str = "") {
+  void ExpectError(const std::string& gdef_ascii,
+                   const std::vector<std::string>& expected_error_strs,
+                   std::string not_expected_error_str = "") {
     // Used to verify that errors don't change graph
-    const string original_graph_description = GraphDebugString();
+    const std::string original_graph_description = GraphDebugString();
 
     Convert(gdef_ascii);
     GraphConstructorOptions opts;
     absl::Status status = ConvertGraphDefToGraph(opts, gdef_, &graph_);
     EXPECT_FALSE(status.ok());
 
-    for (const string& error : expected_error_strs) {
+    for (const std::string& error : expected_error_strs) {
       EXPECT_TRUE(absl::StrContains(status.message(), error))
           << "Expected to find '" << error << "' in " << status;
     }
@@ -82,19 +82,20 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_EQ(original_graph_description, GraphDebugString());
   }
 
-  void ExpectError(const string& gdef_ascii, const ImportGraphDefOptions& opts,
-                   const std::vector<string>& expected_error_strs,
+  void ExpectError(const std::string& gdef_ascii,
+                   const ImportGraphDefOptions& opts,
+                   const std::vector<std::string>& expected_error_strs,
                    ShapeRefiner* refiner = nullptr,
                    ImportGraphDefResults* results = nullptr) {
     // Used to verify that errors don't change graph
-    const string original_graph_description = GraphDebugString();
+    const std::string original_graph_description = GraphDebugString();
 
     Convert(gdef_ascii);
     absl::Status status =
         ImportGraphDef(opts, gdef_, &graph_, refiner, results);
     EXPECT_FALSE(status.ok());
 
-    for (const string& error : expected_error_strs) {
+    for (const std::string& error : expected_error_strs) {
       EXPECT_TRUE(absl::StrContains(status.message(), error))
           << "Expected to find '" << error << "' in " << status;
     }
@@ -102,13 +103,14 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_EQ(original_graph_description, GraphDebugString());
   }
 
-  void ExpectOK(const string& gdef_ascii) {
+  void ExpectOK(const std::string& gdef_ascii) {
     Convert(gdef_ascii);
     GraphConstructorOptions opts;
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef_, &graph_));
   }
 
-  void ExpectOK(const string& gdef_ascii, const ImportGraphDefOptions& opts,
+  void ExpectOK(const std::string& gdef_ascii,
+                const ImportGraphDefOptions& opts,
                 ShapeRefiner* refiner = nullptr,
                 ImportGraphDefResults* results = nullptr) {
     Convert(gdef_ascii);
@@ -125,16 +127,17 @@ class GraphConstructorTest : public ::testing::Test {
         << graph_.versions().producer();
   }
 
-  Node* FindNode(const string& name) {
+  Node* FindNode(const std::string& name) {
     for (Node* n : graph_.nodes()) {
       if (n->name() == name) return n;
     }
     return nullptr;
   }
 
-  bool HasNode(const string& name) { return FindNode(name) != nullptr; }
+  bool HasNode(const std::string& name) { return FindNode(name) != nullptr; }
 
-  bool HasEdge(const string& src, int src_out, const string& dst, int dst_in) {
+  bool HasEdge(const std::string& src, int src_out, const std::string& dst,
+               int dst_in) {
     for (const Edge* e : graph_.edges()) {
       if (e->src()->name() == src && e->src_output() == src_out &&
           e->dst()->name() == dst && e->dst_input() == dst_in) {
@@ -144,11 +147,11 @@ class GraphConstructorTest : public ::testing::Test {
     return false;
   }
 
-  bool HasControlEdge(const string& src, const string& dst) {
+  bool HasControlEdge(const std::string& src, const std::string& dst) {
     return HasEdge(src, Graph::kControlSlot, dst, Graph::kControlSlot);
   }
 
-  string ColocationGroup(const string& node) {
+  std::string ColocationGroup(const std::string& node) {
     Node* n = nullptr;
     for (Node* ni : graph_.nodes()) {
       if (ni->name() == node) {
@@ -159,7 +162,7 @@ class GraphConstructorTest : public ::testing::Test {
     if (n == nullptr) {
       return "";
     }
-    std::vector<string> value;
+    std::vector<std::string> value;
     absl::Status s = GetNodeAttr(n->attrs(), kColocationAttrName, &value);
     if (!s.ok()) {
       return "";
@@ -171,10 +174,11 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     absl::string_view loc(value[0]);
-    return absl::ConsumePrefix(&loc, kColocationGroupPrefix) ? string(loc) : "";
+    return absl::ConsumePrefix(&loc, kColocationGroupPrefix) ? std::string(loc)
+                                                             : "";
   }
 
-  string GraphDebugString() const {
+  std::string GraphDebugString() const {
     return graph_.ToGraphDefDebug().DebugString();
   }
 
@@ -232,7 +236,7 @@ REGISTER_OP("RequiresCurrentGraphVersion")
 
 TEST_F(GraphConstructorTest, InvalidNodeName) {
   auto expect_invalid_name = [this](const char* name) {
-    ExpectError(strings::StrCat("node { name: '", name, "' op: 'ABC' }"),
+    ExpectError(absl::StrCat("node { name: '", name, "' op: 'ABC' }"),
                 {"Node name contains invalid characters"});
   };
 
@@ -504,7 +508,7 @@ TEST_F(GraphConstructorTest, ImportGraphThatUsesConstantValueFromInsideLoop) {
         f.write(str(tf.get_default_graph().as_graph_def()))
 
   */
-  const string pb_ascii = R"EOF(
+  const std::string pb_ascii = R"EOF(
 node {
   name: "Const"
   op: "Const"
@@ -862,7 +866,7 @@ TEST_F(GraphConstructorTest, NoForwardCompatError) {
 }
 
 TEST_F(GraphConstructorTest, LowVersion) {
-  ExpectError(strings::StrCat("versions { producer: ", -1, " }"),
+  ExpectError(absl::StrCat("versions { producer: ", -1, " }"),
               {strings::StrCat("GraphDef producer version -1 below min "
                                "producer ",
                                TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
@@ -872,7 +876,7 @@ TEST_F(GraphConstructorTest, LowVersion) {
 
 TEST_F(GraphConstructorTest, HighVersion) {
   const int version = TF_GRAPH_DEF_VERSION + 1;
-  ExpectError(strings::StrCat("versions { min_consumer: ", version, " }"),
+  ExpectError(absl::StrCat("versions { min_consumer: ", version, " }"),
               {strings::StrCat("GraphDef min consumer version ", version,
                                " above current version ", TF_GRAPH_DEF_VERSION,
                                " for TensorFlow ", TF_VERSION_STRING,
@@ -885,7 +889,7 @@ TEST_F(GraphConstructorTest, BadVersion) {
   ExpectError(
       strings::StrCat("versions { producer: ", version, " bad_consumers: ", bad,
                       " }"),
-      {strings::StrCat(
+      {absl::StrCat(
           "GraphDef disallows consumer version ", bad,
           ".  Please upgrade TensorFlow: this version is likely buggy.")});
 }
@@ -932,8 +936,8 @@ TEST_F(GraphConstructorTest, Error_ControlEdgeBeforeRealInput) {
 TEST_F(GraphConstructorTest, ImportGraphDef) {
   GraphDef def;
   ImportGraphDefOptions opts;
-  const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
-  const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
+  const std::string& source = graph_.FindNodeId(Graph::kSourceId)->name();
+  const std::string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
 
   // Importing an empty graph is fine.
   absl::Status s = ImportGraphDef(opts, def, &graph_, nullptr);
@@ -2447,8 +2451,8 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   TF_EXPECT_OK(
       NodeDefBuilder("scope/A", "TestParams").Finalize(def.add_node()));
   ImportGraphDefOptions opts;
-  const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
-  const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
+  const std::string& source = graph_.FindNodeId(Graph::kSourceId)->name();
+  const std::string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
 
   absl::Status s = ImportGraphDef(opts, def, &graph_, nullptr);
   ASSERT_EQ(absl::OkStatus(), s) << s;
@@ -2457,7 +2461,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   EXPECT_TRUE(HasControlEdge(source, "scope/A"));
   EXPECT_TRUE(HasControlEdge("scope/A", sink));
   EXPECT_EQ(3, graph_.num_edges());
-  const string original_graph_description = GraphDebugString();
+  const std::string original_graph_description = GraphDebugString();
 
 #define EXPECT_IMPORT_FAILURE(graph_def, options, expected_err)       \
   do {                                                                \
@@ -2663,10 +2667,10 @@ TEST_F(GraphConstructorTest, ImportGraphDef_FunctionDefs) {
   p1.scalar<float>()() = 1.0;
   Tensor p2(DT_FLOAT, TensorShape({1}));
   p2.scalar<float>()() = 2.0;
-  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
-                                                   {"Placeholder_1", p2}};
-  std::vector<string> output_names = {"Foo_d03c39a3"};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"Placeholder", p1},
+                                                        {"Placeholder_1", p2}};
+  std::vector<std::string> output_names = {"Foo_d03c39a3"};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(sess->Run(inputs, output_names, target_names, &outputs));
 
@@ -2756,10 +2760,10 @@ TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
   p1.scalar<float>()() = 1.0;
   Tensor p2(DT_FLOAT, TensorShape({1}));
   p2.scalar<float>()() = 2.0;
-  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
-                                                   {"Placeholder_1", p2}};
-  std::vector<string> output_names = {"Outer_966fa13d"};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"Placeholder", p1},
+                                                        {"Placeholder_1", p2}};
+  std::vector<std::string> output_names = {"Outer_966fa13d"};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   s = sess->Run(inputs, output_names, target_names, &outputs);
   ASSERT_TRUE(s.ok()) << s.message();
@@ -2835,16 +2839,16 @@ TEST_F(GraphConstructorTest, CopyGraph) {
 // Confirms that graph def version in the graph reaches the shape inference
 // function.
 TEST_F(GraphConstructorTest, GraphDefVersionUsedForShapeInference) {
-  string gdef_ascii = strings::StrCat(R"EOF(
+  std::string gdef_ascii = absl::StrCat(R"EOF(
       node{ name:"A" op:"RequiresCurrentGraphVersion" }
       versions { producer: )EOF",
-                                      TF_GRAPH_DEF_VERSION - 1, "}");
+                                        TF_GRAPH_DEF_VERSION - 1, "}");
   ImportGraphDefOptions opts;
   ExpectError(gdef_ascii, opts, {"Wrong graph version for shape"});
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = absl::StrCat(R"EOF(
       node{ name:"A" op:"RequiresCurrentGraphVersion" }
       versions { producer: )EOF",
-                               TF_GRAPH_DEF_VERSION, "}");
+                            TF_GRAPH_DEF_VERSION, "}");
   ExpectOK(gdef_ascii, opts);
 }
 
@@ -2887,7 +2891,7 @@ TEST_F(GraphConstructorTest, ImportGraphDefProvidedShapeRefinerVersions) {
   ImportGraphDefOptions opts;
   // A valid graph at producer version 20, but one
   // that would not import if the graph_def_version were 21.
-  string gdef_ascii;
+  std::string gdef_ascii;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   gdef_ascii = strings::StrCat(R"EOF(
 node {
@@ -2973,7 +2977,7 @@ versions {
 })EOF");
 
 #else
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = R"EOF(
 node {
   name: "Sum/input"
   op: "Const"
@@ -3054,7 +3058,7 @@ node {
 }
 versions {
   producer: 20
-})EOF");
+})EOF";
 #endif
   // Create a shape refiner with the latest TF_GRAPH_DEF_VERSION.
   // Importing the graphdef with an existing refiner should
@@ -3098,7 +3102,7 @@ versions {
 })EOF");
 
 #else
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = R"EOF(
 node {
   name: "RandomConst"
   op: "Const"
@@ -3128,7 +3132,7 @@ node {
 }
 versions {
   producer: 21
-})EOF");
+})EOF";
 #endif
 
   ExpectOK(gdef_ascii, opts, &refiner);
@@ -3171,7 +3175,7 @@ versions {
 })EOF");
 
 #else
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = R"EOF(
 node {
   name: "RandomConst2"
   op: "Const"
@@ -3201,7 +3205,7 @@ node {
 }
 versions {
   producer: 17
-})EOF");
+})EOF";
 #endif
   ExpectOK(gdef_ascii, opts, &refiner);
 
@@ -3242,7 +3246,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateDefaultDevice) {
   ImportGraphDefResults res;
 
   TF_ASSERT_OK(ImportGraphDef(options, gdef, &graph_, nullptr, &res));
-  std::map<string, string> node2dev;
+  std::map<std::string, std::string> node2dev;
   for (Node* n : graph_.nodes()) {
     node2dev[n->name()] = n->requested_device();
   }
@@ -3253,7 +3257,8 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateDefaultDevice) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_UnknownOps) {
-  const string pb_ascii = "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
+  const std::string pb_ascii =
+      "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
   // Try load twice to check for two parts of the error message. We cannot check
   // for the whole thing in one go because the message includes the hostname.
   ExpectError(pb_ascii, {"Op type not registered 'OpFromContrib'"});
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index d7a9462e387d2d..a3c1d024babae0 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -64,7 +64,7 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
-bool IsCollectiveV2(const string& op) {
+bool IsCollectiveV2(const std::string& op) {
   return op == "CollectiveReduceV2" || op == "CollectiveGatherV2" ||
          op == "CollectiveBcastRecvV2" || op == "CollectiveBcastSendV2" ||
          op == "ColectiveReduceScatterV2" || op == "ColectiveAllToAllV2";
@@ -199,7 +199,7 @@ absl::Status GraphExecutionState::Extend(
   *gdef.mutable_library() = flib_def_->ToProto();
 
   // 2. Build an index of the new node names.
-  std::unordered_set<string> new_names;
+  std::unordered_set<std::string> new_names;
   for (const NodeDef& node : extension_def.node()) {
     new_names.insert(node.name());
   }
@@ -315,7 +315,7 @@ namespace {
 
 class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
  public:
-  TensorConnectionPruneRewrite(const string* endpoint_name,
+  TensorConnectionPruneRewrite(const std::string* endpoint_name,
                                NodeBuilder::NodeOut from_tensor)
       : subgraph::PruneRewrite(endpoint_name, nullptr /* device_info */),
         from_tensor_(std::move(from_tensor)) {}
@@ -336,8 +336,8 @@ class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
     TF_RETURN_IF_ERROR(s);
 
     TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat("_identity_", feed_tensor.node->name(), "_",
-                                    feed_tensor.index),
+        NodeBuilder(absl::StrCat("_identity_", feed_tensor.node->name(), "_",
+                                 feed_tensor.index),
                     "Identity")
             .Input(from_tensor_)
             .Attr("T",
@@ -355,7 +355,7 @@ class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
 
 template <class Map>
 absl::Status LookupDevice(
-    const DeviceSet& device_set, const string& tensor_name,
+    const DeviceSet& device_set, const std::string& tensor_name,
     const Map& tensor2device,
     const tensorflow::DeviceAttributes** out_device_attrs) {
   *out_device_attrs = nullptr;
@@ -394,7 +394,7 @@ struct TensorAndDevice {
 
 // Tensors of some DataTypes cannot placed in device memory as feeds or
 // fetches. Validate against a allowlist of those known to work.
-bool IsFeedAndFetchSupported(DataType dtype, const string& device_type) {
+bool IsFeedAndFetchSupported(DataType dtype, const std::string& device_type) {
   // The mechanism for supporting feeds of device-backed Tensors requires
   // the _Arg kernel to be registered for the corresponding type (and that
   // the input to the kernel be in device and not host memory).
@@ -474,8 +474,8 @@ absl::Status ValidateFeedAndFetchDevices(
 absl::Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
                                               PartialTensorShape* shape,
                                               DataType* type) {
-  static const gtl::FlatSet<string>* const kHasExplicitShapeAttribute =
-      CHECK_NOTNULL((new gtl::FlatSet<string>{
+  static const gtl::FlatSet<std::string>* const kHasExplicitShapeAttribute =
+      CHECK_NOTNULL((new gtl::FlatSet<std::string>{
           "Placeholder", "PlaceholderV2", "PlaceholderWithDefault",
           "ParallelConcat", "ImmutableConst", "_ParallelConcatStart",
           "InfeedDequeue", "OutfeedDequeue", "CollectiveBcastSend",
@@ -520,7 +520,7 @@ absl::Status GraphExecutionState::PruneGraph(
     for (int i = 0; i < options.callable_options.feed_size(); ++i) {
       // WARNING: feed MUST be a reference, since ArgFeedRewrite and
       // tensors_and_devices holds on to its address.
-      const string& feed = options.callable_options.feed(i);
+      const std::string& feed = options.callable_options.feed(i);
       const DeviceAttributes* device_info;
       TF_RETURN_IF_ERROR(LookupDevice(*device_set_, feed,
                                       options.callable_options.feed_devices(),
@@ -540,7 +540,7 @@ absl::Status GraphExecutionState::PruneGraph(
     for (int i = 0; i < options.callable_options.fetch_size(); ++i) {
       // WARNING: fetch MUST be a reference, since RetvalFetchRewrite and
       // tensors_and_devices holds on to its address.
-      const string& fetch = options.callable_options.fetch(i);
+      const std::string& fetch = options.callable_options.fetch(i);
       const DeviceAttributes* device_info;
       TF_RETURN_IF_ERROR(LookupDevice(*device_set_, fetch,
                                       options.callable_options.fetch_devices(),
@@ -561,11 +561,11 @@ absl::Status GraphExecutionState::PruneGraph(
     }
     const DeviceAttributes* device_info =
         &device_set_->client_device()->attributes();
-    for (const string& feed : options.callable_options.feed()) {
+    for (const std::string& feed : options.callable_options.feed()) {
       feed_rewrites.emplace_back(
           new subgraph::RecvFeedRewrite(&feed, device_info));
     }
-    for (const string& fetch : options.callable_options.fetch()) {
+    for (const std::string& fetch : options.callable_options.fetch()) {
       fetch_rewrites.emplace_back(
           new subgraph::SendFetchRewrite(&fetch, device_info));
     }
@@ -598,7 +598,7 @@ absl::Status GraphExecutionState::PruneGraph(
         &tensor_connection.to_tensor(), {from_node, from_id.second}));
   }
 
-  std::vector<string> target_node_names(
+  std::vector<std::string> target_node_names(
       options.callable_options.target().begin(),
       options.callable_options.target().end());
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
@@ -699,7 +699,7 @@ absl::Status GraphExecutionState::OptimizeGraph(
           options.callable_options.tensor_connection().empty())) {
       std::vector<SafeTensorId> feeds;
 
-      for (const string& feed : options.callable_options.feed()) {
+      for (const std::string& feed : options.callable_options.feed()) {
         feeds.emplace_back(ParseTensorName(feed));
       }
       for (const TensorConnection& tensor_connection :
@@ -830,7 +830,7 @@ absl::Status GraphExecutionState::OptimizeGraph(
     *optimized_flib = std::make_unique<FunctionLibraryDefinition>(*flib_def);
 
     for (const FunctionDef& fdef : new_graph.library().function()) {
-      const string& func_name = fdef.signature().name();
+      const std::string& func_name = fdef.signature().name();
 
       if ((*optimized_flib)->Contains(func_name)) {
         VLOG(3) << "Replace function: name=" << func_name;
@@ -864,7 +864,7 @@ absl::Status GraphExecutionState::OptimizeGraph(
 absl::Status GraphExecutionState::BuildGraph(
     const BuildGraphOptions& options, std::unique_ptr<ClientGraph>* out) {
   VLOG(1) << "BuildGraph";
-  const uint64 start_time_usecs = Env::Default()->NowMicros();
+  const uint64_t start_time_usecs = Env::Default()->NowMicros();
   if (!graph_) {
     // It is only valid to call this method directly when the original graph
     // was created with the option `place_pruned_graph == false`.
@@ -922,7 +922,7 @@ absl::Status GraphExecutionState::BuildGraph(
     // nodes in the Graph and FunctionLibraryDefinition for collective ops and
     // if found, initialize a collective_graph_key as a hash of the ordered set
     // of instance keys.
-    std::set<int32> instance_key_set;
+    std::set<int32_t> instance_key_set;
     bool has_collective_v2 = false;
     for (Node* node : optimized_graph->nodes()) {
       if (node->IsCollective()) {
@@ -952,7 +952,7 @@ absl::Status GraphExecutionState::BuildGraph(
       }
     }
     if (!instance_key_set.empty()) {
-      uint64 hash = 0x8774aa605c729c72ULL;
+      uint64_t hash = 0x8774aa605c729c72ULL;
       for (int32_t instance_key : instance_key_set) {
         hash = Hash64Combine(instance_key, hash);
       }
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 4f713ae922f12d..a718b57063f10d 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -43,10 +43,10 @@ struct GraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
   // Unique session identifier. Can be empty.
-  string session_handle;
+  std::string session_handle;
   // A map from node name to device name, representing the unchangeable
   // placement of stateful nodes.
-  std::unordered_map<string, string> stateful_placements;
+  std::unordered_map<std::string, std::string> stateful_placements;
   // Whether to run Placer on the graph.
   bool run_placer = true;
 
@@ -166,7 +166,7 @@ class GraphExecutionState {
   const FunctionLibraryDefinition& flib_def() const { return *flib_def_; }
 
   // Returns the node with the given name, or null if it does not exist.
-  const Node* get_node_by_name(const string& name) const {
+  const Node* get_node_by_name(const std::string& name) const {
     NodeNameToCostIdMap::const_iterator iter =
         node_name_to_cost_id_map_.find(name);
     if (iter != node_name_to_cost_id_map_.end()) {
@@ -178,7 +178,7 @@ class GraphExecutionState {
 
   // Returns the map of stateful placements as a map of
   // node name to placement string.
-  std::unordered_map<string, string> GetStatefulPlacements() const {
+  std::unordered_map<std::string, std::string> GetStatefulPlacements() const {
     return stateful_placements_;
   }
 
@@ -194,8 +194,9 @@ class GraphExecutionState {
   // is true, such as "params" and "queue" nodes.  Once placed these
   // nodes can not be moved to a different device.  Maps node names to
   // device names.
-  std::unordered_map<string, string> stateful_placements_;  // Immutable after
-                                                            // ctor.
+  std::unordered_map<std::string, std::string>
+      stateful_placements_;  // Immutable after
+                             // ctor.
   void SaveStatefulNodes(Graph* graph);
   void RestoreStatefulNodes(Graph* graph);
 
@@ -215,7 +216,7 @@ class GraphExecutionState {
   const DeviceSet* device_set_;            // Not owned
   const SessionOptions* session_options_;  // Not owned
   // Unique session identifier. Can be empty.
-  string session_handle_;
+  std::string session_handle_;
 
   // Map from name to Node for the full graph in placed_.
   NodeNameToCostIdMap node_name_to_cost_id_map_;
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index f8322cfe7213a2..746c080e4d3f66 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -36,8 +36,8 @@ class GraphOptimizer {
     // pass may replace a node with a different node of the same name that has a
     // different number of outputs, or outputs with different known shapes.
     // TODO(b/65453533) introduce a unique way to name nodes in a graph.
-    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map =
-        nullptr;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>>*
+        shape_map = nullptr;
 
     // If not null then only nodes for which cse_consider_fn returns true will
     // be considered for CSE.
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 90052d68873c6a..8379c126e22711 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -58,7 +58,7 @@ class SimpleRendezvous : public RendezvousInterface {
     }
 
     mutex_lock l(mu_);
-    string edge_name(parsed.edge_name);
+    std::string edge_name(parsed.edge_name);
     if (table_.count(edge_name) > 0) {
       return errors::Internal("Send of an already sent tensor");
     }
@@ -71,7 +71,7 @@ class SimpleRendezvous : public RendezvousInterface {
     Tensor tensor;
     absl::Status status = absl::OkStatus();
     {
-      string key(parsed.edge_name);
+      std::string key(parsed.edge_name);
       mutex_lock l(mu_);
       if (table_.count(key) <= 0) {
         status = errors::Internal("Did not find key ", key);
@@ -85,7 +85,7 @@ class SimpleRendezvous : public RendezvousInterface {
   void StartAbort(const absl::Status& status) override {}
 
  private:
-  typedef std::unordered_map<string, Tensor> Table;
+  typedef std::unordered_map<std::string, Tensor> Table;
 
   mutex mu_;
   Table table_ TF_GUARDED_BY(mu_);
@@ -103,7 +103,7 @@ GraphRunner::~GraphRunner() {}
 absl::Status GraphRunner::Run(Graph* graph,
                               FunctionLibraryRuntime* function_library,
                               const NamedTensorList& inputs,
-                              const std::vector<string>& output_names,
+                              const std::vector<std::string>& output_names,
                               std::vector<Tensor>* outputs) {
   if (device_ == nullptr) {
     return errors::NotFound("Cannot find a device for GraphRunner.");
@@ -130,12 +130,12 @@ absl::Status GraphRunner::Run(Graph* graph,
   SimpleRendezvous rendez;
 
   // Extract the input names and keys, and feed in the inputs.
-  std::vector<string> input_names;
+  std::vector<std::string> input_names;
   for (const auto& in : inputs) {
-    const string& tensor_name = in.first;
+    const std::string& tensor_name = in.first;
     input_names.emplace_back(tensor_name);
-    string full_key = Rendezvous::CreateKey("/device:CPU:0", 1, "/device:CPU:1",
-                                            tensor_name, FrameAndIter(0, 0));
+    std::string full_key = Rendezvous::CreateKey(
+        "/device:CPU:0", 1, "/device:CPU:1", tensor_name, FrameAndIter(0, 0));
     Rendezvous::ParsedKey parsed;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(full_key, &parsed));
     TF_RETURN_IF_ERROR(rendez.Send(parsed, Rendezvous::Args(), in.second,
@@ -194,7 +194,7 @@ absl::Status GraphRunner::Run(Graph* graph,
 
   outputs->resize(output_names.size());
   for (size_t i = 0; i < output_names.size(); ++i) {
-    const string& output_key =
+    const std::string& output_key =
         Rendezvous::CreateKey("/device:CPU:0", 1, "/device:CPU:1",
                               output_names[i], FrameAndIter(0, 0));
     Rendezvous::ParsedKey parsed;
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index a40d17b862b0af..3f651727db5923 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -58,10 +58,10 @@ class GraphRunner {
   //
   // REQUIRES: `graph`, `env`, and `outputs` are not nullptr.
   // `function_library` may be nullptr.
-  typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
+  typedef std::vector<std::pair<std::string, Tensor>> NamedTensorList;
   absl::Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
                    const NamedTensorList& inputs,
-                   const std::vector<string>& output_names,
+                   const std::vector<std::string>& output_names,
                    std::vector<Tensor>* outputs);
 
  private:
diff --git a/tensorflow/core/common_runtime/graph_runner_test.cc b/tensorflow/core/common_runtime/graph_runner_test.cc
index fa9798b929f79e..2d41bc455d5322 100644
--- a/tensorflow/core/common_runtime/graph_runner_test.cc
+++ b/tensorflow/core/common_runtime/graph_runner_test.cc
@@ -64,8 +64,8 @@ TEST(GraphRunnerTest, DeepCopy) {
   Tensor p2_data(DT_FLOAT, TensorShape({}));
   p1_data.scalar<float>()() = 1.0f;
   p2_data.scalar<float>()() = 2.0f;
-  std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
-                                                   {"p2:0", p2_data}};
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"p1:0", p1_data},
+                                                        {"p2:0", p2_data}};
 
   // Create and destroy the GraphRunner, and ensure that the outputs are
   // consumable beyond the lifetime of GraphRunner.
@@ -102,8 +102,8 @@ TEST(GraphRunnerTest, FeedAndFetch) {
   Tensor p2_data(DT_FLOAT, TensorShape({}));
   p1_data.scalar<float>()() = 1.0f;
   p2_data.scalar<float>()() = 2.0f;
-  std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
-                                                   {"p2:0", p2_data}};
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"p1:0", p1_data},
+                                                        {"p2:0", p2_data}};
 
   GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index f84dbfac0d3f6d..65359febf97937 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -40,12 +40,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-string NodeItem::DebugString() const {
-  string ret = strings::StrCat("{name:'", kernel->name(), "' id:", node_id);
+std::string NodeItem::DebugString() const {
+  std::string ret = absl::StrCat("{name:'", kernel->name(), "' id:", node_id);
   if (is_source) {
-    strings::StrAppend(&ret, " source}");
+    absl::StrAppend(&ret, " source}");
   } else {
-    strings::StrAppend(&ret, " def:{", SummarizeNodeDef(kernel->def()), "}}");
+    absl::StrAppend(&ret, " def:{", SummarizeNodeDef(kernel->def()), "}}");
   }
   return ret;
 }
@@ -67,7 +67,7 @@ GraphView::~GraphView() {
 }
 
 namespace {
-typedef std::tuple<int32, int32> OutputAndControlEdges;
+typedef std::tuple<int32_t, int32_t> OutputAndControlEdges;
 
 OutputAndControlEdges CountOutputEdges(const Node* n) {
   DCHECK_LE(n->out_edges().size(), std::numeric_limits<int32_t>::max());
@@ -102,8 +102,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
             sizeof(ControlEdgeInfo)                // output_control_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
       + num_outputs * sizeof(int)                  // forward_from[num_outputs]
-      + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
+      + num_inputs * sizeof(uint8_t)               // input_type[num_inputs]
+      + num_outputs * sizeof(uint8_t);             // output_type[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -141,7 +141,7 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   // values as "int" vs "size_t" in CHECK_LE.
   CHECK_LE(static_cast<int64_t>(ptr - space_),
            std::numeric_limits<uint32_t>::max());
-  const uint32 offset = static_cast<uint32>(ptr - space_);
+  const uint32_t offset = static_cast<uint32_t>(ptr - space_);
   node_offsets_[id] = offset;
   ptr += bytes;
 
@@ -197,10 +197,10 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   }
 
   DCHECK_LT(DataType_MAX, 255);  // Must fit in uint8
-  uint8* input_types = item->input_type_base();
+  uint8_t* input_types = item->input_type_base();
   item->is_any_input_ref_typed = false;
   for (int i = 0; i < num_inputs; i++) {
-    input_types[i] = static_cast<uint8>(n->input_type(i));
+    input_types[i] = static_cast<uint8_t>(n->input_type(i));
     DCHECK_EQ(item->input_type(i), n->input_type(i));
     item->is_any_input_ref_typed |= IsRefType(n->input_type(i));
   }
@@ -215,9 +215,9 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
         GetNodeAttr(n->attrs(), "_scoped_allocator", &scoped_allocator_attrs);
 
     int* forward_from = item->forward_from_base();
-    uint8* output_types = item->output_type_base();
+    uint8_t* output_types = item->output_type_base();
     for (int i = 0; i < num_outputs; ++i) {
-      output_types[i] = static_cast<uint8>(n->output_type(i));
+      output_types[i] = static_cast<uint8_t>(n->output_type(i));
       DCHECK_EQ(item->output_type(i), n->output_type(i));
 
       forward_from[i] = OpKernelContext::Params::kNoReservation;
@@ -264,7 +264,7 @@ absl::Status GraphView::Initialize(const Graph* g) {
     total_bytes += NodeItemBytes(n);
   }
 
-  node_offsets_ = new uint32[num_nodes];
+  node_offsets_ = new uint32_t[num_nodes];
   for (int i = 0; i < num_nodes; i++) {
     node_offsets_[i] = std::numeric_limits<uint32_t>::max();
   }
@@ -363,7 +363,7 @@ absl::Status InferAllocAttr(const Node* n, const Node* dst,
   // Note that it's possible for *n to be a Recv and *dst to be a Send,
   // so these two cases are not mutually exclusive.
   if (IsRecv(n)) {
-    string src_name;
+    std::string src_name;
     s = GetNodeAttr(n->attrs(), "send_device", &src_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_src_name;
@@ -388,7 +388,7 @@ absl::Status InferAllocAttr(const Node* n, const Node* dst,
     }
   }
   if (IsSend(dst)) {
-    string dst_name;
+    std::string dst_name;
     s = GetNodeAttr(dst->attrs(), "recv_device", &dst_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_dst_name;
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 3864df8a6ce165..32df420842d657 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -102,10 +102,10 @@ struct NodeItem {
   int input_start = 0;
 
   // Number of output edges, excluding control edges.
-  int32 num_output_edges;
+  int32_t num_output_edges;
 
   // Number of output control edges.
-  int32 num_output_control_edges;
+  int32_t num_output_control_edges;
 
   // If non-null, contains an array of num_outputs bools, where the ith bool
   // is true if and only if the ith output is consumed by another node.
@@ -143,7 +143,7 @@ struct NodeItem {
   // 0... for forward from that input.
   const int* forward_from() const { return forward_from_base(); }
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   friend class GraphView;
@@ -185,18 +185,18 @@ struct NodeItem {
                                       num_output_control_edges +
                                   sizeof(AllocatorAttributes) * num_outputs);
   }
-  uint8* input_type_base() const {
-    return reinterpret_cast<uint8*>(
+  uint8_t* input_type_base() const {
+    return reinterpret_cast<uint8_t*>(
         var() + sizeof(EdgeInfo) * num_output_edges +
         sizeof(ControlEdgeInfo) * num_output_control_edges +
         sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs);
   }
-  uint8* output_type_base() const {
-    return reinterpret_cast<uint8*>(
+  uint8_t* output_type_base() const {
+    return reinterpret_cast<uint8_t*>(
         var() + sizeof(EdgeInfo) * num_output_edges +
         sizeof(ControlEdgeInfo) * num_output_control_edges +
         sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs +
-        sizeof(uint8) * num_inputs);
+        sizeof(uint8_t) * num_inputs);
   }
 
   NodeItem(const NodeItem&) = delete;
@@ -220,7 +220,7 @@ class GraphView {
   NodeItem* node(int32_t id) const {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
-    uint32 offset = node_offsets_[id];
+    uint32_t offset = node_offsets_[id];
     return ((offset == std::numeric_limits<uint32_t>::max())
                 ? nullptr
                 : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
@@ -232,19 +232,19 @@ class GraphView {
   const NodeItem& node_ref(int32_t id) const {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
-    uint32 offset = node_offsets_[id];
+    uint32_t offset = node_offsets_[id];
     DCHECK_NE(offset, std::numeric_limits<uint32_t>::max());
     return *reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]);
   }
 
-  int32 num_nodes() const { return num_nodes_; }
+  int32_t num_nodes() const { return num_nodes_; }
 
  private:
   char* InitializeNode(char* ptr, const Node* n);
   size_t NodeItemBytes(const Node* n);
 
-  int32 num_nodes_ = 0;
-  uint32* node_offsets_ = nullptr;  // array of size "num_nodes_"
+  int32_t num_nodes_ = 0;
+  uint32_t* node_offsets_ = nullptr;  // array of size "num_nodes_"
   // node_offsets_[id] holds the byte offset for node w/ "id" in space_
 
   char* space_;  // NodeItem objects are allocated here
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index eccea063ad5abf..ebbdfde177da79 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -43,8 +43,8 @@ namespace tensorflow {
 
 namespace {
 // Key to be used for BufRendezvous by Broadcaster.
-string BroadcastBufKey(const string& exec_key, int subdiv, int src_rank,
-                       int dst_rank) {
+std::string BroadcastBufKey(const std::string& exec_key, int subdiv,
+                            int src_rank, int dst_rank) {
   if (READABLE_KEYS) {
     return strings::StrCat("broadcast(", exec_key, "):subdiv(", subdiv,
                            "):src(", src_rank, "):dst(", dst_rank, ")");
@@ -81,13 +81,13 @@ absl::Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   CHECK_EQ(col_params->instance.type, BROADCAST_COLLECTIVE);
   CHECK_EQ(col_params->instance.impl_details.collective_name,
            "HierarchicalTreeBroadcast");
-  const string& device_name =
+  const std::string& device_name =
       col_params->group.members[col_params->default_rank].device.name();
   // Start by counting the devices in each task.
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->group.members[0].task;
+  const std::string* prior_task_name = &col_params->group.members[0].task;
   int dev_count = 1;
   for (int di = 1; di < col_params->group.group_size; ++di) {
     if (col_params->group.members[di].task != *prior_task_name) {
@@ -102,8 +102,8 @@ absl::Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   CHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
 
   if (VLOG_IS_ON(2)) {
-    string dpt_buf;
-    for (int dpt : dev_per_task) strings::StrAppend(&dpt_buf, dpt, ";");
+    std::string dpt_buf;
+    for (int dpt : dev_per_task) absl::StrAppend(&dpt_buf, dpt, ";");
     VLOG(2) << "HierarchicalTreeBroadcaster::InitializeCollectiveParams device="
             << device_name << " source_rank=" << col_params->source_rank
             << " dev_per_task=" << dpt_buf;
@@ -302,9 +302,9 @@ void HierarchicalTreeBroadcaster::RunTree() {
     if (-1 == my_rank) continue;
     int source_rank = col_params_->instance.impl_details.subdiv_source_rank[si];
     if (VLOG_IS_ON(1)) {
-      string subdiv_buf;
+      std::string subdiv_buf;
       for (int r : col_params_->instance.impl_details.subdiv_permutations[si]) {
-        strings::StrAppend(&subdiv_buf, r, ",");
+        absl::StrAppend(&subdiv_buf, r, ",");
       }
       VLOG(1) << "Running Broadcast tree device=" << col_ctx_->device_name
               << " subdiv=" << si << " perm=" << subdiv_buf
@@ -318,7 +318,7 @@ void HierarchicalTreeBroadcaster::RunTree() {
     if (my_rank >= 0 && my_rank != source_rank) {
       // Begin by receiving the value.
       tsl::profiler::TraceMe activity(
-          [&] { return strings::StrCat("ReceiveValue:", si); },
+          [&] { return absl::StrCat("ReceiveValue:", si); },
           tsl::profiler::TraceMeLevel::kInfo);
       int recv_from_rank = TreeRecvFrom(*col_params_, si);
       absl::Notification note;
@@ -334,7 +334,7 @@ void HierarchicalTreeBroadcaster::RunTree() {
     // Then forward value to all descendent devices.
     {
       tsl::profiler::TraceMe activity(
-          [&] { return strings::StrCat("ForwardValue:", si); },
+          [&] { return absl::StrCat("ForwardValue:", si); },
           tsl::profiler::TraceMeLevel::kInfo);
       if (my_rank >= 0 && status_.ok()) {
         std::vector<int> send_to_ranks;
@@ -413,7 +413,7 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
   tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       col_params_->name, col_ctx_->step_id, "dynamic", src_tensor->dtype(),
       [src_tensor]() { return src_tensor->shape().DebugString(); });
-  string send_buf_key =
+  std::string send_buf_key =
       BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
   int dst_idx =
       col_params_->instance.impl_details.subdiv_permutations[subdiv][dst_rank];
@@ -434,7 +434,7 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
 void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
                                                int dst_rank, Tensor* dst_tensor,
                                                const StatusCallback& done) {
-  string recv_buf_key =
+  std::string recv_buf_key =
       BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
   int src_idx =
       col_params_->instance.impl_details.subdiv_permutations[subdiv][src_rank];
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index ba419077d2774e..408d8cb65b3682 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -191,7 +191,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       if (!instances_[di]->status_.ok()) {
         ASSERT_GT(fail_after, 0);
         ASSERT_NE(instances_[di]->status_.message().find("Deliberate failure"),
-                  string::npos);
+                  std::string::npos);
         ++failure_count_;
         continue;
       }
@@ -221,7 +221,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       // In the test we always broadcast from rank 0.
       col_params_->is_source = (rank == 0);
       col_params_->source_rank = 0;
-      string dev_name = col_params_->group.members[rank].device.name();
+      std::string dev_name = col_params_->group.members[rank].device.name();
       TF_CHECK_OK(test_env_->device_mgr->LookupDevice(dev_name, &device_))
           << "Couldn't find device " << dev_name
           << " existing devices: " << test_env_->device_mgr->DebugString();
@@ -356,10 +356,10 @@ TEST_F(HierarchicalTreeBroadcasterInitParamsTest,
   cp->instance.impl_details.collective_name = "HierarchicalTreeBroadcast";
   std::vector<int> dev_per_task = {4, 4, 6, 8};
   for (int ti = 0; ti < cp->group.num_tasks; ti++) {
-    string task_name = strings::StrCat("/job:worker/replica:0/task:", ti);
+    std::string task_name = absl::StrCat("/job:worker/replica:0/task:", ti);
     for (int di = 0; di < dev_per_task[ti]; di++) {
       CollGroupMember member;
-      member.device.set_name(strings::StrCat(task_name, "/device:GPU:", di));
+      member.device.set_name(absl::StrCat(task_name, "/device:GPU:", di));
       member.task = task_name;
       cp->group.members.push_back(member);
       cp->group.group_size++;
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index 6eef9e802d862e..64ded72c5e0d4e 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -68,7 +68,7 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 }  // namespace
 
 ImmutableExecutorState::FrameInfo* ImmutableExecutorState::EnsureFrameInfo(
-    const string& fname) {
+    const std::string& fname) {
   auto iter = frame_info_.find(fname);
   if (iter != frame_info_.end()) {
     return iter->second.get();
@@ -110,8 +110,8 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
       // TODO(mrry): Track whether control flow was present in the
       // pre-partitioned graph, and enable the caller (e.g.
       // `DirectSession`) to relax this constraint.
-      string send_device;
-      string recv_device;
+      std::string send_device;
+      std::string recv_device;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "send_device", &send_device));
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "recv_device", &recv_device));
       if (send_device != recv_device) {
@@ -120,7 +120,7 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
     }
 
     const int id = n->id();
-    const string& frame_name = cf_info.frame_names[id];
+    const std::string& frame_name = cf_info.frame_names[id];
     FrameInfo* frame_info = EnsureFrameInfo(frame_name);
 
     NodeItem* item = gview_.node(id);
@@ -162,7 +162,7 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
           GetNodeAttr(n->attrs(), "is_constant", &is_constant_enter));
       item->is_constant_enter = is_constant_enter;
 
-      string frame_name;
+      std::string frame_name;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &frame_name));
       FrameInfo* frame_info = frame_info_[frame_name].get();
 
@@ -214,7 +214,7 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
     // Initialize static information about the frames in the graph.
     frame_info->nodes->push_back(item);
     if (item->is_enter) {
-      string enter_name;
+      std::string enter_name;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
       EnsureFrameInfo(enter_name)->input_count++;
     }
@@ -291,7 +291,7 @@ absl::Status ImmutableExecutorState::BuildControlFlowInfo(
   std::vector<bool> visited;
   visited.resize(num_nodes);
 
-  string frame_name;
+  std::string frame_name;
   std::deque<Node*> ready;
 
   // Initialize with the root nodes.
@@ -360,7 +360,7 @@ void ImmutableExecutorState::InitializePending(const Graph* graph,
   }
 
   if (!requires_control_flow_) {
-    atomic_pending_counts_.reset(new std::atomic<int32>[gview_.num_nodes()]);
+    atomic_pending_counts_.reset(new std::atomic<int32_t>[gview_.num_nodes()]);
     std::fill(atomic_pending_counts_.get(),
               atomic_pending_counts_.get() + gview_.num_nodes(), 0);
   }
@@ -368,7 +368,7 @@ void ImmutableExecutorState::InitializePending(const Graph* graph,
   for (const Node* n : graph->nodes()) {
     if (IsSink(n)) continue;
     const int id = n->id();
-    const string& name = cf_info.frame_names[id];
+    const std::string& name = cf_info.frame_names[id];
     size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
     auto& counts = EnsureFrameInfo(name)->pending_counts;
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.h b/tensorflow/core/common_runtime/immutable_executor_state.h
index 6a12bc1fb0b0c0..7e7437c5311d20 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.h
+++ b/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -42,7 +42,7 @@ class Graph;
 class ImmutableExecutorState {
  public:
   struct FrameInfo {
-    explicit FrameInfo(string name)
+    explicit FrameInfo(std::string name)
         : name(std::move(name)),
           input_count(0),
           total_inputs(0),
@@ -51,7 +51,7 @@ class ImmutableExecutorState {
           parallel_iterations(-1) {}
 
     // The name of the frame.
-    string name;
+    std::string name;
 
     // The total number of inputs to a frame.
     int input_count;
@@ -71,7 +71,7 @@ class ImmutableExecutorState {
     std::unique_ptr<std::vector<const NodeItem*>> nodes;
 
     // The number of iterations of this frame that can execute concurrently.
-    int32 parallel_iterations;
+    int32_t parallel_iterations;
   };
 
   explicit ImmutableExecutorState(const LocalExecutorParams& p)
@@ -109,24 +109,24 @@ class ImmutableExecutorState {
   //
   // REQUIRES: `!requires_control_flow_support && len(dest) ==
   // graph_view().num_nodes()`.
-  void copy_pending_counts(std::atomic<int32>* dest) const {
+  void copy_pending_counts(std::atomic<int32_t>* dest) const {
     DCHECK(!requires_control_flow_);
     memcpy(dest, atomic_pending_counts_.get(),
-           graph_view().num_nodes() * sizeof(std::atomic<int32>));
+           graph_view().num_nodes() * sizeof(std::atomic<int32_t>));
     std::atomic_thread_fence(std::memory_order_release);
   }
 
  private:
   struct ControlFlowInfo {
-    gtl::FlatSet<string> unique_frame_names;
-    std::vector<string> frame_names;
+    gtl::FlatSet<std::string> unique_frame_names;
+    std::vector<std::string> frame_names;
   };
 
   static absl::Status BuildControlFlowInfo(const Graph* graph,
                                            ControlFlowInfo* cf_info);
   void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
 
-  FrameInfo* EnsureFrameInfo(const string& fname);
+  FrameInfo* EnsureFrameInfo(const std::string& fname);
 
   // Owned.
   LocalExecutorParams params_;
@@ -150,7 +150,7 @@ class ImmutableExecutorState {
 
   // If `requires_control_flow_` is false, this points to an array of initial
   // pending counts for the nodes in the graph, indexed by node ID.
-  std::unique_ptr<std::atomic<int32>[]> atomic_pending_counts_;
+  std::unique_ptr<std::atomic<int32_t>[]> atomic_pending_counts_;
 
   // Shallow copies of the constant tensors used in the graph.
   std::vector<Tensor> const_tensors_;
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index 1e8a85207fa0b1..a627e9e8aff9c9 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -70,11 +70,11 @@ struct Endpoint {
   int index;
 
   // Returns the string name represents this endpoint.
-  string name() const {
+  std::string name() const {
     if (index == 0) {
       return node->name();
     } else {
-      return strings::StrCat(node->name(), ":", index);
+      return absl::StrCat(node->name(), ":", index);
     }
   }
 
@@ -82,7 +82,7 @@ struct Endpoint {
 };
 
 struct EndpointHash {
-  uint64 operator()(const Endpoint& x) const {
+  uint64_t operator()(const Endpoint& x) const {
     return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                   x.index);
   }
@@ -120,15 +120,15 @@ static Node* AddIdentity(absl::string_view name, Graph* g, Endpoint input) {
   return ret;
 }
 
-std::vector<string> InputDevices(const Node& caller) {
-  std::vector<string> input_devices(caller.in_edges().size());
-  std::vector<string> input_tensors(caller.in_edges().size());
+std::vector<std::string> InputDevices(const Node& caller) {
+  std::vector<std::string> input_devices(caller.in_edges().size());
+  std::vector<std::string> input_tensors(caller.in_edges().size());
 
   for (const Edge* edge : caller.in_edges()) {
     if (edge->IsControlEdge()) continue;
-    const string& input_device = edge->src()->has_assigned_device_name()
-                                     ? edge->src()->assigned_device_name()
-                                     : edge->src()->requested_device();
+    const std::string& input_device = edge->src()->has_assigned_device_name()
+                                          ? edge->src()->assigned_device_name()
+                                          : edge->src()->requested_device();
     input_devices[edge->dst_input()] = input_device;
     input_tensors[edge->dst_input()] =
         absl::StrCat(edge->src()->name(), ":", edge->src_output());
@@ -154,22 +154,24 @@ class DefaultFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
   explicit DefaultFunctionBodyPlacer(const Node& caller)
       : input_devices_(InputDevices(caller)) {}
 
-  absl::optional<string> InputNodeDevice(int input_index) const override {
+  absl::optional<std::string> InputNodeDevice(int input_index) const override {
     return input_devices_[input_index];
   }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
+  absl::optional<std::string> OutputNodeDevice(
+      int output_index) const override {
     return absl::nullopt;
   }
   bool ColocateInputOutputIdentities() const override { return false; }
-  absl::optional<string> ControlNodeDevice() const override {
+  absl::optional<std::string> ControlNodeDevice() const override {
     return absl::nullopt;
   }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+  absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const override {
     return absl::nullopt;
   }
 
  private:
-  const std::vector<string> input_devices_;
+  const std::vector<std::string> input_devices_;
 };
 
 // Place all nodes on the same device as caller node.
@@ -178,22 +180,24 @@ class SingleDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
   explicit SingleDeviceFunctionBodyPlacer(const Node& caller)
       : caller_device_(caller.def().device()) {}
 
-  absl::optional<string> InputNodeDevice(int input_index) const override {
+  absl::optional<std::string> InputNodeDevice(int input_index) const override {
     return caller_device_;
   }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
+  absl::optional<std::string> OutputNodeDevice(
+      int output_index) const override {
     return caller_device_;
   }
   bool ColocateInputOutputIdentities() const override { return false; }
-  absl::optional<string> ControlNodeDevice() const override {
+  absl::optional<std::string> ControlNodeDevice() const override {
     return caller_device_;
   }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+  absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const override {
     return caller_device_;
   }
 
  private:
-  const string caller_device_;
+  const std::string caller_device_;
 };
 
 // Place input nodes on the same device as the corresponding caller input
@@ -209,17 +213,19 @@ class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
         DeviceNameUtils::ParseFullName(caller_device_, &caller_parsed_device_);
   }
 
-  absl::optional<string> InputNodeDevice(int input_index) const override {
+  absl::optional<std::string> InputNodeDevice(int input_index) const override {
     return input_devices_[input_index];
   }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
+  absl::optional<std::string> OutputNodeDevice(
+      int output_index) const override {
     return absl::nullopt;
   }
   bool ColocateInputOutputIdentities() const override { return true; }
-  absl::optional<string> ControlNodeDevice() const override {
+  absl::optional<std::string> ControlNodeDevice() const override {
     return caller_device_;
   }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+  absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const override {
     // LINT.IfChange
     // TODO(ezhulenev): If function would have been instantiated as a
     // multi-device function and executed via FunctionLibraryRuntime, it could
@@ -240,10 +246,10 @@ class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
   }
 
  private:
-  string caller_device_;
+  std::string caller_device_;
   bool has_parsed_caller_device_;
   DeviceNameUtils::ParsedName caller_parsed_device_;
-  std::vector<string> input_devices_;
+  std::vector<std::string> input_devices_;
 };
 
 }  // namespace
@@ -286,7 +292,7 @@ using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
 // Propagate the debug info of `nodes` in function `func` to the `target` node.
 // If the debug info of any node is missing, its node name and function name
 // is used.
-void PropagateDebugInfoToNode(const string& func,
+void PropagateDebugInfoToNode(const std::string& func,
                               const std::vector<const Node*>& nodes,
                               NodeDef* target) {
   if (nodes.empty() || target->has_experimental_debug_info()) {
@@ -306,10 +312,10 @@ void PropagateDebugInfoToNode(const string& func,
 }
 }  // namespace
 
-string InlineFunctionBodyOptions::DebugString() const {
+std::string InlineFunctionBodyOptions::DebugString() const {
   const auto true_false = [](bool b) { return b ? "true" : "false"; };
 
-  const auto keep_caller_node_str = [this]() -> string {
+  const auto keep_caller_node_str = [this]() -> std::string {
     switch (keep_caller_node) {
       case KeepCallerNode::kDoNotKeep:
         return "DoNotKeep";
@@ -508,7 +514,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   // Add a NoOp node for function control inputs/outputs.
   const auto no_op = [&](absl::string_view name) -> Node* {
     Node* node = AddNoOp(absl::StrCat(caller->name(), "/", name), g);
-    const absl::optional<string> device = placer->ControlNodeDevice();
+    const absl::optional<std::string> device = placer->ControlNodeDevice();
     if (device.has_value()) node->set_requested_device(*device);
     return node;
   };
@@ -517,13 +523,13 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   const auto input_identity = [&](absl::string_view name, Endpoint input,
                                   int index) -> Node* {
     Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
-    const absl::optional<string> device = placer->InputNodeDevice(index);
+    const absl::optional<std::string> device = placer->InputNodeDevice(index);
     if (device.has_value()) node->set_requested_device(*device);
     bool colocate_identity = placer->ColocateInputOutputIdentities();
     if (colocate_identity) {
       node->AddAttr(kColocationAttrName,
-                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
-                                                     input.node->name())});
+                    std::vector<std::string>{absl::StrCat(
+                        kColocationGroupPrefix, input.node->name())});
     }
     return node;
   };
@@ -532,13 +538,13 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   const auto output_identity = [&](absl::string_view name, Endpoint input,
                                    int index) -> Node* {
     Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
-    const absl::optional<string> device = placer->OutputNodeDevice(index);
+    const absl::optional<std::string> device = placer->OutputNodeDevice(index);
     if (device.has_value()) node->set_requested_device(*device);
     bool colocate_identity = placer->ColocateInputOutputIdentities();
     if (colocate_identity) {
       node->AddAttr(kColocationAttrName,
-                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
-                                                     input.node->name())});
+                    std::vector<std::string>{absl::StrCat(
+                        kColocationGroupPrefix, input.node->name())});
     }
     return node;
   };
@@ -597,7 +603,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   //
   // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
   // remember 'y' in node_map[x->id()].
-  std::unordered_set<string> fn_nodes;
+  std::unordered_set<std::string> fn_nodes;
   for (Node* n : fbody->graph->op_nodes()) {
     fn_nodes.insert(n->name());
   }
@@ -606,7 +612,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
     NodeDef ndef = n->def();
 
     // Maybe override requested node device assignment.
-    const absl::optional<string> device = placer->BodyNodeDevice(ndef);
+    const absl::optional<std::string> device = placer->BodyNodeDevice(ndef);
     if (device.has_value()) ndef.set_device(*device);
 
     // Add inlined function name to inlined node debug information.
@@ -617,7 +623,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
     //  1) to node name to avoid collisions
     //  2) to frame name to avoid multiple LoopCond nodes in one frame
     //  3) to colocation attribute
-    const string prefix = strings::StrCat(caller->name(), "/");
+    const std::string prefix = absl::StrCat(caller->name(), "/");
     TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef,
                                                 options.uniquify_frame_names));
 
diff --git a/tensorflow/core/common_runtime/inline_function_utils.h b/tensorflow/core/common_runtime/inline_function_utils.h
index 94c118fe882a20..7ffafe13e5df03 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.h
+++ b/tensorflow/core/common_runtime/inline_function_utils.h
@@ -41,13 +41,16 @@ class InlinedFunctionBodyPlacer {
  public:
   virtual ~InlinedFunctionBodyPlacer() = default;
 
-  virtual absl::optional<string> InputNodeDevice(int input_index) const = 0;
-  virtual absl::optional<string> OutputNodeDevice(int output_index) const = 0;
+  virtual absl::optional<std::string> InputNodeDevice(
+      int input_index) const = 0;
+  virtual absl::optional<std::string> OutputNodeDevice(
+      int output_index) const = 0;
   // Returns true if the added input/output identity nodes should be colocated
   // with the corresponding input/output from the function body.
   virtual bool ColocateInputOutputIdentities() const = 0;
-  virtual absl::optional<string> ControlNodeDevice() const = 0;
-  virtual absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const = 0;
+  virtual absl::optional<std::string> ControlNodeDevice() const = 0;
+  virtual absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const = 0;
 
   // LINT.IfChange
   // Place input nodes on the same device as the corresponding caller input
@@ -72,7 +75,7 @@ class InlinedFunctionBodyPlacer {
       const Graph&, const Node&)>;
 
   struct Config {
-    string name;
+    std::string name;
     Factory get;
   };
 
@@ -147,7 +150,7 @@ struct InlineFunctionBodyOptions {
   bool uniquify_frame_names = true;
 
   // A human-readable debug string for this options.
-  string DebugString() const;
+  std::string DebugString() const;
 };
 
 // Returns 'OkStatus()' iff the function '*fbody' can be inlined at 'node'
diff --git a/tensorflow/core/common_runtime/inline_function_utils_test.cc b/tensorflow/core/common_runtime/inline_function_utils_test.cc
index 0d726ade656f21..1e20e6da535a16 100644
--- a/tensorflow/core/common_runtime/inline_function_utils_test.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils_test.cc
@@ -50,7 +50,7 @@ TEST(InlineFunctionBody, ColocationConstraintPropagation) {
           {{"z"},
            "AddV2",
            {"x", "y"},
-           {{"T", DT_FLOAT}, {"_class", std::vector<string>({"loc:@x"})}}},
+           {{"T", DT_FLOAT}, {"_class", std::vector<std::string>({"loc:@x"})}}},
       });
   TF_ASSERT_OK(flib_def.AddFunctionDef(fdef));
 
@@ -98,7 +98,8 @@ TEST(InlineFunctionBody, ColocationConstraintPropagation) {
           // Func/call/input/_0.
           NDef("call/z", "AddV2", {"Func/call/input/_0", "Func/call/input/_1"},
                {{"T", DT_FLOAT},
-                {"_class", std::vector<string>({"loc:@Func/call/input/_0"})}}),
+                {"_class",
+                 std::vector<std::string>({"loc:@Func/call/input/_0"})}}),
           NDef("Func/call/output/_2", "Identity", {"call/z"},
                {{"T", DT_FLOAT}}),
       },
diff --git a/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc b/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc
index 7b0fa4af464fe9..4edf42ff812b8d 100644
--- a/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc
+++ b/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc
@@ -27,7 +27,7 @@ InputColocationExemptionRegistry* InputColocationExemptionRegistry::Global() {
   return registry;
 }
 
-void InputColocationExemptionRegistry::Register(const string& op) {
+void InputColocationExemptionRegistry::Register(const std::string& op) {
   auto it = ops_.find(op);
   if (it != ops_.end()) {
     LOG(WARNING) << "Input colocation exemption for op: " << op
diff --git a/tensorflow/core/common_runtime/input_colocation_exemption_registry.h b/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
index c393fe7498b696..9e4bbc9e77f4af 100644
--- a/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
+++ b/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
@@ -40,20 +40,20 @@ class InputColocationExemptionRegistry {
   static InputColocationExemptionRegistry* Global();
 
   // Returns the set of ops exempt from the input colocation constraints.
-  const gtl::FlatSet<string>& Get() { return ops_; }
+  const gtl::FlatSet<std::string>& Get() { return ops_; }
 
   // Registers an op to be excluded from the input colocation constraints.
-  void Register(const string& op);
+  void Register(const std::string& op);
 
  private:
-  gtl::FlatSet<string> ops_;
+  gtl::FlatSet<std::string> ops_;
 };
 
 namespace input_colocation_exemption_registration {
 
 class InputColocationExemptionRegistration {
  public:
-  explicit InputColocationExemptionRegistration(const string& op) {
+  explicit InputColocationExemptionRegistration(const std::string& op) {
     InputColocationExemptionRegistry::Global()->Register(op);
   }
 };
diff --git a/tensorflow/core/common_runtime/inspecting_placer.cc b/tensorflow/core/common_runtime/inspecting_placer.cc
index 96799bcf1e4be8..816d3dcae487a9 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.cc
+++ b/tensorflow/core/common_runtime/inspecting_placer.cc
@@ -34,21 +34,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-string IOColocationGroups::DebugString() const {
-  std::unordered_map<int, std::vector<string>> group_members;
+std::string IOColocationGroups::DebugString() const {
+  std::unordered_map<int, std::vector<std::string>> group_members;
   for (int arg_index = 0; arg_index < input_groups.size(); ++arg_index) {
     int group_id = input_groups[arg_index];
-    group_members[group_id].push_back(strings::StrCat("i:", arg_index));
+    group_members[group_id].push_back(absl::StrCat("i:", arg_index));
   }
   for (int ret_index = 0; ret_index < output_groups.size(); ++ret_index) {
     int group_id = output_groups[ret_index];
-    group_members[group_id].push_back(strings::StrCat("o:", ret_index));
+    group_members[group_id].push_back(absl::StrCat("o:", ret_index));
   }
 
-  std::vector<string> group_strings;
+  std::vector<std::string> group_strings;
   for (const auto& it : group_members) {
     int group_id = it.first;
-    const std::vector<string>& members = it.second;
+    const std::vector<std::string>& members = it.second;
     const PossibleDevices& devices = group_devices[group_id];
     group_strings.push_back(strings::StrCat(
         "Group(", group_id, " members = [", absl::StrJoin(members, ", "),
@@ -57,11 +57,11 @@ string IOColocationGroups::DebugString() const {
         "\" resource_device_name = \"",
         DeviceNameUtils::ParsedNameToString(devices.resource_device_name),
         "\" device_types = [",
-        absl::StrJoin(
-            devices.device_types, ", ",
-            [](string* out, const std::pair<DeviceType, int32>& type_and_pref) {
-              out->append(DeviceTypeString(type_and_pref.first));
-            }),
+        absl::StrJoin(devices.device_types, ", ",
+                      [](std::string* out,
+                         const std::pair<DeviceType, int32_t>& type_and_pref) {
+                        out->append(DeviceTypeString(type_and_pref.first));
+                      }),
         "])"));
   }
 
diff --git a/tensorflow/core/common_runtime/inspecting_placer.h b/tensorflow/core/common_runtime/inspecting_placer.h
index 90df36c58139fd..27e45dacadad8b 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.h
+++ b/tensorflow/core/common_runtime/inspecting_placer.h
@@ -59,7 +59,7 @@ struct IOColocationGroups {
   // group_devices[i] contains possible devices for group with id i.
   std::vector<PossibleDevices> group_devices;
 
-  string DebugString() const;
+  std::string DebugString() const;
 };
 
 class InspectingPlacer {
diff --git a/tensorflow/core/common_runtime/int32_fulltype.h b/tensorflow/core/common_runtime/int32_fulltype.h
index 1a55e0bc6a1e7c..8e89b0bec2f6d9 100644
--- a/tensorflow/core/common_runtime/int32_fulltype.h
+++ b/tensorflow/core/common_runtime/int32_fulltype.h
@@ -29,7 +29,7 @@ namespace tensorflow {
 class Int32FulltypePass {
  public:
   Int32FulltypePass() = default;
-  explicit Int32FulltypePass(string debug_location)
+  explicit Int32FulltypePass(std::string debug_location)
       : debug_location_(debug_location) {}
 
   // For each node in this graph that outputs int32 tensors, set full
@@ -57,7 +57,7 @@ class Int32FulltypePass {
 
  private:
   // Location of where annotations were added for debug messages.
-  string debug_location_;
+  std::string debug_location_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/int32_fulltype_test.cc b/tensorflow/core/common_runtime/int32_fulltype_test.cc
index 8cfb991cdacd38..ed8587667e9bcc 100644
--- a/tensorflow/core/common_runtime/int32_fulltype_test.cc
+++ b/tensorflow/core/common_runtime/int32_fulltype_test.cc
@@ -96,14 +96,14 @@ class Int32FulltypeTest : public ::testing::Test {
   // Returns the node in "graph" with the given name.
   //
   // REQUIRES: "graph" was produced by the most recent call to BuildGraph.
-  Node* GetNodeByName(const Graph& graph, const string& name) {
+  Node* GetNodeByName(const Graph& graph, const std::string& name) {
     const auto search = nodes_by_name_.find(name);
     CHECK(search != nodes_by_name_.end()) << "Unknown node name: " << name;
     return graph.FindNodeId(search->second);
   }
 
  protected:
-  std::unordered_map<string, int> nodes_by_name_;
+  std::unordered_map<std::string, int> nodes_by_name_;
 
  private:
   void RebuildNodeNameMap(const Graph& graph) {
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
index 5afdc072fcc1ae..be10cd744f35f1 100644
--- a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
@@ -67,11 +67,11 @@ void RunPassAndCompare(const GraphDef& original,
   GraphDef rewritten;
   RunPass(original, &rewritten);
 
-  std::vector<string> errors;
+  std::vector<std::string> errors;
   errors.push_back(absl::StrCat("Graphs did not match.\n  Rewritten graph:\n",
                                 SummarizeGraphDef(rewritten)));
   for (const GraphDef& alternative : expected_alternatives) {
-    string diff;
+    std::string diff;
     bool graphs_equal = EqualGraphDef(rewritten, alternative, &diff);
     if (graphs_equal) {
       return;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 1e17e24df37677..78f2d219505341 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 namespace test {
 
 // TODO(hongm): Convert `g` and `init` to using std::unique_ptr.
-Benchmark::Benchmark(const string& device, Graph* g,
+Benchmark::Benchmark(const std::string& device, Graph* g,
                      const SessionOptions* options, Graph* init,
                      Rendezvous* rendez, const char* executor_type,
                      bool old_benchmark_api) {
@@ -61,7 +61,7 @@ Benchmark::Benchmark(const string& device, Graph* g,
 
   CHECK(!old_benchmark_api) << "Expected new API only";
 
-  string t = absl::AsciiStrToUpper(device);
+  std::string t = absl::AsciiStrToUpper(device);
   // Allow NewDevice to allocate a new threadpool with different number of
   // threads for each new benchmark.
   LocalDevice::set_use_global_threadpool(false);
@@ -121,7 +121,8 @@ Benchmark::Benchmark(const string& device, Graph* g,
   TF_CHECK_OK(NewExecutor(executor_type, params, *g, &exec_));
 }
 
-Benchmark::Benchmark(const string& device, Graph* g, bool old_benchmark_api)
+Benchmark::Benchmark(const std::string& device, Graph* g,
+                     bool old_benchmark_api)
     : Benchmark(device, g, nullptr, nullptr, nullptr, "", old_benchmark_api) {}
 
 Benchmark::~Benchmark() {
@@ -141,14 +142,14 @@ void Benchmark::Run(benchmark::State& state) {
   RunWithRendezvousArgs({}, {}, state);
 }
 
-string GetRendezvousKey(const Node* node) {
-  string send_device;
+std::string GetRendezvousKey(const Node* node) {
+  std::string send_device;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
-  string recv_device;
+  std::string recv_device;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device));
-  string tensor_name;
+  std::string tensor_name;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name));
-  uint64 send_device_incarnation;
+  uint64_t send_device_incarnation;
   TF_CHECK_OK(
       GetNodeAttr(node->attrs(), "send_device_incarnation",
                   reinterpret_cast<int64_t*>(&send_device_incarnation)));
@@ -157,8 +158,8 @@ string GetRendezvousKey(const Node* node) {
 }
 
 void Benchmark::RunWithRendezvousArgs(
-    const std::vector<std::pair<string, Tensor>>& inputs,
-    const std::vector<string>& outputs, benchmark::State& state) {
+    const std::vector<std::pair<std::string, Tensor>>& inputs,
+    const std::vector<std::string>& outputs, benchmark::State& state) {
   if (!device_ || state.max_iterations == 0) {
     return;
   }
@@ -179,7 +180,7 @@ void Benchmark::RunWithRendezvousArgs(
       TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
     }
     TF_CHECK_OK(exec_->Run(args));
-    for (const string& key : outputs) {
+    for (const std::string& key : outputs) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
       TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
@@ -197,7 +198,7 @@ void Benchmark::RunWithRendezvousArgs(
       TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
     }
     TF_CHECK_OK(exec_->Run(args));
-    for (const string& key : outputs) {
+    for (const std::string& key : outputs) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
       TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index fcab9a65bc586a..a0e5486b96c120 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -48,20 +48,20 @@ class Benchmark {
   //   * In the new API, the timer starts automatically at the first
   //     iteration of the loop and stops after the last iteration.
   // TODO(vyng) Remove this once we have migrated all code to newer API.
-  Benchmark(const string& device, Graph* g,
+  Benchmark(const std::string& device, Graph* g,
             const SessionOptions* options = nullptr, Graph* init = nullptr,
             Rendezvous* rendez = nullptr, const char* executor_type = "",
             bool old_benchmark_api = false);
 
-  Benchmark(const string& device, Graph* g, bool old_benchmark_api);
+  Benchmark(const std::string& device, Graph* g, bool old_benchmark_api);
 
   ~Benchmark();
 
   void Run(benchmark::State& state);
 
   void RunWithRendezvousArgs(
-      const std::vector<std::pair<string, Tensor>>& inputs,
-      const std::vector<string>& outputs, benchmark::State& state);
+      const std::vector<std::pair<std::string, Tensor>>& inputs,
+      const std::vector<std::string>& outputs, benchmark::State& state);
 
  private:
   thread::ThreadPool* pool_ = nullptr;  // Not owned.
@@ -78,7 +78,7 @@ class Benchmark {
 };
 
 // Returns the rendezvous key associated with the given Send/Recv node.
-string GetRendezvousKey(const Node* node);
+std::string GetRendezvousKey(const Node* node);
 
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 63fd2f1b59c223..9997ff2a30c008 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -85,7 +85,7 @@ struct LocalDevice::EigenThreadPoolInfo {
     thread_opts.numa_node = numa_node;
     eigen_worker_threads_.num_threads = intra_op_parallelism_threads;
     eigen_worker_threads_.workers = new thread::ThreadPool(
-        options.env, thread_opts, strings::StrCat("numa_", numa_node, "_Eigen"),
+        options.env, thread_opts, absl::StrCat("numa_", numa_node, "_Eigen"),
         intra_op_parallelism_threads,
         !options.config.experimental().disable_thread_spinning(),
         /*allocator=*/nullptr);
diff --git a/tensorflow/core/common_runtime/lower_case_op.cc b/tensorflow/core/common_runtime/lower_case_op.cc
index 39d1d150fa8a1b..88c169bc4a80d3 100644
--- a/tensorflow/core/common_runtime/lower_case_op.cc
+++ b/tensorflow/core/common_runtime/lower_case_op.cc
@@ -38,7 +38,7 @@ class CaseBuilder {
  public:
   // Create a CaseBuilder to create the lowered form of `case` with branch
   // functions identified by `branch_fn_names` in the `graph`.
-  CaseBuilder(Node* case_op, const std::vector<string>& branch_fn_names,
+  CaseBuilder(Node* case_op, const std::vector<std::string>& branch_fn_names,
               bool keep_node_fetchable, Graph* graph);
 
   // Constructs the basic conditional control flow using switch and merge nodes.
@@ -58,7 +58,7 @@ class CaseBuilder {
  private:
   // Returns unique name containing the name of the Case op being rewritten
   // (name_), infix and a suffix to ensure it is unique within the graph.
-  string NewName(const string& infix);
+  std::string NewName(const std::string& infix);
 
   // Adds input to both the then and else nodes from src:src_output.
   absl::Status AddInput(Node* src, int src_output);
@@ -88,7 +88,7 @@ class CaseBuilder {
   // for the side effects.
   Node* branch_executed_node_;
   Graph* graph_;
-  string name_;
+  std::string name_;
   bool keep_node_fetchable_;
 
   NodeDebugInfo debug_info_;
@@ -96,7 +96,7 @@ class CaseBuilder {
 };
 
 CaseBuilder::CaseBuilder(Node* case_op,
-                         const std::vector<string>& branch_fn_names,
+                         const std::vector<std::string>& branch_fn_names,
                          bool keep_node_fetchable, Graph* graph)
     : case_op_(case_op),
       num_branches_(branch_fn_names.size()),
@@ -106,7 +106,7 @@ CaseBuilder::CaseBuilder(Node* case_op,
       debug_info_(*case_op_) {
   branch_call_builders_.reserve(num_branches_);
   for (int b = 0; b < num_branches_; b++) {
-    branch_call_builders_.emplace_back(NewName(strings::StrCat("branch", b)),
+    branch_call_builders_.emplace_back(NewName(absl::StrCat("branch", b)),
                                        branch_fn_names[b], graph->op_registry(),
                                        &debug_info_);
     branch_call_builders_[b].Device(case_op_->requested_device());
@@ -129,7 +129,7 @@ absl::Status CaseBuilder::CreatePivotNodes() {
   control_predecessor_ = branch_index;
   pivots_.resize(num_branches_, nullptr);
   for (int b = 0; b < num_branches_; b++) {
-    TF_RETURN_IF_ERROR(NodeBuilder(NewName(strings::StrCat("pivot_", b)),
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(absl::StrCat("pivot_", b)),
                                    "Identity", graph_->op_registry(),
                                    &debug_info_)
                            .Input(branch_index, b)
@@ -139,8 +139,8 @@ absl::Status CaseBuilder::CreatePivotNodes() {
   return absl::OkStatus();
 }
 
-string CaseBuilder::NewName(const string& infix) {
-  return graph_->NewName(strings::StrCat(name_, "/", infix));
+std::string CaseBuilder::NewName(const std::string& infix) {
+  return graph_->NewName(absl::StrCat(name_, "/", infix));
 }
 
 absl::Status CaseBuilder::AddInput(Node* src, int src_output) {
@@ -276,7 +276,7 @@ absl::Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable) {
   }
 
   int num_branches = branches_attr->list().func_size();
-  std::vector<string> branch_fn_names;
+  std::vector<std::string> branch_fn_names;
   branch_fn_names.reserve(num_branches);
   for (int b = 0; b < num_branches; b++) {
     branch_fn_names.emplace_back(branches_attr->list().func(b).name());
diff --git a/tensorflow/core/common_runtime/lower_case_op_test.cc b/tensorflow/core/common_runtime/lower_case_op_test.cc
index eb5033cd75b000..d460d761fc646d 100644
--- a/tensorflow/core/common_runtime/lower_case_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_case_op_test.cc
@@ -184,8 +184,8 @@ TEST(LowerCaseOpTest, BranchFunctionsWithoutOutputs) {
   using FDH = ::tensorflow::FunctionDefHelper;
 
   // Wrap AssignAddVariable + Const into a function.
-  const auto assign_add = [](const string& fn_name, int v) {
-    const Tensor tensor = test::AsScalar<int32>(v);
+  const auto assign_add = [](const std::string& fn_name, int v) {
+    const Tensor tensor = test::AsScalar<int32_t>(v);
     return FDH::Create(
         fn_name, {"v: resource"}, {}, {},
         {
diff --git a/tensorflow/core/common_runtime/lower_function_call_op_test.cc b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
index d276c7c43abbb7..3a2de9036df433 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
@@ -36,13 +36,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-AttrValue FuncAttr(const string& name) {
+AttrValue FuncAttr(const std::string& name) {
   AttrValue attr;
   attr.mutable_func()->set_name(name);
   return attr;
 }
 
-AttrValue FuncAttr(const string& name, const DataType type) {
+AttrValue FuncAttr(const std::string& name, const DataType type) {
   AttrValue attr;
   attr.mutable_func()->set_name(name);
   (*attr.mutable_func()->mutable_attr())["T"].set_type(type);
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 49885ba8129e8e..a2c2b6986a5e8b 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -52,7 +52,7 @@ bool CheckBoolAttr(const Node* n, absl::string_view attr_name) {
 
 // Checks if string attribute is defined and it's not empty.
 bool CheckStringAttr(const Node* n, absl::string_view attr_name) {
-  string match;
+  std::string match;
   bool found = TryGetNodeAttr(n->attrs(), attr_name, &match);
   return found && !match.empty();
 }
diff --git a/tensorflow/core/common_runtime/lower_functional_ops_test.cc b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
index 2f16c6fef7e308..2d47ac5d70bd3c 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops_test.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
@@ -66,7 +66,7 @@ absl::Status Rewrite(std::unique_ptr<Graph>* graph) {
 
 // (counter:int32, pred:bool, x:int32) -> counter < N
 FunctionDef WhileWithIfCond(int32_t N) {
-  const Tensor kN = test::AsScalar<int32>(N);
+  const Tensor kN = test::AsScalar<int32_t>(N);
   return FDH::Define(
       // Name
       "WhileWithIfCond",
@@ -90,7 +90,7 @@ FunctionDef WhileWithIfBody() {
   then_func.set_name("XTimesTwo");
   NameAttrList else_func;
   else_func.set_name("XTimesFour");
-  const Tensor kOne = test::AsScalar<int32>(1);
+  const Tensor kOne = test::AsScalar<int32_t>(1);
   std::vector<DataType> input_types = {DT_INT32};
   std::vector<DataType> output_types = {DT_INT32};
   return FDH::Define(
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index e46ef4ff3de543..01beef8fc2328d 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -60,7 +60,7 @@ class CondBuilder {
  private:
   // Returns unique name containing the name of the If op being rewritten
   // (name_), infix and a suffix to ensure it is unique within the graph.
-  string NewName(const string& infix);
+  std::string NewName(const std::string& infix);
 
   // Adds input to both the then and else nodes from src:src_output.
   absl::Status AddInput(Node* src, int src_output);
@@ -102,7 +102,7 @@ class CondBuilder {
   // executed for the side effects.
   Node* branch_executed_node_;
   Graph* graph_;
-  string name_;
+  std::string name_;
   bool keep_node_fetchable_;
 
   NodeDebugInfo debug_info_;
@@ -172,8 +172,8 @@ absl::Status CondBuilder::CreatePivotNodes() {
   return absl::OkStatus();
 }
 
-string CondBuilder::NewName(const string& infix) {
-  return graph_->NewName(strings::StrCat(name_, "/", infix));
+std::string CondBuilder::NewName(const std::string& infix) {
+  return graph_->NewName(absl::StrCat(name_, "/", infix));
 }
 
 absl::Status CondBuilder::AddInput(Node* src, int src_output) {
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index 91bddb27b452be..68c55d27d16433 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -35,7 +35,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-AttrValue FuncAttr(const string& name) {
+AttrValue FuncAttr(const std::string& name) {
   AttrValue attr;
   attr.mutable_func()->set_name(name);
   return attr;
@@ -153,8 +153,8 @@ TEST(LowerIfOpTest, BranchFunctionsWithoutOutputs) {
   using FDH = ::tensorflow::FunctionDefHelper;
 
   // Wrap AssignAddVariable + Const into a function.
-  const auto assign_add = [](const string& fn_name, int v) {
-    const Tensor tensor = test::AsScalar<int32>(v);
+  const auto assign_add = [](const std::string& fn_name, int v) {
+    const Tensor tensor = test::AsScalar<int32_t>(v);
     return FDH::Create(
         fn_name, {"v: resource"}, {}, {},
         {
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 8a8c3c075dd235..84f03444a93972 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -132,7 +132,7 @@ class LowerWhileHelper {
 
   // Returns unique name containing the name of the While op being rewritten
   // (name_), infix and a suffix to ensure it is unique within the graph.
-  string NewName(const string& infix);
+  std::string NewName(const std::string& infix);
 
   // Returns true if the input at index is a resource and the same resource is
   // returned as an output.
@@ -156,7 +156,7 @@ class LowerWhileHelper {
   Graph* graph_;
   const FunctionLibraryDefinition* flib_def_;
   // Name of the `while_op_`.
-  string name_;
+  std::string name_;
   // Max number of parallel_iterations for the while loop.
   const int parallel_iterations_;
   bool keep_node_fetchable_;
@@ -363,15 +363,15 @@ absl::Status LowerWhileHelper::CreateSwitchNodes() {
     if (IsLoopCarriedResource(i)) {
       continue;
     }
-    string op_name;
+    std::string op_name;
     {
       const Node* input_node;
       TF_RETURN_IF_ERROR(while_op_->input_node(i, &input_node));
-      op_name = strings::StrCat(input_node->name(), "_switch");
+      op_name = absl::StrCat(input_node->name(), "_switch");
     }
     Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     Node* switch_node;
-    string op_type = "Switch";
+    std::string op_type = "Switch";
     if (IsRefType(merge_node->output_type(0))) {
       op_type = "RefSwitch";
     }
@@ -413,7 +413,7 @@ absl::Status LowerWhileHelper::CreateBodyFuncCallNode() {
   // node is not the first one to be ready? Can we speed that case up using some
   // sort of multi-input Merge?
   Node* body_control_node_;
-  string op_type = "Identity";
+  std::string op_type = "Identity";
   if (IsRefType(switch_nodes_[0]->output_type(1))) {
     op_type = "RefIdentity";
   }
@@ -569,8 +569,8 @@ absl::Status LowerWhileHelper::UpdateConsumers() {
   return absl::OkStatus();
 }
 
-string LowerWhileHelper::NewName(const string& infix) {
-  return graph_->NewName(strings::StrCat(name_, "/", infix));
+std::string LowerWhileHelper::NewName(const std::string& infix) {
+  return graph_->NewName(absl::StrCat(name_, "/", infix));
 }
 
 bool LowerWhileHelper::IsLoopCarriedResource(int index) {
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 4fe9337c942766..eb19c84c04dd44 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -253,7 +253,8 @@ TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
   TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
                   .Attr("dtype", type)
                   .Finalize(graph.get(), &placeholder));
-  const string assigned_device_name = "/job:localhost/replica:0/task:0/gpu:0";
+  const std::string assigned_device_name =
+      "/job:localhost/replica:0/task:0/gpu:0";
   placeholder->set_assigned_device_name(assigned_device_name);
   Node* while_node;
   std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(placeholder)});
@@ -343,11 +344,11 @@ TEST(LowerWhileOpTest, ForwardRequestedInputDevice) {
   TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
   auto type = DT_FLOAT;
   // We will place the loop var on the gpu:0.
-  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  const std::string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
   // We will place loop's control input on the gpu:1.
-  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  const std::string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
   // We will place While op on gpu:2.
-  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  const std::string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
   Node* gpu_0_ph;
   TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
                   .Attr("dtype", type)
@@ -483,11 +484,11 @@ TEST(LowerWhileOpTest, ForwardColocationKeyAttribute) {
   TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
   auto type = DT_FLOAT;
   // We will place the loop var on the gpu:0.
-  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  const std::string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
   // We will place loop's control input on the gpu:1.
-  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  const std::string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
   // We will place While op on gpu:2.
-  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  const std::string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
   Node* gpu_0_ph;
   AttrValue gpu_0_colocation_attr;
   gpu_0_colocation_attr.mutable_list()->add_s("loc@:some_op_on_gpu_0_device");
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index d22d72f1a57019..216fdfd6d239c4 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -34,14 +34,14 @@ struct Endpoint {
 };
 
 struct EndpointHash {
-  uint32 operator()(const Endpoint& x) const {
+  uint32_t operator()(const Endpoint& x) const {
     return Hash32(reinterpret_cast<const char*>(&x.node_id), sizeof(int),
                   x.output_index);
   }
 };
 
 struct EndpointEq {
-  uint32 operator()(const Endpoint& x, const Endpoint& y) const {
+  uint32_t operator()(const Endpoint& x, const Endpoint& y) const {
     return (x.node_id == y.node_id) && (x.output_index == y.output_index);
   }
 };
@@ -116,14 +116,14 @@ absl::Status ValidateMemoryTypes(const DeviceType& device_type,
 // within this process. That is sufficient because EnsureMemoryTypes
 // is only used on a TensorFlow graph that is gonna to be executed in
 // a single tf device (hence within a single process).
-static string GetTensorName(const Edge* edge) {
+static std::string GetTensorName(const Edge* edge) {
   static std::atomic<int64_t> counter(0);
-  return strings::StrCat("memtype_", counter.fetch_add(1), "_",
-                         edge->src()->name());
+  return absl::StrCat("memtype_", counter.fetch_add(1), "_",
+                      edge->src()->name());
 }
 
-static Node* Send(Graph* g, const string& tensor_name,
-                  const string& device_name, bool host, const Edge* edge) {
+static Node* Send(Graph* g, const std::string& tensor_name,
+                  const std::string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), host ? "_HostSend" : "_Send")
                   .Input(edge->src(), edge->src_output())
@@ -138,8 +138,8 @@ static Node* Send(Graph* g, const string& tensor_name,
   return ret;
 }
 
-static Node* Recv(Graph* g, const string& tensor_name,
-                  const string& device_name, bool host, const Edge* edge) {
+static Node* Recv(Graph* g, const std::string& tensor_name,
+                  const std::string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(
       NodeBuilder(g->NewName("n"), host ? "_HostRecv" : "_Recv")
@@ -156,7 +156,7 @@ static Node* Recv(Graph* g, const string& tensor_name,
 }
 
 absl::Status EnsureMemoryTypes(const DeviceType& device_type,
-                               const string& device_name, Graph* g) {
+                               const std::string& device_name, Graph* g) {
   struct Item {
     const Edge* edge;
     MemoryType sm;
@@ -191,7 +191,7 @@ absl::Status EnsureMemoryTypes(const DeviceType& device_type,
       Endpoint key{e->src()->id(), e->src_output()};
       auto iter = recv_nodes.find(key);
       if (iter == recv_nodes.end()) {
-        const string tensor_name = GetTensorName(e);
+        const std::string tensor_name = GetTensorName(e);
         Node* send =
             Send(g, tensor_name, device_name, (item.sm == HOST_MEMORY), e);
         recv = Recv(g, tensor_name, device_name, (item.dm == HOST_MEMORY), e);
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index 46a943c0a3836e..bbadfe24e156c8 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -36,7 +36,7 @@ absl::Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g);
 // be OK). Otherwise, returns an error and '*g' may be in an
 // invalidate state and the caller should discard it.
 absl::Status EnsureMemoryTypes(const DeviceType& device_type,
-                               const string& device_name, Graph* g);
+                               const std::string& device_name, Graph* g);
 
 // Get the memory type for 'index'th output of node 'n' in graph 'g', when
 // running on 'device_type'.
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 26f414c14204ce..0be98557679406 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 TEST(MemoryTypeChecker, Int32OK) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor v(DT_INT32, {});
-  v.scalar<int32>().setZero();
+  v.scalar<int32_t>().setZero();
   auto in0 = test::graph::Constant(g, v);
   auto in1 = test::graph::Constant(g, v);
   test::graph::Add(g, in0, in1);
@@ -45,7 +45,7 @@ TEST(MemoryTypeChecker, Int32OK) {
 TEST(MemoryTypeChecker, Int32NotOk) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor v(DT_INT32, {});
-  v.scalar<int32>().setZero();
+  v.scalar<int32_t>().setZero();
   auto x = test::graph::Constant(g, v);
   test::graph::Cast(g, x, DT_FLOAT);
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_CPU, g));
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
index 28f029350da1a2..9570b9407e1574 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
@@ -58,7 +58,7 @@ Tensor CreateTestTensor() {
   Tensor t(DT_INT8, TensorShape({10, 20}));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<int8>()(a, b) = static_cast<int8>((a + 1) * (b + 1));
+      t.matrix<int8_t>()(a, b) = static_cast<int8_t>((a + 1) * (b + 1));
     }
   }
   return t;
@@ -68,7 +68,8 @@ class FakeAllocator : public Allocator {
  public:
   std::string Name() override { return "fake"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
+    return tsl::port::AlignedMalloc(num_bytes,
+                                    static_cast<std::align_val_t>(alignment));
   }
   void DeallocateRaw(void* ptr) override { return port::AlignedFree(ptr); }
 };
@@ -112,8 +113,9 @@ class FakeDeviceManager : public DeviceMgr {
   bool ContainsDevice(int64_t device_incarnation) const override {
     return false;
   }
-  void ClearContainers(absl::Span<const string> containers) const override {}
-  int NumDeviceType(const string& type) const override { return 0; }
+  void ClearContainers(
+      absl::Span<const std::string> containers) const override {}
+  int NumDeviceType(const std::string& type) const override { return 0; }
   int NumDevices() const override { return 0; }
   Device* HostCPU() const override { return nullptr; }
 
@@ -127,7 +129,7 @@ class TestDeviceContext : public DeviceContext {
                              Tensor* device_tensor, StatusCallback done,
                              bool sync_dst_compute) const override {
     Tensor test_tensor = CreateTestTensor();
-    test::ExpectTensorEqual<int8>(test_tensor, *cpu_tensor);
+    test::ExpectTensorEqual<int8_t>(test_tensor, *cpu_tensor);
     done(absl::OkStatus());
   }
 
@@ -191,7 +193,7 @@ TEST(RendezvousCAPI, DeviceToHost) {
                         });
   callback_done.WaitForNotification();
   Tensor test_tensor = CreateTestTensor();
-  test::ExpectTensorEqual<int8>(test_tensor, result);
+  test::ExpectTensorEqual<int8_t>(test_tensor, result);
 
   Destroy(thunk);
   delete thunk;
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
index 79b1eebbb3c6c7..29c45068316914 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
@@ -44,7 +44,8 @@ namespace tensorflow {
 // TODO(chuanhao): implement an API to query device memory, and make
 // memory_limit a parameter instead of hard coding.
 static DeviceAttributes BuildNextPluggableDeviceAttributes(
-    const string& name_prefix, const string& device_name, int device_ordinal) {
+    const std::string& name_prefix, const std::string& device_name,
+    int device_ordinal) {
   return Device::BuildDeviceAttributes(
       absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
       DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
index cb8ecf514101b0..8ad6c2051a87ac 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
@@ -36,13 +36,13 @@ class NextPluggableDevice : public PjRtBaseDevice {
  public:
   struct Options {
     // The device name's prefix (e.g., "/task:7")
-    string device_name_prefix;
+    std::string device_name_prefix;
 
     // The name of the  device (e.g., "GPU")
-    string device_name;
+    std::string device_name;
 
     // The name of the compilation device (e.g., "XLA_TPU_JIT");
-    string compilation_device_name;
+    std::string compilation_device_name;
 
     // The TfDeviceId.
     int device_ordinal = -1;
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
index 857d7f56a43355..f915ecdf47ce24 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
@@ -64,14 +64,14 @@ absl::StatusOr<xla::Shape> DeviceShapeRepresentation(
 }  // namespace
 
 absl::Status NextPluggableDeviceFactory::ListPhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   TF_Status* c_status = TF_NewStatus();
   int32_t device_count = api_->TFNPD_GetDeviceCount(c_status);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
   TF_DeleteStatus(c_status);
 
   for (int i = 0; i < device_count; ++i) {
-    const string device_name =
+    const std::string device_name =
         absl::StrCat("/physical_device:", device_type_, ":", i);
     devices->push_back(device_name);
   }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
index 5ccfb6dd336848..f23e5cd00cd76d 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
@@ -36,7 +36,7 @@ class NextPluggableDeviceFactory : public DeviceFactory {
         device_type_(device_type),
         compilation_device_name_(compilation_device_name) {}
 
-  absl::Status ListPhysicalDevices(std::vector<string>* devices) override;
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override;
 
   absl::Status CreateDevices(
       const SessionOptions& session_options, const std::string& name_prefix,
diff --git a/tensorflow/core/config/BUILD b/tensorflow/core/config/BUILD
index f41dc9f2d94a79..52217b6f7891a9 100644
--- a/tensorflow/core/config/BUILD
+++ b/tensorflow/core/config/BUILD
@@ -21,7 +21,10 @@ cc_library(
         "flags.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/core/platform:stringpiece"],
+    deps = [
+        "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings:string_view",
+    ],
 )
 
 filegroup(
@@ -63,6 +66,7 @@ cc_library(
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/util:env_var",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/core/config/flags.cc b/tensorflow/core/config/flags.cc
index d2d1ea502dfe9e..faf53293eb82d2 100644
--- a/tensorflow/core/config/flags.cc
+++ b/tensorflow/core/config/flags.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/config/flags.h"
 
 #include "absl/strings/ascii.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/util/env_var.h"
 
diff --git a/tensorflow/core/config/flags.h b/tensorflow/core/config/flags.h
index c882cd3939f4af..df4379e6ddb4b9 100644
--- a/tensorflow/core/config/flags.h
+++ b/tensorflow/core/config/flags.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_CONFIG_FLAGS_H_
 #define TENSORFLOW_CORE_CONFIG_FLAGS_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 1a79089fbccc0f..e99277b79f8752 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -226,16 +226,16 @@ TraceMeMetadata DataServiceClient::GetTraceMeMetadata() const {
       "num_tasks",
       num_tasks == -1
           ? kTraceInfoUnavailable
-          : strings::Printf("%lld", static_cast<long long>(num_tasks))));
+          : absl::StrFormat("%lld", static_cast<long long>(num_tasks))));
   result.push_back(std::make_pair("job_name", params_.job_name));
   result.push_back(std::make_pair(
       "max_outstanding_requests",
-      strings::Printf(
+      absl::StrFormat(
           "%lld", static_cast<long long>(params_.max_outstanding_requests))));
   if (params_.max_outstanding_requests == model::kAutotune) {
     result.push_back(std::make_pair(
         "autotuned_max_outstanding_requests",
-        strings::Printf("%lld", static_cast<long long>(
+        absl::StrFormat("%lld", static_cast<long long>(
                                     autotuned_max_outstanding_requests))));
   }
   return result;
@@ -295,7 +295,7 @@ void DataServiceClient::TaskThreadManager() TF_LOCKS_EXCLUDED(mu_) {
   auto cleanup =
       gtl::MakeCleanup([] { VLOG(1) << "Task thread manager exiting"; });
   VLOG(1) << "Starting task thread manager";
-  uint64 next_check = Env::Default()->NowMicros();
+  uint64_t next_check = Env::Default()->NowMicros();
   while (true) {
     {
       mutex_lock l(mu_);
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index 7c211d5551c46e..ecaecc841573e5 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -48,7 +48,7 @@ namespace data {
 class DataServiceContext {
  public:
   virtual ~DataServiceContext() = default;
-  virtual std::unique_ptr<Thread> StartThread(const string& name,
+  virtual std::unique_ptr<Thread> StartThread(const std::string& name,
                                               std::function<void()> fn) = 0;
   virtual void RecordBufferEnqueue(const std::vector<Tensor>& element) = 0;
   virtual void RecordBufferDequeue(const std::vector<Tensor>& element) = 0;
diff --git a/tensorflow/core/data/service/client/data_service_client_test.cc b/tensorflow/core/data/service/client/data_service_client_test.cc
index 9af455d11a0201..0baca60d285a8f 100644
--- a/tensorflow/core/data/service/client/data_service_client_test.cc
+++ b/tensorflow/core/data/service/client/data_service_client_test.cc
@@ -82,7 +82,7 @@ class TestDataServiceContext : public DataServiceContext {
   TestDataServiceContext() = default;
   ~TestDataServiceContext() override = default;
 
-  std::unique_ptr<Thread> StartThread(const string& name,
+  std::unique_ptr<Thread> StartThread(const std::string& name,
                                       std::function<void()> fn) override {
     return absl::WrapUnique(
         Env::Default()->StartThread({}, name, std::move(fn)));
diff --git a/tensorflow/core/data/service/credentials_factory.cc b/tensorflow/core/data/service/credentials_factory.cc
index 721ce5b806e7af..4362800c525137 100644
--- a/tensorflow/core/data/service/credentials_factory.cc
+++ b/tensorflow/core/data/service/credentials_factory.cc
@@ -58,7 +58,7 @@ absl::Status CredentialsFactory::Get(absl::string_view protocol,
     return absl::OkStatus();
   }
 
-  std::vector<string> available_types;
+  std::vector<std::string> available_types;
   for (const auto& factory : credentials_factories()) {
     available_types.push_back(factory.first);
   }
diff --git a/tensorflow/core/data/service/data_transfer.cc b/tensorflow/core/data/service/data_transfer.cc
index 4f45b11d313e31..ee6a0b1c4d3daa 100644
--- a/tensorflow/core/data/service/data_transfer.cc
+++ b/tensorflow/core/data/service/data_transfer.cc
@@ -128,7 +128,7 @@ absl::Status DataTransferClient::Build(
     return it->second(config, out);
   }
 
-  std::vector<string> available_names;
+  std::vector<std::string> available_names;
   for (const auto& factory : transfer_client_factories()) {
     available_names.push_back(factory.first);
   }
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index c06acb3e332ddf..4a3c8a12a31057 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -55,7 +55,7 @@ absl::Status DataServiceDispatcherClient::Initialize() {
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
   grpc::ChannelArguments args;
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
   args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
   auto channel = grpc::CreateCustomChannel(address_, credentials, args);
   stub_ = DispatcherService::NewStub(channel);
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc b/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
index c04cdf7a718456..6882a6b23e09e3 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
@@ -72,7 +72,7 @@ class GrpcDispatcherImplTest : public ::testing::Test {
     TF_RETURN_IF_ERROR(
         CredentialsFactory::CreateClientCredentials(kProtocol, &credentials));
     ChannelArguments args;
-    args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+    args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
     args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
     std::shared_ptr<Channel> channel =
         ::grpc::CreateCustomChannel(GetDispatcherAddress(), credentials, args);
diff --git a/tensorflow/core/data/service/grpc_worker_impl_test.cc b/tensorflow/core/data/service/grpc_worker_impl_test.cc
index 23eb6989c8cb1a..2d7563274bc295 100644
--- a/tensorflow/core/data/service/grpc_worker_impl_test.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl_test.cc
@@ -83,7 +83,7 @@ class GrpcWorkerImplTest : public ::testing::Test {
     TF_RETURN_IF_ERROR(
         CredentialsFactory::CreateClientCredentials(kProtocol, &credentials));
     ChannelArguments args;
-    args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+    args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
     args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
     std::shared_ptr<Channel> channel =
         ::grpc::CreateCustomChannel(GetWorkerAddress(), credentials, args);
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
index 98861523405206..4b2f19b9ca31ac 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -330,7 +330,7 @@ class SnapshotManager {
   absl::StatusOr<std::vector<Source>> CreateSources(
       const DatasetDef& dataset_def) const;
   // Returns the total number of splits.
-  absl::StatusOr<int64> GetSplitsCardinality();
+  absl::StatusOr<int64_t> GetSplitsCardinality();
   // Resets a source when it runs out of splits, to support repetitions.
   absl::Status ResetSource(Source& source, int64_t source_index);
   int64_t num_sources() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 9f0ef2cf3c2886..495743c8d64c0c 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -257,9 +257,10 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
                           recv_args.alloc_attrs.gpu_compatible());
   Allocator* out_allocator = dst_device->GetAllocator(attr);
   AllocationAttributes allocation_attr;
-  uint64 safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
+  uint64_t safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
   bool sync_dst_compute = (safe_alloc_frontier == 0);
-  std::function<uint64()> freed_by_func = [dst_device, &safe_alloc_frontier]() {
+  std::function<uint64_t()> freed_by_func = [dst_device,
+                                             &safe_alloc_frontier]() {
     safe_alloc_frontier = dst_device->SafeAllocFrontier(safe_alloc_frontier);
     return safe_alloc_frontier;
   };
diff --git a/tensorflow/core/distributed_runtime/cancellable_call.h b/tensorflow/core/distributed_runtime/cancellable_call.h
index 7311c8e3a44f42..3a2691b7cff22f 100644
--- a/tensorflow/core/distributed_runtime/cancellable_call.h
+++ b/tensorflow/core/distributed_runtime/cancellable_call.h
@@ -27,8 +27,8 @@ namespace tensorflow {
 // registration with a CancellationManager.
 class CancellableCall {
  public:
-  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
-                  WorkerCacheInterface* wc)
+  CancellableCall(CancellationManager* cancel_mgr,
+                  const std::string& remote_worker, WorkerCacheInterface* wc)
       : is_cancelled_(false),
         cancel_mgr_(cancel_mgr),
         remote_worker_(remote_worker),
@@ -51,7 +51,7 @@ class CancellableCall {
   mutex mu_;
   bool is_cancelled_;
   CancellationManager* const cancel_mgr_;  // Not owned
-  const string remote_worker_;
+  const std::string remote_worker_;
   WorkerCacheInterface* const wc_;  // Not owned
   WorkerInterface* const wi_;       // Owned by wc_, must be released.
   CallOptions opts_;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 966a281c1d2b66..c974bb4c520655 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -39,9 +39,9 @@ absl::Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
     const OpDef& sig, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const FunctionLibraryDefinition& flib_def, GraphDef* gdef,
-    std::vector<string>* send_keys, std::vector<string>* recv_keys) {
-  const string& target = options.target;
-  const string& func_name = sig.name();
+    std::vector<std::string>* send_keys, std::vector<std::string>* recv_keys) {
+  const std::string& target = options.target;
+  const std::string& func_name = sig.name();
   const FunctionDef* func_def = flib_def.Find(sig.name());
   if (func_def == nullptr) {
     return errors::InvalidArgument("Function ", func_name,
@@ -90,7 +90,7 @@ absl::Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
 
     // src_incarnation = 1 works because the transfer is across the same device.
     // TODO(rohanj): Find the src_incarnation for the remote device and set it.
-    const string& key = Rendezvous::CreateKey(
+    const std::string& key = Rendezvous::CreateKey(
         target, 1 /* src_incarnation */, target, in.name(), FrameAndIter(0, 0));
     send_keys->push_back(key);
     ++i;
@@ -140,7 +140,7 @@ absl::Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
 
     g.AddEdge(function_node, i, output_node, 0);
 
-    const string& key =
+    const std::string& key =
         Rendezvous::CreateKey(target, 1 /* src_incarnation */, target,
                               out.name(), FrameAndIter(0, 0));
     recv_keys->push_back(key);
@@ -180,7 +180,7 @@ ClusterFunctionLibraryRuntime::~ClusterFunctionLibraryRuntime() {
 }
 
 void ClusterFunctionLibraryRuntime::Instantiate(
-    const string& function_name, const FunctionLibraryDefinition& lib_def,
+    const std::string& function_name, const FunctionLibraryDefinition& lib_def,
     AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::LocalHandle* handle,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -192,7 +192,7 @@ void ClusterFunctionLibraryRuntime::Instantiate(
   WorkerInterface* wi = worker_cache->GetOrCreateWorker(target);
 
   if (wi == nullptr) {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     worker_session_->worker_cache()->ListWorkers(&workers);
     done(errors::InvalidArgument(
         "Could not find worker with target: ", target,
@@ -202,8 +202,8 @@ void ClusterFunctionLibraryRuntime::Instantiate(
 
   // Make RPC and obtain a graph handle.
   GraphDef gdef;
-  auto* send_keys = new std::vector<string>;
-  auto* recv_keys = new std::vector<string>;
+  auto* send_keys = new std::vector<std::string>;
+  auto* recv_keys = new std::vector<std::string>;
   auto construct_graph_fn = [&](const FunctionLibraryDefinition* lib_def) {
     const FunctionDef* fdef = lib_def->Find(function_name);
     const OpDef& sig = fdef->signature();
@@ -285,7 +285,7 @@ void ClusterFunctionLibraryRuntime::Run(
     args[i].AsProtoTensorContent(send->mutable_tensor());
     i++;
   }
-  const std::vector<string>& recv_keys = function_data->recv_keys;
+  const std::vector<std::string>& recv_keys = function_data->recv_keys;
   for (const auto& recv_key : recv_keys) {
     req->add_recv_key(recv_key);
   }
@@ -308,7 +308,7 @@ void ClusterFunctionLibraryRuntime::Run(
         if (!local_status->ok()) {
           return;
         }
-        std::map<string, TensorProto*> mapped_recvs;
+        std::map<std::string, TensorProto*> mapped_recvs;
         for (auto& recv : *resp->mutable_recv()) {
           mapped_recvs[recv.name()] = recv.mutable_tensor();
         }
@@ -363,7 +363,7 @@ void ClusterFunctionLibraryRuntime::Run(
 }
 
 void ClusterFunctionLibraryRuntime::CleanUp(
-    uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+    uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index a016a5eea418df..2d66854ec8c2ca 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -41,7 +41,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   ~ClusterFunctionLibraryRuntime() override;
 
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
                    FunctionLibraryRuntime::LocalHandle* handle,
@@ -57,7 +57,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
            absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
-  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+  void CleanUp(uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
                FunctionLibraryRuntime::DoneCallback done) override;
 
   DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
@@ -67,7 +67,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
       const OpDef& sig, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const FunctionLibraryDefinition& flib_def, GraphDef* g,
-      std::vector<string>* send_keys, std::vector<string>* recv_keys);
+      std::vector<std::string>* send_keys, std::vector<std::string>* recv_keys);
   friend class ClusterFunctionLibraryRuntimeTest;
 
   mutable mutex mu_;
@@ -77,19 +77,19 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
   DeviceMgr* remote_device_mgr_;  // not owned.
 
   struct FunctionData {
-    const string graph_handle;
-    const string target;
+    const std::string graph_handle;
+    const std::string target;
     // Hold a shared pointer to the underlying worker cache to avoid it being
     // deleted in potential cluster update.
     const std::shared_ptr<WorkerCacheInterface> worker_cache;
     WorkerInterface* wi = nullptr;
-    const std::vector<string> send_keys;
-    const std::vector<string> recv_keys;
+    const std::vector<std::string> send_keys;
+    const std::vector<std::string> recv_keys;
 
-    FunctionData(const string& graph_handle, const string& target,
+    FunctionData(const std::string& graph_handle, const std::string& target,
                  std::shared_ptr<WorkerCacheInterface> worker_cache,
-                 WorkerInterface* wi, const std::vector<string>& send_keys,
-                 const std::vector<string>& recv_keys)
+                 WorkerInterface* wi, const std::vector<std::string>& send_keys,
+                 const std::vector<std::string>& recv_keys)
         : graph_handle(graph_handle),
           target(target),
           worker_cache(std::move(worker_cache)),
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 40290ef3e4f54e..9be587fb48880c 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -42,7 +42,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
         &cluster_));
     GrpcChannelSpec spec;
 
-    std::map<int, string> host_ports;
+    std::map<int, std::string> host_ports;
     int i = 0;
     for (const auto& target : cluster_->targets("localhost")) {
       host_ports[i++] = target;
@@ -72,12 +72,13 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
       const OpDef& sig, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const FunctionLibraryDefinition& lib_def, GraphDef* g,
-      std::vector<string>* send_keys, std::vector<string>* recv_keys) {
+      std::vector<std::string>* send_keys,
+      std::vector<std::string>* recv_keys) {
     return ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
         sig, attrs, options, lib_def, g, send_keys, recv_keys);
   }
 
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def,
                    test::function::Attrs attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
@@ -88,8 +89,8 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(
-      const string& function_name, const FunctionLibraryDefinition& lib_def,
-      test::function::Attrs attrs,
+      const std::string& function_name,
+      const FunctionLibraryDefinition& lib_def, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::LocalHandle handle;
@@ -135,7 +136,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
 
 TEST_F(ClusterFunctionLibraryRuntimeTest, ConstructFunctionGraph) {
   GraphDef actual;
-  std::vector<string> send_keys, recv_keys;
+  std::vector<std::string> send_keys, recv_keys;
   FunctionDefLibrary proto;
   *(proto.add_function()) = test::function::Swap();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
@@ -402,10 +403,10 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, DISABLED_InstantiateAndRun) {
   instantiate_opts.target = "/job:localhost/replica:0/task:1/cpu:0";
 
   Tensor y;
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   TF_EXPECT_OK(InstantiateAndRun("XTimesTwoInt32", lib_def, {},
                                  instantiate_opts, {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({2, 4, 6, 8}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({2, 4, 6, 8}));
 }
 
 TEST_F(ClusterFunctionLibraryRuntimeTest,
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index ab13146b73bbbd..5acf12ccea0f69 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -34,7 +34,7 @@ class CompleteGroupCall : public CancellableCall {
   CompleteGroupCall(const CollGroupParams& group,
                     const DeviceAttributes& device,
                     CancellationManager* cancel_mgr,
-                    const string& remote_worker, WorkerCacheInterface* wc)
+                    const std::string& remote_worker, WorkerCacheInterface* wc)
       : CancellableCall(cancel_mgr, remote_worker, wc) {
     req_.set_group_key(group.group_key);
     req_.set_group_size(group.group_size);
@@ -55,9 +55,11 @@ class CompleteInstanceCall : public CancellableCall {
  public:
   CompleteInstanceCall(const CollGroupParams& group,
                        const CollInstanceParams& instance,
-                       const string& node_name, const string& device_name,
-                       bool is_source, CancellationManager* cancel_mgr,
-                       const string& remote_worker, WorkerCacheInterface* wc)
+                       const std::string& node_name,
+                       const std::string& device_name, bool is_source,
+                       CancellationManager* cancel_mgr,
+                       const std::string& remote_worker,
+                       WorkerCacheInterface* wc)
       : CancellableCall(cancel_mgr, remote_worker, wc) {
     req_.set_name(node_name);
     req_.set_type(instance.type);
@@ -91,7 +93,7 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     DeviceResolverDistributed* dev_resolver,
     NcclCommunicatorInterface* nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& task_name)
+    WorkerCacheInterface* worker_cache, const std::string& task_name)
     : CollectiveParamResolverLocal(config, dev_mgr, dev_resolver,
                                    nccl_communicator, task_name),
       worker_cache_(worker_cache),
@@ -364,8 +366,8 @@ absl::Status CollectiveParamResolverDistributed::UpdateInstanceCache(
 }
 
 void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const StatusCallback& done) {
+    const std::string& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
   if (group_leader_.empty()) {
     // This is the group leader so resolution is local.
     return CompleteInstanceLocal(device, cp, done);
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index 63006c1253547e..d885fe0bb81a0e 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -32,7 +32,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
       const ConfigProto& config, const DeviceMgr* dev_mgr,
       DeviceResolverDistributed* dev_resolver,
       NcclCommunicatorInterface* nccl_communicator,
-      WorkerCacheInterface* worker_cache, const string& task_name);
+      WorkerCacheInterface* worker_cache, const std::string& task_name);
 
   void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
@@ -82,13 +82,14 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
   // Finish populating *cp.  Semantics are like those of
   // CompleteInstanceLocal but will make a remote call to the group
   // leader if necessary.
-  void CompleteInstanceDistributed(const string& device, CollectiveParams* cp,
+  void CompleteInstanceDistributed(const std::string& device,
+                                   CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    const StatusCallback& done)
       TF_LOCKS_EXCLUDED(instance_mu_, group_mu_);
 
   WorkerCacheInterface* worker_cache_;  // Not owned
-  const string group_leader_;
+  const std::string group_leader_;
   CancellationManager abortion_cancel_mgr_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 31140bf0755740..2880d722f0efbf 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static std::unique_ptr<Device> NewDevice(const string& type,
-                                         const string& name) {
+static std::unique_ptr<Device> NewDevice(const std::string& type,
+                                         const std::string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -54,15 +54,16 @@ class FakeCache : public TestWorkerCache {
  public:
   // Override the Locality methods to actually pass through to the
   // worker.
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return false;
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
-    string task_name;
-    string dev_part;
+    std::string task_name;
+    std::string dev_part;
     if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
       done(errors::Internal("failed to parse device name"));
       return;
@@ -94,7 +95,9 @@ class FakeCache : public TestWorkerCache {
 class FakeNcclCommunicator : public NcclCommunicatorInterface {
  public:
   // We only need to define GenerateCommunicatorKey().
-  string GenerateCommunicatorKey() override { return "mock-communicator-key"; }
+  std::string GenerateCommunicatorKey() override {
+    return "mock-communicator-key";
+  }
 
   void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
                StatusCallback done) override {
@@ -114,15 +117,16 @@ class DeviceResDistTest : public ::testing::Test {
 
  protected:
   void DefineWorkers(int num_workers, int num_devices,
-                     const string& device_type, bool nccl) {
+                     const std::string& device_type, bool nccl) {
     for (int w = 0; w < num_workers; ++w) {
-      string name = absl::StrCat("/job:worker/replica:0/task:", w);
+      std::string name = absl::StrCat("/job:worker/replica:0/task:", w);
       DefineWorker(name, device_type, num_devices, nccl);
     }
   }
 
-  void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices, bool nccl) {
+  void DefineWorker(const std::string& worker_name,
+                    const std::string& device_type, int num_devices,
+                    bool nccl) {
     ConfigProto config;
     config.mutable_experimental()->set_collective_group_leader(
         "/job:worker/replica:0/task:0");
@@ -136,7 +140,7 @@ class DeviceResDistTest : public ::testing::Test {
     }
     device_mgrs_[worker_name] =
         std::make_unique<StaticDeviceMgr>(std::move(devices));
-    std::vector<string>* dv = &dev_by_task_[worker_name];
+    std::vector<std::string>* dv = &dev_by_task_[worker_name];
     dv->clear();
     for (auto* d : device_mgrs_[worker_name]->ListDevices()) {
       dv->push_back(d->name());
@@ -160,14 +164,14 @@ class DeviceResDistTest : public ::testing::Test {
   }
 
   void DefineCollectiveParams(int num_workers, int num_devices,
-                              const string& device_type,
+                              const std::string& device_type,
                               CollectiveType coll_type = REDUCTION_COLLECTIVE,
                               int source_rank = 0) {
     for (int wi = 0; wi < num_workers; ++wi) {
-      string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
+      std::string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
         int idx = wi * num_devices + di;
-        string device_name =
+        std::string device_name =
             strings::StrCat(task_name, "/device:", device_type, ":", di);
         cp_[device_name] =
             CreateCollectiveParams(num_workers, num_devices, device_type,
@@ -177,7 +181,7 @@ class DeviceResDistTest : public ::testing::Test {
   }
 
   CollectiveParams* CreateCollectiveParams(int num_workers, int num_devices,
-                                           const string& device_type,
+                                           const std::string& device_type,
                                            CollectiveType coll_type,
                                            bool is_source) {
     const int kGroupKey = 5;
@@ -203,16 +207,16 @@ class DeviceResDistTest : public ::testing::Test {
     }
     int group_size = num_workers * num_devices;
     for (int wi = 0; wi < num_workers; ++wi) {
-      string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
+      std::string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        string device_name = absl::StrCat(task_name, "/device:CPU:", di);
+        std::string device_name = absl::StrCat(task_name, "/device:CPU:", di);
         IssueRequest(task_name, device_name, group_size);
       }
     }
   }
 
-  void IssueRequest(const string& task_name, const string& device_name,
-                    int group_size) {
+  void IssueRequest(const std::string& task_name,
+                    const std::string& device_name, int group_size) {
     Device* device = nullptr;
     TF_CHECK_OK(device_mgrs_[task_name]->LookupDevice(device_name, &device));
     CollectiveParams* cp = cp_[device_name];
@@ -243,11 +247,11 @@ class DeviceResDistTest : public ::testing::Test {
     // Verify that all cp_ values get the same set of task and device
     // names, with unique default_rank in the expected order.
     const int dev_count = num_workers * num_devices;
-    string dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
+    std::string dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
     for (int wi = 0; wi < num_workers; ++wi) {
-      string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
+      std::string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        string device_name = absl::StrCat(task_name, "/device:CPU:", di);
+        std::string device_name = absl::StrCat(task_name, "/device:CPU:", di);
         int idx = wi * num_devices + di;
         TF_ASSERT_OK(status_[device_name]);
         EXPECT_EQ(cp_[device_name]->default_rank, idx);
@@ -270,7 +274,8 @@ class DeviceResDistTest : public ::testing::Test {
     }
   }
 
-  void ValidateDeviceResolver(const CollectiveParams& cp, const string& task) {
+  void ValidateDeviceResolver(const CollectiveParams& cp,
+                              const std::string& task) {
     for (const CollGroupMember& member : cp.group.members) {
       DeviceAttributes attributes;
       TF_ASSERT_OK(dev_resolvers_[task]->GetDeviceAttributes(
@@ -279,14 +284,14 @@ class DeviceResDistTest : public ::testing::Test {
   }
 
   void RestartWorker(int worker_idx, int num_workers, int num_devices,
-                     const string& device_type, bool nccl,
+                     const std::string& device_type, bool nccl,
                      CollectiveType coll_type = REDUCTION_COLLECTIVE,
                      bool is_source = false) {
-    string worker_name =
+    std::string worker_name =
         absl::StrCat("/job:worker/replica:0/task:", worker_idx);
     DefineWorker(worker_name, device_type, num_devices, nccl);
     for (int i = 0; i < num_devices; ++i) {
-      string device_name =
+      std::string device_name =
           strings::StrCat(worker_name, "/device:", device_type, ":", i);
       if (cp_.find(device_name) != cp_.end()) {
         cp_[device_name]->Unref();
@@ -301,18 +306,18 @@ class DeviceResDistTest : public ::testing::Test {
   FakeNcclCommunicator nccl_communicator_;
   CancellationManager cm_;
   // Below are keyed by task names.
-  absl::flat_hash_map<string, std::unique_ptr<DeviceMgr>> device_mgrs_;
-  absl::flat_hash_map<string, std::unique_ptr<DeviceResolverDistributed>>
+  absl::flat_hash_map<std::string, std::unique_ptr<DeviceMgr>> device_mgrs_;
+  absl::flat_hash_map<std::string, std::unique_ptr<DeviceResolverDistributed>>
       dev_resolvers_;
-  absl::flat_hash_map<string,
+  absl::flat_hash_map<std::string,
                       std::unique_ptr<CollectiveParamResolverDistributed>>
       cp_resolvers_;
-  absl::flat_hash_map<string, std::vector<string>> dev_by_task_;
-  absl::flat_hash_map<string, std::unique_ptr<WorkerEnv>> worker_envs_;
-  absl::flat_hash_map<string, std::unique_ptr<Worker>> workers_;
+  absl::flat_hash_map<std::string, std::vector<std::string>> dev_by_task_;
+  absl::flat_hash_map<std::string, std::unique_ptr<WorkerEnv>> worker_envs_;
+  absl::flat_hash_map<std::string, std::unique_ptr<Worker>> workers_;
   // Below are keyed by device names;
-  absl::flat_hash_map<string, CollectiveParams*> cp_;
-  absl::flat_hash_map<string, absl::Status> status_;
+  absl::flat_hash_map<std::string, CollectiveParams*> cp_;
+  absl::flat_hash_map<std::string, absl::Status> status_;
   mutex mu_;
   int num_done_ TF_GUARDED_BY(mu_);
   condition_variable done_;
@@ -343,8 +348,8 @@ TEST_F(DeviceResDistTest, DifferentIncarnation) {
   DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   RestartWorker(1, num_workers, num_devices, "CPU", /*nccl*/ false);
-  const string task_name = "/job:worker/replica:0/task:1";
-  const string device_name = absl::StrCat(task_name, "/device:CPU:0");
+  const std::string task_name = "/job:worker/replica:0/task:1";
+  const std::string device_name = absl::StrCat(task_name, "/device:CPU:0");
   IssueRequest(task_name, device_name, num_workers * num_devices);
   EXPECT_TRUE(absl::IsFailedPrecondition(status_[device_name]));
 }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 1b4ba6296f4978..afab5707e58e4e 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -39,9 +39,9 @@ namespace {
 
 class RecvBufCall : public CancellableCall {
  public:
-  RecvBufCall(int64_t step_id, const string& peer_device,
-              const string& peer_task, const string& key, Device* to_device,
-              DeviceContext* to_device_ctx,
+  RecvBufCall(int64_t step_id, const std::string& peer_device,
+              const std::string& peer_task, const std::string& key,
+              Device* to_device, DeviceContext* to_device_ctx,
               const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
               const DeviceLocality& client_locality,
               const DeviceAttributes& server_attributes,
@@ -107,11 +107,12 @@ absl::Status PopulateTensorFromResponse(const RecvBufResponse& response,
 }  // namespace
 
 void CollectiveRemoteAccessDistributed::RecvFromPeer(
-    const string& peer_device, const string& peer_task, bool peer_is_local,
-    const string& key, Device* to_device, DeviceContext* to_device_ctx,
-    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-    const DeviceLocality& client_locality, int dev_to_dev_stream_index,
-    CancellationManager* cancellation_manager, const StatusCallback& done) {
+    const std::string& peer_device, const std::string& peer_task,
+    bool peer_is_local, const std::string& key, Device* to_device,
+    DeviceContext* to_device_ctx, const AllocatorAttributes& to_alloc_attr,
+    Tensor* to_tensor, const DeviceLocality& client_locality,
+    int dev_to_dev_stream_index, CancellationManager* cancellation_manager,
+    const StatusCallback& done) {
   if (peer_is_local) {
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
@@ -232,7 +233,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
 }
 
 void CollectiveRemoteAccessDistributed::CheckPeerHealth(
-    const string& peer_task, int64_t timeout_in_ms,
+    const std::string& peer_task, int64_t timeout_in_ms,
     const StatusCallback& done) {
   if (peer_task == task_name_) {
     // Fast path if the peer is the worker itself.
@@ -265,7 +266,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
           s = dev_resolver_->GetAllDeviceAttributes(peer_task, &cached_attrs);
         }
         if (s.ok()) {
-          absl::flat_hash_set<uint64> remote_incarnations;
+          absl::flat_hash_set<uint64_t> remote_incarnations;
           for (const DeviceAttributes& da : resp->device_attributes()) {
             remote_incarnations.insert(da.incarnation());
           }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
index 22d4d6f5a119e6..4557e9b36ac206 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -29,7 +29,8 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   CollectiveRemoteAccessDistributed(
       const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
       std::shared_ptr<UnboundedWorkQueue> work_queue,
-      WorkerCacheInterface* worker_cache, int64_t step_id, string task_name)
+      WorkerCacheInterface* worker_cache, int64_t step_id,
+      std::string task_name)
       : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         worker_cache_(worker_cache),
         work_queue_(std::move(work_queue)),
@@ -37,8 +38,9 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
 
   ~CollectiveRemoteAccessDistributed() override {}
 
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
+  void RecvFromPeer(const std::string& peer_device,
+                    const std::string& peer_task, bool peer_is_local,
+                    const std::string& key, Device* to_device,
                     DeviceContext* to_device_ctx,
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality,
@@ -46,7 +48,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
                     CancellationManager* cancellation_manager,
                     const StatusCallback& done) override;
 
-  void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+  void CheckPeerHealth(const std::string& peer_task, int64_t timeout_in_ms,
                        const StatusCallback& done) override;
 
   void StartAbort(const absl::Status& s) override;
@@ -57,7 +59,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   // `CollectiveExecutorMgr`.
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CancellationManager abortion_cancel_mgr_;
-  string task_name_;
+  std::string task_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index a2ec3b1aff2834..4d626cb9f49a9c 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -50,14 +50,16 @@ namespace {
 
 class FakeAllocator : public Allocator {
  public:
-  string Name() override { return "fake"; }
+  std::string Name() override { return "fake"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
+    return tsl::port::AlignedMalloc(num_bytes,
+                                    static_cast<std::align_val_t>(alignment));
   }
   void DeallocateRaw(void* ptr) override { return port::AlignedFree(ptr); }
 };
 
-static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
+static std::unique_ptr<Device> NewDevice(const std::string& type,
+                                         const std::string& name,
                                          Allocator* allocator) {
   class FakeDevice : public Device {
    public:
@@ -81,7 +83,7 @@ static int64_t kStepId = 123;
 
 class FakeWorker : public TestWorkerInterface {
  public:
-  FakeWorker(const string& name, DeviceMgr* dev_mgr,
+  FakeWorker(const std::string& name, DeviceMgr* dev_mgr,
              DeviceResolverDistributed* dres, bool is_failed,
              bool set_tensor_in_extra)
       : name_(name),
@@ -144,7 +146,7 @@ class FakeWorker : public TestWorkerInterface {
               // Since this is not really RDMA into pre-allocated memory send
               // the bytes in the response.
               RecvBufRespExtra extra;
-              extra.add_tensor_content(string(
+              extra.add_tensor_content(std::string(
                   reinterpret_cast<const char*>(DMAHelper::base(h->prod_value)),
                   num_bytes));
               response->mutable_transport_options()->PackFrom(extra);
@@ -164,7 +166,7 @@ class FakeWorker : public TestWorkerInterface {
   }
 
  private:
-  string name_;
+  std::string name_;
   DeviceMgr* device_mgr_;
   DeviceResolverDistributed* device_resolver_;
   BufRendezvous buf_rendezvous_;
@@ -176,15 +178,16 @@ class FakeCache : public TestWorkerCache {
  public:
   // Override the Locality methods to actually pass through to the
   // worker.
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return false;
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
-    string task_name;
-    string dev_part;
+    std::string task_name;
+    std::string dev_part;
     if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
       done(errors::Internal("failed to parse device name"));
       return;
@@ -246,10 +249,10 @@ class CollRMADistTest
   void SetUp() override {
     const int num_workers = 2;
     const int num_devices = 1;
-    string device_type = "CPU";
-    string dev0_worker_name;
+    std::string device_type = "CPU";
+    std::string dev0_worker_name;
     for (int w = 0; w < num_workers; ++w) {
-      string name = absl::StrCat("/job:worker/replica:0/task:", w);
+      std::string name = absl::StrCat("/job:worker/replica:0/task:", w);
       if (w == 0) {
         dev0_worker_name = name;
       }
@@ -288,8 +291,9 @@ class CollRMADistTest
     }
   }
 
-  void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices, bool is_failed = false) {
+  void DefineWorker(const std::string& worker_name,
+                    const std::string& device_type, int num_devices,
+                    bool is_failed = false) {
     std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
@@ -316,8 +320,9 @@ class CollRMADistTest
     wc_.AddWorker(worker_name, fw);
   }
 
-  void RestartWorker(const string& worker_name, const string& device_type,
-                     int num_devices, bool is_failed = false) {
+  void RestartWorker(const std::string& worker_name,
+                     const std::string& device_type, int num_devices,
+                     bool is_failed = false) {
     auto it = dev_resolvers_.find(worker_name);
     if (it != dev_resolvers_.end()) {
       delete it->second;
@@ -354,8 +359,8 @@ class CollRMADistTest
   FakeCache wc_;
   CancellationManager cm_;
   std::vector<DeviceMgr*> device_mgrs_;
-  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
-  std::unordered_map<string, std::vector<DeviceAttributes>> dev_by_task_;
+  std::unordered_map<std::string, DeviceResolverDistributed*> dev_resolvers_;
+  std::unordered_map<std::string, std::vector<DeviceAttributes>> dev_by_task_;
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<FakeWorker*> workers_;
   std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
@@ -379,7 +384,7 @@ TEST_P(CollRMADistTest, ProdFirstOK) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   wi->buf_rendezvous()->ProvideBuf(
       kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
       AllocatorAttributes(),
@@ -389,7 +394,7 @@ TEST_P(CollRMADistTest, ProdFirstOK) {
       },
       nullptr /*cancellation_manager*/);
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   DeviceContext* to_device_ctx = nullptr;
   MaybeSetGPUDevice(dst_device);
@@ -418,9 +423,9 @@ TEST_P(CollRMADistTest, ConsFirstOK) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   MaybeSetGPUDevice(dst_device);
   DeviceContext* to_device_ctx = nullptr;
@@ -454,9 +459,9 @@ TEST_P(CollRMADistTest, ConsFirstAbort) {
   ResolveDeviceAttributes();
   absl::Notification consumer_note;
   absl::Status consumer_status;
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   MaybeSetGPUDevice(dst_device);
   DeviceContext* to_device_ctx = nullptr;
@@ -483,7 +488,7 @@ TEST_P(CollRMADistTest, ResponseTooLarge) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   wi->buf_rendezvous()->ProvideBuf(
       kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &large_response_,
       AllocatorAttributes(),
@@ -493,7 +498,7 @@ TEST_P(CollRMADistTest, ResponseTooLarge) {
       },
       nullptr /*cancellation_manager*/);
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   DeviceContext* to_device_ctx = nullptr;
   MaybeSetGPUDevice(dst_device);
@@ -523,9 +528,9 @@ TEST_P(CollRMADistTest, WorkerRestart) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string buf_key = "fake_buf_key";
+  const std::string buf_key = "fake_buf_key";
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   MaybeSetGPUDevice(dst_device);
   DeviceContext* to_device_ctx = nullptr;
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
index f0f8c50b2fd50a..3de97cc08726ff 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -28,7 +28,7 @@ DeviceResolverDistributed::DeviceResolverDistributed(const DeviceMgr* dev_mgr) {
 }
 
 absl::Status DeviceResolverDistributed::GetDeviceAttributes(
-    const string& device, DeviceAttributes* attributes) {
+    const std::string& device, DeviceAttributes* attributes) {
   mutex_lock l(mu_);
   auto it = attr_table_.find(device);
   if (it == attr_table_.end()) {
@@ -39,11 +39,11 @@ absl::Status DeviceResolverDistributed::GetDeviceAttributes(
 }
 
 absl::Status DeviceResolverDistributed::GetAllDeviceAttributes(
-    const string& task, std::vector<DeviceAttributes>* attributes) {
+    const std::string& task, std::vector<DeviceAttributes>* attributes) {
   mutex_lock l(mu_);
   attributes->clear();
   for (const auto& it : attr_table_) {
-    const string& device_name = it.first;
+    const std::string& device_name = it.first;
     if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) {
       attributes->push_back(it.second);
     }
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
index b46c288cb3a456..3bf6cfa813fe2f 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -31,19 +31,21 @@ class DeviceResolverDistributed : public DeviceResolverInterface {
  public:
   explicit DeviceResolverDistributed(const DeviceMgr* dev_mgr);
 
-  absl::Status GetDeviceAttributes(const string& device,
+  absl::Status GetDeviceAttributes(const std::string& device,
                                    DeviceAttributes* attributes) override;
 
   absl::Status GetAllDeviceAttributes(
-      const string& task, std::vector<DeviceAttributes>* attributes) override;
+      const std::string& task,
+      std::vector<DeviceAttributes>* attributes) override;
 
   absl::Status UpdateDeviceAttributes(
       const std::vector<DeviceAttributes>& attributes) override;
 
  protected:
-  const string task_name_;
+  const std::string task_name_;
   mutex mu_;
-  absl::flat_hash_map<string, DeviceAttributes> attr_table_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, DeviceAttributes> attr_table_
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 0c2bdba1da59d4..8a3245ce2ee3e5 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -34,7 +34,8 @@ using ::testing::UnorderedElementsAre;
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality and incarnation.
-std::unique_ptr<Device> NewDevice(const string& type, const string& name) {
+std::unique_ptr<Device> NewDevice(const std::string& type,
+                                  const std::string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 5688c30275eb2e..f62268f3a40d3f 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -54,7 +54,7 @@ void StripDefaultAttributesInRegisterFunctionOp(
 }  // namespace
 
 void EagerClusterFunctionLibraryRuntime::Instantiate(
-    const string& function_name, const FunctionLibraryDefinition& lib_def,
+    const std::string& function_name, const FunctionLibraryDefinition& lib_def,
     AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::LocalHandle* handle,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -281,7 +281,7 @@ void EagerClusterFunctionLibraryRuntime::Run(
 }
 
 void EagerClusterFunctionLibraryRuntime::CleanUp(
-    uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+    uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
@@ -312,7 +312,8 @@ void EagerClusterFunctionLibraryRuntime::CleanUp(
 }
 
 DistributedFunctionLibraryRuntime* CreateClusterFLR(
-    const uint64 context_id, EagerContext* ctx, WorkerSession* worker_session) {
+    const uint64_t context_id, EagerContext* ctx,
+    WorkerSession* worker_session) {
   return new EagerClusterFunctionLibraryRuntime(
       context_id, ctx, worker_session->remote_device_mgr());
 }
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index 58af5ed93ae8ac..6fb1fc280f0638 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -37,7 +37,8 @@ namespace eager {
 class EagerClusterFunctionLibraryRuntime
     : public DistributedFunctionLibraryRuntime {
  public:
-  EagerClusterFunctionLibraryRuntime(const uint64 context_id, EagerContext* ctx,
+  EagerClusterFunctionLibraryRuntime(const uint64_t context_id,
+                                     EagerContext* ctx,
                                      DeviceMgr* remote_device_mgr)
       : context_id_(context_id),
         ctx_(ctx),
@@ -49,7 +50,7 @@ class EagerClusterFunctionLibraryRuntime
   // on the remote target specified in `options.target`. This should be
   // triggered as part of instantiating a multi-device function in
   // ProcessFunctionLibraryRuntime.
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
                    FunctionLibraryRuntime::LocalHandle* handle,
@@ -75,23 +76,23 @@ class EagerClusterFunctionLibraryRuntime
            absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
-  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+  void CleanUp(uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
                FunctionLibraryRuntime::DoneCallback done) override;
 
   DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
 
  private:
-  const uint64 context_id_;
+  const uint64_t context_id_;
   EagerContext* ctx_;
   DeviceMgr* remote_device_mgr_;  // not owned.
 
   struct FunctionData {
-    const string target;
+    const std::string target;
     const absl::optional<std::vector<int>> ret_indices;
     core::RefCountPtr<EagerClient> eager_client;
     std::unique_ptr<EagerOperation> op;
 
-    FunctionData(const string& target,
+    FunctionData(const std::string& target,
                  const absl::optional<std::vector<int>>& ret_indices,
                  EagerClient* eager_client, std::unique_ptr<EagerOperation> op)
         : target(target),
@@ -107,7 +108,8 @@ class EagerClusterFunctionLibraryRuntime
 };
 
 DistributedFunctionLibraryRuntime* CreateClusterFLR(
-    const uint64 context_id, EagerContext* ctx, WorkerSession* worker_session);
+    const uint64_t context_id, EagerContext* ctx,
+    WorkerSession* worker_session);
 
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index ade7260cc9fb74..a0991dc601be4e 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -71,8 +71,8 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
   // Remote node deletions are best effort
   bool Fatal() const override { return false; }
 
-  string DebugString() const override {
-    string out = "[DestroyTensorHandleNode]";
+  std::string DebugString() const override {
+    std::string out = "[DestroyTensorHandleNode]";
     absl::StrAppend(&out, " request: ", request_->DebugString());
     return out;
   }
@@ -80,7 +80,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
  private:
   std::unique_ptr<EnqueueRequest> request_;
   core::RefCountPtr<EagerClient> eager_client_;
-  const string remote_task_;
+  const std::string remote_task_;
   bool ready_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 6fc956014ab666..a2a3d596bff10a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -92,7 +92,7 @@ class EagerClientCache {
   // increment the refcount of the client. The reference ownership is
   // transferred to the caller, and the unref should automatically happen when
   // destructing the RefCountPtr object from the caller's side.
-  virtual absl::Status GetClient(const string& target,
+  virtual absl::Status GetClient(const std::string& target,
                                  core::RefCountPtr<EagerClient>* client) = 0;
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 2bb45a8ed53d67..abae4bdce1d23a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -59,8 +59,9 @@ namespace eager {
 
 namespace {
 absl::Status GetNumRetvals(
-    FunctionLibraryDefinition* func_lib_def, const string& op_name,
-    const google::protobuf::Map<string, tensorflow::AttrValue>& attrs, int* num_retvals) {
+    FunctionLibraryDefinition* func_lib_def, const std::string& op_name,
+    const google::protobuf::Map<std::string, tensorflow::AttrValue>& attrs,
+    int* num_retvals) {
   const tensorflow::OpRegistrationData* op_reg_data = nullptr;
   auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
   if (absl::IsNotFound(status)) {
@@ -189,10 +190,10 @@ absl::Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
 
 absl::Status AddOpRetvalsToResponse(
     EagerContext* eager_context, int op_id, int num_retvals,
-    const std::vector<int32>& output_nums, TensorHandle** retvals,
+    const std::vector<int32_t>& output_nums, TensorHandle** retvals,
     std::function<TensorProto*()> add_tensor_proto_fn,
     std::function<TensorShapeProto*()> add_shape_proto_fn,
-    std::function<string*()> add_device_fn = nullptr) {
+    std::function<std::string*()> add_device_fn = nullptr) {
   // retvals hold references to the allocated output tensor handles. If errors
   // happen with adding some results to the response, aggregate the status in sg
   // instead of directly returning the error, to make sure unref or ownership
@@ -291,7 +292,7 @@ absl::Status EagerServiceImpl::CreateContext(
     TF_RETURN_IF_ERROR(env_->session_mgr->DeleteAllSessions());
 
     // Cleanup existing contexts if any.
-    std::unordered_map<uint64, ServerContext*> tmp_contexts;
+    std::unordered_map<uint64_t, ServerContext*> tmp_contexts;
     {
       mutex_lock l(contexts_mu_);
       if (!contexts_.empty()) {
@@ -372,7 +373,7 @@ absl::Status EagerServiceImpl::CreateContext(
   // case ctx will be deleted by this unref.
   core::ScopedUnref unref_ctx(ctx);
 
-  std::vector<string> remote_workers;
+  std::vector<std::string> remote_workers;
   worker_session->worker_cache()->ListWorkers(&remote_workers);
   remote_workers.erase(std::remove(remote_workers.begin(), remote_workers.end(),
                                    worker_session->worker_name()),
@@ -500,7 +501,7 @@ absl::Status EagerServiceImpl::UpdateContext(
 
   const tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
 
-  std::vector<string> remote_workers;
+  std::vector<std::string> remote_workers;
   worker_session->worker_cache()->ListWorkers(&remote_workers);
   remote_workers.erase(std::remove(remote_workers.begin(), remote_workers.end(),
                                    worker_session->worker_name()),
@@ -508,7 +509,7 @@ absl::Status EagerServiceImpl::UpdateContext(
   VLOG(1) << "On existing server " << worker_session->worker_name()
           << " updating remote workers";
   if (VLOG_IS_ON(2)) {
-    for (const string& rw : remote_workers) {
+    for (const std::string& rw : remote_workers) {
       VLOG(2) << "Remote worker " << rw;
     }
   }
@@ -546,8 +547,8 @@ absl::Status EagerServiceImpl::UpdateContext(
   return absl::OkStatus();
 }
 
-absl::Status EagerServiceImpl::CreateMasterContext(
-    const tensorflow::uint64 context_id, EagerContext* context) {
+absl::Status EagerServiceImpl::CreateMasterContext(const uint64_t context_id,
+                                                   EagerContext* context) {
   {
     mutex_lock l(contexts_mu_);
     auto iter = contexts_.find(context_id);
@@ -616,7 +617,7 @@ void EagerServiceImpl::RunComponentFunction(
   auto* retvals = new absl::FixedArray<TensorHandle*>(*num_retvals);
   VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
           << operation.id();
-  std::vector<int32> output_nums;
+  std::vector<int32_t> output_nums;
   for (const int32_t output_num : request->output_num()) {
     output_nums.push_back(output_num);
   }
@@ -676,7 +677,7 @@ absl::Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
           num_retvals),
       &num_retvals));
 
-  std::function<string*()> add_device_fn = nullptr;
+  std::function<std::string*()> add_device_fn = nullptr;
   // Send the output devices of a function back to let a client know where the
   // outputs are. For a primitive op, an output devics is the op device which is
   // known on a client.
@@ -694,7 +695,7 @@ absl::Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
 absl::Status EagerServiceImpl::Enqueue(CallOptions* call_opts,
                                        const EnqueueRequest* request,
                                        EnqueueResponse* response,
-                                       uint64 stream_id) {
+                                       uint64_t stream_id) {
   tsl::profiler::TraceMe activity(
       [&] {
         return absl::StrCat(
@@ -901,12 +902,12 @@ absl::Status EagerServiceImpl::SendPackedHandle(
 }
 
 absl::Status EagerServiceImpl::GetServerContext(
-    uint64 context_id, ServerContext** server_context) {
+    uint64_t context_id, ServerContext** server_context) {
   tf_shared_lock l(contexts_mu_);
   auto iter = contexts_.find(context_id);
   if (iter == contexts_.end()) {
     *server_context = nullptr;
-    return errors::Aborted(strings::Printf(
+    return errors::Aborted(absl::StrFormat(
         "Unable to find a context_id matching the specified one "
         "(%llu). Perhaps the worker was restarted, or the context was GC'd?",
         static_cast<unsigned long long>(context_id)));
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 329f60cf583ef7..90d49cc7a64e19 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -83,15 +83,15 @@ class EagerServiceImpl {
                              UpdateContextResponse* response);
 
   // Create a ServerContext for master eager context.
-  absl::Status CreateMasterContext(const tensorflow::uint64 context_id,
+  absl::Status CreateMasterContext(const uint64_t context_id,
                                    EagerContext* context);
 
-  static constexpr uint64 kInvalidStreamId = 0;
+  static constexpr uint64_t kInvalidStreamId = 0;
 
   // Used by both Enqueue and StreamingEnqueue RPCs.
   absl::Status Enqueue(CallOptions* call_opts, const EnqueueRequest* request,
                        EnqueueResponse* response,
-                       uint64 stream_id = kInvalidStreamId);
+                       uint64_t stream_id = kInvalidStreamId);
 
   absl::Status WaitQueueDone(const WaitQueueDoneRequest* request,
                              WaitQueueDoneResponse* response);
@@ -166,7 +166,7 @@ class EagerServiceImpl {
     const bool is_master_;
   };
   // The returned ServerContext will need to be Unrefed.
-  absl::Status GetServerContext(uint64, ServerContext**);
+  absl::Status GetServerContext(uint64_t, ServerContext**);
 
   class ClientTensorHandleDeleteNode : public EagerNode {
    public:
@@ -194,8 +194,8 @@ class EagerServiceImpl {
     // Remote node deletions are best effort
     bool Fatal() const override { return false; }
 
-    string DebugString() const override {
-      string out = "[ClientTensorHandleDeleteNode]";
+    std::string DebugString() const override {
+      std::string out = "[ClientTensorHandleDeleteNode]";
       absl::StrAppend(&out, " op_id: ", handle_to_delete_->op_id);
       absl::StrAppend(&out, ", output_num: ", handle_to_delete_->output_num);
       return out;
@@ -225,7 +225,7 @@ class EagerServiceImpl {
   WorkerEnv* const env_;  // Not owned.
 
   mutex contexts_mu_;
-  std::unordered_map<uint64, ServerContext*> contexts_
+  std::unordered_map<uint64_t, ServerContext*> contexts_
       TF_GUARDED_BY(contexts_mu_);
 
   std::unique_ptr<Thread> gc_thread_;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index a4b1f6552b4b33..e9be274d4fea19 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -56,14 +56,14 @@ namespace {
 class TestEagerServiceImpl : public EagerServiceImpl {
  public:
   explicit TestEagerServiceImpl(WorkerEnv* env) : EagerServiceImpl(env) {}
-  absl::Status GetEagerContext(const uint64 context_id, EagerContext** ctx) {
+  absl::Status GetEagerContext(const uint64_t context_id, EagerContext** ctx) {
     ServerContext* context = nullptr;
     TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
     core::ScopedUnref context_unref(context);
     *ctx = context->Context();
     return absl::OkStatus();
   }
-  absl::Status GetTensorHandle(const uint64 context_id,
+  absl::Status GetTensorHandle(const uint64_t context_id,
                                const RemoteTensorHandleInternal& remote_handle,
                                tensorflow::TensorHandle** handle) {
     ServerContext* context = nullptr;
@@ -136,7 +136,7 @@ class FakeEagerClient : public EagerClient {
 class DummyEagerClientCache : public EagerClientCache {
  public:
   DummyEagerClientCache() : client_(new FakeEagerClient) {}
-  absl::Status GetClient(const string& target,
+  absl::Status GetClient(const std::string& target,
                          core::RefCountPtr<EagerClient>* client) override {
     client->reset(client_.get());
     client_->Ref();
@@ -154,7 +154,7 @@ class FakeCache : public TestWorkerCache {
     return absl::OkStatus();
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     workers->push_back("/job:localhost/replica:0/task:0");
   }
 };
@@ -202,10 +202,11 @@ void SetTensorProto(TensorProto* tensor_proto) {
 }
 
 void BuildOperation(
-    Operation* operation, int64_t id, const string& name,
-    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32>>>&
+    Operation* operation, int64_t id, const std::string& name,
+    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32_t>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device) {
+    const std::unordered_map<std::string, AttrValue>& attrs,
+    const std::string& device) {
   operation->set_id(id);
   operation->set_name(name);
   operation->set_device(device);
@@ -216,7 +217,7 @@ void BuildOperation(
           std::get<TensorProto>(input);
     } else {
       const auto& tensor_handle_pair =
-          std::get<std::pair<int64_t, int32>>(input);
+          std::get<std::pair<int64_t, int32_t>>(input);
       auto* input = operation->add_op_inputs()->mutable_remote_handle();
       input->set_op_id(tensor_handle_pair.first);
       input->set_output_num(tensor_handle_pair.second);
@@ -231,21 +232,22 @@ void BuildOperation(
 }
 
 void AddOperationToEnqueueRequest(
-    int64_t id, const string& name,
-    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32>>>&
+    int64_t id, const std::string& name,
+    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32_t>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    EnqueueRequest* request) {
+    const std::unordered_map<std::string, AttrValue>& attrs,
+    const std::string& device, EnqueueRequest* request) {
   auto* operation = request->add_queue()->mutable_operation();
   BuildOperation(operation, id, name, inputs, attrs, device);
 }
 
 void AddOperationToRunComponentFunctionRequest(
-    int64_t id, const string& name,
-    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32>>>&
+    int64_t id, const std::string& name,
+    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32_t>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    const int output_num, RunComponentFunctionRequest* request) {
+    const std::unordered_map<std::string, AttrValue>& attrs,
+    const std::string& device, const int output_num,
+    RunComponentFunctionRequest* request) {
   auto* operation = request->mutable_operation();
   operation->set_is_function(true);
   operation->set_is_component_function(true);
@@ -450,7 +452,7 @@ tensorflow::FunctionDef SingleRecvNodeFunction() {
 TEST_F(EagerServiceImplTest, BasicTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
 
   CreateContextRequest request;
   request.mutable_server_def()->set_job_name("localhost");
@@ -464,7 +466,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
   remote_enqueue_request.set_context_id(context_id);
   EnqueueResponse remote_enqueue_response;
 
-  std::unordered_map<string, AttrValue> const_attrs;
+  std::unordered_map<std::string, AttrValue> const_attrs;
   AttrValue val;
   val.set_type(tensorflow::DataType::DT_FLOAT);
   const_attrs.insert({"dtype", val});
@@ -476,7 +478,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
                                "/job:localhost/replica:0/task:0/device:CPU:0",
                                &remote_enqueue_request);
 
-  std::unordered_map<string, AttrValue> attrs;
+  std::unordered_map<std::string, AttrValue> attrs;
   val.Clear();
   val.set_type(tensorflow::DataType::DT_FLOAT);
   attrs.insert({"T", val});
@@ -529,12 +531,12 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
 
   // Creates a context and attempts to execute a function.
   void TestFunction(const RegisterFunctionOp& register_op,
-                    const string& function_name,
+                    const std::string& function_name,
                     const bool local_inputs = false,
                     const bool test_cancel = false) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-    uint64 context_id = random::New64();
+    uint64_t context_id = random::New64();
 
     CreateContextRequest request;
     request.mutable_server_def()->set_job_name("localhost");
@@ -561,12 +563,12 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
       SetTensorProto(&tensor_proto);
       AddOperationToEnqueueRequest(
           2, function_name, {tensor_proto},
-          std::unordered_map<string, AttrValue>(),
+          std::unordered_map<std::string, AttrValue>(),
           "/job:localhost/replica:0/task:0/device:CPU:0",
           &remote_enqueue_request);
 
     } else {
-      std::unordered_map<string, AttrValue> const_attrs;
+      std::unordered_map<std::string, AttrValue> const_attrs;
       AttrValue val;
       val.set_type(tensorflow::DataType::DT_FLOAT);
       const_attrs.insert({"dtype", val});
@@ -581,7 +583,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
           &remote_enqueue_request);
       AddOperationToEnqueueRequest(
           2, function_name, {std::make_pair(1, 0)},
-          std::unordered_map<string, AttrValue>(),
+          std::unordered_map<std::string, AttrValue>(),
           "/job:localhost/replica:0/task:0/device:CPU:0",
           &remote_enqueue_request);
     }
@@ -629,10 +631,10 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
 
   // Creates a context and attempts to execute a component function.
   void TestComponentFunction(const RegisterFunctionOp& register_op,
-                             const string& function_name,
+                             const std::string& function_name,
                              const bool test_cancel) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
-    uint64 context_id = random::New64();
+    uint64_t context_id = random::New64();
 
     // Create context.
     CreateContextRequest request;
@@ -655,7 +657,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     remote_enqueue_request.set_context_id(context_id);
     EnqueueResponse remote_enqueue_response;
 
-    std::unordered_map<string, AttrValue> const_attrs;
+    std::unordered_map<std::string, AttrValue> const_attrs;
     AttrValue val;
     val.set_type(tensorflow::DataType::DT_FLOAT);
     const_attrs.insert({"dtype", val});
@@ -675,7 +677,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         2, function_name, {std::make_pair(1, 0)},
-        std::unordered_map<string, AttrValue>(),
+        std::unordered_map<std::string, AttrValue>(),
         "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
         &run_comp_func_request);
 
@@ -772,7 +774,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionTest) {
 
 TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
 
   // Create context.
   CreateContextRequest request;
@@ -820,7 +822,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
   remote_enqueue_request.set_context_id(context_id);
   EnqueueResponse remote_enqueue_response;
 
-  std::unordered_map<string, AttrValue> const_attrs;
+  std::unordered_map<std::string, AttrValue> const_attrs;
   AttrValue val;
   val.set_type(tensorflow::DataType::DT_FLOAT);
   const_attrs.insert({"dtype", val});
@@ -841,7 +843,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
     const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         2, "MatMulNestedFunction", {std::make_pair(1, 0)},
-        std::unordered_map<string, AttrValue>(),
+        std::unordered_map<std::string, AttrValue>(),
         "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
         &run_comp_func_request);
 
@@ -883,7 +885,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
     const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         3, "MatMulNestedTransposeFunction", {std::make_pair(1, 0)},
-        std::unordered_map<string, AttrValue>(),
+        std::unordered_map<std::string, AttrValue>(),
         "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
         &run_comp_func_request);
 
@@ -984,7 +986,7 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
     EnqueueRequest remote_enqueue_request;
     remote_enqueue_request.set_context_id(context_id_);
     EnqueueResponse remote_enqueue_response;
-    std::unordered_map<string, AttrValue> const_attrs;
+    std::unordered_map<std::string, AttrValue> const_attrs;
     AttrValue val;
     val.set_type(tensorflow::DataType::DT_FLOAT);
     const_attrs.insert({"dtype", val});
@@ -1045,11 +1047,13 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
   }
 
  protected:
-  const string local_device_ = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string remote_device_ = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const std::string local_device_ =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string remote_device_ =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
   TestEagerServiceImpl eager_service_impl_;
   std::unique_ptr<DeviceMgr> remote_device_mgr_;
-  uint64 context_id_;
+  uint64_t context_id_;
   tensorflow::FunctionDef fdef_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> eager_pflr_;
   std::unique_ptr<EagerClusterFunctionLibraryRuntime> eager_cluster_flr_;
@@ -1072,7 +1076,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
       fdef_.signature().name(), AttrSlice(&fdef_.attr()), options, &handle));
   EagerContext* ctx = nullptr;
   TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx));
-  for (const string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
+  for (const std::string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
     const FunctionDef* fdef = ctx->FuncLibDef()->Find(func_name);
     EXPECT_TRUE(fdef != nullptr);
     if (absl::StartsWith(func_name, "MatMulFunction")) {
@@ -1085,7 +1089,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
 
   // Run MatMulFunction on remote_device.
   FunctionLibraryRuntime::Options opts;
-  const uint64 op_id = 2;
+  const uint64_t op_id = 2;
   opts.op_id = op_id;
   absl::Notification done;
   absl::Status status;
@@ -1133,7 +1137,7 @@ TEST_F(FunctionWithRemoteInputsTest,
   TF_ASSERT_OK(status);
   EagerContext* ctx = nullptr;
   TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx));
-  for (const string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
+  for (const std::string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
     const FunctionDef* fdef = ctx->FuncLibDef()->Find(func_name);
     EXPECT_TRUE(fdef != nullptr);
     if (absl::StartsWith(func_name, "MatMulFunction")) {
@@ -1288,7 +1292,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
 TEST_F(EagerServiceImplTest, SendTensorTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
 
   CreateContextRequest request;
   request.mutable_server_def()->set_job_name("localhost");
@@ -1306,7 +1310,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
   send_tensor->set_op_id(1);
   SetTensorProto(send_tensor->add_tensors());
 
-  std::unordered_map<string, AttrValue> attrs;
+  std::unordered_map<std::string, AttrValue> attrs;
   AttrValue val;
   val.Clear();
   val.set_type(tensorflow::DataType::DT_FLOAT);
@@ -1351,13 +1355,13 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
 TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
-  const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
-  const string composite_device =
+  const std::string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const std::string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+  const std::string composite_device =
       "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
   CreateContextRequest request;
   auto* server_def = request.mutable_server_def();
   server_def->set_job_name("localhost");
@@ -1465,7 +1469,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
       /*async=*/false, device_mgr_.get(), false, std::move(rendezvous), nullptr,
       nullptr,
       /*run_eager_op_as_function=*/true);
-  const uint64 context_id = random::New64();
+  const uint64_t context_id = random::New64();
 
   // Set RemoteMgr to ctx.
   auto remote_mgr =
@@ -1506,7 +1510,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
 TEST_F(EagerServiceImplTest, KeepAliveTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
   CreateContextRequest request;
   request.mutable_server_def()->set_job_name("localhost");
   request.mutable_server_def()->set_task_index(0);
@@ -1531,7 +1535,7 @@ TEST_F(EagerServiceImplTest, KeepAliveTest) {
   EXPECT_PRED_FORMAT2(::testing::IsSubstring, "Unable to find a context_id",
                       std::string(status.message()));
 
-  uint64 new_context_id = random::New64();
+  uint64_t new_context_id = random::New64();
   // Create a new context.
   request.set_context_id(new_context_id);
   TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 8066664cd0e456..e532bdff5e657a 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -66,8 +66,8 @@ absl::Status CreateUncachedKernelAndDeviceOp(
 
 // This gets a unique wire ID. We add a random identifier so that if the
 // worker has other clients that it is servicing, we don't have any collision.
-string GetUniqueWireID() {
-  static tensorflow::uint64 random_seed = random::New64();
+std::string GetUniqueWireID() {
+  static uint64_t random_seed = random::New64();
   static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
   static std::atomic<int64_t> wire_id;
   return absl::StrCat(random_seed, "_", wire_id++);
@@ -77,7 +77,7 @@ string GetUniqueWireID() {
 
 RemoteCopyNode::RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor,
                                TensorHandle* src, TensorHandle* dst,
-                               Device* recv_device, uint64 recv_op_id)
+                               Device* recv_device, uint64_t recv_op_id)
     : AsyncEagerNode(),
       src_(src),
       ctx_(ctx),
@@ -220,12 +220,12 @@ absl::Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
 
 void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
   EnqueueRequest request;
-  uint64 context_id = ctx_->GetContextId();
+  uint64_t context_id = ctx_->GetContextId();
   request.set_context_id(context_id);
   auto* remote_op = request.add_queue()->mutable_operation();
   PrepareRemoteOp(remote_op, op);
   remote_op->set_id(recv_op_id_);
-  uint64 context_view_id = ctx_->GetContextViewId();
+  uint64_t context_view_id = ctx_->GetContextViewId();
 
   core::RefCountPtr<eager::EagerClient> eager_client;
   absl::Status status = ctx_->GetClient(recv_device_, &eager_client);
@@ -316,7 +316,7 @@ void RemoteCopyNode::StartRecv(StatusCallback done) {
   }
 }
 
-absl::Status SerializePackedHandle(const uint64 op_id,
+absl::Status SerializePackedHandle(const uint64_t op_id,
                                    TensorHandle* packed_handle,
                                    const Device* target_device,
                                    EagerContext* ctx, SendPackedHandleOp* op) {
@@ -362,7 +362,7 @@ absl::Status SerializePackedHandle(const uint64 op_id,
 
 void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
   absl::Status s;
-  const uint64 context_view_id = ctx_->GetContextViewId();
+  const uint64_t context_view_id = ctx_->GetContextViewId();
   if (!send_device_->IsLocal()) {
     s = errors::InvalidArgument(
         "Copy a packed handle from a remote device is not supported");
@@ -372,7 +372,7 @@ void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
   }
 
   EnqueueRequest request;
-  uint64 context_id = ctx_->GetContextId();
+  uint64_t context_id = ctx_->GetContextId();
   request.set_context_id(context_id);
   s = SerializePackedHandle(recv_op_id_, src_, recv_device_, ctx_,
                             request.add_queue()->mutable_send_packed_handle());
@@ -426,12 +426,12 @@ void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
 void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   absl::Status s;
   EnqueueRequest request;
-  uint64 context_id = ctx_->GetContextId();
+  uint64_t context_id = ctx_->GetContextId();
   request.set_context_id(context_id);
   auto* send_tensor = request.add_queue()->mutable_send_tensor();
   send_tensor->set_op_id(recv_op_id_);
   send_tensor->set_device_name(recv_device_->name());
-  uint64 context_view_id = ctx_->GetContextViewId();
+  uint64_t context_view_id = ctx_->GetContextViewId();
 
   // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
   // copy it to the CPU before copying it out.
@@ -515,7 +515,7 @@ void RemoteCopyNode::RunAsync(StatusCallback done) {
 
 void RemoteCopyNode::Abort(absl::Status status) {
   if (!started_) {
-    uint64 context_view_id = ctx_->GetContextViewId();
+    uint64_t context_view_id = ctx_->GetContextViewId();
     captured_state_->dst()->PoisonRemote(status, recv_device_, context_view_id);
   }
 }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
index 572b650651b0c3..a8dc387d9a7dbf 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -63,7 +63,7 @@ namespace eager {
 class RemoteCopyNode : public AsyncEagerNode {
  public:
   RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor, TensorHandle* src,
-                 TensorHandle* dst, Device* recv_device, uint64 recv_op_id);
+                 TensorHandle* dst, Device* recv_device, uint64_t recv_op_id);
 
   ~RemoteCopyNode() override;
 
@@ -73,8 +73,8 @@ class RemoteCopyNode : public AsyncEagerNode {
 
   void Abort(absl::Status status) override;
 
-  string DebugString() const override {
-    string out = "[RemoteCopyNode]";
+  std::string DebugString() const override {
+    std::string out = "[RemoteCopyNode]";
     absl::StrAppend(&out, " send_device: ", send_device_->name());
     absl::StrAppend(&out, ", recv_device: ", recv_device_->name());
     absl::StrAppend(&out, ", send_tensor: ", src_->DebugString());
@@ -167,8 +167,8 @@ class RemoteCopyNode : public AsyncEagerNode {
   EagerExecutor* const executor_;
   Device* const send_device_;
   Device* const recv_device_;
-  const string wire_id_;
-  const uint64 recv_op_id_;
+  const std::string wire_id_;
+  const uint64_t recv_op_id_;
 
   std::shared_ptr<CapturedSharedState> captured_state_;
   bool started_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
index f118ecaeb2bbad..3c526f2904d34c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -32,9 +32,9 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
   Device* device = device_;
 
   // Filled and used only when VLOG(3) is on.
-  string rpc_description;
+  std::string rpc_description;
   if (VLOG_IS_ON(3)) {
-    std::vector<string> ops;
+    std::vector<std::string> ops;
     ops.reserve(request_->queue_size());
     for (const QueueItem& item : request_->queue()) {
       if (item.has_operation()) {
@@ -96,7 +96,7 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
         }
         for (size_t i = 0; i < retvals.size(); ++i) {
           if (status.ok()) {
-            const string output_device =
+            const std::string output_device =
                 response->queue_response(0).device().empty()
                     ? ""
                     : response->queue_response(0).device(i);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index e29d8d1c187f31..8cc9501efb06d4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -40,7 +40,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
  public:
   RemoteExecuteNode(EagerContext* eager_context,
                     std::unique_ptr<EnqueueRequest> request, Device* device,
-                    uint64 context_view_id, EagerClient* eager_client,
+                    uint64_t context_view_id, EagerClient* eager_client,
                     CancellationManager* cancellation_manager,
                     const NodeDef& ndef,
                     const FunctionLibraryDefinition* lib_def,
@@ -118,8 +118,8 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
     return eager_client_->allow_multiple_pending_requests();
   }
 
-  string DebugString() const override {
-    string out = "[RemoteExecuteNode]";
+  std::string DebugString() const override {
+    std::string out = "[RemoteExecuteNode]";
     absl::StrAppend(&out, " request: ", request_->DebugString());
     absl::StrAppend(&out, ", target_device: ", device_->name());
     return out;
@@ -129,7 +129,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
   EagerContext* eager_context_;  // Not owned, and must outlive this node.
   std::unique_ptr<EnqueueRequest> request_;
   Device* device_;             // Not owned
-  uint64 context_view_id_;
+  uint64_t context_view_id_;
   bool needs_remote_inputs_;
   EagerClient* eager_client_;  // Not owned, and must outlive this node.
   CancellationManager* cancellation_manager_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index acd34fd9ccbc86..5cec8424c2e14d 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -127,7 +127,7 @@ absl::Status RemoteMgr::GetMirroredResourceShape(
 
 absl::Status RemoteMgr::GetRemoteTensorHandle(
     const tensorflow::TensorHandle* handle, const bool wait_until_ready,
-    int64_t* op_id, int32* output_num) {
+    int64_t* op_id, int32_t* output_num) {
   TF_RETURN_IF_ERROR(handle->RemoteAddress(handle->device(), wait_until_ready,
                                            op_id, output_num));
   tensorflow::TensorHandle* h;
@@ -213,7 +213,7 @@ absl::Status RemoteMgr::DeserializeRemoteTensorHandle(
   } else {
     // Create a remote TensorHandle for remote tensors which have not been
     // copied to the local worker yet (e.g. remote function inputs).
-    const string& device_name =
+    const std::string& device_name =
         in.op_device().empty() ? in.device() : in.op_device();
     TF_RETURN_IF_ERROR(
         parent_->FindDeviceFromName(device_name.c_str(), &device));
@@ -241,7 +241,7 @@ absl::Status RemoteMgr::DeserializeRemoteTensorHandle(
   return absl::OkStatus();
 }
 
-EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
+EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64_t stream_id) {
   mutex_lock l(executor_map_mu_);
   auto it = executor_map_.find(stream_id);
   if (it == executor_map_.end()) {
@@ -254,7 +254,7 @@ EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
   return it->second;
 }
 
-void RemoteMgr::DeleteExecutorForStream(uint64 stream_id) {
+void RemoteMgr::DeleteExecutorForStream(uint64_t stream_id) {
   mutex_lock l(executor_map_mu_);
   auto it = executor_map_.find(stream_id);
   if (it == executor_map_.end()) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index b62134cd6e5860..975cfa13e45ef7 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -58,7 +58,7 @@ class RemoteMgr {
 
   // Helper function to create monotonically increasing ids unique to this
   // context.
-  uint64 NextOpId() {
+  uint64_t NextOpId() {
     DCHECK(is_master_);
     mutex_lock l(next_id_mutex_);
     return next_op_id_++;
@@ -77,20 +77,20 @@ class RemoteMgr {
   absl::Status DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
                                              TensorHandle** out);
 
-  EagerExecutor& GetOrCreateExecutorForStream(uint64 stream_id);
+  EagerExecutor& GetOrCreateExecutorForStream(uint64_t stream_id);
 
-  void DeleteExecutorForStream(uint64 stream_id);
+  void DeleteExecutorForStream(uint64_t stream_id);
 
  protected:
   mutex next_id_mutex_;
-  uint64 next_op_id_ TF_GUARDED_BY(next_id_mutex_) = 1;
+  uint64_t next_op_id_ TF_GUARDED_BY(next_id_mutex_) = 1;
 
  private:
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   absl::Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
                                      const bool wait_until_ready,
-                                     int64_t* op_id, int32* output_num)
+                                     int64_t* op_id, int32_t* output_num)
       TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   absl::Status GetTensorHandleImpl(
@@ -129,7 +129,7 @@ class RemoteMgr {
   EagerContext* parent_;  // not owned.
 
   mutex executor_map_mu_;
-  std::unordered_map<uint64, EagerExecutor> executor_map_
+  std::unordered_map<uint64_t, EagerExecutor> executor_map_
       TF_GUARDED_BY(executor_map_mu_);
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index ae05ce640cf0dc..89901367b49b2d 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -35,7 +35,7 @@ class TestRemoteMgr : public RemoteMgr {
   TestRemoteMgr(bool is_master, EagerContext* ctx)
       : RemoteMgr(is_master, ctx) {}
 
-  uint64 OpId() {
+  uint64_t OpId() {
     tf_shared_lock l(next_id_mutex_);
     return next_op_id_;
   }
@@ -75,7 +75,7 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
 
   TensorHandle* handle = TensorHandle::CreateLocalHandle(
       std::move(t), local_device_, local_device_, ctx_);
-  const uint64 op_id = 2;
+  const uint64_t op_id = 2;
   const int output_num = 3;
   TF_ASSERT_OK(handle->AddUnshapedRemoteMirror(remote_device_, op_id,
                                                output_num, "", ctx_));
@@ -94,7 +94,7 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
 TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
   RemoteMgr remote_mgr(false, ctx_);
 
-  const uint64 op_id = 3;
+  const uint64_t op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
       op_id, output_num, DT_FLOAT, remote_device_, /*is_ready=*/true, ctx_);
@@ -113,7 +113,7 @@ TEST_F(RemoteMgrTest, InvalidateRemoteMirrorWithClusterUpdate) {
 
   TensorHandle* handle = TensorHandle::CreateLocalHandle(
       std::move(t), local_device_, local_device_, ctx_);
-  const uint64 op_id = 2;
+  const uint64_t op_id = 2;
   const int output_num = 3;
   TF_ASSERT_OK(handle->AddUnshapedRemoteMirror(remote_device_, op_id,
                                                output_num, "", ctx_));
@@ -134,7 +134,7 @@ TEST_F(RemoteMgrTest, InvalidateRemoteMirrorWithClusterUpdate) {
 TEST_F(RemoteMgrTest, SetRemoteShapeWithClusterUpdate) {
   RemoteMgr remote_mgr(false, ctx_);
 
-  const uint64 op_id = 3;
+  const uint64_t op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateUnshapedRemoteHandle(
       op_id, output_num,
@@ -157,7 +157,7 @@ TEST_F(RemoteMgrTest, SetRemoteShapeWithClusterUpdate) {
 TEST_F(RemoteMgrTest, ErrorSourcesShouldExist) {
   RemoteMgr remote_mgr(false, ctx_);
 
-  const uint64 op_id = 3;
+  const uint64_t op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
       op_id, output_num, DT_FLOAT, remote_device_, /*is_ready=*/true, ctx_);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
index 903d019172a457..51f8d97e6ce6f8 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
@@ -28,7 +28,7 @@ struct RemoteTensorHandleInternal {
   RemoteTensorHandleInternal(int64_t op_id, int32_t output_num)
       : op_id(op_id), output_num(output_num) {}
   int64_t op_id;
-  int32 output_num;
+  int32_t output_num;
 };
 
 struct RemoteTensorHandleInternalHash {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 73427ed1372ed8..32ec58774d99cb 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -29,9 +29,10 @@ namespace tensorflow {
 
 namespace {
 
-void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
-                               uint64 context_id, uint64 op_id, int output_num,
-                               bool ready) {
+void DestroyRemoteTensorHandle(EagerContext* ctx,
+                               const std::string& remote_task,
+                               uint64_t context_id, uint64_t op_id,
+                               int output_num, bool ready) {
   if (ctx->GetContextId() != context_id) {
     // This means that this tensor was pointing to a remote device, which
     // has been changed out from under us. Simply return since there is
@@ -89,7 +90,7 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
 }  // namespace
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64_t op_id, int output_num,
-                                               uint64 context_view_id,
+                                               uint64_t context_view_id,
                                                bool is_ready)
     : is_ready_(is_ready),
       op_id_(op_id),
@@ -102,7 +103,7 @@ RemoteTensorHandleData::RemoteTensorHandleData(int64_t op_id, int output_num,
 }
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64_t op_id, int output_num,
-                                               const string& remote_task,
+                                               const std::string& remote_task,
                                                EagerContext* ctx)
     : is_ready_(false),
       op_id_(op_id),
@@ -182,7 +183,7 @@ absl::Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
 }
 
 absl::Status RemoteTensorHandleData::SetShapeAndRemoteTask(
-    const TensorShape& shape, const string& remote_task) {
+    const TensorShape& shape, const std::string& remote_task) {
   // If `is_ready_` is set previously due to poisoning, return the original
   // error that poisoned this tensor.
   TF_RETURN_IF_ERROR(IsPoisoned());
@@ -216,13 +217,13 @@ absl::Status RemoteTensorHandleData::SetShapeAndRemoteTask(
   return absl::OkStatus();
 }
 
-string RemoteTensorHandleData::DebugString() const {
+std::string RemoteTensorHandleData::DebugString() const {
   return absl::StrCat("RemoteTensorHandleData:", " op_id: ", op_id_,
                       " output_num: ", output_num_);
 }
 
 absl::Status RemoteTensorHandleData::OpIdAndOutputNum(
-    const bool wait_until_ready, int64_t* op_id, int32* output_num) const {
+    const bool wait_until_ready, int64_t* op_id, int32_t* output_num) const {
   if (wait_until_ready) {
     TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
   }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 892d82bd5f7efe..1c7099cc66b1a4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -31,12 +31,12 @@ class RemoteTensorHandleData {
   // the corresponding remote tensor is ready. So the remote tensor should be
   // ready when we create a lazy remote handle. If it refers to a remote output,
   // it's not ready until the shape is set.
-  RemoteTensorHandleData(int64_t op_id, int output_num, uint64 context_view_id,
-                         bool is_ready);
+  RemoteTensorHandleData(int64_t op_id, int output_num,
+                         uint64_t context_view_id, bool is_ready);
   // Constructor for unshaped remote handles. It controls the lifetime of a
   // remote handle that it refers to.
   RemoteTensorHandleData(int64_t op_id, int output_num,
-                         const string& remote_task, EagerContext* ctx);
+                         const std::string& remote_task, EagerContext* ctx);
   ~RemoteTensorHandleData();
 
   // A remote tensor handle does not have a Tensor object, hence it can only
@@ -51,18 +51,18 @@ class RemoteTensorHandleData {
   absl::Status WaitReady(const char* caller) const;
   absl::Status SetShape(const TensorShape& shape);
   absl::Status SetShapeAndRemoteTask(const TensorShape& shape,
-                                     const string& remote_task);
+                                     const std::string& remote_task);
   void Poison(absl::Status status);
   absl::Status IsPoisoned() const;
 
-  string DebugString() const;
+  std::string DebugString() const;
 
   // Return the op id and output num. If wait_until_ready is true, block until
   // the remote tensor is ready on a remote worker.
   absl::Status OpIdAndOutputNum(bool wait_until_ready, int64_t* op_id,
-                                int32* output_num) const;
+                                int32_t* output_num) const;
 
-  uint64 context_view_id() const { return context_view_id_; }
+  uint64_t context_view_id() const { return context_view_id_; }
 
  private:
   mutable mutex mu_;
@@ -72,10 +72,10 @@ class RemoteTensorHandleData {
 
   // IDs required when this class is representing a remote tensor handle.
   const int64_t op_id_;
-  const int32 output_num_;
-  string remote_task_ TF_GUARDED_BY(mu_);
-  uint64 context_id_;
-  uint64 context_view_id_;
+  const int32_t output_num_;
+  std::string remote_task_ TF_GUARDED_BY(mu_);
+  uint64_t context_id_;
+  uint64_t context_view_id_;
   EagerContext* ctx_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 13d130d289418c..507915a74152be 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -337,7 +337,7 @@ absl::Status GraphMgr::Register(const std::string& handle, const GraphDef& gdef,
   {
     mutex_lock l(mu_);
     *graph_handle =
-        strings::Printf("%016llx", static_cast<long long>(++next_id_));
+        absl::StrFormat("%016llx", static_cast<long long>(++next_id_));
     item->handle = *graph_handle;
     CHECK(table_.insert({*graph_handle, item}).second);
   }
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
index d781cb254fa9a9..66e39b5a15ce61 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
@@ -60,7 +60,7 @@ void ConfigCoordinationService(tensorflow::ServerDef* server_def,
   coord_config->set_enable_health_check(enable_health_check);
 }
 
-string SetConfigKeyValueFn() {
+std::string SetConfigKeyValueFn() {
   FunctionDef fdef;
   tensorflow::protobuf::TextFormat::ParseFromString(
       "    signature {"
@@ -86,7 +86,7 @@ string SetConfigKeyValueFn() {
   return fdef.SerializeAsString();
 }
 
-string GetConfigKeyValueFn() {
+std::string GetConfigKeyValueFn() {
   FunctionDef fdef;
   tensorflow::protobuf::TextFormat::ParseFromString(
       "    signature {"
@@ -521,7 +521,7 @@ TEST_P(SingleClientCoordinationServiceTest, TestSetGetConfigInOp) {
   TF_DeleteTensor(t);
   TFE_DeleteOp(get_op2);
 
-  const string& set_fdef = SetConfigKeyValueFn();
+  const std::string& set_fdef = SetConfigKeyValueFn();
   TFE_ContextAddFunctionDef(ctx, set_fdef.data(), set_fdef.size(), status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* set_fn = TFE_NewOp(ctx, "SetConfigKeyValueFn", status);
@@ -542,7 +542,7 @@ TEST_P(SingleClientCoordinationServiceTest, TestSetGetConfigInOp) {
   TFE_DeleteTensorHandle(set_val);
   TFE_DeleteOp(set_fn);
 
-  const string& get_fdef = GetConfigKeyValueFn();
+  const std::string& get_fdef = GetConfigKeyValueFn();
   TFE_ContextAddFunctionDef(ctx, get_fdef.data(), get_fdef.size(), status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* get_fn = TFE_NewOp(ctx, "GetConfigKeyValueFn", status);
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
index 7d767e9a8ce42a..73db4a0bb22cee 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
@@ -39,7 +39,7 @@ namespace {
 
 std::string SendFunction(const std::string& send_device,
                          const std::string& recv_device,
-                         const tensorflow::int64 send_device_incarnation) {
+                         const int64_t send_device_incarnation) {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
       absl::StrCat("    signature {"
@@ -100,7 +100,7 @@ std::string SendFunction(const std::string& send_device,
 
 std::string RecvFunction(const std::string& send_device,
                          const std::string& recv_device,
-                         const tensorflow::int64 send_device_incarnation) {
+                         const int64_t send_device_incarnation) {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
       absl::StrCat("    signature {"
@@ -239,7 +239,7 @@ TEST_P(MultiClientSendRecvTest, TestMultiClientSendRecv) {
 
     std::vector<tensorflow::DeviceAttributes> device_attrs;
     tensorflow::unwrap(ctx)->ListDevices(&device_attrs);
-    tensorflow::uint64 send_device_incarnation = 0;
+    uint64_t send_device_incarnation = 0;
     for (const auto& device_attr : device_attrs) {
       if (device_attr.name() == send_device) {
         send_device_incarnation = device_attr.incarnation();
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
index a4a1476edaab93..640dbb2a334050 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
@@ -174,7 +174,7 @@ TEST(CAPI, MultiClientSendRecv) {
         tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
     context->ListDevices(&device_attrs);
 
-    tensorflow::uint64 send_device_incarnation = 0;
+    uint64_t send_device_incarnation = 0;
     for (const auto& device_attr : device_attrs) {
       if (device_attr.name() == send_device) {
         send_device_incarnation = device_attr.incarnation();
diff --git a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
index 893ee615659298..5c1864ec2bff3d 100644
--- a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
@@ -45,12 +45,12 @@ class TestSetConfigKeyValueOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
                 errors::InvalidArgument("Key must be scalar."));
-    const string& config_key = key_tensor->scalar<tstring>()();
+    const std::string& config_key = key_tensor->scalar<tstring>()();
     const Tensor* val_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("value", &val_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
                 errors::InvalidArgument("Value must be scalar."));
-    const string& config_value = val_tensor->scalar<tstring>()();
+    const std::string& config_value = val_tensor->scalar<tstring>()();
     LOG(INFO) << "TestSetConfigKeyValueOp key=" << config_key
               << "value=" << config_value;
     auto* coord_agent = ctx->coordination_service_agent();
@@ -90,7 +90,7 @@ class TestGetConfigKeyValueOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
                 errors::InvalidArgument("Key must be scalar."));
-    const string& config_key = key_tensor->scalar<tstring>()();
+    const std::string& config_key = key_tensor->scalar<tstring>()();
     LOG(INFO) << "TestGetConfigKeyValueOp key=" << config_key;
 
     auto* coord_agent = ctx->coordination_service_agent();
@@ -142,7 +142,8 @@ class TestReportErrorToClusterOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("error_message", &error_message_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(error_message_tensor->shape()),
                 errors::InvalidArgument("Error message must be scalar."));
-    const string& error_message = error_message_tensor->scalar<tstring>()();
+    const std::string& error_message =
+        error_message_tensor->scalar<tstring>()();
     LOG(INFO) << "TestReportErrorToClusterOp error_code=" << error_code
               << " error_message=" << error_message;
     auto* coord_agent = ctx->coordination_service_agent();
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index 54a50da2ace799..af41d4ad1d4b49 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -223,7 +223,7 @@ struct MasterInfo {
       : master(master), default_timeout_in_ms(default_timeout_in_ms) {}
 };
 
-typedef std::unordered_map<string, MasterInfo> LocalMasterRegistry;
+typedef std::unordered_map<std::string, MasterInfo> LocalMasterRegistry;
 LocalMasterRegistry* local_master_registry() {
   static LocalMasterRegistry* local_master_registry_ = new LocalMasterRegistry;
   return local_master_registry_;
@@ -231,7 +231,7 @@ LocalMasterRegistry* local_master_registry() {
 }  // namespace
 
 /* static */
-void LocalMaster::Register(const string& target, Master* master,
+void LocalMaster::Register(const std::string& target, Master* master,
                            int64_t default_timeout_in_ms) {
   mutex_lock l(*get_local_master_registry_lock());
   local_master_registry()->insert(
@@ -239,7 +239,7 @@ void LocalMaster::Register(const string& target, Master* master,
 }
 
 /* static */
-std::unique_ptr<LocalMaster> LocalMaster::Lookup(const string& target) {
+std::unique_ptr<LocalMaster> LocalMaster::Lookup(const std::string& target) {
   std::unique_ptr<LocalMaster> ret;
   mutex_lock l(*get_local_master_registry_lock());
   auto iter = local_master_registry()->find(target);
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index e4fc37e4f60f50..b9fe78e8591f17 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -89,12 +89,12 @@ class LocalMaster : public MasterInterface {
   // any LocalMaster objects that may wrap this master. There is no
   // corresponding deregister method, since clean server shutdown is
   // not currently implemented for any server type.
-  static void Register(const string& target, Master* master,
+  static void Register(const std::string& target, Master* master,
                        int64_t default_timeout_in_ms);
 
   // Returns a pointer to the local master associated with the given
   // `target`, or nullptr if none exists.
-  static std::unique_ptr<LocalMaster> Lookup(const string& target);
+  static std::unique_ptr<LocalMaster> Lookup(const std::string& target);
 
  private:
   Master* master_impl_;  // Not owned.
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 9a2c553f841faf..bc7fa3c80bb678 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -102,7 +102,7 @@ void Master::GC() {
     if (shutdown_) {
       break;
     }
-    std::vector<string> handles;
+    std::vector<std::string> handles;
     const int64_t num_micros =
         static_cast<int64_t>(session_gc_seconds_ * 1000000);
     for (const auto& entry : sessions_) {
@@ -124,7 +124,7 @@ void Master::GC() {
   }
 }
 
-MasterSession* Master::FindMasterSession(const string& handle) {
+MasterSession* Master::FindMasterSession(const std::string& handle) {
   MasterSession* session = nullptr;
   {
     mutex_lock l(mu_);
@@ -139,8 +139,8 @@ MasterSession* Master::FindMasterSession(const string& handle) {
 class DeviceFinder {
  public:
   static absl::Status GetRemoteDevices(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      WorkerCacheInterface* worker_cache,
+      const protobuf::RepeatedPtrField<std::string>& device_filters,
+      MasterEnv* env, WorkerCacheInterface* worker_cache,
       std::vector<std::unique_ptr<Device>>* out_remote) {
     DeviceFinder finder(device_filters, env, worker_cache);
     finder.Start();
@@ -150,19 +150,20 @@ class DeviceFinder {
   }
 
   static void GetRemoteWorkers(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      WorkerCacheInterface* worker_cache, std::vector<string>* workers) {
+      const protobuf::RepeatedPtrField<std::string>& device_filters,
+      MasterEnv* env, WorkerCacheInterface* worker_cache,
+      std::vector<std::string>* workers) {
     DeviceFinder finder(device_filters, env, worker_cache);
     *workers = finder.targets_;
   }
 
  private:
   explicit DeviceFinder(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      WorkerCacheInterface* worker_cache)
+      const protobuf::RepeatedPtrField<std::string>& device_filters,
+      MasterEnv* env, WorkerCacheInterface* worker_cache)
       : env_(env), worker_cache_(worker_cache) {
     CHECK(worker_cache) << "Worker cache was null!";
-    auto process_filter = [this](const string& filter) {
+    auto process_filter = [this](const std::string& filter) {
       DeviceNameUtils::ParsedName parsed;
       if (DeviceNameUtils::ParseFullName(filter, &parsed)) {
         filters_.push_back(parsed);
@@ -170,7 +171,7 @@ class DeviceFinder {
         LOG(FATAL) << "Skipping invalid filter: " << filter;
       }
     };
-    for (const string& filter : device_filters) {
+    for (const std::string& filter : device_filters) {
       process_filter(filter);
     }
     // Enumerates all known workers' target. A target name is a
@@ -178,19 +179,19 @@ class DeviceFinder {
     if (filters_.empty()) {
       // If no filters were specified, we list all known workers in
       // `worker_cache`.
-      std::vector<string> workers;
+      std::vector<std::string> workers;
       worker_cache->ListWorkers(&workers);
       std::swap(workers, targets_);
     } else {
       // When applying filters, we must include the local worker, even if it
       // does not match any of the filters.
       CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
-      const string& local_device_name = env_->local_devices[0]->name();
+      const std::string& local_device_name = env_->local_devices[0]->name();
       DeviceNameUtils::ParsedName local_parsed_name;
       CHECK(DeviceNameUtils::ParseFullName(local_device_name,
                                            &local_parsed_name));
       bool all_filters_have_job = true;
-      std::unordered_set<string> filter_job_names({local_parsed_name.job});
+      std::unordered_set<std::string> filter_job_names({local_parsed_name.job});
       for (const DeviceNameUtils::ParsedName& filter : filters_) {
         all_filters_have_job = all_filters_have_job && filter.has_job;
         if (filter.has_job) {
@@ -198,14 +199,14 @@ class DeviceFinder {
         }
       }
 
-      std::vector<string> workers;
+      std::vector<std::string> workers;
       if (all_filters_have_job) {
         // If all of the device filters have a job specified, then we only need
         // to list the workers in the jobs named in the filter, because a worker
         // in any other job would not match any filter.
-        for (const string& job_name : filter_job_names) {
+        for (const std::string& job_name : filter_job_names) {
           VLOG(2) << "Selectively listing workers in job: " << job_name;
-          std::vector<string> workers_in_job;
+          std::vector<std::string> workers_in_job;
           worker_cache->ListWorkersInJob(job_name, &workers_in_job);
           workers.insert(workers.end(), workers_in_job.begin(),
                          workers_in_job.end());
@@ -218,13 +219,13 @@ class DeviceFinder {
         if (device_filters.empty()) {
           VLOG(2) << "- <NO FILTERS>";
         } else {
-          for (const string& filter : device_filters) {
+          for (const std::string& filter : device_filters) {
             VLOG(2) << "- " << filter;
           }
         }
         worker_cache->ListWorkers(&workers);
       }
-      for (const string& name : workers) {
+      for (const std::string& name : workers) {
         if (MatchFilters(name) ||
             DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) {
           targets_.push_back(name);
@@ -263,7 +264,7 @@ class DeviceFinder {
   // Every `kLoggingPeriodMs`, while the DeviceFinder is still waiting
   // to hear from workers, log a list of the workers who have not
   // responded.
-  const int32 kLoggingPeriodMs = 10 * 1000;
+  const int32_t kLoggingPeriodMs = 10 * 1000;
 
   absl::Status Wait() {
     mutex_lock l(mu_);
@@ -287,11 +288,11 @@ class DeviceFinder {
   // The caller takes the ownership of returned remote devices.
   void GetRemoteDevices(const std::vector<Device*>& local,
                         std::vector<std::unique_ptr<Device>>* remote) {
-    std::unordered_set<string> names(local.size());
+    std::unordered_set<std::string> names(local.size());
     for (Device* dev : local) names.insert(dev->name());
     mutex_lock l(mu_);
     for (Device* dev : found_) {
-      const string& name = dev->name();
+      const std::string& name = dev->name();
       if (names.insert(name).second && MatchFilters(name)) {
         remote->push_back(std::unique_ptr<Device>(dev));
       } else {
@@ -313,7 +314,7 @@ class DeviceFinder {
   // List of targets to be contacted by this DeviceFinder. The
   // respective `bool` in `seen_targets_` indicates whether we have
   // heard from this target or not.
-  std::vector<string> targets_;
+  std::vector<std::string> targets_;
   std::vector<bool> seen_targets_ TF_GUARDED_BY(mu_);
   absl::Status status_;
 
@@ -347,7 +348,7 @@ class DeviceFinder {
   }
 
   // Returns true iff 'name' matches one of the filters_.
-  bool MatchFilters(const string& name) {
+  bool MatchFilters(const std::string& name) {
     if (filters_.empty()) return true;
     DeviceNameUtils::ParsedName x;
     if (DeviceNameUtils::ParseFullName(name, &x)) {
@@ -386,7 +387,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
     if (!cluster_def.job().empty()) {
       worker_cache_factory_options.cluster_def = cluster_def;
       // If the target starts with gRPC protocol prefix, remove the prefix
-      string normalized_string(req->target());
+      std::string normalized_string(req->target());
       RE2::Replace(&normalized_string, kGrpcPrefixRegex, "");
 
       // Set the server_def's job_name and task_index fields.
@@ -472,7 +473,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
     options.config.mutable_experimental()
         ->set_disable_optimize_for_static_graph(true);
 
-    std::vector<string> filtered_worker_list;
+    std::vector<std::string> filtered_worker_list;
     DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_,
                                    worker_cache, &filtered_worker_list);
 
@@ -555,7 +556,7 @@ void Master::RunStep(CallOptions* opts, const RunStepRequestWrapper* req,
   SchedClosure([this, start_time, session, opts, req, resp, done]() {
     absl::Status status = session->Run(opts, *req, resp);
     session->Unref();
-    uint64 done_time = env_->env->NowMicros();
+    uint64_t done_time = env_->env->NowMicros();
     done(status);
     mutex_lock l(mu_);
     last_1000_steps_.AddValue((done_time - start_time) / 1e9);
@@ -624,7 +625,7 @@ void Master::ListDevices(const ListDevicesRequest* req,
 }
 
 void Master::CleanupWorkers(const ResetRequest& reset) {
-  std::vector<string> worker_names;
+  std::vector<std::string> worker_names;
   DeviceFinder::GetRemoteWorkers(reset.device_filters(), env_,
                                  env_->worker_cache, &worker_names);
   if (!worker_names.empty()) {
@@ -635,7 +636,7 @@ void Master::CleanupWorkers(const ResetRequest& reset) {
     std::vector<CleanupAllResponse> resp(num_workers);
     int c = 0;
     for (int i = 0; i < num_workers; ++i) {
-      const string& worker_name = worker_names[i];
+      const std::string& worker_name = worker_names[i];
       auto worker = env_->worker_cache->GetOrCreateWorker(worker_name);
       if (worker) {
         worker->CleanupAllAsync(
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index a3930249b629ee..f39fd34d0a5900 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -84,7 +84,7 @@ class Master {
   Thread* gc_thread_;
 
   // Maps session handles to sessions.
-  std::unordered_map<string, MasterSession*> sessions_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, MasterSession*> sessions_ TF_GUARDED_BY(mu_);
 
   // Moving average of step times.
   MovingAverage last_1000_steps_ TF_GUARDED_BY(mu_);
@@ -107,7 +107,7 @@ class Master {
 
   // Find master session by session handle, and increments the reference count
   // on the returned MasterSession if not null.
-  MasterSession* FindMasterSession(const string& handle);
+  MasterSession* FindMasterSession(const std::string& handle);
 
   Master(const Master&) = delete;
   void operator=(const Master&) = delete;
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index b8dcf1963df50d..5845a96836f913 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -41,7 +41,7 @@ class OpRegistryInterface;
 // Options passed to the worker_cache_factory function.
 struct WorkerCacheFactoryOptions {
   ClusterDef cluster_def;
-  string job_name;
+  std::string job_name;
   int task_index;
   int replica_index = 0;
   RPCOptions rpc_options;
@@ -96,7 +96,7 @@ struct MasterEnv {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
       std::unique_ptr<WorkerCacheInterface>,
       std::unique_ptr<DeviceSet> device_set,
-      std::vector<string> filtered_worker_list)>
+      std::vector<std::string> filtered_worker_list)>
       master_session_factory;
 
   std::function<absl::Status(const WorkerCacheFactoryOptions&,
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 87eea4ec558c2d..b24bdc24a765c8 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -75,7 +75,7 @@ namespace tensorflow {
 // TODO(zhifengc): Cleanup this class. It's becoming messy.
 class MasterSession::ReffedClientGraph : public core::RefCounted {
  public:
-  ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
+  ReffedClientGraph(const std::string& handle, const BuildGraphOptions& bopts,
                     std::unique_ptr<ClientGraph> client_graph,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
@@ -122,7 +122,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   int64_t collective_graph_key() { return collective_graph_key_; }
 
-  std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
+  std::unique_ptr<ProfileHandler> GetProfileHandler(uint64_t step,
                                                     int64_t execution_count,
                                                     const RunOptions& ropts) {
     return stats_publisher_->GetProfileHandler(step, execution_count, ropts);
@@ -239,7 +239,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                             GraphExecutionState* execution_state);
 
  private:
-  const string session_handle_;
+  const std::string session_handle_;
   const BuildGraphOptions bg_opts_;
 
   // NOTE(mrry): This pointer will be null after `RegisterPartitions()` returns.
@@ -250,13 +250,13 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   WorkerCacheInterface* const worker_cache_;  // Not owned.
 
   struct NodeDetails {
-    explicit NodeDetails(string type_string, string detail_text)
+    explicit NodeDetails(std::string type_string, std::string detail_text)
         : type_string(std::move(type_string)),
           detail_text(std::move(detail_text)) {}
-    const string type_string;
-    const string detail_text;
+    const std::string type_string;
+    const std::string detail_text;
   };
-  std::unordered_map<string, NodeDetails> name_to_node_details_;
+  std::unordered_map<std::string, NodeDetails> name_to_node_details_;
 
   const bool should_deregister_;
   const int64_t collective_graph_key_;
@@ -265,20 +265,20 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // Graph partitioned into per-location subgraphs.
   struct Part {
     // Worker name.
-    string name;
+    std::string name;
 
     // Maps feed names to rendezvous keys. Empty most of the time.
-    std::unordered_map<string, string> feed_key;
+    std::unordered_map<std::string, std::string> feed_key;
 
     // Maps rendezvous keys to fetch names. Empty most of the time.
-    std::unordered_map<string, string> key_fetch;
+    std::unordered_map<std::string, std::string> key_fetch;
 
     // The interface to the worker. Owned.
     WorkerInterface* worker = nullptr;
 
     // After registration with the worker, graph_handle identifies
     // this partition on the worker.
-    string graph_handle;
+    std::string graph_handle;
 
     Part() : feed_key(3), key_fetch(3) {}
   };
@@ -300,14 +300,15 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   std::unique_ptr<StatsPublisherInterface> stats_publisher_;
 
-  string DetailText(const NodeDetails& details, const NodeExecStats& stats) {
+  std::string DetailText(const NodeDetails& details,
+                         const NodeExecStats& stats) {
     int64_t tot = 0;
     for (auto& no : stats.output()) {
       tot += no.tensor_description().allocation_description().requested_bytes();
     }
-    string bytes;
+    std::string bytes;
     if (tot >= 0.1 * 1048576.0) {
-      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
+      bytes = absl::StrFormat("[%.1fMB] ", tot / 1048576.0);
     }
     return strings::StrCat(bytes, stats.node_name(), " = ", details.type_string,
                            details.detail_text);
@@ -322,10 +323,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // The actual graph partitioning and registration implementation.
   absl::Status DoBuildPartitions(
       PartitionOptions popts, ClientGraph* client_graph,
-      std::unordered_map<string, GraphDef>* out_partitions);
+      std::unordered_map<std::string, GraphDef>* out_partitions);
   absl::Status DoRegisterPartitions(
       const PartitionOptions& popts,
-      std::unordered_map<string, GraphDef> graph_partitions);
+      std::unordered_map<std::string, GraphDef> graph_partitions);
 
   // Prepares a number of calls to workers. One call per partition.
   // This is a generic method that handles Run, PartialRun, and RunCallable.
@@ -359,7 +360,7 @@ absl::Status MasterSession::ReffedClientGraph::RegisterPartitions(
       std::unique_ptr<ClientGraph> client_graph;
       std::swap(client_graph_before_register_, client_graph);
       mu_.unlock();
-      std::unordered_map<string, GraphDef> graph_defs;
+      std::unordered_map<std::string, GraphDef> graph_defs;
       popts.flib_def = client_graph->flib_def.get();
       absl::Status s =
           DoBuildPartitions(popts, client_graph.get(), &graph_defs);
@@ -390,9 +391,9 @@ absl::Status MasterSession::ReffedClientGraph::RegisterPartitions(
   }
 }
 
-static string SplitByWorker(const Node* node) {
-  string task;
-  string device;
+static std::string SplitByWorker(const Node* node) {
+  std::string task;
+  std::string device;
   CHECK(DeviceNameUtils::SplitDeviceName(node->assigned_device_name(), &task,
                                          &device))
       << "node: " << node->name() << " dev: " << node->assigned_device_name();
@@ -413,17 +414,17 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
       bool client_terminated;
       TF_CHECK_OK(GetNodeAttr(ndef, "client_terminated", &client_terminated));
       if (client_terminated) {
-        string name;
+        std::string name;
         TF_CHECK_OK(GetNodeAttr(ndef, "tensor_name", &name));
-        string send_device;
+        std::string send_device;
         TF_CHECK_OK(GetNodeAttr(ndef, "send_device", &send_device));
-        string recv_device;
+        std::string recv_device;
         TF_CHECK_OK(GetNodeAttr(ndef, "recv_device", &recv_device));
-        uint64 send_device_incarnation;
+        uint64_t send_device_incarnation;
         TF_CHECK_OK(
             GetNodeAttr(ndef, "send_device_incarnation",
                         reinterpret_cast<int64_t*>(&send_device_incarnation)));
-        const string& key =
+        const std::string& key =
             Rendezvous::CreateKey(send_device, send_device_incarnation,
                                   recv_device, name, FrameAndIter(0, 0));
 
@@ -439,7 +440,7 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
 
 absl::Status MasterSession::ReffedClientGraph::DoBuildPartitions(
     PartitionOptions popts, ClientGraph* client_graph,
-    std::unordered_map<string, GraphDef>* out_partitions) {
+    std::unordered_map<std::string, GraphDef>* out_partitions) {
   if (popts.need_to_record_start_times) {
     CostModel cost_model(true);
     cost_model.InitFromGraph(client_graph->graph);
@@ -455,7 +456,7 @@ absl::Status MasterSession::ReffedClientGraph::DoBuildPartitions(
 
 absl::Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     const PartitionOptions& popts,
-    std::unordered_map<string, GraphDef> graph_partitions) {
+    std::unordered_map<std::string, GraphDef> graph_partitions) {
   partitions_.reserve(graph_partitions.size());
   absl::Status s;
   for (auto& name_def : graph_partitions) {
@@ -524,7 +525,7 @@ class RunManyGraphs {
   // Returns the index-th call.
   struct Call {
     CallOptions opts;
-    const string* worker_name;
+    const std::string* worker_name;
     std::atomic<bool> done{false};
     std::unique_ptr<MutableRunGraphRequestWrapper> req;
     std::unique_ptr<MutableRunGraphResponseWrapper> resp;
@@ -625,13 +626,15 @@ class RunManyGraphs {
 
 absl::Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
                                       MutableRunGraphRequestWrapper* worker_req,
-                                      size_t index, const string& send_key) {
+                                      size_t index,
+                                      const std::string& send_key) {
   return worker_req->AddSendFromRunStepRequest(client_req, index, send_key);
 }
 
 absl::Status AddSendFromClientRequest(const RunCallableRequest& client_req,
                                       MutableRunGraphRequestWrapper* worker_req,
-                                      size_t index, const string& send_key) {
+                                      size_t index,
+                                      const std::string& send_key) {
   return worker_req->AddSendFromRunCallableRequest(client_req, index, send_key);
 }
 
@@ -639,13 +642,13 @@ absl::Status AddSendFromClientRequest(const RunCallableRequest& client_req,
 // in-process messages.
 struct RunCallableResponseWrapper {
   RunCallableResponse* resp;  // Not owned.
-  std::unordered_map<string, TensorProto> fetch_key_to_protos;
+  std::unordered_map<std::string, TensorProto> fetch_key_to_protos;
 
   RunMetadata* mutable_metadata() { return resp->mutable_metadata(); }
 
   absl::Status AddTensorFromRunGraphResponse(
-      const string& tensor_name, MutableRunGraphResponseWrapper* worker_resp,
-      size_t index) {
+      const std::string& tensor_name,
+      MutableRunGraphResponseWrapper* worker_resp, size_t index) {
     return worker_resp->RecvValue(index, &fetch_key_to_protos[tensor_name]);
   }
 };
@@ -709,18 +712,18 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
       for (const auto& name_index : feeds) {
-        const auto iter = part.feed_key.find(string(name_index.first));
+        const auto iter = part.feed_key.find(std::string(name_index.first));
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
         }
-        const string& key = iter->second;
+        const std::string& key = iter->second;
         TF_RETURN_IF_ERROR(AddSendFromClientRequest(req, c->req.get(),
                                                     name_index.second, key));
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (const string& req_fetch : fetches) {
+      for (const std::string& req_fetch : fetches) {
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
             c->req->add_recv_key(key_fetch.first);
@@ -730,8 +733,8 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
       }
     } else {
       for (const auto& feed_key : part.feed_key) {
-        const string& feed = feed_key.first;
-        const string& key = feed_key.second;
+        const std::string& feed = feed_key.first;
+        const std::string& key = feed_key.second;
         auto iter = feeds.find(feed);
         if (iter == feeds.end()) {
           return errors::Internal("No feed index found for feed: ", feed);
@@ -741,7 +744,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
             AddSendFromClientRequest(req, c->req.get(), feed_index, key));
       }
       for (const auto& key_fetch : part.key_fetch) {
-        const string& key = key_fetch.first;
+        const std::string& key = key_fetch.first;
         c->req->add_recv_key(key);
       }
     }
@@ -790,7 +793,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
                                        run_graph_resp->recv_key(j)));
         break;
       }
-      const string& fetch = iter->second;
+      const std::string& fetch = iter->second;
       status.Update(
           resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
       if (!status.ok()) {
@@ -834,7 +837,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitions(
     }
   }
 
-  std::vector<string> fetches;
+  std::vector<std::string> fetches;
   fetches.reserve(req.num_fetches());
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     fetches.push_back(req.fetch_name(i));
@@ -870,7 +873,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitions(
       call_opts, req, &wrapped_resp, cm, false /* is_last_partial_run */));
 
   // Collects fetches.
-  for (const string& fetch : callable_opts_.fetch()) {
+  for (const std::string& fetch : callable_opts_.fetch()) {
     TensorProto* fetch_proto = resp->mutable_fetch()->Add();
     auto iter = wrapped_resp.fetch_key_to_protos.find(fetch);
     if (iter == wrapped_resp.fetch_key_to_protos.end()) {
@@ -1001,7 +1004,7 @@ void MasterSession::ReffedClientGraph::ProcessStats(int64_t step_id,
 
 void MasterSession::ReffedClientGraph::ProcessDeviceStats(
     ProfileHandler* ph, const DeviceStepStats& ds, bool is_rpc) {
-  const string& dev_name = ds.device();
+  const std::string& dev_name = ds.device();
   VLOG(1) << "Device " << dev_name << " reports stats for "
           << ds.node_stats_size() << " nodes";
   for (const auto& ns : ds.node_stats()) {
@@ -1026,9 +1029,9 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
         }
         continue;
       }
-      const string& optype =
+      const std::string& optype =
           found_node_in_graph ? iter->second.type_string : ns.node_name();
-      string details;
+      std::string details;
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
@@ -1055,7 +1058,7 @@ absl::Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const Node* n = execution_state->get_node_by_name(string(id.first));
+    const Node* n = execution_state->get_node_by_name(std::string(id.first));
     if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
@@ -1069,9 +1072,9 @@ absl::Status MasterSession::ReffedClientGraph::CheckFetches(
   // Initialize the stack with the fetch nodes.
   std::vector<const Node*> stack;
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    const string& fetch = req.fetch_name(i);
+    const std::string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    const Node* n = execution_state->get_node_by_name(string(id.first));
+    const Node* n = execution_state->get_node_by_name(std::string(id.first));
     if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
@@ -1120,7 +1123,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
       WorkerCacheInterface* worker_cache = worker_cache_;
-      const string name = part.name;
+      const std::string name = part.name;
       WorkerInterface* w = part.worker;
       CHECK_NOTNULL(w);
       auto cb = [worker_cache, c, name, w](const absl::Status& s) {
@@ -1138,10 +1141,10 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
 }
 
 namespace {
-void CopyAndSortStrings(size_t size,
-                        const std::function<string(size_t)>& input_accessor,
-                        protobuf::RepeatedPtrField<string>* output) {
-  std::vector<string> temp;
+void CopyAndSortStrings(
+    size_t size, const std::function<std::string(size_t)>& input_accessor,
+    protobuf::RepeatedPtrField<std::string>* output) {
+  std::vector<std::string> temp;
   temp.reserve(size);
   for (size_t i = 0; i < size; ++i) {
     output->Add(input_accessor(i));
@@ -1194,22 +1197,22 @@ void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
   // TODO(cais): Add TFDBG support to partial runs.
 }
 
-uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
-  uint64 h = 0x2b992ddfa23249d6ull;
-  for (const string& name : opts.callable_options.feed()) {
+uint64_t HashBuildGraphOptions(const BuildGraphOptions& opts) {
+  uint64_t h = 0x2b992ddfa23249d6ull;
+  for (const std::string& name : opts.callable_options.feed()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
-  for (const string& name : opts.callable_options.target()) {
+  for (const std::string& name : opts.callable_options.target()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
-  for (const string& name : opts.callable_options.fetch()) {
+  for (const std::string& name : opts.callable_options.fetch()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
 
   const DebugOptions& debug_options =
       opts.callable_options.run_options().debug_options();
   if (!debug_options.debug_tensor_watch_opts().empty()) {
-    const string watch_summary =
+    const std::string watch_summary =
         SummarizeDebugTensorWatches(debug_options.debug_tensor_watch_opts());
     h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
   }
@@ -1217,17 +1220,17 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
   return h;
 }
 
-string BuildGraphOptionsString(const BuildGraphOptions& opts) {
-  string buf;
-  for (const string& name : opts.callable_options.feed()) {
+std::string BuildGraphOptionsString(const BuildGraphOptions& opts) {
+  std::string buf;
+  for (const std::string& name : opts.callable_options.feed()) {
     absl::StrAppend(&buf, " FdE: ", name);
   }
   absl::StrAppend(&buf, "\n");
-  for (const string& name : opts.callable_options.target()) {
+  for (const std::string& name : opts.callable_options.target()) {
     absl::StrAppend(&buf, " TN: ", name);
   }
   absl::StrAppend(&buf, "\n");
-  for (const string& name : opts.callable_options.fetch()) {
+  for (const std::string& name : opts.callable_options.fetch()) {
     absl::StrAppend(&buf, " FeE: ", name);
   }
   if (opts.collective_graph_key != BuildGraphOptions::kNoCollectiveGraphKey) {
@@ -1242,7 +1245,7 @@ MasterSession::MasterSession(
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceSet> device_set,
-    std::vector<string> filtered_worker_list,
+    std::vector<std::string> filtered_worker_list,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
@@ -1301,12 +1304,12 @@ absl::Status MasterSession::Create(GraphDef&& graph_def,
 
 absl::Status MasterSession::CreateWorkerSessions(
     const ClusterDef& cluster_def) {
-  const std::vector<string> worker_names = filtered_worker_list_;
+  const std::vector<std::string> worker_names = filtered_worker_list_;
   WorkerCacheInterface* worker_cache = get_worker_cache();
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
-    const string* name;
+    const std::string* name;
 
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
@@ -1328,8 +1331,8 @@ absl::Status MasterSession::CreateWorkerSessions(
     }
   });
 
-  string task_name;
-  string local_device_name;
+  std::string task_name;
+  std::string local_device_name;
   DeviceNameUtils::SplitDeviceName(devices_->client_device()->name(),
                                    &task_name, &local_device_name);
   const int64_t client_device_incarnation =
@@ -1435,11 +1438,11 @@ absl::Status MasterSession::CreateWorkerSessions(
 
 absl::Status MasterSession::DeleteWorkerSessions() {
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  const std::vector<string>& worker_names = filtered_worker_list_;
+  const std::vector<std::string>& worker_names = filtered_worker_list_;
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
-    const string* name;
+    const std::string* name;
 
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
@@ -1554,7 +1557,7 @@ absl::Status MasterSession::StartStep(const BuildGraphOptions& opts,
                                       bool is_partial,
                                       ReffedClientGraph** out_rcg,
                                       int64_t* out_count) {
-  const uint64 hash = HashBuildGraphOptions(opts);
+  const uint64_t hash = HashBuildGraphOptions(opts);
   {
     mutex_lock l(mu_);
     // TODO(suharshs): We cache partial run graphs and run graphs separately
@@ -1599,12 +1602,12 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
-uint64 MasterSession::NewStepId(int64_t graph_key) {
+uint64_t MasterSession::NewStepId(int64_t graph_key) {
   if (graph_key == BuildGraphOptions::kNoCollectiveGraphKey) {
     // StepId must leave the most-significant 7 bits empty for future use.
     return random::New64() & (((1uLL << 56) - 1) | (1uLL << 56));
   } else {
-    uint64 step_id = env_->collective_executor_mgr->NextStepId(graph_key);
+    uint64_t step_id = env_->collective_executor_mgr->NextStepId(graph_key);
     int32_t retry_count = 0;
     while (static_cast<int64_t>(step_id) == CollectiveExecutor::kInvalidId) {
       absl::Notification note;
@@ -1631,7 +1634,7 @@ uint64 MasterSession::NewStepId(int64_t graph_key) {
 
 absl::Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                             PartialRunSetupResponse* resp) {
-  std::vector<string> inputs, outputs, targets;
+  std::vector<std::string> inputs, outputs, targets;
   for (const auto& feed : req->feed()) {
     inputs.push_back(feed);
   }
@@ -1642,7 +1645,7 @@ absl::Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
     targets.push_back(target);
   }
 
-  string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
+  std::string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
 
   ReffedClientGraph* rcg = nullptr;
 
@@ -1706,11 +1709,11 @@ absl::Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // The closures popts.{new_name,get_incarnation} are called synchronously in
   // RegisterPartitions() below, so do not need a Ref()/Unref() pair to keep
   // "this" alive during the closure.
-  popts.new_name = [this](const string& prefix) {
+  popts.new_name = [this](const std::string& prefix) {
     mutex_lock l(mu_);
     return absl::StrCat(prefix, "_S", next_node_id_++);
   };
-  popts.get_incarnation = [this](const string& name) -> int64 {
+  popts.get_incarnation = [this](const std::string& name) -> int64_t {
     Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
       return PartitionOptions::kIllegalIncarnation;
@@ -1746,7 +1749,7 @@ absl::Status MasterSession::DoPartialRun(CallOptions* opts,
                                          const RunStepRequestWrapper& req,
                                          MutableRunStepResponseWrapper* resp) {
   auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
-  const string& prun_handle = req.partial_run_handle();
+  const std::string& prun_handle = req.partial_run_handle();
   RunState* run_state = nullptr;
   {
     mutex_lock l(mu_);
@@ -1802,7 +1805,7 @@ absl::Status MasterSession::DoPartialRun(CallOptions* opts,
 
   // Make sure that this is a new set of feeds that are still pending.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    const string& feed = req.feed_name(i);
+    const std::string& feed = req.feed_name(i);
     auto it = run_state->pending_inputs.find(feed);
     if (it == run_state->pending_inputs.end()) {
       return errors::InvalidArgument(
@@ -1814,7 +1817,7 @@ absl::Status MasterSession::DoPartialRun(CallOptions* opts,
   }
   // Check that this is a new set of fetches that are still pending.
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    const string& fetch = req.fetch_name(i);
+    const std::string& fetch = req.fetch_name(i);
     auto it = run_state->pending_outputs.find(fetch);
     if (it == run_state->pending_outputs.end()) {
       return errors::InvalidArgument(
@@ -1879,17 +1882,17 @@ absl::Status MasterSession::CreateDebuggerState(
   TF_RETURN_IF_ERROR(
       DebuggerStateRegistry::CreateState(debug_options, debugger_state));
 
-  std::vector<string> input_names;
+  std::vector<std::string> input_names;
   input_names.reserve(req.num_feeds());
   for (size_t i = 0; i < req.num_feeds(); ++i) {
     input_names.push_back(req.feed_name(i));
   }
-  std::vector<string> output_names;
+  std::vector<std::string> output_names;
   output_names.reserve(req.num_fetches());
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     output_names.push_back(req.fetch_name(i));
   }
-  std::vector<string> target_names;
+  std::vector<std::string> target_names;
   target_names.reserve(req.num_targets());
   for (size_t i = 0; i < req.num_targets(); ++i) {
     target_names.push_back(req.target_name(i));
@@ -1908,7 +1911,7 @@ absl::Status MasterSession::CreateDebuggerState(
 
 void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
                                      const RunOptions& run_options,
-                                     uint64 step_id, int64_t count,
+                                     uint64_t step_id, int64_t count,
                                      PerStepState* out_pss,
                                      std::unique_ptr<ProfileHandler>* out_ph) {
   out_pss->collect_timeline =
@@ -1935,7 +1938,7 @@ void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
 }
 
 absl::Status MasterSession::PostRunCleanup(
-    MasterSession::ReffedClientGraph* rcg, uint64 step_id,
+    MasterSession::ReffedClientGraph* rcg, uint64_t step_id,
     const RunOptions& run_options, PerStepState* pss,
     const std::unique_ptr<ProfileHandler>& ph, const absl::Status& run_status,
     RunMetadata* out_run_metadata) {
@@ -2004,7 +2007,7 @@ absl::Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = NewStepId(rcg->collective_graph_key());
+  uint64_t step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   std::unique_ptr<ProfileHandler> ph;
@@ -2054,7 +2057,7 @@ absl::Status MasterSession::MakeCallable(const MakeCallableRequest& req,
     return s;
   }
 
-  uint64 handle;
+  uint64_t handle;
   {
     mutex_lock l(mu_);
     handle = next_callable_handle_++;
@@ -2077,7 +2080,7 @@ absl::Status MasterSession::DoRunCallable(CallOptions* opts,
   // Prepare.
   int64_t count = rcg->get_and_increment_execution_count();
 
-  const uint64 step_id = NewStepId(rcg->collective_graph_key());
+  const uint64_t step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   const RunOptions& run_options = rcg->callable_options().run_options();
@@ -2176,10 +2179,10 @@ void MasterSession::GarbageCollect() {
   Unref();
 }
 
-MasterSession::RunState::RunState(const std::vector<string>& input_names,
-                                  const std::vector<string>& output_names,
-                                  ReffedClientGraph* rcg, const uint64 step_id,
-                                  const int64_t count)
+MasterSession::RunState::RunState(const std::vector<std::string>& input_names,
+                                  const std::vector<std::string>& output_names,
+                                  ReffedClientGraph* rcg,
+                                  const uint64_t step_id, const int64_t count)
     : rcg(rcg), step_id(step_id), count(count) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : input_names) {
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index f7016518bca5a9..b22953547b8f7c 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -52,7 +52,7 @@ class MasterSession : public core::RefCounted {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       std::unique_ptr<DeviceSet> device_set,
-      std::vector<string> filtered_worker_list,
+      std::vector<std::string> filtered_worker_list,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
@@ -60,11 +60,13 @@ class MasterSession : public core::RefCounted {
   absl::Status Create(GraphDef&& def, const ClusterDef& cluster_def);
 
   // Returns the session handle.
-  const string& handle() const { return handle_; }
+  const std::string& handle() const { return handle_; }
 
   // Returns the last access time (the number of micro-seconds since
   // some fixed point in time) of this session.
-  uint64 last_access_time_usec() const { return last_access_time_usec_.load(); }
+  uint64_t last_access_time_usec() const {
+    return last_access_time_usec_.load();
+  }
 
   // Attempt to extend the graph according to the given "req".
   // (See master.proto for details of valid extensions.)
@@ -117,7 +119,7 @@ class MasterSession : public core::RefCounted {
   const MasterEnv* env_;
 
   // The opaque session handle.
-  const string handle_;
+  const std::string handle_;
 
   std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
 
@@ -132,7 +134,7 @@ class MasterSession : public core::RefCounted {
 
   // The (partial device) names of remote worker tasks that this
   // session will contact.
-  const std::vector<string> filtered_worker_list_;
+  const std::vector<std::string> filtered_worker_list_;
 
   StatsPublisherFactory stats_publisher_factory_;
 
@@ -140,7 +142,7 @@ class MasterSession : public core::RefCounted {
 
   std::atomic<int64_t> partial_run_handle_counter_ = {0};
 
-  uint64 NewStepId(int64_t graph_key);
+  uint64_t NewStepId(int64_t graph_key);
 
   mutex mu_;
   std::unique_ptr<GraphExecutionState> execution_state_ TF_GUARDED_BY(mu_);
@@ -152,7 +154,7 @@ class MasterSession : public core::RefCounted {
   // before a new substitute has been created, Variables can go out of
   // scope and lose their state.
   class ReffedClientGraph;
-  typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
+  typedef std::unordered_map<uint64_t, ReffedClientGraph*> RCGMap;
   RCGMap run_graphs_ TF_GUARDED_BY(mu_);
   RCGMap partial_run_graphs_ TF_GUARDED_BY(mu_);
   int64_t next_callable_handle_ TF_GUARDED_BY(mu_) = 0;
@@ -172,35 +174,36 @@ class MasterSession : public core::RefCounted {
   };
 
   struct RunState {
-    std::unordered_map<string, bool> pending_inputs;   // true if fed
-    std::unordered_map<string, bool> pending_outputs;  // true if fetched
+    std::unordered_map<std::string, bool> pending_inputs;   // true if fed
+    std::unordered_map<std::string, bool> pending_outputs;  // true if fetched
     ReffedClientGraph* rcg = nullptr;
-    uint64 step_id;
+    uint64_t step_id;
     int64_t collective_graph_key;
     int64_t count = 0;
     PerStepState pss;
     std::unique_ptr<ProfileHandler> ph;
     bool step_started = false;
 
-    RunState(const std::vector<string>& input_names,
-             const std::vector<string>& output_names, ReffedClientGraph* rcg,
-             const uint64 step_id, const int64_t count);
+    RunState(const std::vector<std::string>& input_names,
+             const std::vector<std::string>& output_names,
+             ReffedClientGraph* rcg, const uint64_t step_id,
+             const int64_t count);
 
     bool PendingDone() const;
 
     ~RunState();
   };
-  std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
+  std::unordered_map<std::string, std::unique_ptr<RunState>> partial_runs_
       TF_GUARDED_BY(mu_);
 
   // Active RunStep calls.
   condition_variable num_running_is_zero_;
-  int32 num_running_ TF_GUARDED_BY(mu_) = 0;
+  int32_t num_running_ TF_GUARDED_BY(mu_) = 0;
 
   bool closed_ TF_GUARDED_BY(mu_) = false;
   bool garbage_collected_ TF_GUARDED_BY(mu_) = false;
 
-  std::unordered_map<uint64, int64_t> subgraph_execution_counts_
+  std::unordered_map<uint64_t, int64_t> subgraph_execution_counts_
       TF_GUARDED_BY(mu_);
 
   // We need to ensure that certain nodes added (e.g., send and recv
@@ -228,7 +231,7 @@ class MasterSession : public core::RefCounted {
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
                       RCGMap* rcg_map) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
-                        const RunOptions& run_options, uint64 step_id,
+                        const RunOptions& run_options, uint64_t step_id,
                         int64_t count, PerStepState* out_pss,
                         std::unique_ptr<ProfileHandler>* out_ph);
   absl::Status DoRunWithLocalExecution(CallOptions* opts,
@@ -240,7 +243,7 @@ class MasterSession : public core::RefCounted {
                              const RunCallableRequest& req,
                              RunCallableResponse* resp);
   absl::Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
-                              uint64 step_id, const RunOptions& run_options,
+                              uint64_t step_id, const RunOptions& run_options,
                               PerStepState* pss,
                               const std::unique_ptr<ProfileHandler>& ph,
                               const absl::Status& run_status,
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index ed6461c63b07ac..8269f1dca201cd 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -44,7 +44,7 @@ namespace tensorflow {
 class MasterTest : public ::testing::Test {
  protected:
   MasterTest() {
-    std::vector<string> targets;
+    std::vector<std::string> targets;
     SessionOptions options;
     (*options.config.mutable_device_count())["CPU"] = 1;
     (*options.config.mutable_device_count())["GPU"] = 0;
@@ -64,7 +64,7 @@ class MasterTest : public ::testing::Test {
   // Helpers for MasterService.{CreateSession,RunStep,CloseSession}
   // rpc calls.
 
-  absl::Status CreateSession(const GraphDef& def, string* handle,
+  absl::Status CreateSession(const GraphDef& def, std::string* handle,
                              int64_t* initial_version) {
     ::grpc::ClientContext ctx;
     CreateSessionRequest req;
@@ -81,7 +81,7 @@ class MasterTest : public ::testing::Test {
     return s;
   }
 
-  absl::Status ExtendSession(const string& handle, const GraphDef& def,
+  absl::Status ExtendSession(const std::string& handle, const GraphDef& def,
                              int64_t current_version, int64_t* new_version) {
     ::grpc::ClientContext ctx;
     ExtendSessionRequest req;
@@ -98,21 +98,21 @@ class MasterTest : public ::testing::Test {
   }
 
   absl::Status RunStep(
-      const string& handle,
-      const std::vector<std::pair<string, const Tensor*> >& feed,
-      const std::map<string, Tensor*>& fetch) {
+      const std::string& handle,
+      const std::vector<std::pair<std::string, const Tensor*> >& feed,
+      const std::map<std::string, Tensor*>& fetch) {
     ::grpc::ClientContext ctx;
     RunStepRequest req;
     req.set_session_handle(handle);
     for (const auto& p : feed) {
-      const string& feed_name = p.first;
+      const std::string& feed_name = p.first;
       const Tensor* feed_tensor = p.second;
       auto f = req.add_feed();
       f->set_name(feed_name);
       feed_tensor->AsProtoTensorContent(f->mutable_tensor());
     }
     for (const auto& p : fetch) {
-      const string& fetch_name = p.first;
+      const std::string& fetch_name = p.first;
       req.add_fetch(fetch_name);
     }
     RunStepResponse resp;
@@ -127,7 +127,7 @@ class MasterTest : public ::testing::Test {
     return s;
   }
 
-  absl::Status CloseSession(const string& handle) {
+  absl::Status CloseSession(const std::string& handle) {
     ::grpc::ClientContext ctx;
     CloseSessionRequest req;
     req.set_session_handle(handle);
@@ -145,7 +145,7 @@ class MasterTest : public ::testing::Test {
 
 TEST_F(MasterTest, CreateClose) {
   GraphDef def;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def, &handle, &initial_version));
   EXPECT_TRUE(absl::IsAborted(CloseSession("randombits")));
@@ -164,7 +164,7 @@ TEST_F(MasterTest, ListDevices) {
 
 TEST_F(MasterTest, Reset) {
   GraphDef def;  // Empty.
-  string s1, s2;
+  std::string s1, s2;
   int64_t initial_version1, initial_version2;
   TF_ASSERT_OK(CreateSession(def, &s1, &initial_version1));
   TF_ASSERT_OK(CreateSession(def, &s2, &initial_version2));
@@ -175,7 +175,7 @@ TEST_F(MasterTest, Reset) {
 
 TEST_F(MasterTest, Extend) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -216,7 +216,7 @@ TEST_F(MasterTest, Extend) {
 
 TEST_F(MasterTest, ExtendUpdateStatefulFails) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -235,7 +235,7 @@ TEST_F(MasterTest, ExtendUpdateStatefulFails) {
 
 TEST_F(MasterTest, ExtendTwiceFails) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -254,7 +254,7 @@ TEST_F(MasterTest, ExtendTwiceFails) {
 
 TEST_F(MasterTest, ConcurrentExtendOnlyOneSucceeds) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -306,7 +306,7 @@ TEST_F(MasterTest, ConcurrentExtendAndRun) {
   GraphDef def_0;
   test::graph::ToGraphDef(&graph_0, &def_0);
 
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -388,7 +388,7 @@ TEST_F(MasterTest, EigenProblem) {
   GraphDef def;
   test::graph::ToGraphDef(&graph, &def);
 
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_CHECK_OK(CreateSession(def, &handle, &initial_version));
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 60a264565dbb61..7eabcadcc173bf 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -38,24 +38,24 @@ bool ParseTensorProtoToTensor(const TensorProto& tensor_proto,
   return false;
 }
 
-const string& InMemoryRunStepRequest::session_handle() const {
+const std::string& InMemoryRunStepRequest::session_handle() const {
   return session_handle_;
 }
 
-void InMemoryRunStepRequest::set_session_handle(const string& handle) {
+void InMemoryRunStepRequest::set_session_handle(const std::string& handle) {
   session_handle_ = handle;
 }
 
-const string& InMemoryRunStepRequest::partial_run_handle() const {
+const std::string& InMemoryRunStepRequest::partial_run_handle() const {
   return partial_run_handle_;
 }
 
-void InMemoryRunStepRequest::set_partial_run_handle(const string& handle) {
+void InMemoryRunStepRequest::set_partial_run_handle(const std::string& handle) {
   partial_run_handle_ = handle;
 }
 
 size_t InMemoryRunStepRequest::num_feeds() const { return feeds_.size(); }
-const string& InMemoryRunStepRequest::feed_name(size_t i) const {
+const std::string& InMemoryRunStepRequest::feed_name(size_t i) const {
   return feeds_[i].first;
 }
 
@@ -71,23 +71,24 @@ absl::Status InMemoryRunStepRequest::FeedValue(size_t i,
   return absl::OkStatus();
 }
 
-void InMemoryRunStepRequest::add_feed(const string& name, const Tensor& value) {
+void InMemoryRunStepRequest::add_feed(const std::string& name,
+                                      const Tensor& value) {
   feeds_.emplace_back(name, value);
 }
 
 size_t InMemoryRunStepRequest::num_fetches() const { return fetches_.size(); }
-const string& InMemoryRunStepRequest::fetch_name(size_t i) const {
+const std::string& InMemoryRunStepRequest::fetch_name(size_t i) const {
   return fetches_[i];
 }
-void InMemoryRunStepRequest::add_fetch(const string& name) {
+void InMemoryRunStepRequest::add_fetch(const std::string& name) {
   fetches_.push_back(name);
 }
 
 size_t InMemoryRunStepRequest::num_targets() const { return targets_.size(); }
-const string& InMemoryRunStepRequest::target_name(size_t i) const {
+const std::string& InMemoryRunStepRequest::target_name(size_t i) const {
   return targets_[i];
 }
-void InMemoryRunStepRequest::add_target(const string& name) {
+void InMemoryRunStepRequest::add_target(const std::string& name) {
   targets_.push_back(name);
 }
 
@@ -108,7 +109,7 @@ void InMemoryRunStepRequest::set_store_errors_in_response_body(
   store_errors_in_response_body_ = store_errors;
 }
 
-string InMemoryRunStepRequest::DebugString() const {
+std::string InMemoryRunStepRequest::DebugString() const {
   return ToProto().DebugString();
 }
 
@@ -133,24 +134,25 @@ const RunStepRequest& InMemoryRunStepRequest::ToProto() const {
   return *proto_version_;
 }
 
-const string& MutableProtoRunStepRequest::session_handle() const {
+const std::string& MutableProtoRunStepRequest::session_handle() const {
   return request_.session_handle();
 }
-void MutableProtoRunStepRequest::set_session_handle(const string& handle) {
+void MutableProtoRunStepRequest::set_session_handle(const std::string& handle) {
   request_.set_session_handle(handle);
 }
 
-const string& MutableProtoRunStepRequest::partial_run_handle() const {
+const std::string& MutableProtoRunStepRequest::partial_run_handle() const {
   return request_.partial_run_handle();
 }
-void MutableProtoRunStepRequest::set_partial_run_handle(const string& handle) {
+void MutableProtoRunStepRequest::set_partial_run_handle(
+    const std::string& handle) {
   request_.set_partial_run_handle(handle);
 }
 
 size_t MutableProtoRunStepRequest::num_feeds() const {
   return request_.feed_size();
 }
-const string& MutableProtoRunStepRequest::feed_name(size_t i) const {
+const std::string& MutableProtoRunStepRequest::feed_name(size_t i) const {
   return request_.feed(i).name();
 }
 absl::Status MutableProtoRunStepRequest::FeedValue(size_t i,
@@ -168,7 +170,7 @@ absl::Status MutableProtoRunStepRequest::FeedValue(
   return absl::OkStatus();
 }
 
-void MutableProtoRunStepRequest::add_feed(const string& name,
+void MutableProtoRunStepRequest::add_feed(const std::string& name,
                                           const Tensor& value) {
   NamedTensorProto* feed = request_.add_feed();
   feed->set_name(name);
@@ -180,10 +182,10 @@ size_t MutableProtoRunStepRequest::num_fetches() const {
   return request_.fetch_size();
 }
 
-const string& MutableProtoRunStepRequest::fetch_name(size_t i) const {
+const std::string& MutableProtoRunStepRequest::fetch_name(size_t i) const {
   return request_.fetch(i);
 }
-void MutableProtoRunStepRequest::add_fetch(const string& name) {
+void MutableProtoRunStepRequest::add_fetch(const std::string& name) {
   request_.add_fetch(name);
 }
 
@@ -191,11 +193,11 @@ size_t MutableProtoRunStepRequest::num_targets() const {
   return request_.target_size();
 }
 
-const string& MutableProtoRunStepRequest::target_name(size_t i) const {
+const std::string& MutableProtoRunStepRequest::target_name(size_t i) const {
   return request_.target(i);
 }
 
-void MutableProtoRunStepRequest::add_target(const string& name) {
+void MutableProtoRunStepRequest::add_target(const std::string& name) {
   request_.add_target(name);
 }
 
@@ -220,7 +222,7 @@ int64_t MutableProtoRunStepRequest::request_id() const {
   return request_.request_id();
 }
 
-string MutableProtoRunStepRequest::DebugString() const {
+std::string MutableProtoRunStepRequest::DebugString() const {
   return request_.DebugString();
 }
 
@@ -231,17 +233,17 @@ const RunStepRequest& MutableProtoRunStepRequest::ToProto() const {
 ProtoRunStepRequest::ProtoRunStepRequest(const RunStepRequest* request)
     : request_(request) {}
 
-const string& ProtoRunStepRequest::session_handle() const {
+const std::string& ProtoRunStepRequest::session_handle() const {
   return request_->session_handle();
 }
 
-const string& ProtoRunStepRequest::partial_run_handle() const {
+const std::string& ProtoRunStepRequest::partial_run_handle() const {
   return request_->partial_run_handle();
 }
 
 size_t ProtoRunStepRequest::num_feeds() const { return request_->feed_size(); }
 
-const string& ProtoRunStepRequest::feed_name(size_t i) const {
+const std::string& ProtoRunStepRequest::feed_name(size_t i) const {
   return request_->feed(i).name();
 }
 
@@ -264,7 +266,7 @@ size_t ProtoRunStepRequest::num_fetches() const {
   return request_->fetch_size();
 }
 
-const string& ProtoRunStepRequest::fetch_name(size_t i) const {
+const std::string& ProtoRunStepRequest::fetch_name(size_t i) const {
   return request_->fetch(i);
 }
 
@@ -272,7 +274,7 @@ size_t ProtoRunStepRequest::num_targets() const {
   return request_->target_size();
 }
 
-const string& ProtoRunStepRequest::target_name(size_t i) const {
+const std::string& ProtoRunStepRequest::target_name(size_t i) const {
   return request_->target(i);
 }
 
@@ -288,13 +290,13 @@ int64_t ProtoRunStepRequest::request_id() const {
   return request_->request_id();
 }
 
-string ProtoRunStepRequest::DebugString() const {
+std::string ProtoRunStepRequest::DebugString() const {
   return request_->DebugString();
 }
 
 const RunStepRequest& ProtoRunStepRequest::ToProto() const { return *request_; }
 
-const string& InMemoryRunGraphRequest::session_handle() const {
+const std::string& InMemoryRunGraphRequest::session_handle() const {
   return session_handle_;
 }
 
@@ -302,7 +304,7 @@ bool InMemoryRunGraphRequest::create_worker_session_called() const {
   return create_worker_session_called_;
 }
 
-void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
+void InMemoryRunGraphRequest::set_session_handle(const std::string& handle) {
   session_handle_ = handle;
 }
 
@@ -310,11 +312,11 @@ void InMemoryRunGraphRequest::set_create_worker_session_called(bool called) {
   create_worker_session_called_ = called;
 }
 
-const string& InMemoryRunGraphRequest::graph_handle() const {
+const std::string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
 
-void InMemoryRunGraphRequest::set_graph_handle(const string& handle) {
+void InMemoryRunGraphRequest::set_graph_handle(const std::string& handle) {
   graph_handle_ = handle;
 }
 
@@ -334,7 +336,7 @@ ExecutorOpts* InMemoryRunGraphRequest::mutable_exec_opts() {
 
 size_t InMemoryRunGraphRequest::num_sends() const { return sends_.size(); }
 
-const string& InMemoryRunGraphRequest::send_key(size_t i) const {
+const std::string& InMemoryRunGraphRequest::send_key(size_t i) const {
   return sends_[i].first;
 }
 
@@ -346,7 +348,7 @@ absl::Status InMemoryRunGraphRequest::SendValue(size_t i,
 
 absl::Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
     const RunStepRequestWrapper& run_step_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   Tensor tensor;
   TF_RETURN_IF_ERROR(run_step_request.FeedValue(i, &tensor));
   sends_.emplace_back(send_key, std::move(tensor));
@@ -355,7 +357,7 @@ absl::Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
 
 absl::Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
     const RunCallableRequest& run_callable_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   Tensor tensor;
   if (!ParseTensorProtoToTensor(run_callable_request.feed(i), &tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
@@ -366,11 +368,11 @@ absl::Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
 
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
 
-const string& InMemoryRunGraphRequest::recv_key(size_t i) const {
+const std::string& InMemoryRunGraphRequest::recv_key(size_t i) const {
   return recvs_[i];
 }
 
-void InMemoryRunGraphRequest::add_recv_key(const string& recv_key) {
+void InMemoryRunGraphRequest::add_recv_key(const std::string& recv_key) {
   recvs_.push_back(recv_key);
 }
 
@@ -430,11 +432,12 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   return *proto_version_;
 }
 
-const string& MutableProtoRunGraphRequest::session_handle() const {
+const std::string& MutableProtoRunGraphRequest::session_handle() const {
   return request_.session_handle();
 }
 
-void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
+void MutableProtoRunGraphRequest::set_session_handle(
+    const std::string& handle) {
   request_.set_session_handle(handle);
 }
 
@@ -447,11 +450,11 @@ void MutableProtoRunGraphRequest::set_create_worker_session_called(
   request_.set_create_worker_session_called(called);
 }
 
-const string& MutableProtoRunGraphRequest::graph_handle() const {
+const std::string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
 
-void MutableProtoRunGraphRequest::set_graph_handle(const string& handle) {
+void MutableProtoRunGraphRequest::set_graph_handle(const std::string& handle) {
   request_.set_graph_handle(handle);
 }
 
@@ -475,7 +478,7 @@ size_t MutableProtoRunGraphRequest::num_sends() const {
   return request_.send_size();
 }
 
-const string& MutableProtoRunGraphRequest::send_key(size_t i) const {
+const std::string& MutableProtoRunGraphRequest::send_key(size_t i) const {
   return request_.send(i).name();
 }
 
@@ -490,7 +493,7 @@ absl::Status MutableProtoRunGraphRequest::SendValue(size_t i,
 
 absl::Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
     const RunStepRequestWrapper& run_step_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   NamedTensorProto* send = request_.add_send();
   send->set_name(send_key);
   TF_RETURN_IF_ERROR(run_step_request.FeedValue(i, send->mutable_tensor()));
@@ -499,7 +502,7 @@ absl::Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
 
 absl::Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
     const RunCallableRequest& run_callable_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   NamedTensorProto* send = request_.add_send();
   send->set_name(send_key);
   *send->mutable_tensor() = run_callable_request.feed(i);
@@ -510,11 +513,11 @@ size_t MutableProtoRunGraphRequest::num_recvs() const {
   return request_.recv_key_size();
 }
 
-const string& MutableProtoRunGraphRequest::recv_key(size_t i) const {
+const std::string& MutableProtoRunGraphRequest::recv_key(size_t i) const {
   return request_.recv_key(i);
 }
 
-void MutableProtoRunGraphRequest::add_recv_key(const string& recv_key) {
+void MutableProtoRunGraphRequest::add_recv_key(const std::string& recv_key) {
   request_.add_recv_key(recv_key);
 }
 
@@ -559,7 +562,7 @@ const RunGraphRequest& MutableProtoRunGraphRequest::ToProto() const {
 ProtoRunGraphRequest::ProtoRunGraphRequest(const RunGraphRequest* request)
     : request_(request) {}
 
-const string& ProtoRunGraphRequest::session_handle() const {
+const std::string& ProtoRunGraphRequest::session_handle() const {
   return request_->session_handle();
 }
 
@@ -567,7 +570,7 @@ bool ProtoRunGraphRequest::create_worker_session_called() const {
   return request_->create_worker_session_called();
 }
 
-const string& ProtoRunGraphRequest::graph_handle() const {
+const std::string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
 
@@ -579,7 +582,7 @@ const ExecutorOpts& ProtoRunGraphRequest::exec_opts() const {
 
 size_t ProtoRunGraphRequest::num_sends() const { return request_->send_size(); }
 
-const string& ProtoRunGraphRequest::send_key(size_t i) const {
+const std::string& ProtoRunGraphRequest::send_key(size_t i) const {
   return request_->send(i).name();
 }
 
@@ -596,7 +599,7 @@ size_t ProtoRunGraphRequest::num_recvs() const {
   return request_->recv_key_size();
 }
 
-const string& ProtoRunGraphRequest::recv_key(size_t i) const {
+const std::string& ProtoRunGraphRequest::recv_key(size_t i) const {
   return request_->recv_key(i);
 }
 
@@ -620,7 +623,7 @@ const RunGraphRequest& ProtoRunGraphRequest::ToProto() const {
 
 size_t InMemoryRunGraphResponse::num_recvs() const { return recvs_.size(); }
 
-const string& InMemoryRunGraphResponse::recv_key(size_t i) const {
+const std::string& InMemoryRunGraphResponse::recv_key(size_t i) const {
   return recvs_[i].first;
 }
 
@@ -635,7 +638,8 @@ absl::Status InMemoryRunGraphResponse::RecvValue(size_t i, Tensor* out_tensor) {
   return absl::OkStatus();
 }
 
-void InMemoryRunGraphResponse::AddRecv(const string& key, const Tensor& value) {
+void InMemoryRunGraphResponse::AddRecv(const std::string& key,
+                                       const Tensor& value) {
   recvs_.emplace_back(key, value);
 }
 
@@ -679,7 +683,7 @@ size_t OwnedProtoRunGraphResponse::num_recvs() const {
   return response_.recv_size();
 }
 
-const string& OwnedProtoRunGraphResponse::recv_key(size_t i) const {
+const std::string& OwnedProtoRunGraphResponse::recv_key(size_t i) const {
   return response_.recv(i).name();
 }
 
@@ -698,7 +702,7 @@ absl::Status OwnedProtoRunGraphResponse::RecvValue(size_t i,
   }
 }
 
-void OwnedProtoRunGraphResponse::AddRecv(const string& key,
+void OwnedProtoRunGraphResponse::AddRecv(const std::string& key,
                                          const Tensor& value) {
   NamedTensorProto* recv = response_.add_recv();
   recv->set_name(key);
@@ -752,7 +756,7 @@ size_t NonOwnedProtoRunGraphResponse::num_recvs() const {
   return response_->recv_size();
 }
 
-const string& NonOwnedProtoRunGraphResponse::recv_key(size_t i) const {
+const std::string& NonOwnedProtoRunGraphResponse::recv_key(size_t i) const {
   return response_->recv(i).name();
 }
 
@@ -771,7 +775,7 @@ absl::Status NonOwnedProtoRunGraphResponse::RecvValue(size_t i,
   }
 }
 
-void NonOwnedProtoRunGraphResponse::AddRecv(const string& key,
+void NonOwnedProtoRunGraphResponse::AddRecv(const std::string& key,
                                             const Tensor& value) {
   NamedTensorProto* recv = response_->add_recv();
   recv->set_name(key);
@@ -823,7 +827,7 @@ MutableRunStepResponseWrapper::~MutableRunStepResponseWrapper() {}
 
 size_t InMemoryRunStepResponse::num_tensors() const { return tensors_.size(); }
 
-const string& InMemoryRunStepResponse::tensor_name(size_t i) const {
+const std::string& InMemoryRunStepResponse::tensor_name(size_t i) const {
   return tensors_[i].first;
 }
 
@@ -838,7 +842,8 @@ const RunMetadata& InMemoryRunStepResponse::metadata() const {
 }
 
 absl::Status InMemoryRunStepResponse::AddTensorFromRunGraphResponse(
-    const string& name, MutableRunGraphResponseWrapper* wrapper, size_t i) {
+    const std::string& name, MutableRunGraphResponseWrapper* wrapper,
+    size_t i) {
   Tensor tensor;
   TF_RETURN_IF_ERROR(wrapper->RecvValue(i, &tensor));
   tensors_.emplace_back(name, tensor);
@@ -866,7 +871,7 @@ size_t OwnedProtoRunStepResponse::num_tensors() const {
   return response_.tensor_size();
 }
 
-const string& OwnedProtoRunStepResponse::tensor_name(size_t i) const {
+const std::string& OwnedProtoRunStepResponse::tensor_name(size_t i) const {
   return response_.tensor(i).name();
 }
 
@@ -884,7 +889,7 @@ const RunMetadata& OwnedProtoRunStepResponse::metadata() const {
 }
 
 absl::Status OwnedProtoRunStepResponse::AddTensorFromRunGraphResponse(
-    const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+    const std::string& name, MutableRunGraphResponseWrapper* run_graph_response,
     size_t i) {
   NamedTensorProto* response_tensor = response_.add_tensor();
   response_tensor->set_name(name);
@@ -919,7 +924,7 @@ size_t NonOwnedProtoRunStepResponse::num_tensors() const {
   return response_->tensor_size();
 }
 
-const string& NonOwnedProtoRunStepResponse::tensor_name(size_t i) const {
+const std::string& NonOwnedProtoRunStepResponse::tensor_name(size_t i) const {
   return response_->tensor(i).name();
 }
 
@@ -937,7 +942,7 @@ const RunMetadata& NonOwnedProtoRunStepResponse::metadata() const {
 }
 
 absl::Status NonOwnedProtoRunStepResponse::AddTensorFromRunGraphResponse(
-    const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+    const std::string& name, MutableRunGraphResponseWrapper* run_graph_response,
     size_t i) {
   NamedTensorProto* response_tensor = response_->add_tensor();
   response_tensor->set_name(name);
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index d4b07fb51ce4a3..b911d23245b4ad 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -53,15 +53,15 @@ class RunStepRequestWrapper {
 
   // REQUIRED: session_handle must be returned by a CreateSession call
   // to the same master service.
-  virtual const string& session_handle() const = 0;
+  virtual const std::string& session_handle() const = 0;
 
   // Partial run handle (optional). If specified, this will be a partial run
   // execution, run up to the specified fetches.
-  virtual const string& partial_run_handle() const = 0;
+  virtual const std::string& partial_run_handle() const = 0;
 
   // Tensors to be fed in the step. Each feed is a named tensor.
   virtual size_t num_feeds() const = 0;
-  virtual const string& feed_name(size_t i) const = 0;
+  virtual const std::string& feed_name(size_t i) const = 0;
 
   // Stores the content of the feed value at index `i` in `tensor`.
   virtual absl::Status FeedValue(size_t i, Tensor* out_tensor) const = 0;
@@ -71,12 +71,12 @@ class RunStepRequestWrapper {
   // be returned for each fetch[i] (see RunStepResponse.tensor). The
   // order of specified fetches does not change the execution order.
   virtual size_t num_fetches() const = 0;
-  virtual const string& fetch_name(size_t i) const = 0;
+  virtual const std::string& fetch_name(size_t i) const = 0;
 
   // Target Nodes. A list of node names. The named nodes will be run
   // to but their outputs will not be fetched.
   virtual size_t num_targets() const = 0;
-  virtual const string& target_name(size_t i) const = 0;
+  virtual const std::string& target_name(size_t i) const = 0;
 
   // Options for the run call.
   virtual const RunOptions& options() const = 0;
@@ -94,7 +94,7 @@ class RunStepRequestWrapper {
   virtual int64_t request_id() const = 0;
 
   // Returns a human-readable representation of this message for debugging.
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Returns the wrapped data as a protocol buffer message.
   virtual const RunStepRequest& ToProto() const = 0;
@@ -105,11 +105,11 @@ class RunStepRequestWrapper {
 // See `RunStepRequestWrapper` above for a description of the fields.
 class MutableRunStepRequestWrapper : public RunStepRequestWrapper {
  public:
-  virtual void set_session_handle(const string& handle) = 0;
-  virtual void set_partial_run_handle(const string& handle) = 0;
-  virtual void add_feed(const string& name, const Tensor& value) = 0;
-  virtual void add_fetch(const string& name) = 0;
-  virtual void add_target(const string& name) = 0;
+  virtual void set_session_handle(const std::string& handle) = 0;
+  virtual void set_partial_run_handle(const std::string& handle) = 0;
+  virtual void add_feed(const std::string& name, const Tensor& value) = 0;
+  virtual void add_fetch(const std::string& name) = 0;
+  virtual void add_target(const std::string& name) = 0;
   virtual RunOptions* mutable_options() = 0;
   virtual void set_store_errors_in_response_body(bool store_errors) = 0;
 };
@@ -119,37 +119,37 @@ class MutableRunStepRequestWrapper : public RunStepRequestWrapper {
 class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
  public:
   // RunStepRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& partial_run_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& partial_run_handle() const override;
   size_t num_feeds() const override;
-  const string& feed_name(size_t i) const override;
+  const std::string& feed_name(size_t i) const override;
   absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
   absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
   size_t num_fetches() const override;
-  const string& fetch_name(size_t i) const override;
+  const std::string& fetch_name(size_t i) const override;
   size_t num_targets() const override;
-  const string& target_name(size_t i) const override;
+  const std::string& target_name(size_t i) const override;
   const RunOptions& options() const override;
-  string DebugString() const override;
+  std::string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
   int64_t request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
-  void set_partial_run_handle(const string& handle) override;
-  void add_feed(const string& name, const Tensor& value) override;
-  void add_fetch(const string& name) override;
-  void add_target(const string& name) override;
+  void set_session_handle(const std::string& handle) override;
+  void set_partial_run_handle(const std::string& handle) override;
+  void add_feed(const std::string& name, const Tensor& value) override;
+  void add_fetch(const std::string& name) override;
+  void add_target(const std::string& name) override;
   RunOptions* mutable_options() override;
   void set_store_errors_in_response_body(bool store_errors) override;
 
  private:
-  string session_handle_;
-  string partial_run_handle_;
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> feeds_;
-  absl::InlinedVector<string, 4UL> fetches_;
-  absl::InlinedVector<string, 4UL> targets_;
+  std::string session_handle_;
+  std::string partial_run_handle_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> feeds_;
+  absl::InlinedVector<std::string, 4UL> fetches_;
+  absl::InlinedVector<std::string, 4UL> targets_;
   RunOptions options_;
   bool store_errors_in_response_body_ = false;
 
@@ -170,28 +170,28 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
 class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
  public:
   // RunStepRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& partial_run_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& partial_run_handle() const override;
   size_t num_feeds() const override;
-  const string& feed_name(size_t i) const override;
+  const std::string& feed_name(size_t i) const override;
   absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
   absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
   size_t num_fetches() const override;
-  const string& fetch_name(size_t i) const override;
+  const std::string& fetch_name(size_t i) const override;
   size_t num_targets() const override;
-  const string& target_name(size_t i) const override;
+  const std::string& target_name(size_t i) const override;
   const RunOptions& options() const override;
-  string DebugString() const override;
+  std::string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
   int64_t request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
-  void set_partial_run_handle(const string& handle) override;
-  void add_feed(const string& name, const Tensor& value) override;
-  void add_fetch(const string& name) override;
-  void add_target(const string& name) override;
+  void set_session_handle(const std::string& handle) override;
+  void set_partial_run_handle(const std::string& handle) override;
+  void add_feed(const std::string& name, const Tensor& value) override;
+  void add_fetch(const std::string& name) override;
+  void add_target(const std::string& name) override;
   RunOptions* mutable_options() override;
   void set_store_errors_in_response_body(bool store_errors) override;
 
@@ -211,18 +211,18 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
   ProtoRunStepRequest(const RunStepRequest* request);
 
   // RunStepRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& partial_run_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& partial_run_handle() const override;
   size_t num_feeds() const override;
-  const string& feed_name(size_t i) const override;
+  const std::string& feed_name(size_t i) const override;
   absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
   absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
   size_t num_fetches() const override;
-  const string& fetch_name(size_t i) const override;
+  const std::string& fetch_name(size_t i) const override;
   size_t num_targets() const override;
-  const string& target_name(size_t i) const override;
+  const std::string& target_name(size_t i) const override;
   const RunOptions& options() const override;
-  string DebugString() const override;
+  std::string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
   int64_t request_id() const override;
@@ -254,14 +254,14 @@ class RunGraphRequestWrapper {
 
   // The session handle used to register the graph. If empty, a single global
   // namespace is used.
-  virtual const string& session_handle() const = 0;
+  virtual const std::string& session_handle() const = 0;
 
   // Set to true if `CreateWorkerSession` was called for `session_handle`.
   virtual bool create_worker_session_called() const = 0;
 
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
-  virtual const string& graph_handle() const = 0;
+  virtual const std::string& graph_handle() const = 0;
 
   // A unique ID to distinguish different runs of the same graph.
   //
@@ -276,12 +276,12 @@ class RunGraphRequestWrapper {
 
   // Sends the tensors in "send" into the graph before the run.
   virtual size_t num_sends() const = 0;
-  virtual const string& send_key(size_t i) const = 0;
+  virtual const std::string& send_key(size_t i) const = 0;
   virtual absl::Status SendValue(size_t i, Tensor* out_tensor) const = 0;
 
   // Fetches the keys into `RunGraphResponse.recv` after the run.
   virtual size_t num_recvs() const = 0;
-  virtual const string& recv_key(size_t i) const = 0;
+  virtual const std::string& recv_key(size_t i) const = 0;
 
   // True if the RunGraphRequest is a partial run request.
   virtual bool is_partial() const = 0;
@@ -307,9 +307,9 @@ class RunGraphRequestWrapper {
 // See `RunGraphRequestWrapper` above for a description of the fields.
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
-  virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_session_handle(const std::string& handle) = 0;
   virtual void set_create_worker_session_called(bool called) = 0;
-  virtual void set_graph_handle(const string& handle) = 0;
+  virtual void set_graph_handle(const std::string& handle) = 0;
   virtual void set_step_id(int64_t step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
 
@@ -317,12 +317,12 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   // request with the given `send_key`.
   virtual absl::Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
-      const string& send_key) = 0;
+      const std::string& send_key) = 0;
   virtual absl::Status AddSendFromRunCallableRequest(
       const RunCallableRequest& run_callable_request, size_t i,
-      const string& send_key) = 0;
+      const std::string& send_key) = 0;
 
-  virtual void add_recv_key(const string& recv_key) = 0;
+  virtual void add_recv_key(const std::string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
   virtual void set_is_last_partial_run(bool is_last_partial_run) = 0;
   virtual void set_store_errors_in_response_body(bool store_errors) = 0;
@@ -332,16 +332,16 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
 class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& graph_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& graph_handle() const override;
   bool create_worker_session_called() const override;
   int64_t step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
-  const string& send_key(size_t i) const override;
+  const std::string& send_key(size_t i) const override;
   absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   const RunGraphRequest& ToProto() const override;
@@ -349,31 +349,31 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   int64_t request_id() const override;
 
   // MutableRunGraphRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
+  void set_session_handle(const std::string& handle) override;
   void set_create_worker_session_called(bool called) override;
-  void set_graph_handle(const string& handle) override;
+  void set_graph_handle(const std::string& handle) override;
   void set_step_id(int64_t step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
   absl::Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
-      const string& send_key) override;
+      const std::string& send_key) override;
   absl::Status AddSendFromRunCallableRequest(
       const RunCallableRequest& run_callable_request, size_t i,
-      const string& send_key) override;
-  void add_recv_key(const string& recv_key) override;
+      const std::string& send_key) override;
+  void add_recv_key(const std::string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
   void set_store_errors_in_response_body(bool store_errors) override;
   void set_request_id(int64_t request_id) override;
 
  private:
-  string session_handle_;
+  std::string session_handle_;
   bool create_worker_session_called_ = false;
-  string graph_handle_;
+  std::string graph_handle_;
   int64_t step_id_;
   ExecutorOpts exec_opts_;
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> sends_;
-  absl::InlinedVector<string, 4UL> recvs_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> sends_;
+  absl::InlinedVector<std::string, 4UL> recvs_;
   bool is_partial_ = false;
   bool is_last_partial_run_ = false;
   bool store_errors_in_response_body_ = false;
@@ -392,16 +392,16 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
-  const string& session_handle() const override;
+  const std::string& session_handle() const override;
   bool create_worker_session_called() const override;
-  const string& graph_handle() const override;
+  const std::string& graph_handle() const override;
   int64_t step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
-  const string& send_key(size_t i) const override;
+  const std::string& send_key(size_t i) const override;
   absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   bool store_errors_in_response_body() const override;
@@ -409,18 +409,18 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
+  void set_session_handle(const std::string& handle) override;
   void set_create_worker_session_called(bool called) override;
-  void set_graph_handle(const string& handle) override;
+  void set_graph_handle(const std::string& handle) override;
   void set_step_id(int64_t step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
   absl::Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
-      const string& send_key) override;
+      const std::string& send_key) override;
   absl::Status AddSendFromRunCallableRequest(
       const RunCallableRequest& run_callable_request, size_t i,
-      const string& send_key) override;
-  void add_recv_key(const string& recv_key) override;
+      const std::string& send_key) override;
+  void add_recv_key(const std::string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
   void set_store_errors_in_response_body(bool store_errors) override;
@@ -435,16 +435,16 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
   ProtoRunGraphRequest(const RunGraphRequest* request);
 
   // RunGraphRequestWrapper methods.
-  const string& session_handle() const override;
+  const std::string& session_handle() const override;
   bool create_worker_session_called() const override;
-  const string& graph_handle() const override;
+  const std::string& graph_handle() const override;
   int64_t step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
-  const string& send_key(size_t i) const override;
+  const std::string& send_key(size_t i) const override;
   absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   bool store_errors_in_response_body() const override;
@@ -480,12 +480,12 @@ class MutableRunGraphResponseWrapper {
   // A list of tensors corresponding to those requested by
   // `RunGraphRequest.recv_key`.
   virtual size_t num_recvs() const = 0;
-  virtual const string& recv_key(size_t i) const = 0;
+  virtual const std::string& recv_key(size_t i) const = 0;
   // NOTE: The following methods may perform a destructive read, for
   // efficiency.
   virtual absl::Status RecvValue(size_t i, TensorProto* out_tensor) = 0;
   virtual absl::Status RecvValue(size_t i, Tensor* out_tensor) = 0;
-  virtual void AddRecv(const string& key, const Tensor& value) = 0;
+  virtual void AddRecv(const std::string& key, const Tensor& value) = 0;
 
   // Submessages that store performance statistics about the subgraph
   // execution, if necessary.
@@ -520,10 +520,10 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
  public:
   // MutableRunGraphResponseWrapper methods.
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
   absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
-  void AddRecv(const string& key, const Tensor& value) override;
+  void AddRecv(const std::string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
   size_t num_partition_graphs() const override;
@@ -539,7 +539,7 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   RunGraphResponse* get_proto() override;
 
  private:
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> recvs_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> recvs_;
   StepStats step_stats_;
   CostGraphDef cost_graph_;
   std::vector<GraphDef> partition_graphs_;
@@ -553,10 +553,10 @@ class OwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
  public:
   // MutableRunGraphResponseWrapper methods.
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
   absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
-  void AddRecv(const string& key, const Tensor& value) override;
+  void AddRecv(const std::string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
   size_t num_partition_graphs() const override;
@@ -580,10 +580,10 @@ class NonOwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
 
   // MutableRunGraphResponseWrapper methods.
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
   absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
-  void AddRecv(const string& key, const Tensor& value) override;
+  void AddRecv(const std::string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
   size_t num_partition_graphs() const override;
@@ -628,14 +628,14 @@ class MutableRunStepResponseWrapper {
   // NOTE: The order of the returned tensors may or may not match
   // the fetch order specified in RunStepRequest.
   virtual size_t num_tensors() const = 0;
-  virtual const string& tensor_name(size_t i) const = 0;
+  virtual const std::string& tensor_name(size_t i) const = 0;
   virtual absl::Status TensorValue(size_t i, Tensor* out_tensor) const = 0;
 
   // Stores the i^{th} recv value in `run_graph_response` in this
   // response with the given `name`.
   virtual absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) = 0;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) = 0;
 
   // Returned metadata if requested in the options.
   virtual const RunMetadata& metadata() const = 0;
@@ -666,11 +666,11 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
  public:
   // MutableRunStepResponseWrapper methods.
   size_t num_tensors() const override;
-  const string& tensor_name(size_t i) const override;
+  const std::string& tensor_name(size_t i) const override;
   absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
   absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) override;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   absl::Status status() const override;
@@ -683,7 +683,7 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
   RunStepResponse* get_proto() override;
 
  private:
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> tensors_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> tensors_;
   RunMetadata metadata_;
   // Store the code and message separately so that they can be updated
   // independently by setters.
@@ -695,11 +695,11 @@ class OwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
  public:
   // MutableRunStepResponseWrapper methods.
   size_t num_tensors() const override;
-  const string& tensor_name(size_t i) const override;
+  const std::string& tensor_name(size_t i) const override;
   absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
   absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) override;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   absl::Status status() const override;
@@ -720,11 +720,11 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
 
   // MutableRunStepResponseWrapper methods.
   size_t num_tensors() const override;
-  const string& tensor_name(size_t i) const override;
+  const std::string& tensor_name(size_t i) const override;
   absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
   absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) override;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   absl::Status status() const override;
diff --git a/tensorflow/core/distributed_runtime/message_wrappers_test.cc b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
index f64d476d6dde3e..9f0af827eee591 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers_test.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
@@ -27,13 +27,13 @@ namespace {
 
 Tensor TensorA() {
   Tensor a_tensor(DT_INT32, TensorShape({2, 2}));
-  test::FillValues<int32>(&a_tensor, {3, 2, -1, 0});
+  test::FillValues<int32_t>(&a_tensor, {3, 2, -1, 0});
   return a_tensor;
 }
 
 Tensor TensorB() {
   Tensor b_tensor(DT_INT32, TensorShape({1, 2}));
-  test::FillValues<int32>(&b_tensor, {1, 2});
+  test::FillValues<int32_t>(&b_tensor, {1, 2});
   return b_tensor;
 }
 
@@ -57,9 +57,9 @@ void CheckRunStepRequest(const RunStepRequestWrapper& request) {
   EXPECT_EQ("feed_b:0", request.feed_name(1));
   Tensor val;
   TF_EXPECT_OK(request.FeedValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(request.FeedValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
 
   EXPECT_EQ(2, request.num_fetches());
   EXPECT_EQ("fetch_x:0", request.fetch_name(0));
@@ -92,9 +92,9 @@ void CheckRunGraphRequest(const RunGraphRequestWrapper& request) {
   EXPECT_EQ(2, request.num_sends());
   Tensor val;
   TF_EXPECT_OK(request.SendValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(request.SendValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
   EXPECT_TRUE(request.is_partial());
   EXPECT_FALSE(request.is_last_partial_run());
 }
@@ -117,9 +117,9 @@ void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
   EXPECT_EQ("recv_3", response->recv_key(1));
   Tensor val;
   TF_EXPECT_OK(response->RecvValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(response->RecvValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
   ASSERT_EQ(1, response->mutable_step_stats()->dev_stats_size());
   EXPECT_EQ("/cpu:0", response->mutable_step_stats()->dev_stats(0).device());
   ASSERT_EQ(1, response->mutable_cost_graph()->node_size());
@@ -152,9 +152,9 @@ void CheckRunStepResponse(const MutableRunStepResponseWrapper& response) {
   EXPECT_EQ("fetch_y:0", response.tensor_name(1));
   Tensor val;
   TF_EXPECT_OK(response.TensorValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(response.TensorValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
   ASSERT_EQ(1, response.metadata().step_stats().dev_stats_size());
   EXPECT_EQ("/cpu:0", response.metadata().step_stats().dev_stats(0).device());
   ASSERT_EQ(1, response.metadata().partition_graphs_size());
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index f75390b26bd338..f98da9aa19629e 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -61,7 +61,7 @@ bool RecentRequestIds::Insert(int64_t request_id) {
 }
 
 absl::Status RecentRequestIds::TrackUnique(int64_t request_id,
-                                           const string& method_name,
+                                           const std::string& method_name,
                                            const protobuf::Message& request) {
   if (Insert(request_id)) {
     return absl::OkStatus();
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 2eb35ac7266c6c..0299d3d9289118 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -60,11 +60,11 @@ class RecentRequestIds {
   // num_tracked_request_ids insertions. For backwards compatibility, this
   // always returns OK for request_id 0. The method_name and the request's
   // ShortDebugString are added to returned errors.
-  absl::Status TrackUnique(int64_t request_id, const string& method_name,
+  absl::Status TrackUnique(int64_t request_id, const std::string& method_name,
                            const protobuf::Message& request);
   // Overloaded version of the above function for wrapped protos.
   template <typename RequestWrapper>
-  absl::Status TrackUnique(int64_t request_id, const string& method_name,
+  absl::Status TrackUnique(int64_t request_id, const std::string& method_name,
                            const RequestWrapper* wrapper);
 
  private:
@@ -88,7 +88,7 @@ class RecentRequestIds {
 
 template <typename RequestWrapper>
 absl::Status RecentRequestIds::TrackUnique(int64_t request_id,
-                                           const string& method_name,
+                                           const std::string& method_name,
                                            const RequestWrapper* wrapper) {
   if (Insert(request_id)) {
     return absl::OkStatus();
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index ad8ac2080ab833..5bcf27d54abd1c 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -53,7 +53,7 @@ class RemoteDevice : public Device {
   bool IsRemoteCallAllowed() const override { return true; }
 
  private:
-  const string local_dev_name_;
+  const std::string local_dev_name_;
 
   RemoteDevice(const RemoteDevice&) = delete;
   void operator=(const RemoteDevice&) = delete;
@@ -78,7 +78,8 @@ void AsRemoteDevices(
 }
 
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
-                      const string& worker_name, NewRemoteDevicesDone done) {
+                      const std::string& worker_name,
+                      NewRemoteDevicesDone done) {
   WorkerInterface* wi = worker_cache->GetOrCreateWorker(worker_name);
   if (wi == nullptr) {
     std::vector<Device*> empty;
diff --git a/tensorflow/core/distributed_runtime/remote_device.h b/tensorflow/core/distributed_runtime/remote_device.h
index 591531f94d567f..806123ed71b205 100644
--- a/tensorflow/core/distributed_runtime/remote_device.h
+++ b/tensorflow/core/distributed_runtime/remote_device.h
@@ -62,7 +62,8 @@ void AsRemoteDevices(
 typedef std::function<void(const absl::Status&, std::vector<Device*>*)>
     NewRemoteDevicesDone;
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
-                      const string& worker_name, NewRemoteDevicesDone done);
+                      const std::string& worker_name,
+                      NewRemoteDevicesDone done);
 
 // Create Remote Device based on the given attributes.
 std::unique_ptr<Device> NewRemoteDevice(Env* env,
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 946b245a3e8fce..154eb09ee9d5ff 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -127,7 +127,7 @@ class GrpcEagerClientThread : public core::RefCounted {
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
-                  GrpcEagerClientThread* thread, const string& target)
+                  GrpcEagerClientThread* thread, const std::string& target)
       : stub_(channel), thread_(thread), target_(target) {
     // Hold a reference to make sure the corresponding EagerClientThread
     // outlives the client.
@@ -266,13 +266,13 @@ class GrpcEagerClient : public EagerClient {
  private:
   ::grpc::GenericStub stub_;
   const GrpcEagerClientThread* thread_;
-  const string target_;
+  const std::string target_;
 
   ::grpc::CompletionQueue* cq_;
 
   mutable mutex mu_;
 
-  std::unordered_map<uint64, StreamingRPCDispatcher<EnqueueResponse>>
+  std::unordered_map<uint64_t, StreamingRPCDispatcher<EnqueueResponse>>
       enqueue_dispatchers_ TF_GUARDED_BY(mu_);
 
   StatusCallback callback_wrapper(StatusCallback done) {
@@ -313,7 +313,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
   ~GrpcEagerClientCache() override { threads_.clear(); }
 
-  absl::Status GetClient(const string& target,
+  absl::Status GetClient(const std::string& target,
                          core::RefCountPtr<EagerClient>* client) override {
     mutex_lock l(clients_mu_);
     auto it = clients_.find(target);
@@ -342,7 +342,7 @@ class GrpcEagerClientCache : public EagerClientCache {
       TF_GUARDED_BY(assignment_mu_);
   size_t next_round_robin_assignment_ TF_GUARDED_BY(assignment_mu_);
 
-  size_t AssignClientToThread(const string& target) {
+  size_t AssignClientToThread(const std::string& target) {
     // Round-robin target assignment, but keeps the same target on the same
     // polling thread always, as this is important for gRPC performance
     mutex_lock lock(assignment_mu_);
@@ -358,7 +358,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
   std::shared_ptr<tensorflow::GrpcChannelCache> cache_;
   mutable mutex clients_mu_;
-  std::unordered_map<string, core::RefCountPtr<EagerClient>> clients_
+  std::unordered_map<std::string, core::RefCountPtr<EagerClient>> clients_
       TF_GUARDED_BY(clients_mu_);
   std::vector<core::RefCountPtr<GrpcEagerClientThread>> threads_;
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
index 2d64e07794d41a..3a11ef95274fbc 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
@@ -48,7 +48,7 @@ TEST(GrpcEagerClientCache, TestGetClientThreadSafety) {
 
   for (int i = 0; i < num_calls; i++) {
     Env::Default()->SchedClosure([&client_cache, i, &counter]() {
-      string target = absl::StrCat("/job:worker/replica:0/task:", i);
+      std::string target = absl::StrCat("/job:worker/replica:0/task:", i);
       core::RefCountPtr<EagerClient> eager_client;
       absl::Status s = client_cache->GetClient(target, &eager_client);
       // With 6 tasks added to the job, querying client for 0--5 should be OK,
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index b9bea2ea437a7a..33d567c56a0a63 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -45,7 +45,7 @@ GrpcEagerServiceImpl::GrpcEagerServiceImpl(
 }
 
 absl::Status GrpcEagerServiceImpl::CreateMasterContext(
-    const tensorflow::uint64 context_id, EagerContext* context) {
+    const uint64_t context_id, EagerContext* context) {
   return local_impl_.CreateMasterContext(context_id, context);
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 083ad55b3f4841..62ee6e9f13a9f0 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -50,8 +50,7 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
   virtual ~GrpcEagerServiceImpl() {}
 
   // Create a master context in eager service.
-  absl::Status CreateMasterContext(tensorflow::uint64 context_id,
-                                   EagerContext* context);
+  absl::Status CreateMasterContext(uint64_t context_id, EagerContext* context);
 
   void HandleRPCsLoop() override;
   void Shutdown() override;
@@ -136,7 +135,7 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
       // streaming connection.
       absl::Status status = local_impl_.Enqueue(
           /*call_opts=*/nullptr, &call->request(), call->mutable_response(),
-          reinterpret_cast<uint64>(static_cast<void*>(call)));
+          reinterpret_cast<uint64_t>(static_cast<void*>(call)));
 
       if (status.ok()) {
         VLOG(1) << "local_impl_.Enqueue completed successfully";
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 1af67bdb51b3ca..b80045c28f08cf 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -30,7 +30,7 @@ RpcCollectiveExecutorMgr::RpcCollectiveExecutorMgr(
     std::unique_ptr<DeviceResolverDistributed> dev_resolver,
     std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
     std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& task_name)
+    WorkerCacheInterface* worker_cache, const std::string& task_name)
     : CollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
                             std::move(param_resolver),
                             std::move(nccl_communicator)),
@@ -172,7 +172,8 @@ void RpcCollectiveExecutorMgr::RetireStepId(int64_t graph_key,
 std::unique_ptr<RpcCollectiveExecutorMgr> CreateProdRpcCollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* device_mgr,
     std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& default_worker_name) {
+    WorkerCacheInterface* worker_cache,
+    const std::string& default_worker_name) {
   auto dev_resolver = std::make_unique<DeviceResolverDistributed>(device_mgr);
   auto param_resolver = std::make_unique<CollectiveParamResolverDistributed>(
       config, device_mgr, dev_resolver.get(), nccl_communicator.get(),
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index 6836204cc1a289..aadbaf33796437 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -39,7 +39,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
       std::unique_ptr<DeviceResolverDistributed> dev_resolver,
       std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
       std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-      WorkerCacheInterface* worker_cache, const string& task_name);
+      WorkerCacheInterface* worker_cache, const std::string& task_name);
 
   virtual ~RpcCollectiveExecutorMgr();
 
@@ -60,8 +60,8 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
   virtual CollectiveExecutor* Create(int64_t step_id) override;
 
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  const string task_name_;
-  string group_leader_;
+  const std::string task_name_;
+  std::string group_leader_;
   friend class RpcCollectiveExecutorMgrTest;
 
  private:
@@ -88,7 +88,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
 std::unique_ptr<RpcCollectiveExecutorMgr> CreateProdRpcCollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* device_mgr,
     std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& default_worker_name);
+    WorkerCacheInterface* worker_cache, const std::string& default_worker_name);
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index f830fd96110456..55eebf621e5882 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 class RpcCollectiveExecutorMgrTest : public ::testing::Test {
  protected:
   RpcCollectiveExecutorMgrTest() {
-    string task_name = "/job:localhost/replica:0/task:0";
+    std::string task_name = "/job:localhost/replica:0/task:0";
     SessionOptions options;
     options.config.mutable_experimental()->set_collective_group_leader(
         task_name);
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index 666800294ac003..70816cc8a7b556 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -42,7 +42,7 @@ static const int kWorkers = 60;
 static thread::ThreadPool* worker_threads;
 
 void MakeGRPCCluster(const SessionOptions& options, int n,
-                     std::vector<string>* workers,
+                     std::vector<std::string>* workers,
                      std::vector<DeviceAttributes>* devices) {
   CHECK_GE(n, 1);
 
@@ -100,7 +100,7 @@ void MakeGRPCCluster(const SessionOptions& options, int n,
 
 struct Cluster {
   SessionOptions options;
-  std::vector<string> workers;
+  std::vector<std::string> workers;
   std::vector<DeviceAttributes> devices;  // One per process
 
   Cluster() {
@@ -153,14 +153,14 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
   return def;
 }
 
-string DebugString(const Tensor& x, const Tensor& y, int tensor_size) {
+std::string DebugString(const Tensor& x, const Tensor& y, int tensor_size) {
   CHECK_EQ(x.NumElements(), tensor_size);
   CHECK_EQ(y.NumElements(), tensor_size);
   auto x_flat = x.flat<float>();
   auto y_flat = y.flat<float>();
   // Just print the first couple of elements of each tensor
   CHECK_GE(tensor_size, 2);
-  return strings::Printf("x = [%8.6f %8.6f] y = [%8.6f %8.6f]", x_flat(0),
+  return absl::StrFormat("x = [%8.6f %8.6f] y = [%8.6f %8.6f]", x_flat(0),
                          x_flat(1), y_flat(0), y_flat(1));
 }
 
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index 4385db786ff38a..d277bdab74e835 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -86,7 +86,7 @@ class GreedyScheduler {
   const CostModel* cost_model_;
   const Graph* graph_;
   std::vector<int64_t>* priority_;
-  std::unordered_map<string, Sim*> device_states_;
+  std::unordered_map<std::string, Sim*> device_states_;
 
   GreedyScheduler(const GreedyScheduler&) = delete;
   void operator=(const GreedyScheduler&) = delete;
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 2f7cc4184662f4..527dd49507c607 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -28,7 +28,7 @@ mutex* get_server_factory_lock() {
   return &server_factory_lock;
 }
 
-typedef std::unordered_map<string, ServerFactory*> ServerFactories;
+typedef std::unordered_map<std::string, ServerFactory*> ServerFactories;
 ServerFactories* server_factories() {
   static ServerFactories* factories = new ServerFactories;
   return factories;
@@ -36,7 +36,7 @@ ServerFactories* server_factories() {
 }  // namespace
 
 /* static */
-void ServerFactory::Register(const string& server_type,
+void ServerFactory::Register(const std::string& server_type,
                              ServerFactory* factory) {
   mutex_lock l(*get_server_factory_lock());
   if (!server_factories()->insert({server_type, factory}).second) {
@@ -56,7 +56,7 @@ absl::Status ServerFactory::GetFactory(const ServerDef& server_def,
     }
   }
 
-  std::vector<string> server_names;
+  std::vector<std::string> server_names;
   for (const auto& server_factory : *server_factories()) {
     server_names.push_back(server_factory.first);
   }
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index cc92d0bae12b17..c49d47970b4ca0 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -64,7 +64,7 @@ class ServerInterface {
 
   // Returns a target string that can be used to connect to this server using
   // `tensorflow::NewSession()`.
-  virtual const string target() const = 0;
+  virtual const std::string target() const = 0;
 
   virtual WorkerEnv* worker_env() = 0;
   virtual MasterEnv* master_env() = 0;
@@ -77,7 +77,7 @@ class ServerInterface {
   // Add master eager context to local eager service in order to handle enqueue
   // requests from remote workers.
   virtual absl::Status AddMasterEagerContextToEagerService(
-      const tensorflow::uint64 context_id, EagerContext* context) = 0;
+      const uint64_t context_id, EagerContext* context) = 0;
   // Set coordination service agent instance to coordination service RPC handler
   virtual absl::Status SetCoordinationServiceAgentInstance(
       tsl::CoordinationServiceAgent* agent) = 0;
@@ -113,7 +113,7 @@ class ServerFactory {
   // be registered by calling this method.
   //
   // The `server_type` must be unique to the server factory.
-  static void Register(const string& server_type, ServerFactory* factory);
+  static void Register(const std::string& server_type, ServerFactory* factory);
 
   // Looks up a factory that can create a server based on the given
   // `server_def`, and stores it in `*out_factory`. Returns OK on
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index 1990f0c17c66a4..43524d19a35788 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -114,14 +114,14 @@ enum WireType {
   WIRETYPE_VARINT = 0,
   WIRETYPE_LENGTH_DELIMITED = 2,
 };
-inline int GetTagFieldNumber(uint32 tag) { return tag >> 3; }
-inline WireType GetTagWireType(uint32 tag) {
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+inline WireType GetTagWireType(uint32_t tag) {
   return static_cast<WireType>(tag & 0x7);
 }
 
 bool ReadVarintSizeAsInt(protobuf::io::CodedInputStream* input, int* result) {
   protobuf_uint64 v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64>(INT_MAX)) {
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
     *result = static_cast<int>(v);
     return true;
   } else {
@@ -162,7 +162,7 @@ bool TensorResponse::ParseTensorSubmessage(
     }
     switch (tag) {
       case TensorProto::kDtypeFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input->ReadVarint32(&v)) return false;
         if (seen_tensor_content) return false;
         tensor_meta->set_dtype(static_cast<DataType>(static_cast<int>(v)));
@@ -177,10 +177,10 @@ bool TensorResponse::ParseTensorSubmessage(
         break;
       }
       case TensorProto::kVersionNumberFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input->ReadVarint32(&v)) return false;
         if (seen_tensor_content) return false;
-        tensor_meta->set_version_number(static_cast<int32>(v));
+        tensor_meta->set_version_number(static_cast<int32_t>(v));
         break;
       }
       case TensorProto::kTensorContentFieldNumber: {
@@ -242,7 +242,7 @@ bool TensorResponse::ParseFast(Source* source) {
         break;
       }
       case RecvTensorResponse::kIsDeadFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) return false;
         meta_.set_is_dead(v != 0);
         break;
@@ -260,7 +260,7 @@ bool TensorResponse::ParseFast(Source* source) {
         break;
       }
       case RecvTensorResponse::kRequireAckFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) return false;
         meta_.set_require_ack(v != 0);
         break;
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
index 9ef513f70392e4..66ba2bdce86b3a 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -48,7 +48,7 @@ class DummyDevice : public DeviceBase {
 
 class StringSource : public TensorResponse::Source {
  public:
-  explicit StringSource(const string* s, int block_size)
+  explicit StringSource(const std::string* s, int block_size)
       : s_(s), stream_(nullptr), block_size_(block_size) {}
   ~StringSource() override { DeleteStream(); }
 
@@ -66,7 +66,7 @@ class StringSource : public TensorResponse::Source {
   }
 
  private:
-  const string* s_;
+  const std::string* s_;
   protobuf::io::ArrayInputStream* stream_;
   char space_[sizeof(protobuf::io::ArrayInputStream)];
   int block_size_;
@@ -83,7 +83,7 @@ class TensorResponseTest : public ::testing::Test {
     } else {
       src.AsProtoField(proto.mutable_tensor());
     }
-    string encoded;
+    std::string encoded;
     proto.AppendToString(&encoded);
 
     StringSource source(&encoded, 1024);
@@ -136,11 +136,11 @@ class TensorResponseTest : public ::testing::Test {
 TEST_F(TensorResponseTest, Simple) {
   DoTest<float>(DT_FLOAT);
   DoTest<double>(DT_DOUBLE);
-  DoTest<int32>(DT_INT32);
-  DoTest<uint16>(DT_UINT16);
-  DoTest<uint8>(DT_UINT8);
-  DoTest<int16>(DT_INT16);
-  DoTest<int8>(DT_INT8);
+  DoTest<int32_t>(DT_INT32);
+  DoTest<uint16_t>(DT_UINT16);
+  DoTest<uint8_t>(DT_UINT8);
+  DoTest<int16_t>(DT_INT16);
+  DoTest<int8_t>(DT_INT8);
   DoTest<complex64>(DT_COMPLEX64);
   DoTest<complex128>(DT_COMPLEX128);
   DoTest<int64_t>(DT_INT64);
@@ -156,19 +156,19 @@ TEST_F(TensorResponseTest, Simple) {
 
 TEST_F(TensorResponseTest, StringTensor) { DoTestForStrings(DT_STRING); }
 
-string MakeFloatTensorTestCase(int num_elems) {
-  std::vector<int8> v(num_elems);
+std::string MakeFloatTensorTestCase(int num_elems) {
+  std::vector<int8_t> v(num_elems);
   for (int i = 0; i < num_elems; i++) {
     v[i] = i % 10;
   }
   Tensor src(DT_INT8, TensorShape({1, static_cast<int64_t>(v.size())}));
-  test::FillValues<int8>(&src, v);
+  test::FillValues<int8_t>(&src, v);
 
   RecvTensorResponse proto;
   proto.set_is_dead(false);
   proto.set_send_start_micros(123456);
   src.AsProtoTensorContent(proto.mutable_tensor());
-  string encoded;
+  std::string encoded;
   proto.AppendToString(&encoded);
   return encoded;
 }
@@ -176,7 +176,7 @@ string MakeFloatTensorTestCase(int num_elems) {
 static void BM_TensorResponse(::testing::benchmark::State& state) {
   const int arg = state.range(0);
 
-  string encoded = MakeFloatTensorTestCase(arg);
+  std::string encoded = MakeFloatTensorTestCase(arg);
   DummyDevice cpu_device(Env::Default());
   size_t bytes = 0;
   for (auto i : state) {
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index e7ad1041dd73ff..b7316299e051c3 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -124,23 +124,24 @@ class TestWorkerCache : public WorkerCacheInterface {
  public:
   virtual ~TestWorkerCache() {}
 
-  void AddWorker(const string& target, WorkerInterface* wi) {
+  void AddWorker(const std::string& target, WorkerInterface* wi) {
     workers_[target] = wi;
   }
 
-  void AddDevice(const string& device_name, const DeviceLocality& dev_loc) {
+  void AddDevice(const std::string& device_name,
+                 const DeviceLocality& dev_loc) {
     localities_[device_name] = dev_loc;
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     workers->clear();
     for (auto it : workers_) {
       workers->push_back(it.first);
     }
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) const override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) const override {
     workers->clear();
     for (auto it : workers_) {
       DeviceNameUtils::ParsedName device_name;
@@ -152,7 +153,7 @@ class TestWorkerCache : public WorkerCacheInterface {
     }
   }
 
-  WorkerInterface* GetOrCreateWorker(const string& target) override {
+  WorkerInterface* GetOrCreateWorker(const std::string& target) override {
     auto it = workers_.find(target);
     if (it != workers_.end()) {
       return it->second;
@@ -160,7 +161,8 @@ class TestWorkerCache : public WorkerCacheInterface {
     return nullptr;
   }
 
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {}
+  void ReleaseWorker(const std::string& target,
+                     WorkerInterface* worker) override {}
 
   absl::Status GetEagerClientCache(
       std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
@@ -172,7 +174,7 @@ class TestWorkerCache : public WorkerCacheInterface {
     return errors::Unimplemented("Unimplemented.");
   }
 
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     auto it = localities_.find(device);
     if (it != localities_.end()) {
@@ -182,7 +184,8 @@ class TestWorkerCache : public WorkerCacheInterface {
     return false;
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
     auto it = localities_.find(device);
     if (it != localities_.end()) {
@@ -194,8 +197,8 @@ class TestWorkerCache : public WorkerCacheInterface {
   }
 
  protected:
-  std::unordered_map<string, WorkerInterface*> workers_;
-  std::unordered_map<string, DeviceLocality> localities_;
+  std::unordered_map<std::string, WorkerInterface*> workers_;
+  std::unordered_map<std::string, DeviceLocality> localities_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 9fb0a76ad866f9..04b0ee20d2cc8f 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -251,7 +251,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
 
         if (s.ok()) {
           for (const auto& p : *out) {
-            const string& key = p.first;
+            const std::string& key = p.first;
             const Tensor& val = p.second;
             response->AddRecv(key, val);
           }
@@ -271,7 +271,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
                                MutableRunGraphResponseWrapper* response,
                                StatusCallback done) {
   const int64_t step_id = request->step_id();
-  const string& graph_handle = request->graph_handle();
+  const std::string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
   absl::Status s = recent_request_ids_.TrackUnique(
       request->request_id(), "PartialRunGraph (Worker)", request);
@@ -345,7 +345,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
         if (s.ok()) {
           // Construct and return the resp.
           for (const auto& p : *out) {
-            const string& key = p.first;
+            const std::string& key = p.first;
             const Tensor& val = p.second;
             response->AddRecv(key, val);
           }
@@ -378,7 +378,7 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
 void Worker::CleanupAllAsync(const CleanupAllRequest* request,
                              CleanupAllResponse* response,
                              StatusCallback done) {
-  std::vector<string> containers;
+  std::vector<std::string> containers;
   for (const auto& c : request->container()) containers.push_back(c);
   env_->device_mgr->ClearContainers(containers);
   done(absl::OkStatus());
@@ -474,7 +474,7 @@ void Worker::GetStepSequenceAsync(const GetStepSequenceRequest* request,
 absl::Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
                                        Device** src_dev) {
   // Figures out which device the tensor is hosted on.
-  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  std::string local_name = DeviceNameUtils::LocalName(parsed.src_device);
   TF_RETURN_IF_ERROR(env_->device_mgr->LookupDevice(local_name, src_dev));
 
   // Does the device have the right incarnation number we expect?
diff --git a/tensorflow/core/distributed_runtime/worker_cache.h b/tensorflow/core/distributed_runtime/worker_cache.h
index 1ac4de35d9788f..0612a8321d3aac 100644
--- a/tensorflow/core/distributed_runtime/worker_cache.h
+++ b/tensorflow/core/distributed_runtime/worker_cache.h
@@ -37,22 +37,23 @@ class WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  virtual void ListWorkers(std::vector<string>* workers) const = 0;
-  virtual void ListWorkersInJob(const string& job_name,
-                                std::vector<string>* workers) const = 0;
+  virtual void ListWorkers(std::vector<std::string>* workers) const = 0;
+  virtual void ListWorkersInJob(const std::string& job_name,
+                                std::vector<std::string>* workers) const = 0;
 
   // If "target" names a remote task for which an RPC channel exists
   // or can be constructed, returns a pointer to a WorkerInterface object
   // wrapping that channel. The returned value must be destroyed by
   // calling `this->ReleaseWorker(target, ret)`
-  virtual WorkerInterface* GetOrCreateWorker(const string& target) = 0;
+  virtual WorkerInterface* GetOrCreateWorker(const std::string& target) = 0;
 
   // Release a worker previously returned by this->GetOrCreateWorker(target).
   //
   // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
   // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
   //                    per-rpc-subsystem WorkerInterface creator.
-  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+  virtual void ReleaseWorker(const std::string& target,
+                             WorkerInterface* worker) {
     // Subclasses may override to reuse worker objects.
     delete worker;
   }
@@ -61,13 +62,13 @@ class WorkerCacheInterface {
   // within its local environment.  Returns true if *locality
   // was set, using only locally cached data.  Returns false
   // if status data for that device was not available.  Never blocks.
-  virtual bool GetDeviceLocalityNonBlocking(const string& device,
+  virtual bool GetDeviceLocalityNonBlocking(const std::string& device,
                                             DeviceLocality* locality) = 0;
 
   // Set *locality with the DeviceLocality of the specified remote device
   // within its local environment.  Callback gets Status::OK if *locality
   // was set.
-  virtual void GetDeviceLocalityAsync(const string& device,
+  virtual void GetDeviceLocalityAsync(const std::string& device,
                                       DeviceLocality* locality,
                                       StatusCallback done) = 0;
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 2936c3b2667e18..5a1d3d02d4eceb 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -68,7 +68,7 @@ bool WorkerCacheLogger::RetrieveLogs(int64_t step_id, StepStats* ss) {
   return false;
 }
 
-void WorkerCacheLogger::Save(const string& device, int64_t step_id,
+void WorkerCacheLogger::Save(const std::string& device, int64_t step_id,
                              NodeExecStats* ns) {
   mutex_lock l(mu_);
   StepLog* sl = &log_map_[step_id];
@@ -84,33 +84,31 @@ void WorkerCacheLogger::Save(const string& device, int64_t step_id,
 
 void WorkerCacheLogger::RecordRecvTensor(int64_t step_id, int64_t start_usecs,
                                          int64_t end_usecs,
-                                         const string& tensor_name,
-                                         const string& src_device,
-                                         const string& dst_device,
+                                         const std::string& tensor_name,
+                                         const std::string& src_device,
+                                         const std::string& dst_device,
                                          int64_t bytes) {
   RecordDataTransfer(step_id, start_usecs, end_usecs, tensor_name, src_device,
                      dst_device, bytes, "", "RecvTensor");
 }
 
-void WorkerCacheLogger::RecordDataTransfer(int64_t step_id, int64_t start_usecs,
-                                           int64_t end_usecs,
-                                           const string& tensor_name,
-                                           const string& src_device,
-                                           const string& dst_device,
-                                           int64_t bytes, const string& details,
-                                           const string& transfer_method_name) {
+void WorkerCacheLogger::RecordDataTransfer(
+    int64_t step_id, int64_t start_usecs, int64_t end_usecs,
+    const std::string& tensor_name, const std::string& src_device,
+    const std::string& dst_device, int64_t bytes, const std::string& details,
+    const std::string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
   int64_t elapsed_usecs = end_usecs - start_usecs;
   if (details.empty()) {
     auto byte_string = absl::StrCat("[", bytes, "B] ");
     if (bytes >= 0.1 * 1048576.0) {
-      byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
+      byte_string = absl::StrFormat("[%.1fMB] ", bytes / 1048576.0);
     }
     float mbs_rate = (8.0 * static_cast<float>(bytes)) / elapsed_usecs;
     auto rate_string = (mbs_rate >= 1000.0)
-                           ? strings::Printf("[%.1fGb/s] ", mbs_rate / 1000.0)
-                           : strings::Printf("[%fMb/s] ", mbs_rate);
+                           ? absl::StrFormat("[%.1fGb/s] ", mbs_rate / 1000.0)
+                           : absl::StrFormat("[%fMb/s] ", mbs_rate);
     auto label = strings::StrCat(byte_string, rate_string, tensor_name,
                                  " from ", src_device, " to ", dst_device);
     ns->set_timeline_label(label);
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.h b/tensorflow/core/distributed_runtime/worker_cache_logger.h
index f5ef19bf6646f7..e7a1ebf0c40708 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.h
@@ -57,20 +57,22 @@ class WorkerCacheLogger {
   // Generates a NodeExecStats record with the given data, and saves for
   // later retrieval by RetrieveLogs().
   void RecordRecvTensor(int64_t step_id, int64_t start_usecs, int64_t end_usecs,
-                        const string& tensor_name, const string& src_device,
-                        const string& dst_device, int64_t bytes);
+                        const std::string& tensor_name,
+                        const std::string& src_device,
+                        const std::string& dst_device, int64_t bytes);
 
   // Generates a NodeExecStats record with the given data, and saves for
   // later retrieval by RetrieveLogs().
   void RecordDataTransfer(int64_t step_id, int64_t start_usecs,
-                          int64_t end_usecs, const string& tensor_name,
-                          const string& src_device, const string& dst_device,
-                          int64_t bytes, const string& details,
-                          const string& transfer_method_name);
+                          int64_t end_usecs, const std::string& tensor_name,
+                          const std::string& src_device,
+                          const std::string& dst_device, int64_t bytes,
+                          const std::string& details,
+                          const std::string& transfer_method_name);
 
  private:
   mutex count_mu_;
-  int32 want_logging_count_ TF_GUARDED_BY(count_mu_) = 0;
+  int32_t want_logging_count_ TF_GUARDED_BY(count_mu_) = 0;
 
   struct StepLog {
     StepStats step_stats;
@@ -81,7 +83,7 @@ class WorkerCacheLogger {
   LogMap log_map_ TF_GUARDED_BY(mu_);
 
   // Records "ns" in log_map_ under the given device and step.
-  void Save(const string& device, int64_t step_id, NodeExecStats* ns);
+  void Save(const std::string& device, int64_t step_id, NodeExecStats* ns);
 
   void ClearLogsWithLock() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 };
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
index 58b130228e00dd..47fdcce387297d 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 
 bool WorkerCachePartial::GetDeviceLocalityNonBlocking(
-    const string& device_name, DeviceLocality* locality) {
+    const std::string& device_name, DeviceLocality* locality) {
   mutex_lock lock(mu_);  // could use reader lock
   auto iter = device_status_cache_.find(device_name);
   if (iter != device_status_cache_.end()) {
@@ -37,7 +37,7 @@ bool WorkerCachePartial::GetDeviceLocalityNonBlocking(
   return false;
 }
 
-void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name,
+void WorkerCachePartial::GetDeviceLocalityAsync(const std::string& device_name,
                                                 DeviceLocality* locality,
                                                 StatusCallback done) {
   if (!GetDeviceLocalityNonBlocking(device_name, locality)) {
@@ -55,9 +55,9 @@ void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name,
 }
 
 absl::Status WorkerCachePartial::RefreshDeviceStatus(
-    const string& device_name) {
-  string task;
-  string device;
+    const std::string& device_name) {
+  std::string task;
+  std::string device;
   absl::Status s;
   if (!DeviceNameUtils::SplitDeviceName(device_name, &task, &device)) {
     s = errors::InvalidArgument("Bad device name to RefreshDeviceStatus: ",
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.h b/tensorflow/core/distributed_runtime/worker_cache_partial.h
index b5a500b86dae00..08e272a3bb6db6 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.h
@@ -31,10 +31,11 @@ namespace tensorflow {
 // device status attributes.
 class WorkerCachePartial : public WorkerCacheInterface {
  public:
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override;
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback) override;
 
   ~WorkerCachePartial() override {}
@@ -47,9 +48,9 @@ class WorkerCachePartial : public WorkerCacheInterface {
 
   // Initiate a GetStatusAsync to the remote task named by "task", and
   // update the cache with all the DeviceAttributes reported.
-  absl::Status RefreshDeviceStatus(const string& device_name);
+  absl::Status RefreshDeviceStatus(const std::string& device_name);
 
-  typedef std::unordered_map<string, DeviceAttributes> StatusMap;
+  typedef std::unordered_map<std::string, DeviceAttributes> StatusMap;
   StatusMap device_status_cache_ TF_GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
index 7f709b4fb5c1bb..8917da3825773b 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -29,11 +29,11 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     return wrapped_->ListWorkers(workers);
   }
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) const override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) const override {
     return wrapped_->ListWorkersInJob(job_name, workers);
   }
 
@@ -41,7 +41,7 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // or can be constructed, returns a pointer to a WorkerInterface object
   // wrapping that channel. The returned value must be destroyed by
   // calling `this->ReleaseWorker(target, ret)`
-  WorkerInterface* GetOrCreateWorker(const string& target) override {
+  WorkerInterface* GetOrCreateWorker(const std::string& target) override {
     return wrapped_->GetOrCreateWorker(target);
   }
 
@@ -50,7 +50,8 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
   // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
   //                    per-rpc-subsystem WorkerInterface creator.
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+  void ReleaseWorker(const std::string& target,
+                     WorkerInterface* worker) override {
     return wrapped_->ReleaseWorker(target, worker);
   }
 
@@ -69,7 +70,7 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // within its local environment.  Returns true if *locality
   // was set, using only locally cached data.  Returns false
   // if status data for that device was not available.  Never blocks.
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
   }
@@ -77,7 +78,8 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // Set *locality with the DeviceLocality of the specified remote device
   // within its local environment.  Callback gets Status::OK if *locality
   // was set.
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
     return wrapped_->GetDeviceLocalityAsync(device, locality, std::move(done));
   }
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index d9286d0d148843..cb66a4f845f5b7 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -43,16 +43,16 @@ class WorkerFreeListCache : public WorkerCacheInterface {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     wrapped_->ListWorkers(workers);
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) const override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) const override {
     wrapped_->ListWorkersInJob(job_name, workers);
   }
 
-  WorkerInterface* GetOrCreateWorker(const string& target) override {
+  WorkerInterface* GetOrCreateWorker(const std::string& target) override {
     {
       // Fast path if worker has been created.
       tf_shared_lock l(mu_);
@@ -88,16 +88,18 @@ class WorkerFreeListCache : public WorkerCacheInterface {
     return wrapped_->GetCoordinationClientCache(coordination_client_cache);
   }
 
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+  void ReleaseWorker(const std::string& target,
+                     WorkerInterface* worker) override {
     // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
   }
 
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
     wrapped_->GetDeviceLocalityAsync(device, locality, done);
   }
@@ -121,13 +123,13 @@ class WorkerFreeListCache : public WorkerCacheInterface {
 
   // TODO(jeff,sanjay): Eviction when the map becomes too big.
   mutex mu_;
-  std::unordered_map<string, WorkerState> workers_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, WorkerState> workers_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
 
 WorkerSession::WorkerSession(
-    const string& session_name, const string& worker_name,
+    const std::string& session_name, const std::string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceMgr> device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
@@ -165,7 +167,7 @@ absl::Status WorkerSession::UpdateWorkerCacheAndDevices(
 
 /* static */
 std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
-    const string& session_name, const string& worker_name,
+    const std::string& session_name, const std::string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
@@ -177,7 +179,7 @@ std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
 }
 
 WorkerSession::WorkerSession(
-    const string& session_name, const string& worker_name,
+    const std::string& session_name, const std::string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index e366accf18075b..5f8d66d93b6c69 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -51,8 +51,8 @@ class WorkerSession {
 
   DynamicDeviceMgr* remote_device_mgr() { return remote_device_mgr_.get(); }
 
-  const string& session_name() const { return session_name_; }
-  const string& worker_name() const { return worker_name_; }
+  const std::string& session_name() const { return session_name_; }
+  const std::string& worker_name() const { return worker_name_; }
 
   WorkerCacheInterface* worker_cache() const {
     tf_shared_lock l(worker_session_state_mu_);
@@ -64,7 +64,7 @@ class WorkerSession {
     return cluster_flr_.get();
   }
 
-  WorkerSession(const string& session_name, const string& worker_name,
+  WorkerSession(const std::string& session_name, const std::string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
@@ -72,7 +72,7 @@ class WorkerSession {
                 DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
-      const string& session_name, const string& worker_name,
+      const std::string& session_name, const std::string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
       std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
@@ -98,7 +98,7 @@ class WorkerSession {
   ~WorkerSession();
 
  private:
-  WorkerSession(const string& session_name, const string& worker_name,
+  WorkerSession(const std::string& session_name, const std::string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 DeviceMgr* borrowed_device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
@@ -106,10 +106,10 @@ class WorkerSession {
                 DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   // The name of the session.
-  const string session_name_;
+  const std::string session_name_;
 
   // The name of the worker. E.g., /job:mnist/replica:0/task:1.
-  const string worker_name_;
+  const std::string worker_name_;
 
   mutable mutex worker_session_state_mu_;
   // Object from which WorkerInterface instances can be obtained.
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 9de70eb28c1b07..8a10b3d5557f42 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -739,6 +739,7 @@ cc_library(
         "//waymo/ml/compiler/frontend/kernels:__pkg__",
         "//waymo/ml/compiler/runtime/alpine/core:__pkg__",
         "//waymo/ml/woodshed/ops:__pkg__",
+        "//waymo/perception/training/point_lens/unified_dataset/python/tensorflow:__pkg__",
     ],
     deps = [
         "//tensorflow/core/lib/core:refcount",
@@ -1554,7 +1555,7 @@ tf_proto_library(
     name = "log_memory_proto",
     srcs = ["log_memory.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":allocation_description_proto",
         ":tensor_description_proto",
         ":tensor_shape_proto",
@@ -1572,7 +1573,8 @@ tf_proto_library(
     name = "graph_proto",
     srcs = ["graph.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = ["//visibility:public"],
+    deps = [
         ":attr_value_proto",
         ":function_proto",
         ":graph_debug_info_proto",
@@ -1584,14 +1586,13 @@ tf_proto_library(
         ":types_proto",
         ":versions_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "node_def_proto",
     srcs = ["node_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":attr_value_proto",
         ":full_type_proto",
         ":resource_handle_proto",
@@ -1623,7 +1624,7 @@ tf_proto_library(
     name = "tensor_description_proto",
     srcs = ["tensor_description.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":allocation_description_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1641,7 +1642,7 @@ tf_proto_library(
     name = "resource_handle_proto",
     srcs = ["resource_handle.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":tensor_shape_proto",
         ":types_proto",
     ],
@@ -1651,7 +1652,7 @@ tf_proto_library(
     name = "step_stats_proto",
     srcs = ["step_stats.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":allocation_description_proto",
         ":tensor_description_proto",
         ":tensor_shape_proto",
@@ -1669,7 +1670,7 @@ tf_proto_library(
     name = "kernel_def_proto",
     srcs = ["kernel_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":attr_value_proto",
         ":resource_handle_proto",
         ":tensor_proto",
@@ -1682,7 +1683,11 @@ tf_proto_library(
     name = "op_def_proto",
     srcs = ["op_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = [
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow/python:__pkg__",
+    ],
+    deps = [
         ":attr_value_proto",
         ":full_type_proto",
         ":resource_handle_proto",
@@ -1690,22 +1695,12 @@ tf_proto_library(
         ":tensor_shape_proto",
         ":types_proto",
     ],
-    visibility = [
-        "//tensorflow/core:__subpackages__",
-        "//tensorflow/python:__pkg__",
-    ],
 )
 
 tf_proto_library(
     name = "attr_value_proto",
     srcs = ["attr_value.proto"],
     make_default_target_header_only = True,
-    protodeps = [
-        ":resource_handle_proto",
-        ":tensor_proto",
-        ":tensor_shape_proto",
-        ":types_proto",
-    ],
     visibility = [
         #internal library,
         "//tensorflow/core:__subpackages__",
@@ -1714,20 +1709,26 @@ tf_proto_library(
         "//tensorflow/security/fuzzing:__subpackages__",
         "//waymo/ml/deploy/benchmark:__subpackages__",
     ],
+    deps = [
+        ":resource_handle_proto",
+        ":tensor_proto",
+        ":tensor_shape_proto",
+        ":types_proto",
+    ],
 )
 
 tf_proto_library(
     name = "full_type_proto",
     srcs = ["full_type.proto"],
     make_default_target_header_only = True,
-    protodeps = [],
+    deps = [],
 )
 
 tf_proto_library(
     name = "tensor_proto",
     srcs = ["tensor.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":resource_handle_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1744,7 +1745,7 @@ tf_proto_library(
     name = "api_def_proto",
     srcs = ["api_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":attr_value_proto",
         ":resource_handle_proto",
         ":tensor_proto",
@@ -1757,7 +1758,7 @@ tf_proto_library(
     name = "cpp_shape_inference_proto",
     srcs = ["cpp_shape_inference.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":full_type_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1774,7 +1775,7 @@ tf_proto_library(
     name = "graph_transfer_info_proto",
     srcs = ["graph_transfer_info.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":types_proto",
     ],
 )
@@ -1796,7 +1797,7 @@ tf_proto_library(
     name = "cost_graph_proto",
     srcs = ["cost_graph.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":tensor_shape_proto",
         ":types_proto",
     ],
@@ -1812,7 +1813,10 @@ tf_proto_library(
     name = "function_proto",
     srcs = ["function.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = [
+        "//tensorflow/python:__pkg__",
+    ] + default_visibility,
+    deps = [
         ":attr_value_proto",
         ":node_def_proto",
         ":op_def_proto",
@@ -1821,9 +1825,6 @@ tf_proto_library(
         ":tensor_shape_proto",
         ":types_proto",
     ],
-    visibility = [
-        "//tensorflow/python:__pkg__",
-    ] + default_visibility,
 )
 
 # copybara:uncomment_begin(google-only)
@@ -1840,14 +1841,14 @@ tf_proto_library(
     name = "summary_proto",
     srcs = ["summary.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    exports = ["@local_xla//xla/tsl/protobuf:histogram_proto"],
+    deps = [
         ":resource_handle_proto",
         ":tensor_proto",
         ":tensor_shape_proto",
         ":types_proto",
         "@local_xla//xla/tsl/protobuf:histogram_proto",
     ],
-    exports = ["@local_xla//xla/tsl/protobuf:histogram_proto"],
 )
 
 tf_proto_library(
@@ -1860,7 +1861,7 @@ tf_proto_library(
     name = "dataset_proto",
     srcs = ["dataset.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":tensor_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1877,7 +1878,7 @@ tf_proto_library(
     name = "dataset_options_proto",
     srcs = ["dataset_options.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":model_proto",
     ],
 )
@@ -1886,24 +1887,27 @@ tf_proto_library(
     name = "optimized_function_graph_proto",
     srcs = ["optimized_function_graph.proto"],
     make_default_target_header_only = True,
-    protodeps = [
-        ":types_proto",
+    deps = [
         ":graph_proto",
+        ":types_proto",
     ],
 )
 
 tf_proto_library(
     name = "protos_all",
     make_default_target_header_only = True,
-    protodeps = [
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
+    deps = [
         ":allocation_description_proto",
         ":api_def_proto",
-        ":cpp_shape_inference_proto",
         ":attr_value_proto",
         ":cost_graph_proto",
-        ":dataset_proto",
+        ":cpp_shape_inference_proto",
         ":dataset_metadata_proto",
         ":dataset_options_proto",
+        ":dataset_proto",
         ":device_attributes_proto",
         ":full_type_proto",
         ":function_proto",
@@ -1914,8 +1918,8 @@ tf_proto_library(
         ":log_memory_proto",
         ":model_proto",
         ":node_def_proto",
-        ":optimized_function_graph_proto",
         ":op_def_proto",
+        ":optimized_function_graph_proto",
         ":reader_base_proto",
         ":resource_handle_proto",
         ":step_stats_proto",
@@ -1928,9 +1932,6 @@ tf_proto_library(
         ":variable_proto",
         ":versions_proto",
     ],
-    tags = [
-        "alt_dep=//third_party/tensorflow/core:protos_all",
-    ],
 )
 
 tf_cc_fuzz_test(
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index ba3f396b6c3ef0..76bfb059935786 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -210,7 +210,7 @@ TEST(CPUAllocatorTest, Sizes) {
 
 TEST(CPUAllocatorTest, ProfilerReporting) {
   // TODO(b/196611863): Make debugging work even without GetAllocatedSize.
-  void* p = port::AlignedMalloc(8, 1);
+  void* p = tsl::port::AlignedMalloc(8, static_cast<std::align_val_t>(1));
   const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
   port::AlignedFree(p);
   if (alloc_size == 0) {
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
index d0d995cbcc3712..e26467011ac2dd 100644
--- a/tensorflow/core/framework/function_handle_cache.cc
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 FunctionHandleCache::FunctionHandleCache(FunctionLibraryRuntime* lib)
     : lib_(lib),
       state_handle_(
-          strings::Printf("%lld", static_cast<long long>(random::New64()))) {}
+          absl::StrFormat("%lld", static_cast<long long>(random::New64()))) {}
 
 FunctionHandleCache::~FunctionHandleCache() {
   absl::Status s = Clear();
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index 6a56c1695d35b9..36e87d36d594fd 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "xla/tsl/platform/logging.h"
 #include "tensorflow/core/activity_watcher/activity.h"
@@ -404,8 +405,13 @@ void LocalRendezvous::DoAbort(const absl::Status& status) {
     mutex_lock l(mu_);
     status_.Update(status);
   }
-  LOG_EVERY_POW_2(INFO) << "Local rendezvous is aborting with status: "
-                        << status;
+
+  // OUT_OF_RANGE implies a normal end of sequence (e.g. for tf.data),
+  // so we suppress the warning to avoid log noise.
+  if (status.code() != absl::StatusCode::kOutOfRange) {
+    LOG_EVERY_POW_2(WARNING)
+        << "Local rendezvous is aborting with status: " << status;
+  }
 
   // Keeps one Item to make sure the current rendezvous won't be destructed.
   std::unique_ptr<Item> to_delete;
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 6ad728f1a0de2c..16e9df7641753b 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -1657,7 +1657,7 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
   const int32_t parallelism = 1;
   const int32_t deterministic = 1;
   const int32_t cycle_length = 3;
-  ComputeModelTiming(strings::Printf(
+  ComputeModelTiming(absl::StrFormat(
       R"pb(
         nodes: {
           key: 1
@@ -1841,7 +1841,7 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
   const int32_t parallelism = std::get<0>(GetParam());
   const int32_t deterministic = std::get<1>(GetParam());
   const int32_t cycle_length = std::get<2>(GetParam());
-  ComputeModelTiming(strings::Printf(
+  ComputeModelTiming(absl::StrFormat(
       R"pb(
         nodes: {
           key: 1
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index c83acfe5329311..0c59566c84261b 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -204,7 +204,7 @@ std::string ResourceMgr::DebugString() const {
   std::vector<std::string> text;
   text.reserve(lines.size());
   for (const Line& line : lines) {
-    text.push_back(strings::Printf(
+    text.push_back(absl::StrFormat(
         "%-20s | %-40s | %-40s | %-s", line.container->c_str(),
         line.type.c_str(), line.resource->c_str(), line.detail.c_str()));
   }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 5db5b0bcd74e84..fa19396557bf0a 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -1095,9 +1095,10 @@ void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
 
 template <typename T>
 Tensor::Tensor(T value, host_scalar_tag tag) {
-  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
-      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
-                          EIGEN_MAX_ALIGN_BYTES));
+  auto* value_and_buf =
+      static_cast<Tensor::ValueAndTensorBuffer<T>*>(tsl::port::AlignedMalloc(
+          sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+          static_cast<std::align_val_t>(EIGEN_MAX_ALIGN_BYTES)));
   new (&value_and_buf->value) T(std::move(value));
   new (&value_and_buf->tensor_buffer)
       typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index 22c0d608076af5..0277dd1418b524 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -57,8 +57,9 @@ class TypeIndex {
   static TypeIndex Make() {
 #ifdef PLATFORM_CLOUD_TPU
     static bool hash_bit[1];
-    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
-                     typeid(T).name());
+    return TypeIndex(
+        static_cast<uint64_t>(reinterpret_cast<intptr_t>(hash_bit)),
+        typeid(T).name());
 #endif
 #if defined(__GXX_RTTI) || defined(_CPPRTTI)
 
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 2506bdd433242d..8a3563ab64322e 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -231,8 +231,8 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
   Variant v_out = VariantValue();
 
   OpKernelContext* null_context_pointer = nullptr;
-  Status s0 = UnaryOpVariant<GPUDevice>(null_context_pointer,
-                                        ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
+  absl::Status s0 = UnaryOpVariant<GPUDevice>(
+      null_context_pointer, ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(absl::StrContains(s0.message(), "early exit zeros_like"));
 
@@ -304,7 +304,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Variant v_out = VariantValue();
 
   OpKernelContext* null_context_pointer = nullptr;
-  Status s0 = BinaryOpVariants<GPUDevice>(
+  absl::Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(absl::StrContains(s0.message(), "early exit add"));
diff --git a/tensorflow/core/function/testing/test_pass.h b/tensorflow/core/function/testing/test_pass.h
index 93c2116f5ad996..c3bee77403884c 100644
--- a/tensorflow/core/function/testing/test_pass.h
+++ b/tensorflow/core/function/testing/test_pass.h
@@ -101,7 +101,8 @@ struct TestPassTfDialect
     DCHECK(target != nullptr);
 
     builder.setInsertionPoint(target);
-    auto replacement = builder.create<mlir::TF::AddV2Op>(
+    auto replacement = mlir::TF::AddV2Op::create(
+        builder,
         mlir::NameLoc::get(
             mlir::StringAttr::get(builder.getContext(), "x_plus_y")),
         target->getResultTypes(), target->getOperand(0), target->getOperand(1));
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 3c6cc215e95bc5..0c560b57044cb4 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -47,11 +47,12 @@ REGISTER_OP("TestBinary")
 
 // Compares that the order of nodes in 'inputs' respects the
 // pair orders described in 'ordered_pairs'.
-bool ExpectBefore(const std::vector<std::pair<string, string>>& ordered_pairs,
-                  const std::vector<Node*>& inputs, string* error) {
-  for (const std::pair<string, string>& pair : ordered_pairs) {
-    const string& before_node = pair.first;
-    const string& after_node = pair.second;
+bool ExpectBefore(
+    const std::vector<std::pair<std::string, std::string>>& ordered_pairs,
+    const std::vector<Node*>& inputs, std::string* error) {
+  for (const std::pair<std::string, std::string>& pair : ordered_pairs) {
+    const std::string& before_node = pair.first;
+    const std::string& after_node = pair.second;
     bool seen_before = false;
     bool seen_both = false;
     for (const Node* node : inputs) {
@@ -97,10 +98,10 @@ TEST(AlgorithmTest, ReversePostOrder) {
   GetReversePostOrder(g, &order);
 
   // Check that the order respects the dependencies correctly.
-  std::vector<std::pair<string, string>> reverse_orders = {
+  std::vector<std::pair<std::string, std::string>> reverse_orders = {
       {"W1", "input"}, {"W1", "t1"},    {"W1", "t2"}, {"W1", "t3"},
       {"input", "t1"}, {"input", "t3"}, {"t1", "t2"}, {"W2", "t3"}};
-  string error;
+  std::string error;
   EXPECT_TRUE(ExpectBefore(reverse_orders, order, &error)) << error;
 
   // A false ordering should fail the check.
@@ -111,7 +112,7 @@ TEST(AlgorithmTest, ReversePostOrder) {
   GetPostOrder(g, &order);
 
   // Check that the order respects the dependencies correctly.
-  std::vector<std::pair<string, string>> orders = {
+  std::vector<std::pair<std::string, std::string>> orders = {
       {"input", "W1"}, {"t1", "W1"},    {"t2", "W1"}, {"t3", "W1"},
       {"t1", "input"}, {"t3", "input"}, {"t2", "t1"}, {"t3", "W2"}};
   EXPECT_TRUE(ExpectBefore(orders, order, &error)) << error;
@@ -131,7 +132,7 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
     // raw pointer value of Node. Stable post order suppose to remove this
     // nondeterminism by enforcing an ordering based on node ids.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
-    string error;
+    std::string error;
     Node* w1 = SourceOp("TestParams", b.opts().WithName("W1"));
     Node* input =
         SourceOp("TestInput", b.opts().WithName("input").WithControlInput(w1));
diff --git a/tensorflow/core/graph/benchmark_testlib.h b/tensorflow/core/graph/benchmark_testlib.h
index 54716405fd2a6a..98a488d4209a9b 100644
--- a/tensorflow/core/graph/benchmark_testlib.h
+++ b/tensorflow/core/graph/benchmark_testlib.h
@@ -73,7 +73,7 @@ inline GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
   const int kNumInNodes = 10 * num_edges_per_node;
   GraphDef graph_def;
 
-  auto create_node = [](const string& name, const string& op) {
+  auto create_node = [](const std::string& name, const std::string& op) {
     NodeDef node;
     node.set_name(name);
     node.set_op(op);
@@ -115,17 +115,17 @@ inline GraphDef CreateRandomGraph(int size) {
   random::PhiloxRandom philox(0x12345);
   random::SimplePhilox rnd(&philox);
 
-  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+  std::string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
 
   GraphDef graph;
   for (int i = 0; i < size; ++i) {
-    const string name = absl::StrCat(prefix, i);
-    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+    const std::string name = absl::StrCat(prefix, i);
+    const uint32_t num_inputs = rnd.Uniform(std::min(i, 5));
 
     NodeDef node;
     node.set_name(name);
     for (int n = 0; n < num_inputs; ++n) {
-      const uint32 input_node = rnd.Uniform(i);
+      const uint32_t input_node = rnd.Uniform(i);
       node.add_input(absl::StrCat(prefix, input_node));
     }
 
@@ -142,7 +142,7 @@ inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
                                            bool fanout_unique_index) {
   GraphDef graph;
 
-  auto create_node = [](const string& name) {
+  auto create_node = [](const std::string& name) {
     NodeDef node;
     node.set_name(name);
     return node;
@@ -151,14 +151,14 @@ inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
   NodeDef node = create_node(/*name=*/"node");
 
   for (int i = 0; i < num_regular_fanins; ++i) {
-    const string input_node_name = absl::StrFormat("in%05d", i);
+    const std::string input_node_name = absl::StrFormat("in%05d", i);
     NodeDef input_node = create_node(/*name=*/input_node_name);
     *graph.add_node() = std::move(input_node);
     node.add_input(input_node_name);
   }
 
   for (int i = 0; i < num_controlling_fanins; ++i) {
-    const string input_node_name = absl::StrFormat("control_in%05d", i);
+    const std::string input_node_name = absl::StrFormat("control_in%05d", i);
     NodeDef input_node = create_node(/*name=*/input_node_name);
     *graph.add_node() = std::move(input_node);
     node.add_input(absl::StrCat("^", input_node_name));
@@ -166,13 +166,13 @@ inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
 
   for (int i = 0; i < num_regular_fanouts; ++i) {
     NodeDef output_node = create_node(/*name=*/absl::StrFormat("out%05d", i));
-    const string input_node_index =
+    const std::string input_node_index =
         fanout_unique_index ? absl::StrCat(node.name(), ":", i) : node.name();
     output_node.add_input(input_node_index);
     *graph.add_node() = std::move(output_node);
   }
 
-  const string controlled_fanout_input = absl::StrCat("^", node.name());
+  const std::string controlled_fanout_input = absl::StrCat("^", node.name());
   for (int i = 0; i < num_controlled_fanouts; ++i) {
     NodeDef output_node =
         create_node(/*name=*/absl::StrFormat("control_out%05d", i));
diff --git a/tensorflow/core/graph/collective_order.cc b/tensorflow/core/graph/collective_order.cc
index 9f8a498d88b47e..3ca3748eeb18be 100644
--- a/tensorflow/core/graph/collective_order.cc
+++ b/tensorflow/core/graph/collective_order.cc
@@ -25,8 +25,9 @@ namespace {
 // them.
 absl::Status DiscoverDataDependencies(
     const Graph* graph, std::vector<Node*>* collective_nodes,
-    std::vector<int32>* instance_keys,
-    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies) {
+    std::vector<int32_t>* instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>>*
+        data_dependencies) {
   absl::Status s;
   // Algorithm: do Reverse DFS starting at sink.  `node_leave` is called when
   // all parents of `node` have been visited.  At that point,
@@ -69,8 +70,8 @@ absl::Status DiscoverDataDependencies(
 // If there exists an edge a -> b then `dependency_edges[a]` contains `b`
 absl::Status CreateControlDependencies(
     const std::vector<Node*>& collective_nodes,
-    const std::vector<int32>& instance_keys,
-    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies,
+    const std::vector<int32_t>& instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>>* data_dependencies,
     absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>* dependency_edges) {
   // If there exists some path a -> ... -> b then `all_paths[a]` contains `b`
   absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> all_paths;
@@ -158,7 +159,7 @@ absl::Status InsertControlDependencies(
   } else if (order_type == GraphCollectiveOrder::kAttrs) {
     // `wait_for` is the inverse of `dependency_edges`, i.e. `wait_for[node]`
     // contains the list of instance keys for which `node` must wait.
-    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> wait_for;
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>> wait_for;
     for (const auto& pair : dependency_edges) {
       int32_t src_instance;
       TF_RETURN_IF_ERROR(
@@ -168,7 +169,8 @@ absl::Status InsertControlDependencies(
       }
     }
     for (const auto& pair : wait_for) {
-      std::vector<int32> wait_for_list(pair.second.begin(), pair.second.end());
+      std::vector<int32_t> wait_for_list(pair.second.begin(),
+                                         pair.second.end());
       pair.first->ClearAttr("wait_for");
       pair.first->AddAttr("wait_for", wait_for_list);
     }
@@ -184,9 +186,9 @@ absl::Status InsertControlDependencies(
 absl::Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type) {
   // `instance_keys[i]` corresponds to `collective_nodes[i]`
   std::vector<Node*> collective_nodes;
-  std::vector<int32> instance_keys;
+  std::vector<int32_t> instance_keys;
   // node -> set of collectives on which node depends.
-  absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> data_dependencies;
+  absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>> data_dependencies;
   TF_RETURN_IF_ERROR(DiscoverDataDependencies(
       graph, &collective_nodes, &instance_keys, &data_dependencies));
 
diff --git a/tensorflow/core/graph/collective_order_test.cc b/tensorflow/core/graph/collective_order_test.cc
index 46333535cbbaad..2206fc1b309d3b 100644
--- a/tensorflow/core/graph/collective_order_test.cc
+++ b/tensorflow/core/graph/collective_order_test.cc
@@ -32,11 +32,12 @@ REGISTER_OP("TestParams").Output("o: float");
 // `expected_collective_nodes`, and that the list of control edges between these
 // collective nodes matches `expected_collective_control_edges`.
 void VerifyGraph(const Graph& graph,
-                 const std::vector<string>& expected_collective_nodes,
-                 const std::vector<std::pair<string, string>>&
+                 const std::vector<std::string>& expected_collective_nodes,
+                 const std::vector<std::pair<std::string, std::string>>&
                      expected_collective_control_edges) {
-  std::vector<string> actual_collective_nodes;
-  std::vector<std::pair<string, string>> actual_collective_control_edges;
+  std::vector<std::string> actual_collective_nodes;
+  std::vector<std::pair<std::string, std::string>>
+      actual_collective_control_edges;
   for (const Node* src : graph.nodes()) {
     if (!src->IsCollective()) {
       continue;
@@ -63,13 +64,13 @@ void VerifyGraph(const Graph& graph,
 // `wait_for_map`.
 void VerifyAttrs(
     const Graph& graph,
-    const std::unordered_map<string, std::vector<int32>> wait_for_map) {
+    const std::unordered_map<std::string, std::vector<int32_t>> wait_for_map) {
   for (const Node* node : graph.nodes()) {
     if (node->IsCollective() ||
         wait_for_map.find(node->name()) == wait_for_map.end()) {
       continue;
     }
-    std::vector<int32> wait_for_actual;
+    std::vector<int32_t> wait_for_actual;
     TF_EXPECT_OK(GetNodeAttr(node->attrs(), "wait_for", &wait_for_actual));
     auto wait_for_expected = wait_for_map.at(node->name());
     EXPECT_THAT(wait_for_actual, UnorderedElementsAreArray(wait_for_expected));
@@ -77,7 +78,7 @@ void VerifyAttrs(
 }
 
 Node* CollectiveReduceNode(GraphDefBuilder* builder, Node* input,
-                           const string& name, const string& device,
+                           const std::string& name, const std::string& device,
                            int instance_key) {
   Node* collective_node =
       ops::UnaryOp("CollectiveReduce", input,
@@ -109,8 +110,8 @@ Node* CollectiveReduceNode(GraphDefBuilder* builder, Node* input,
 // inputs, `id` is identity node.
 std::unique_ptr<Graph> InitGraph() {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string dev1 = "/job:localhost/replica:0/task:0/device:CPU:1";
+  const std::string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string dev1 = "/job:localhost/replica:0/task:0/device:CPU:1";
   Node* a = ops::SourceOp("TestParams",
                           builder.opts().WithName("a").WithDevice(dev0));
   Node* b = ops::SourceOp("TestParams",
@@ -165,7 +166,7 @@ TEST(CollectiveOrderTest, SimpleOrderAttr) {
 // `id` is identity node.
 std::unique_ptr<Graph> InitGraph2() {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   Node* a = ops::SourceOp("TestParams",
                           builder.opts().WithName("a").WithDevice(dev0));
   Node* c1 = CollectiveReduceNode(&builder, a, "c1", dev0, 1);
@@ -201,7 +202,7 @@ TEST(CollectiveOrderTest, SimpleOrder2) {
 //
 std::unique_ptr<Graph> InitGraphForPruning() {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   Node* w = ops::SourceOp("TestParams",
                           builder.opts().WithName("w").WithDevice(dev0));
   Node* x = ops::SourceOp("TestParams",
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 4cd9316a4607e3..e443dadc678c26 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace {
 // Information about a loop frame structure.
 struct Frame {
-  string name;
+  std::string name;
 
   // Pointer to the parent frame. The root frame has a pointer to itself.
   Frame* parent = nullptr;
@@ -40,7 +40,7 @@ struct Frame {
 // Verify that the ControlFlowInfo of the graph has valid loop structure.
 absl::Status ValidateControlFlowInfo(
     const Graph* graph, const std::vector<ControlFlowInfo>& cf_info) {
-  std::unordered_map<string, Frame> frames;
+  std::unordered_map<std::string, Frame> frames;
   for (const Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
     if (!cf.frame || !cf.parent_frame) {
@@ -85,7 +85,7 @@ absl::Status ValidateControlFlowInfo(
 
 absl::Status BuildControlFlowInfo(const Graph* g,
                                   std::vector<ControlFlowInfo>* info,
-                                  std::vector<string>* unreachable_nodes) {
+                                  std::vector<std::string>* unreachable_nodes) {
   info->clear();
   info->resize(g->num_node_ids());
 
@@ -97,7 +97,7 @@ absl::Status BuildControlFlowInfo(const Graph* g,
   src_info.frame = src_node;
   src_info.parent_frame = src_node;
 
-  string frame_name;
+  std::string frame_name;
   std::deque<const Node*> ready;
   ready.push_back(src_node);
   while (!ready.empty()) {
@@ -135,7 +135,8 @@ absl::Status BuildControlFlowInfo(const Graph* g,
       // Process the node 'out'.
       if (IsEnter(out)) {
         if (is_visited) {
-          const string& parent_frame = (*info)[out_parent->id()].frame_name;
+          const std::string& parent_frame =
+              (*info)[out_parent->id()].frame_name;
           if (parent_frame != frame_name) {
             return errors::InvalidArgument(
                 FormatNodeForError(*out),
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index c1e2db339122df..b15bb671f7e1ce 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -36,7 +36,7 @@ struct ControlFlowInfo {
 
   const Node* frame = nullptr;         // frame of a node
   const Node* parent_frame = nullptr;  // parent frame of a node
-  string frame_name;                   // frame name of a node
+  std::string frame_name;              // frame name of a node
 };
 
 // Clear and populate `info` with each node's frame and the level it belongs to.
@@ -54,7 +54,7 @@ struct ControlFlowInfo {
 // which all sane front-ends should satisfy.
 absl::Status BuildControlFlowInfo(
     const Graph* g, std::vector<ControlFlowInfo>* info,
-    std::vector<string>* unreachable_nodes = nullptr);
+    std::vector<std::string>* unreachable_nodes = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 37d1e69c5b3c66..6026522f28cfb0 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -35,7 +35,7 @@ void CostModel::SuppressInfrequent() {
   // Find the median of the non-zero counts, and use half of its value
   // as the cutoff for a "normal" execution mode node.
   if (count_.empty()) return;
-  std::vector<int32> non_zero;
+  std::vector<int32_t> non_zero;
   for (auto v : count_) {
     if (v > 0) non_zero.push_back(v);
   }
@@ -192,7 +192,7 @@ void CostModel::RecordCount(const Node* node, int count) {
   count_[id] += count;
 }
 
-int32 CostModel::TotalCount(const Node* node) const {
+int32_t CostModel::TotalCount(const Node* node) const {
   const int id = Id(node);
   if (id < 0) return 0;
   return (static_cast<size_t>(id) < slot_bytes_.size()) ? count_[id] : 0;
@@ -419,7 +419,7 @@ Microseconds CostModel::ComputationTimeEstimate(int64_t math_ops) {
 
 void CostModel::IncrementUpdateTimes() { update_times_++; }
 
-int32 CostModel::GetUpdateTimes() const { return update_times_; }
+int32_t CostModel::GetUpdateTimes() const { return update_times_; }
 
 // ----------------------------------------------------------------------------
 // InitCostModel
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 795d94720415b5..9bfd9b2a60ce1b 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
-typedef std::unordered_map<absl::string_view, int32, StringPieceHasher>
+typedef std::unordered_map<absl::string_view, int32_t, StringPieceHasher>
     NodeNameToCostIdMap;
 
 class StepStats;
@@ -95,7 +95,7 @@ class CostModel {
   void RecordCount(const Node* node, int num_count);
 
   // Returns how many times "node" has been executed.
-  int32 TotalCount(const Node* node) const;
+  int32_t TotalCount(const Node* node) const;
 
   // Records that "output_slot" of "node" has produced tensors of
   // aggregated "bytes".
@@ -184,7 +184,7 @@ class CostModel {
   void IncrementUpdateTimes();
 
   // Get the times that the cost model is updated.
-  int32 GetUpdateTimes() const;
+  int32_t GetUpdateTimes() const;
 
  private:
   static Bytes MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
@@ -197,13 +197,13 @@ class CostModel {
 
   // Nodes and Edges whose count is < this value
   // get type/byte estimates of 0.
-  int32 min_count_ = 0;
+  int32_t min_count_ = 0;
 
   // The number of times the cost model is updated.
-  int32 update_times_ = 0;
+  int32_t update_times_ = 0;
 
   // Number of times each Node has been executed.
-  std::vector<int32> count_;
+  std::vector<int32_t> count_;
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
diff --git a/tensorflow/core/graph/costmodel_test.cc b/tensorflow/core/graph/costmodel_test.cc
index 0e5c2273f53b20..c062f58856523b 100644
--- a/tensorflow/core/graph/costmodel_test.cc
+++ b/tensorflow/core/graph/costmodel_test.cc
@@ -56,7 +56,7 @@ MATCHER_P(ShapeProtoEquals, other, "") {
   return true;
 }
 
-static void InitGraph(const string& s, Graph* graph) {
+static void InitGraph(const std::string& s, Graph* graph) {
   GraphDef graph_def;
 
   auto parser = protobuf::TextFormat::Parser();
@@ -97,8 +97,8 @@ Node* FindNode(const Graph& graph, std::string name) {
   return nullptr;
 }
 
-Node* AddNode(Graph& graph, const string& name, const string& node_type,
-              int num_inputs) {
+Node* AddNode(Graph& graph, const std::string& name,
+              const std::string& node_type, int num_inputs) {
   auto builder = NodeDefBuilder(name, node_type);
   for (int i = 0; i < num_inputs; ++i) {
     builder = builder.Input(absl::StrCat("node_", i), i, DT_FLOAT);
@@ -114,7 +114,7 @@ Node* AddNode(Graph& graph, const string& name, const string& node_type,
 }
 
 static void GenerateStepStats(Graph* graph, StepStats* step_stats,
-                              const string& device_name) {
+                              const std::string& device_name) {
   // Fill RunMetadata's step_stats and partition_graphs fields.
   DeviceStepStats* device_stepstats = step_stats->add_dev_stats();
   device_stepstats->set_device(device_name);
@@ -150,7 +150,7 @@ TEST(CostModelTest, WorksWithManager) {
   GenerateStepStats(graph1.get(), &step_stats, "DummyDevice1");
   GenerateStepStats(graph2.get(), &step_stats, "DummyDevice2");
   StepStatsCollector collector(&step_stats);
-  std::unordered_map<string, const Graph*> device_map;
+  std::unordered_map<std::string, const Graph*> device_map;
   device_map["DummyDevice1"] = graph1.get();
   device_map["DummyDevice2"] = graph2.get();
   CostModelManager cost_model_manager;
@@ -161,7 +161,7 @@ TEST(CostModelTest, WorksWithManager) {
   TF_ASSERT_OK(
       cost_model_manager.AddToCostGraphDef(graph2.get(), &cost_graph_def));
   ASSERT_EQ(cost_graph_def.node_size(), 12);
-  absl::flat_hash_map<int32, const CostGraphDef::Node> ids;
+  absl::flat_hash_map<int32_t, const CostGraphDef::Node> ids;
   for (auto node : cost_graph_def.node()) {
     int32_t index = node.id();
     auto result = ids.insert({index, node});
diff --git a/tensorflow/core/graph/edgeset.h b/tensorflow/core/graph/edgeset.h
index 6d6cb3ff630591..e3f50ef59484ea 100644
--- a/tensorflow/core/graph/edgeset.h
+++ b/tensorflow/core/graph/edgeset.h
@@ -82,7 +82,7 @@ class EdgeSet {
 #ifdef NDEBUG
   void RegisterMutation() {}
 #else
-  uint32 mutations_ = 0;
+  uint32_t mutations_ = 0;
   void RegisterMutation() { mutations_++; }
 #endif
 
@@ -127,7 +127,7 @@ class EdgeSet::const_iterator {
     CHECK_EQ(init_mutations_, owner_->mutations_);
   }
   const EdgeSet* owner_ = nullptr;
-  uint32 init_mutations_ = 0;
+  uint32_t init_mutations_ = 0;
 #endif
 };
 
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index a3e14eac396859..c7acee2bd056eb 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -190,7 +190,7 @@ void Node::ClearTypeInfo() {
 
 absl::Status Node::ShrinkTypeInfo(
     const absl::flat_hash_map<int, int>& index_mapping,
-    const string& type_attr_name, bool update_full_type) {
+    const std::string& type_attr_name, bool update_full_type) {
   std::vector<DataType> dtypes;
   TF_RETURN_IF_ERROR(GetNodeAttr(def(), type_attr_name, &dtypes));
 
@@ -239,11 +239,11 @@ const OpDef& Node::op_def() const { return *props_->op_def; }
 
 NodeDef* Node::mutable_def() { return &props_->node_def; }
 
-int32 Node::num_inputs() const { return props_->input_types.size(); }
+int32_t Node::num_inputs() const { return props_->input_types.size(); }
 DataType Node::input_type(int32_t i) const { return props_->input_types[i]; }
 const DataTypeVector& Node::input_types() const { return props_->input_types; }
 
-int32 Node::num_outputs() const { return props_->output_types.size(); }
+int32_t Node::num_outputs() const { return props_->output_types.size(); }
 DataType Node::output_type(int32_t o) const { return props_->output_types[o]; }
 const DataTypeVector& Node::output_types() const {
   return props_->output_types;
@@ -416,7 +416,7 @@ bool InputTensor::operator==(const InputTensor& other) const {
   return node == other.node && index == other.index;
 }
 
-uint64 InputTensor::Hash::operator()(InputTensor const& s) const {
+uint64_t InputTensor::Hash::operator()(InputTensor const& s) const {
   return Hash64Combine(std::hash<const Node*>()(s.node),
                        std::hash<int>()(s.index));
 }
@@ -427,7 +427,7 @@ bool OutputTensor::operator==(const OutputTensor& other) const {
   return node == other.node && index == other.index;
 }
 
-uint64 OutputTensor::Hash::operator()(OutputTensor const& s) const {
+uint64_t OutputTensor::Hash::operator()(OutputTensor const& s) const {
   return Hash64Combine(std::hash<const Node*>()(s.node),
                        std::hash<int>()(s.index));
 }
@@ -1086,7 +1086,7 @@ GraphDebugInfo Graph::BuildDebugInfo() const {
 std::string Edge::DebugString() const {
   auto src_name = src_ ? src_->name().c_str() : "<NULL>";
   auto dst_name = dst_ ? dst_->name().c_str() : "<NULL>";
-  return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_name, src_output_,
+  return absl::StrFormat("[id=%d %s:%d -> %s:%d]", id_, src_name, src_output_,
                          dst_name, dst_input_);
 }
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 6e70b0cdfa8322..10b29e0975625f 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -107,11 +107,11 @@ class Node {
   NodeDef* mutable_def();
 
   // input and output types
-  int32 num_inputs() const;
+  int32_t num_inputs() const;
   DataType input_type(int32_t i) const;
   const DataTypeVector& input_types() const;
 
-  int32 num_outputs() const;
+  int32_t num_outputs() const;
   DataType output_type(int32_t o) const;
   const DataTypeVector& output_types() const;
 
@@ -139,14 +139,14 @@ class Node {
 
   // Sets 'original_node_names' field of this node's DebugInfo proto to
   // 'names'.
-  void set_original_node_names(const std::vector<string>& names);
-  void set_original_func_names(const std::vector<string>& names);
+  void set_original_node_names(const std::vector<std::string>& names);
+  void set_original_func_names(const std::vector<std::string>& names);
 
   // Read only access to attributes
   AttrSlice attrs() const;
 
   // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
-  const protobuf::RepeatedPtrField<string>& requested_inputs() const;
+  const protobuf::RepeatedPtrField<std::string>& requested_inputs() const;
 
   // Get the neighboring nodes via edges either in or out of this node.  This
   // includes control edges.
@@ -220,7 +220,7 @@ class Node {
     UpdateProperties();
   }
 
-  void AddAttr(const std::string& name, std::vector<string>&& val) {
+  void AddAttr(const std::string& name, std::vector<std::string>&& val) {
     MoveAttrValue(std::move(val), AddAttrHelper(name));
     UpdateProperties();
   }
@@ -278,7 +278,7 @@ class Node {
   // update the node's full type information (if present).
   absl::Status ShrinkTypeInfo(
       const absl::flat_hash_map<int, int>& index_mapping,
-      const string& type_attr_name, bool update_full_type);
+      const std::string& type_attr_name, bool update_full_type);
 
   // Called after an incident non-control edge has changed. Does nothing if not
   // all input edges are defined.
@@ -383,8 +383,8 @@ class Node {
 // Stores debug information associated with the Node.
 struct NodeDebugInfo {
   const std::string name;
-  std::vector<string> original_node_names;
-  std::vector<string> original_func_names;
+  std::vector<std::string> original_node_names;
+  std::vector<std::string> original_func_names;
 
   NodeDebugInfo(const Node& n);
   NodeDebugInfo(const NodeDef& ndef);
@@ -407,7 +407,7 @@ struct InputTensor {
   // A hash function for InputTensors. Nodes are hashed based on their pointer
   // value.
   struct Hash {
-    uint64 operator()(InputTensor const& s) const;
+    uint64_t operator()(InputTensor const& s) const;
   };
 };
 
@@ -428,7 +428,7 @@ struct OutputTensor {
   // A hash function for OutputTensors. Nodes are hashed based on their pointer
   // value.
   struct Hash {
-    uint64 operator()(OutputTensor const& s) const;
+    uint64_t operator()(OutputTensor const& s) const;
   };
 };
 
@@ -803,7 +803,7 @@ class Graph {
                                WhileContext** result);
 
   // Builds a node name to node pointer index for all nodes in the graph.
-  std::unordered_map<string, Node*> BuildNodeNameIndex() const;
+  std::unordered_map<std::string, Node*> BuildNodeNameIndex() const;
 
   absl::optional<std::vector<bool>>& GetConstArgIndicesCache() const {
     return const_arg_indices_cache_;
@@ -906,16 +906,16 @@ class Graph {
 
   // A table of the unique assigned device names.  Indices do NOT correspond
   // to node IDs.  Index 0 is always the empty string.
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 
   // Maps unique device names to indices within device_names_[i].
-  std::unordered_map<string, int> device_names_map_;
+  std::unordered_map<std::string, int> device_names_map_;
 
   // All the while contexts owned by this graph, keyed by frame name,
   // corresponding to all the while loops contained in this graph (including
   // nested loops). The stored contexts are usually accessed via
   // AddWhileContext() or Node::while_ctx(), but this manages the lifetime.
-  std::map<string, WhileContext> while_ctxs_;
+  std::map<std::string, WhileContext> while_ctxs_;
 
   // Cache of the indices of the arguments which need to be constant for the XLA
   // compilation.
diff --git a/tensorflow/core/graph/graph_debug_info_builder_test.cc b/tensorflow/core/graph/graph_debug_info_builder_test.cc
index cbe4a8a8ae9287..5680800a5592c5 100644
--- a/tensorflow/core/graph/graph_debug_info_builder_test.cc
+++ b/tensorflow/core/graph/graph_debug_info_builder_test.cc
@@ -47,7 +47,7 @@ class TestStackTrace : public AbstractStackTrace {
 
   StackFrame LastUserFrame() const override { return frames_.back(); }
 
-  string ToString(const TracePrintingOptions& opts) const override {
+  std::string ToString(const TracePrintingOptions& opts) const override {
     auto frame = LastUserFrame();
     return absl::StrCat(frame.file_name, ":", frame.line_number, ":",
                         frame.function_name);
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index 168fc1a0da3da7..a4f08eab66b090 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -44,12 +44,12 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputs(
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithNameImpl(
     absl::string_view name) {
-  name_ = string(name);
+  name_ = std::string(name);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithDeviceImpl(
     absl::string_view device) {
-  device_ = string(device);
+  device_ = std::string(device);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputImpl(
@@ -72,7 +72,7 @@ absl::Status GraphDefBuilder::ToGraphDef(GraphDef* graph_def) const {
   return status_;
 }
 
-string GraphDefBuilder::Options::GetNameForOp(absl::string_view op) const {
+std::string GraphDefBuilder::Options::GetNameForOp(absl::string_view op) const {
   if (name_.empty()) return graph_->NewName(op);
   return name_;
 }
@@ -99,14 +99,15 @@ void GraphDefBuilder::Options::UpdateStatus(const absl::Status& status) const {
 
 namespace ops {
 
-Node* SourceOp(const string& op_name, const GraphDefBuilder::Options& opts) {
+Node* SourceOp(const std::string& op_name,
+               const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
                            opts.op_registry());
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Node* UnaryOp(const string& op_name, NodeOut input,
+Node* UnaryOp(const std::string& op_name, NodeOut input,
               const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
@@ -115,7 +116,7 @@ Node* UnaryOp(const string& op_name, NodeOut input,
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
+Node* BinaryOp(const std::string& op_name, NodeOut a, NodeOut b,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
@@ -124,7 +125,7 @@ Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Node* TernaryOp(const string& op_name, NodeOut a, NodeOut b, NodeOut c,
+Node* TernaryOp(const std::string& op_name, NodeOut a, NodeOut b, NodeOut c,
                 const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index b635ece0eab707..afe3aebe55d62c 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -104,14 +104,14 @@ class GraphDefBuilder {
 
     // Returns a string representation of the status associated with *this.
     // Returns the string `"OK"` if the status doesn't have any error.
-    string StatusToString() const {
+    std::string StatusToString() const {
       return status_->ok() ? "OK" : std::string(status_->message());
     }
 
     // Given the Op type name, return a name for a node of that type.
     // Uses the value set in WithName() if that has been called.  Otherwise,
     // returns a name built out of the Op type name.
-    string GetNameForOp(absl::string_view op) const;
+    std::string GetNameForOp(absl::string_view op) const;
 
     // Sets the device, adds control inputs, adds attrs, and calls Finalize().
     // If Finalize returns an error, it is saved and this function returns
@@ -133,17 +133,17 @@ class GraphDefBuilder {
     Options WithControlInputsImpl(absl::Span<Node* const> control_inputs);
     template <class T>
     Options WithAttrImpl(absl::string_view name, T&& value) {
-      attrs_.emplace_back(string(name), AttrValue());
+      attrs_.emplace_back(std::string(name), AttrValue());
       SetAttrValue(std::forward<T>(value), &attrs_.back().second);
       return *this;
     }
 
     Graph* const graph_;
     absl::Status* const status_;
-    string name_;
-    string device_;
+    std::string name_;
+    std::string device_;
     std::vector<Node*> control_inputs_;
-    std::vector<std::pair<string, AttrValue>> attrs_;
+    std::vector<std::pair<std::string, AttrValue>> attrs_;
   };
 
   // Start building a new graph.
@@ -176,7 +176,7 @@ class GraphDefBuilder {
 
   // Returns whether a user-defined function with `name` already exists in the
   // graph.
-  bool HasFunction(const string& name) {
+  bool HasFunction(const std::string& name) {
     return flib_def_.Find(name) != nullptr;
   }
 
@@ -196,18 +196,19 @@ namespace ops {
 typedef NodeBuilder::NodeOut NodeOut;
 
 // For adding an Op with no inputs to a GraphDefBuilder.
-Node* SourceOp(const string& op_name, const GraphDefBuilder::Options& opts);
+Node* SourceOp(const std::string& op_name,
+               const GraphDefBuilder::Options& opts);
 
 // For adding an Op with one input to a GraphDefBuilder.
-Node* UnaryOp(const string& op_name, NodeOut input,
+Node* UnaryOp(const std::string& op_name, NodeOut input,
               const GraphDefBuilder::Options& opts);
 
 // For adding an Op with two inputs to a GraphDefBuilder.
-Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
+Node* BinaryOp(const std::string& op_name, NodeOut a, NodeOut b,
                const GraphDefBuilder::Options& opts);
 
 // For adding an Op with three inputs to a GraphDefBuilder.
-Node* TernaryOp(const string& op_name, NodeOut a, NodeOut b, NodeOut c,
+Node* TernaryOp(const std::string& op_name, NodeOut a, NodeOut b, NodeOut c,
                 const GraphDefBuilder::Options& opts);
 
 }  // namespace ops
diff --git a/tensorflow/core/graph/graph_node_util.cc b/tensorflow/core/graph/graph_node_util.cc
index 3bf14ed2944394..ed6a23e3813d80 100644
--- a/tensorflow/core/graph/graph_node_util.cc
+++ b/tensorflow/core/graph/graph_node_util.cc
@@ -25,9 +25,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
+std::string SummarizeNode(const Node& node) {
+  return SummarizeNodeDef(node.def());
+}
 
-string FormatNodeForError(const Node& node) {
+std::string FormatNodeForError(const Node& node) {
   return FormatNodeDefForError(node.def());
 }
 
@@ -41,9 +43,10 @@ absl::Status AttachDef(const absl::Status& status, const Node& node,
   return AttachDef(status, node.def(), allow_multiple_formatted_node);
 }
 
-absl::btree_set<string> GetMergedNames(const std::vector<string>& from_names,
-                                       const std::vector<string>& to_names) {
-  absl::btree_set<string> merged_names;
+absl::btree_set<std::string> GetMergedNames(
+    const std::vector<std::string>& from_names,
+    const std::vector<std::string>& to_names) {
+  absl::btree_set<std::string> merged_names;
   merged_names.insert(from_names.begin(), from_names.end());
   merged_names.insert(to_names.begin(), to_names.end());
   return merged_names;
diff --git a/tensorflow/core/graph/graph_node_util.h b/tensorflow/core/graph/graph_node_util.h
index 146c4c07ca833a..8d7a44c5fed2e0 100644
--- a/tensorflow/core/graph/graph_node_util.h
+++ b/tensorflow/core/graph/graph_node_util.h
@@ -29,12 +29,12 @@ class OpDef;
 
 // Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
-string SummarizeNode(const Node& node);
+std::string SummarizeNode(const Node& node);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
 // followed is: {{node <node_name>}}
-string FormatNodeForError(const Node& node);
+std::string FormatNodeForError(const Node& node);
 
 // Merges the original node names from the debug information of 'from' to the
 // debug information of 'to'.
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index be5a5423ae57c6..1328c5c8b57b4c 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -160,16 +160,17 @@ bool IsDstInputOnHost(const Edge* edge, const GraphInfo& info) {
 
 // Add a control edge from each input to each recv.
 void AddReadControl(const std::vector<NodeDef*>& recvs,
-                    const std::vector<string>& inputs) {
+                    const std::vector<std::string>& inputs) {
   for (NodeDef* recv : recvs) {
-    for (const string& input : inputs) {
+    for (const std::string& input : inputs) {
       recv->add_input(absl::StrCat("^", input));
     }
   }
 }
 
 void SetSendRecvAttrs(const PartitionOptions& opts, const Edge* edge,
-                      const string& tensor_name_attr, NodeDefBuilder* builder) {
+                      const std::string& tensor_name_attr,
+                      NodeDefBuilder* builder) {
   builder->Attr("tensor_name", tensor_name_attr);
   builder->Attr("send_device", edge->src()->assigned_device_name());
   builder->Attr("send_device_incarnation",
@@ -184,7 +185,7 @@ void SetSendRecvAttrs(const PartitionOptions& opts, const Edge* edge,
 NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
                  GraphDef* gdef, const Edge* edge,
                  NodeDefBuilder::NodeOut send_from, int64_t start_time,
-                 const string& tensor_name_attr, absl::Status* status) {
+                 const std::string& tensor_name_attr, absl::Status* status) {
   const DataType dtype = send_from.data_type;
   const DataType cast_dtype = opts.should_cast ? opts.should_cast(edge) : dtype;
   const Node* src = edge->src();
@@ -201,7 +202,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
   // Add a cast node that casts dtype to cast_dtype.
   // NOTE(yuanbyu): Only cast for cross-device send/recv.
   if (dtype != cast_dtype && !NeedSameDeviceSendRecv(edge, g_info)) {
-    const string cast_op = (host_memory) ? "_HostCast" : "Cast";
+    const std::string cast_op = (host_memory) ? "_HostCast" : "Cast";
     NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
                                 NodeDebugInfo(*src));
     cast_builder.Device(src->assigned_device_name()).Input(send_from);
@@ -226,7 +227,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
   }
 
   // Add the send node.
-  const string send_op = (host_memory) ? "_HostSend" : "_Send";
+  const std::string send_op = (host_memory) ? "_HostSend" : "_Send";
   NodeDefBuilder send_builder(opts.new_name(src->name()), send_op,
                               NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, tensor_name_attr, &send_builder);
@@ -241,7 +242,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
 
 NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
                  GraphDef* gdef, const Edge* edge, NodeDef** real_recv,
-                 const string& tensor_name_attr, absl::Status* status) {
+                 const std::string& tensor_name_attr, absl::Status* status) {
   const DataType dtype = EdgeType(edge);
   const Node* src = edge->src();
   const Node* dst = edge->dst();
@@ -285,7 +286,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
   }
 
   // Add the recv node.
-  const string recv_op = (host_memory) ? "_HostRecv" : "_Recv";
+  const std::string recv_op = (host_memory) ? "_HostRecv" : "_Recv";
   NodeDefBuilder recv_builder(opts.new_name(src->name()), recv_op,
                               NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, tensor_name_attr, &recv_builder);
@@ -298,7 +299,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
 
   // Add the cast node (from cast_dtype to dtype) or an Identity node.
   if (dtype != cast_dtype) {
-    const string cast_op = (host_memory) ? "_HostCast" : "Cast";
+    const std::string cast_op = (host_memory) ? "_HostCast" : "Cast";
     NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
                                 NodeDebugInfo(*src));
     cast_builder.Attr("DstT", dtype);
@@ -339,8 +340,9 @@ NodeDef* AddDummyConst(const PartitionOptions& opts, GraphDef* gdef,
 
 // A dummy node for scheduling.
 NodeDef* AddControlTrigger(const PartitionOptions& opts, GraphDef* gdef,
-                           const string& assigned_device_name, int64_t epoch,
-                           int64_t starttime, absl::Status* status) {
+                           const std::string& assigned_device_name,
+                           int64_t epoch, int64_t starttime,
+                           absl::Status* status) {
   NodeDef* result = gdef->add_node();
   *status = NodeDefBuilder(opts.new_name(absl::StrCat("synch_", epoch)),
                            "ControlTrigger")
@@ -398,18 +400,19 @@ void OptimizeControlFlowColocation(Graph* graph) {
   DFS(*graph, visit, {});
 }
 
-string ControlLoopName(const string& name) {
+std::string ControlLoopName(const std::string& name) {
   return absl::StrCat("_cloop", name);
 }
 
 bool IsControlLoop(const Node* node) {
-  const string& name = node->name();
+  const std::string& name = node->name();
   return absl::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
-Node* AddControlEnter(Graph* g, const string& node_name,
-                      const string& device_name, const string& frame_name,
+Node* AddControlEnter(Graph* g, const std::string& node_name,
+                      const std::string& device_name,
+                      const std::string& frame_name,
                       const int parallel_iterations, absl::Status* status) {
   NodeBuilder node_builder(node_name, "Enter", g->op_registry());
   node_builder.Input({"dummy", 0, DT_FLOAT});
@@ -423,9 +426,9 @@ Node* AddControlEnter(Graph* g, const string& node_name,
 }
 
 // A merge node for control flow.
-Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g,
-                      const string& node_name, const string& device_name,
-                      absl::Status* status) {
+Node* AddControlMerge(const std::string& in_name1, const std::string& in_name2,
+                      Graph* g, const std::string& node_name,
+                      const std::string& device_name, absl::Status* status) {
   NodeBuilder node_builder(node_name, "Merge", g->op_registry());
   node_builder.Input({{in_name1, 0, DT_FLOAT}, {in_name2, 0, DT_FLOAT}});
   Node* res_node;
@@ -437,7 +440,7 @@ Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g,
 
 // A switch node for control flow.
 Node* AddControlSwitch(NodeBuilder::NodeOut input1, NodeBuilder::NodeOut input2,
-                       const string& device_name,
+                       const std::string& device_name,
                        const GraphDefBuilder::Options& bopts) {
   Node* res_node =
       ops::BinaryOp("Switch", std::move(input1), std::move(input2), bopts);
@@ -447,7 +450,7 @@ Node* AddControlSwitch(NodeBuilder::NodeOut input1, NodeBuilder::NodeOut input2,
 }
 
 // A next_iteration node for control flow.
-Node* AddControlNext(NodeBuilder::NodeOut input, const string& device_name,
+Node* AddControlNext(NodeBuilder::NodeOut input, const std::string& device_name,
                      const GraphDefBuilder::Options& bopts) {
   Node* res_node = ops::UnaryOp("NextIteration", std::move(input), bopts);
   if (bopts.HaveError()) return nullptr;
@@ -469,7 +472,7 @@ Node* EmptyConst(const GraphDefBuilder::Options& options) {
 }
 
 // A dummy const node for control flow.
-Node* AddControlConst(const string& device_name,
+Node* AddControlConst(const std::string& device_name,
                       const GraphDefBuilder::Options& bopts) {
   Node* res_node = EmptyConst(bopts);
   if (bopts.HaveError()) return nullptr;
@@ -513,21 +516,22 @@ absl::Status AddControlLoop(const PartitionOptions& opts, Graph* g,
   absl::Status status;
   GraphDefBuilder::Options bopts(g, &status);
   const ControlFlowInfo& src_info = (*cf_info)[src->id()];
-  const string& device_name = edge->dst()->assigned_device_name();
-  const string& frame_name = src_info.frame_name;
+  const std::string& device_name = edge->dst()->assigned_device_name();
+  const std::string& frame_name = src_info.frame_name;
   int parallel_iterations;
   status = GetNodeAttr(src_info.frame->attrs(), "parallel_iterations",
                        &parallel_iterations);
   if (!status.ok()) return status;
 
   // The names of the nodes to be added.
-  const string& enter_name =
+  const std::string& enter_name =
       ControlLoopName(opts.new_name(edge->dst()->name()));
-  const string& merge_name =
+  const std::string& merge_name =
       ControlLoopName(opts.new_name(edge->dst()->name()));
-  const string& switch_name =
+  const std::string& switch_name =
+      ControlLoopName(opts.new_name(edge->dst()->name()));
+  const std::string& next_name =
       ControlLoopName(opts.new_name(edge->dst()->name()));
-  const string& next_name = ControlLoopName(opts.new_name(edge->dst()->name()));
 
   // Add the nodes to the graph g.
   Node* enter = AddControlEnter(g, enter_name, device_name, frame_name,
@@ -634,14 +638,14 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
   OptimizeControlFlowColocation(g);
 
   // The map from frames to their LoopCond nodes.
-  std::unordered_map<string, Node*> frame_cond_map;
+  std::unordered_map<std::string, Node*> frame_cond_map;
   int num_node_ids = g->num_node_ids();
   for (int i = 0; i < num_node_ids; ++i) {
     Node* node = g->FindNodeId(i);
     if (node == nullptr) continue;
 
     if (IsLoopCond(node)) {
-      const string& frame_name = cf_info[node->id()].frame_name;
+      const std::string& frame_name = cf_info[node->id()].frame_name;
       DCHECK(!frame_name.empty());
       frame_cond_map[frame_name] = node;
     }
@@ -655,7 +659,7 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
   // the merge of the outer loop to the enter of the inner loop.
   //
   // A map from <frame_name, device_name> to ControlLoop.
-  std::unordered_map<string, ControlLoop> control_loops;
+  std::unordered_map<std::string, ControlLoop> control_loops;
   int num_edge_ids = g->num_edge_ids();
   for (int i = 0; i < num_edge_ids; ++i) {
     const Edge* edge = g->FindEdgeId(i);
@@ -666,15 +670,15 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
     // Skip Sink/Source nodes.
     if (!src->IsOp() || !dst->IsOp()) continue;
 
-    const string& src_device = src->assigned_device_name();
-    const string& dst_device = dst->assigned_device_name();
+    const std::string& src_device = src->assigned_device_name();
+    const std::string& dst_device = dst->assigned_device_name();
     // Skip local edges.
     if (src_device == dst_device) continue;
 
     const Node* src_frame = OutputFrame(src, cf_info);
     const Node* dst_frame = InputFrame(dst, cf_info);
-    const string& src_frame_name = cf_info[src_frame->id()].frame_name;
-    const string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
+    const std::string& src_frame_name = cf_info[src_frame->id()].frame_name;
+    const std::string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
     // Skip if src and dst are not in the same frame.
     if (src_frame_name.empty() || src_frame_name != dst_frame_name) {
       continue;
@@ -685,12 +689,12 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
     // for its outer frame when nested.
     ControlLoop child_loop;
     while (true) {
-      const string& curr_frame_name = cf_info[src_frame->id()].frame_name;
+      const std::string& curr_frame_name = cf_info[src_frame->id()].frame_name;
       if (curr_frame_name.empty()) {
         // We have reached the root frame.
         if (child_loop.merge != nullptr) {
-          const string& node_name = opts.new_name(edge->dst()->name());
-          const string& device_name = edge->dst()->assigned_device_name();
+          const std::string& node_name = opts.new_name(edge->dst()->name());
+          const std::string& device_name = edge->dst()->assigned_device_name();
           Node* const_node =
               AddControlConst(device_name, bopts.WithName(node_name));
           if (!status.ok()) return status;
@@ -700,7 +704,8 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
         break;
       }
 
-      const string& cl_key = absl::StrCat(curr_frame_name, "$$", dst_device);
+      const std::string& cl_key =
+          absl::StrCat(curr_frame_name, "$$", dst_device);
       auto it = control_loops.find(cl_key);
       if (it != control_loops.end()) {
         if (child_loop.enter != nullptr) {
@@ -748,15 +753,16 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
     // Skip Sink/Source nodes.
     if (!src->IsOp() || !dst->IsOp()) continue;
 
-    const string& src_device = src->assigned_device_name();
-    const string& dst_device = dst->assigned_device_name();
+    const std::string& src_device = src->assigned_device_name();
+    const std::string& dst_device = dst->assigned_device_name();
     if (src_device != dst_device) {
       const Node* src_frame = OutputFrame(src, cf_info);
       const Node* dst_frame = InputFrame(dst, cf_info);
-      const string& src_frame_name = cf_info[src_frame->id()].frame_name;
-      const string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
+      const std::string& src_frame_name = cf_info[src_frame->id()].frame_name;
+      const std::string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
       if (!src_frame_name.empty() && src_frame_name == dst_frame_name) {
-        const string& cl_key = absl::StrCat(dst_frame_name, "$$", dst_device);
+        const std::string& cl_key =
+            absl::StrCat(dst_frame_name, "$$", dst_device);
         ControlLoop loop = control_loops[cl_key];
         DCHECK(loop.enter != nullptr);
         // Note that we'll create multiple duplicate edges if dst has multiple
@@ -812,12 +818,13 @@ absl::Status TopologicalSortNodesWithTimePriority(
   };
 
   // Build initial structures, initial contents of queue.
-  std::unordered_map<string, std::vector<const NodeDef*>> node_to_output_nodes;
+  std::unordered_map<std::string, std::vector<const NodeDef*>>
+      node_to_output_nodes;
   std::unordered_map<const NodeDef*, int> inputs_needed;
   for (int n = 0; n < gdef->node_size(); ++n) {
     const NodeDef* ndef = &gdef->node(n);
     for (int i = 0; i < ndef->input_size(); ++i) {
-      node_to_output_nodes[string(ParseTensorName(ndef->input(i)).first)]
+      node_to_output_nodes[std::string(ParseTensorName(ndef->input(i)).first)]
           .push_back(ndef);
     }
     int64_t start_time;
@@ -872,8 +879,9 @@ absl::Status TopologicalSortNodesWithTimePriority(
   return absl::OkStatus();
 }
 
-absl::Status AddControlEdges(const PartitionOptions& opts,
-                             std::unordered_map<string, GraphDef>* partitions) {
+absl::Status AddControlEdges(
+    const PartitionOptions& opts,
+    std::unordered_map<std::string, GraphDef>* partitions) {
   absl::Status status;
   // TODO(yuanbyu): Very naive for now. To be improved.
   const int num_epochs = 100;
@@ -891,7 +899,7 @@ absl::Status AddControlEdges(const PartitionOptions& opts,
 
     // Add a dummy node for every epoch, and add a control edge from the
     // "last" node in the preceding epoch to the dummy node.
-    string device_name = gdef->node(0).device();
+    std::string device_name = gdef->node(0).device();
     int64_t makespan = start_times.back().second;
     int64_t resolution = (makespan / num_epochs) + 1;
 
@@ -909,7 +917,7 @@ absl::Status AddControlEdges(const PartitionOptions& opts,
         }
         dummys.push_back(dummy);
         if (j > 0) {
-          string src_name = start_times[j - 1].first->name();
+          std::string src_name = start_times[j - 1].first->name();
           Graph::AddInput(dummy, src_name, Graph::kControlSlot);
         }
         i++;
@@ -940,7 +948,7 @@ void SetIncarnation(const PartitionOptions& opts, NodeDef* ndef) {
     // Not related to send/recv.
     return;
   }
-  const string& send_device = GetNodeAttrString(*ndef, "send_device");
+  const std::string& send_device = GetNodeAttrString(*ndef, "send_device");
   if (send_device.empty()) {
     // No known send_device. The runtime will detect it later.
     return;
@@ -968,10 +976,10 @@ void SetIncarnation(const PartitionOptions& opts, GraphDef* gdef) {
 }
 
 absl::Status Partition(const PartitionOptions& opts, Graph* g,
-                       std::unordered_map<string, GraphDef>* partitions) {
+                       std::unordered_map<std::string, GraphDef>* partitions) {
   // TODO(b/290689453) Refactor this into smaller functions
   absl::Status status;
-  absl::flat_hash_map<string, std::unique_ptr<GraphDebugInfoBuilder>>
+  absl::flat_hash_map<std::string, std::unique_ptr<GraphDebugInfoBuilder>>
       debug_info_builders;
   partitions->clear();
 
@@ -991,7 +999,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
   status = BuildMemoryDeviceInfo(*g, &g_info);
   if (!status.ok()) return status;
 
-  string dstp;
+  std::string dstp;
   std::vector<const Edge*> inputs;
   DupRecvTable dup_recv(3);
   // For a node dst, 'ref_recvs' remembers the recvs introduced by a ref
@@ -999,7 +1007,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
   // edge to dst. We will add a control edge for every pair in
   // (ref_recvs x ref_control_inputs).
   std::vector<NodeDef*> ref_recvs;
-  std::vector<string> ref_control_inputs;
+  std::vector<std::string> ref_control_inputs;
 
   int32_t num_data = 0;
   int32_t num_control = 0;
@@ -1121,7 +1129,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
       auto iter = dup_recv.find(key);
       if (iter != dup_recv.end()) {
         // We found one. Reuse the data/control transferred already.
-        const string& recv_node_name = iter->second.recv->name();
+        const std::string& recv_node_name = iter->second.recv->name();
         if (edge->IsControlEdge()) {
           Graph::AddInput(dst_def, recv_node_name, Graph::kControlSlot);
         } else {
@@ -1157,7 +1165,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
         send_from.Reset(src->name(), edge->src_output(), EdgeType(edge));
       }
 
-      string tensor_name_attr;
+      std::string tensor_name_attr;
       if (opts.get_tensor_name_attr) {
         tensor_name_attr = opts.get_tensor_name_attr(edge);
       } else {
diff --git a/tensorflow/core/graph/graph_partition.h b/tensorflow/core/graph/graph_partition.h
index 59e9fe0e61c35d..c1d9493c76c6b5 100644
--- a/tensorflow/core/graph/graph_partition.h
+++ b/tensorflow/core/graph/graph_partition.h
@@ -31,19 +31,19 @@ namespace tensorflow {
 struct PartitionOptions {
   // A function that returns a location for the execution of a given
   // Node.
-  typedef std::function<string(const Node*)> NodeToLocFunc;
+  typedef std::function<std::string(const Node*)> NodeToLocFunc;
   NodeToLocFunc node_to_loc = nullptr;
 
   // A function that returns a unique graph node name with the given
   // prefix.
-  typedef std::function<string(const string&)> NewNameFunc;
+  typedef std::function<std::string(const std::string&)> NewNameFunc;
   NewNameFunc new_name = nullptr;
 
   // A function that returns the incarnation of a device given the
   // device's fullname. If not found, GetIncarnationFunc should return
   // kIllegalIncarnation.
-  static constexpr uint64 kIllegalIncarnation = 0;
-  typedef std::function<uint64(const string&)> GetIncarnationFunc;
+  static constexpr uint64_t kIllegalIncarnation = 0;
+  typedef std::function<uint64_t(const std::string&)> GetIncarnationFunc;
   GetIncarnationFunc get_incarnation = nullptr;
 
   // If specified, flib_def defines a function library that should be
@@ -79,7 +79,7 @@ struct PartitionOptions {
 
   // Optional customized function to compute the "tensor_name" attr value of
   // Send/Recv ops inserted during partitioning.
-  std::function<string(const Edge*)> get_tensor_name_attr = nullptr;
+  std::function<std::string(const Edge*)> get_tensor_name_attr = nullptr;
 
   // If true, the `Partition()` function can make destructive changes to the
   // passed-in `Graph`.
@@ -96,13 +96,14 @@ struct PartitionOptions {
 //
 // Stores the partitions in *partitions.
 absl::Status Partition(const PartitionOptions& opts, Graph* input,
-                       std::unordered_map<string, GraphDef>* partitions);
+                       std::unordered_map<std::string, GraphDef>* partitions);
 
 // Add control edges to the partitions to control the ordering
 // and timing of the recv nodes based on the start times calculated
 // using some scheduling algorithm.
-absl::Status AddControlEdges(const PartitionOptions& opts,
-                             std::unordered_map<string, GraphDef>* partitions);
+absl::Status AddControlEdges(
+    const PartitionOptions& opts,
+    std::unordered_map<std::string, GraphDef>* partitions);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 5f3d0a1b4117f2..4f5e431b87df50 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -68,21 +68,23 @@ using ::testing::Ne;
 
 const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
 
-string SplitByDevice(const Node* node) { return node->assigned_device_name(); }
+std::string SplitByDevice(const Node* node) {
+  return node->assigned_device_name();
+}
 
-string DeviceName(const Node* node) {
+std::string DeviceName(const Node* node) {
   char first = node->name()[0];
   if (first == 'G') {
     return gpu_device;
   } else {
-    const string cpu_prefix = "/job:a/replica:0/task:0/cpu:";
+    const std::string cpu_prefix = "/job:a/replica:0/task:0/cpu:";
     int index = first - 'A';
     return absl::StrCat(cpu_prefix, index);
   }
 }
 
 void Partition(const GraphDef& graph_def,
-               std::unordered_map<string, GraphDef>* partitions) {
+               std::unordered_map<std::string, GraphDef>* partitions) {
   Graph g(OpRegistry::Global());
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &g));
@@ -90,16 +92,18 @@ void Partition(const GraphDef& graph_def,
   // Assigns devices to each node. Uses 1st letter of the node name as the
   // device index if no device is specified.
   for (Node* node : g.nodes()) {
-    string device_name = !node->requested_device().empty()
-                             ? node->requested_device()
-                             : DeviceName(node);
+    std::string device_name = !node->requested_device().empty()
+                                  ? node->requested_device()
+                                  : DeviceName(node);
     node->set_assigned_device_name(device_name);
   }
 
   PartitionOptions popts;
   popts.node_to_loc = SplitByDevice;
-  popts.new_name = [&g](const string& prefix) { return g.NewName(prefix); };
-  popts.get_incarnation = [](const string& name) {
+  popts.new_name = [&g](const std::string& prefix) {
+    return g.NewName(prefix);
+  };
+  popts.get_incarnation = [](const std::string& name) {
     return (name[0] - 'A') + 100;
   };
   absl::Status s = Partition(popts, &g, partitions);
@@ -116,7 +120,7 @@ void Partition(const GraphDef& graph_def,
 }
 
 void CheckLoopConstruction(const GraphDef& graph_def) {
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   Partition(graph_def, &partitions);
   for (const auto& kv : partitions) {
     const GraphDef& gdef = kv.second;
@@ -128,7 +132,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       // _recvs must have a control input
       if (ndef.op() == "_Recv") {
         bool has_control = false;
-        for (const string& input_name : ndef.input()) {
+        for (const std::string& input_name : ndef.input()) {
           if (absl::StartsWith(input_name, "^")) {
             has_control = true;
             break;
@@ -171,10 +175,10 @@ REGISTER_OP("Combine")
     .Output("o: float")
     .SetShapeFn(shape_inference::UnknownShape);
 
-Output ConstructOp(const Scope& scope, const string& op_type,
+Output ConstructOp(const Scope& scope, const std::string& op_type,
                    const absl::Span<const Input> inputs) {
   if (!scope.ok()) return Output();
-  const string unique_name = scope.GetUniqueNameForOp(op_type);
+  const std::string unique_name = scope.GetUniqueNameForOp(op_type);
   auto builder =
       NodeBuilder(unique_name, op_type, scope.graph()->op_registry());
   for (auto const& input : inputs) {
@@ -230,20 +234,20 @@ class GraphPartitionTest : public ::testing::Test {
   void ExpectMatchA() {
     GraphDef graph_def;
     TF_EXPECT_OK(scope_a_.ToGraphDef(&graph_def));
-    string a = "/job:a/replica:0/task:0/cpu:0";
+    std::string a = "/job:a/replica:0/task:0/cpu:0";
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[a]);
   }
 
   void ExpectMatchB() {
     GraphDef graph_def;
     TF_EXPECT_OK(scope_b_.ToGraphDef(&graph_def));
-    string b = "/job:a/replica:0/task:0/cpu:1";
+    std::string b = "/job:a/replica:0/task:0/cpu:1";
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[b]);
   }
 
   void ExpectFunctions(const FunctionDefLibrary& library,
-                       const std::set<string>& expected_names) {
-    std::set<string> actual_names;
+                       const std::set<std::string>& expected_names) {
+    std::set<std::string> actual_names;
     for (const FunctionDef& fdef : library.function()) {
       actual_names.insert(fdef.signature().name());
     }
@@ -254,7 +258,7 @@ class GraphPartitionTest : public ::testing::Test {
   GraphDef in_graph_def_;
   Scope scope_a_;
   Scope scope_b_;
-  std::unordered_map<string, GraphDef> partitions_;
+  std::unordered_map<std::string, GraphDef> partitions_;
 };
 
 TEST_F(GraphPartitionTest, SingleDevice) {
@@ -277,8 +281,8 @@ TEST_F(GraphPartitionTest, CrossDeviceData) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   ExpectMatchA();
@@ -298,8 +302,8 @@ TEST_F(GraphPartitionTest, CrossDeviceControl) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c =
       Const(scope_a_.WithOpName("A1/ctrl/_0").WithControlDependencies(a1), {});
@@ -323,8 +327,8 @@ TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   ExpectMatchA();
@@ -346,8 +350,8 @@ TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c =
       Const(scope_a_.WithOpName("A1/ctrl/_0").WithControlDependencies(a1), {});
@@ -372,8 +376,8 @@ TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   auto c =
@@ -417,7 +421,7 @@ TEST_F(GraphPartitionTest, CrossDeviceLoopSimple1) {
   auto b1 = Identity(in_.WithOpName("B1"), a3);
   NextIteration(in_.WithOpName("B5"), b1);
 
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   Partition(ToGraphDef(), &partitions);
   for (const auto& kv : partitions) {
     const GraphDef& gdef = kv.second;
@@ -471,10 +475,12 @@ TEST_F(GraphPartitionTest, PartitionIncompleteGraph) {
 
   PartitionOptions popts;
   popts.node_to_loc = SplitByDevice;
-  popts.new_name = [&g](const string& prefix) { return g.NewName(prefix); };
-  popts.get_incarnation = [](const string&) { return 1; };
+  popts.new_name = [&g](const std::string& prefix) {
+    return g.NewName(prefix);
+  };
+  popts.get_incarnation = [](const std::string&) { return 1; };
 
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   status = Partition(popts, &g, &partitions);
   // Partitioning should fail, but not crash like it did before the
   // changes that accompanied the addition of this test.
@@ -498,8 +504,8 @@ TEST_F(GraphPartitionTest, Functions) {
   EXPECT_EQ(2, partitions_.size());
 
   // Test that partition graphs inherit function library from original graph.
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
 
   // Node "A2" is placed in part `a`, and uses only "XTimesTwo".
   ExpectFunctions(partitions_[a].library(), {"XTimesTwo"});
@@ -602,7 +608,7 @@ TEST_F(GraphPartitionTest, GraphDebugInfo) {
 
   // Expect each partitioned graph to contain the stack traces for its nodes.
   // A stack trace for A1 should be in the A partition (".../cpu:0").
-  string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
   const GraphDebugInfo& a_debug_info = partitions_[a].debug_info();
   StackTracesMap traces = LoadTracesFromDebugInfo(a_debug_info);
   const auto& a_it = traces.find("A1");
@@ -611,7 +617,7 @@ TEST_F(GraphPartitionTest, GraphDebugInfo) {
               ::testing::ContainsRegex("alpha.cc.*30"));
 
   // Stack traces for B1 and B2 should be in the B partition (".../cpu:1").
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   const GraphDebugInfo& b_debug_info = partitions_[b].debug_info();
   traces = LoadTracesFromDebugInfo(b_debug_info);
   const auto& b1_it = traces.find("B1");
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index a5b519365034f2..fb5ce07959a424 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -100,13 +100,13 @@ class GraphTest : public ::testing::Test {
     EXPECT_EQ(edges, graph_.num_edges());
   }
 
-  Node* AddNodeWithName(const string& name) {
+  Node* AddNodeWithName(const std::string& name) {
     Node* node;
     TF_CHECK_OK(NodeBuilder(name, "NoOp").Finalize(&graph_, &node));
     return node;
   }
 
-  Node* FromNodeDef(const string& name, const string& node_type,
+  Node* FromNodeDef(const std::string& name, const std::string& node_type,
                     int num_inputs) {
     auto builder = NodeDefBuilder(name, node_type);
     for (int i = 0; i < num_inputs; ++i) {
@@ -122,14 +122,14 @@ class GraphTest : public ::testing::Test {
     return node;
   }
 
-  void FromGraphDef(const string& gdef_ascii) {
+  void FromGraphDef(const std::string& gdef_ascii) {
     GraphDef gdef;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef));
     GraphConstructorOptions opts;
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef, &graph_));
   }
 
-  Node* FindNode(const string& name) {
+  Node* FindNode(const std::string& name) {
     for (Node* node : graph_.nodes()) {
       if (node->name() == name) return node;
     }
@@ -158,8 +158,8 @@ class GraphTest : public ::testing::Test {
  private:
   // Convert a list of nodes to a sorted list of strings so failure messages
   // are readable.
-  static std::vector<string> Stringify(const std::vector<Node*>& nodes) {
-    std::vector<string> result;
+  static std::vector<std::string> Stringify(const std::vector<Node*>& nodes) {
+    std::vector<std::string> result;
     result.reserve(nodes.size());
     for (Node* n : nodes) {
       result.push_back(n->DebugString());
@@ -322,14 +322,14 @@ TEST_F(GraphTest, NodeIteration) {
   graph_.RemoveNode(c);
 
   // expected = set of all node DebugStrings we expect in the graph
-  std::set<string> expected;
+  std::set<std::string> expected;
   expected.insert(graph_.source_node()->DebugString());
   expected.insert(a->DebugString());
   expected.insert(d->DebugString());
   expected.insert(graph_.sink_node()->DebugString());
 
   // Verify that iterating through ids gets the same set of nodes.
-  std::set<string> actual;
+  std::set<std::string> actual;
   for (int id = 0; id < graph_.num_node_ids(); ++id) {
     Node* node = graph_.FindNodeId(id);
     if (node != nullptr) {
@@ -370,7 +370,7 @@ TEST_F(GraphTest, AddAttr) {
 
   n1->AddAttr("_a", "new_attr");
 
-  string attr;
+  std::string attr;
   EXPECT_EQ(absl::OkStatus(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
 
@@ -389,13 +389,13 @@ TEST_F(GraphTest, AddAttr) {
 }
 
 // Convert edge iteration results into a sorted string.
-static string EdgeIter(const Graph& g) {
+static std::string EdgeIter(const Graph& g) {
   std::vector<std::pair<int, int> > edges;
   for (const Edge* e : g.edges()) {
     edges.push_back(std::make_pair(e->src()->id(), e->dst()->id()));
   }
   std::sort(edges.begin(), edges.end());
-  string result;
+  std::string result;
   for (auto& p : edges) {
     absl::StrAppend(&result, p.first, "->", p.second, ";");
   }
@@ -422,9 +422,9 @@ TEST_F(GraphTest, EdgeIteration) {
 }
 
 TEST_F(GraphTest, NewName) {
-  string a1 = graph_.NewName("A");
-  string a2 = graph_.NewName("A");
-  string b1 = graph_.NewName("B");
+  std::string a1 = graph_.NewName("A");
+  std::string a2 = graph_.NewName("A");
+  std::string b1 = graph_.NewName("B");
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
@@ -446,19 +446,19 @@ TEST_F(GraphTest, IsValidNode) {
   // nullptr
   absl::Status s = graph_.IsValidNode(nullptr);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("Node is null"), s.message());
+  EXPECT_EQ(std::string("Node is null"), s.message());
 
   // node id_ is too high
   s = graph_.IsValidNode(g2_node2);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("node id 3 is >= than number of nodes in graph 3"),
+  EXPECT_EQ(std::string("node id 3 is >= than number of nodes in graph 3"),
             s.message());
 
   // valid id_ but different ptr
   s = graph_.IsValidNode(g2_node1);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("Node with id 2 is different from the passed in node. "
-                   "Does it belong to a different graph?"),
+  EXPECT_EQ(std::string("Node with id 2 is different from the passed in node. "
+                        "Does it belong to a different graph?"),
             s.message());
 }
 
@@ -695,8 +695,8 @@ TEST_F(GraphTest, BuildNodeNameIndex) {
   auto node_name_index = graph_.BuildNodeNameIndex();
   EXPECT_EQ(node_name_index.size(), 5);
 
-  std::vector<string> node_names{"_SOURCE", "_SINK", "A", "B", "C"};
-  for (const string& node_name : node_names) {
+  std::vector<std::string> node_names{"_SOURCE", "_SINK", "A", "B", "C"};
+  for (const std::string& node_name : node_names) {
     EXPECT_NE(node_name_index.find(node_name), node_name_index.end());
     EXPECT_EQ(node_name_index[node_name], FindNode(node_name));
   }
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index e2fe533ce4b238..e29d2d92d4c597 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -108,7 +108,7 @@ NodeBuilder& NodeBuilder::Device(absl::string_view device_spec) {
 }
 
 NodeBuilder& NodeBuilder::AssignedDevice(absl::string_view device) {
-  assigned_device_ = string(device);
+  assigned_device_ = std::string(device);
   return *this;
 }
 
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 6f249371606b3e..476393cae8166b 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -67,8 +67,8 @@ class NodeBuilder {
     // * a nullptr Node* was passed to the NodeOut constructor, or
     // * an out-of-range index was passed to the NodeOut constructor.
     bool error;
-    string name;
-    int32 index;
+    std::string name;
+    int32_t index;
     DataType dt;
   };
 
@@ -132,7 +132,7 @@ class NodeBuilder {
   absl::StatusOr<Node*> Finalize(Graph* graph, bool consume = false);
 
   // Accessors for the values set in the constructor.
-  const string& node_name() const { return def_builder_.node_name(); }
+  const std::string& node_name() const { return def_builder_.node_name(); }
   const OpDef& op_def() const { return def_builder_.op_def(); }
 
  private:
@@ -157,8 +157,8 @@ class NodeBuilder {
   const OpRegistryInterface* op_registry_;
   std::vector<NodeOut> inputs_;
   std::vector<Node*> control_inputs_;
-  std::vector<string> errors_;
-  string assigned_device_;
+  std::vector<std::string> errors_;
+  std::string assigned_device_;
 };
 
 // IMPLEMENTATION -------------------------------------------------------------
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 39b53541081659..f18d8a3bca0f1a 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -93,9 +93,9 @@ static size_t kIllegalNodeHash = 0;
 
 class Hasher {
  public:
-  uint64 hash() { return h_ == kIllegalNodeHash ? kIllegalNodeHash + 1 : h_; }
+  uint64_t hash() { return h_ == kIllegalNodeHash ? kIllegalNodeHash + 1 : h_; }
 
-  void MixString(const string& s) { h_ = Hash64(s.data(), s.size(), h_); }
+  void MixString(const std::string& s) { h_ = Hash64(s.data(), s.size(), h_); }
 
   void MixInteger(size_t z) { h_ = Hash64Combine(h_, z); }
 
@@ -122,7 +122,7 @@ class Hasher {
     // This kBufSize makes sizeof(HashingOutputStream) == 256.  It's not chosen
     // for any particular reason except it's a nice even number of cache lines.
     static constexpr size_t kBufSize = 228;
-    static constexpr uint64 kDefaultSeed = 2570847921467975139ULL;
+    static constexpr uint64_t kDefaultSeed = 2570847921467975139ULL;
     bool Next(void** data, int* size) override {
       if (i_ == kBufSize) {
         // Mix the chunk in.
@@ -174,7 +174,7 @@ class Hasher {
 
     bool AllowsAliasing() const override { return true; }
 
-    uint64 hash() {
+    uint64_t hash() {
       if (i_ != 0) {
         Mix(buf_, i_);
         i_ = 0;
@@ -190,10 +190,10 @@ class Hasher {
     char buf_[kBufSize];
     int i_ = 0;
     int64_t byte_count_ = 0;
-    uint64 h_ = kDefaultSeed;
+    uint64_t h_ = kDefaultSeed;
   };
 
-  uint64 h_ = HashingOutputStream::kDefaultSeed;
+  uint64_t h_ = HashingOutputStream::kDefaultSeed;
 };
 
 size_t OptimizerCSE::NodeHash(const Node* n) {
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 94b4cabb2fd884..bac15370ae039e 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static void InitGraph(const string& s, Graph* graph) {
+static void InitGraph(const std::string& s, Graph* graph) {
   GraphDef graph_def;
 
   auto parser = protobuf::TextFormat::Parser();
@@ -50,14 +50,14 @@ class OptimizerCSETest : public ::testing::Test {
  public:
   OptimizerCSETest() : graph_(OpRegistry::Global()) {}
 
-  void InitGraph(const string& s) {
+  void InitGraph(const std::string& s) {
     ::tensorflow::InitGraph(s, &graph_);
     original_ = CanonicalGraphString(&graph_);
   }
 
   static bool IncludeNode(const Node* n) { return n->IsOp(); }
 
-  static string EdgeId(const Node* n, int index) {
+  static std::string EdgeId(const Node* n, int index) {
     if (index == 0) {
       return n->name();
     } else if (index == Graph::kControlSlot) {
@@ -67,9 +67,9 @@ class OptimizerCSETest : public ::testing::Test {
     }
   }
 
-  string CanonicalGraphString(Graph* g) {
-    std::vector<string> nodes;
-    std::vector<string> edges;
+  std::string CanonicalGraphString(Graph* g) {
+    std::vector<std::string> nodes;
+    std::vector<std::string> edges;
     for (const Node* n : g->nodes()) {
       if (IncludeNode(n)) {
         nodes.push_back(absl::StrCat(n->name(), "(", n->type_string(), ")"));
@@ -88,21 +88,22 @@ class OptimizerCSETest : public ::testing::Test {
                         absl::StrJoin(edges, ";"));
   }
 
-  string DoCSE(const std::function<bool(const Node*)>& consider_fn = nullptr) {
-    string before = CanonicalGraphString(&graph_);
+  std::string DoCSE(
+      const std::function<bool(const Node*)>& consider_fn = nullptr) {
+    std::string before = CanonicalGraphString(&graph_);
     LOG(ERROR) << "Before rewrites: " << before;
 
     OptimizeCSE(&graph_, consider_fn);
 
-    string result = CanonicalGraphString(&graph_);
+    std::string result = CanonicalGraphString(&graph_);
     LOG(ERROR) << "After rewrites:  " << result;
     return result;
   }
 
-  const string& OriginalGraph() const { return original_; }
+  const std::string& OriginalGraph() const { return original_; }
 
   Graph graph_;
-  string original_;
+  std::string original_;
 };
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
@@ -339,8 +340,8 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   EXPECT_EQ(OriginalGraph(),
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
             "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
-  std::vector<string> nodes = str_util::Split(DoCSE(), ";|");
-  std::set<string> node_set(nodes.begin(), nodes.end());
+  std::vector<std::string> nodes = str_util::Split(DoCSE(), ";|");
+  std::set<std::string> node_set(nodes.begin(), nodes.end());
   // Expect exactly one of each type of node to be retained after CSE.
   EXPECT_EQ(node_set.count("n/_0(Const)") + node_set.count("n/_7(Const)"), 1);
   EXPECT_EQ(node_set.count("n/_1(Const)") + node_set.count("n/_6(Const)"), 1);
@@ -350,14 +351,14 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
 
 void BM_CSE(::testing::benchmark::State& state) {
   const int op_nodes = state.range(0);
-  string s;
+  std::string s;
   for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+    s += absl::StrFormat("node { name: 'in%04d' op: 'Input'}", in);
   }
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   for (int op = 0; op < op_nodes; op++) {
-    s += strings::Printf(
+    s += absl::StrFormat(
         "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
         "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
         op, rnd.Uniform(10), rnd.Uniform(10));
diff --git a/tensorflow/core/graph/regularization/simple_delete_test.cc b/tensorflow/core/graph/regularization/simple_delete_test.cc
index 2eac003707755f..424c0384823cb8 100644
--- a/tensorflow/core/graph/regularization/simple_delete_test.cc
+++ b/tensorflow/core/graph/regularization/simple_delete_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/regularization/simple_delete.h"
 
+#include <cstdint>
 #include <string>
 
 #include "absl/status/statusor.h"
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index 1d03877c02583c..697defb2ef2558 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -61,7 +61,7 @@ absl::Status FeedInputs(
   out_feed_types->clear();
   out_feed_types->reserve(feed_rewrites.size());
   for (size_t i = 0; i < feed_rewrites.size(); ++i) {
-    const string& t = feed_rewrites[i]->endpoint_name();
+    const std::string& t = feed_rewrites[i]->endpoint_name();
     TensorId id(ParseTensorName(t));
 
     auto iter = name_index->find(id.first);
@@ -127,7 +127,7 @@ absl::Status FetchOutputs(
   out_fetch_nodes->clear();
   out_fetch_nodes->reserve(fetch_rewrites.size());
   for (size_t i = 0; i < fetch_rewrites.size(); ++i) {
-    const string& t = fetch_rewrites[i]->endpoint_name();
+    const std::string& t = fetch_rewrites[i]->endpoint_name();
 
     // Parse t into node_name and output_index.
     TensorId id(ParseTensorName(t));
@@ -174,7 +174,7 @@ absl::Status FetchOutputs(
   return absl::OkStatus();
 }
 
-bool AddNodeToTargets(const string& node_or_tensor_name,
+bool AddNodeToTargets(const std::string& node_or_tensor_name,
                       const NameIndex& name_index,
                       std::unordered_set<const Node*>* targets) {
   TensorId id = ParseTensorName(node_or_tensor_name);
@@ -188,17 +188,18 @@ bool AddNodeToTargets(const string& node_or_tensor_name,
   return true;
 }
 
-absl::Status PruneForTargets(Graph* g, const NameIndex& name_index,
-                             const std::vector<Node*>& fetch_nodes,
-                             const absl::Span<const string>& target_nodes) {
-  string not_found;
+absl::Status PruneForTargets(
+    Graph* g, const NameIndex& name_index,
+    const std::vector<Node*>& fetch_nodes,
+    const absl::Span<const std::string>& target_nodes) {
+  std::string not_found;
   std::unordered_set<const Node*> targets;
   for (Node* n : fetch_nodes) {
     if (!AddNodeToTargets(n->name(), name_index, &targets)) {
       absl::StrAppend(&not_found, n->name(), " ");
     }
   }
-  for (const string& s : target_nodes) {
+  for (const std::string& s : target_nodes) {
     if (!AddNodeToTargets(s, name_index, &targets)) {
       absl::StrAppend(&not_found, s, " ");
     }
@@ -295,9 +296,9 @@ absl::Status SendFetchRewrite::AddNode(Graph* g,
 }
 
 absl::Status RewriteGraphForExecution(
-    Graph* g, const absl::Span<const string>& fed_outputs,
-    const absl::Span<const string>& fetch_outputs,
-    const absl::Span<const string>& target_node_names,
+    Graph* g, const absl::Span<const std::string>& fed_outputs,
+    const absl::Span<const std::string>& fetch_outputs,
+    const absl::Span<const std::string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata) {
   std::vector<std::unique_ptr<PruneRewrite>> feed_rewrites;
@@ -305,10 +306,10 @@ absl::Status RewriteGraphForExecution(
   if (use_function_convention) {
     for (size_t i = 0; i < fed_outputs.size(); ++i) {
       feed_rewrites.emplace_back(new ArgFeedRewrite(
-          &fed_outputs[i], &device_info, static_cast<int32>(i)));
+          &fed_outputs[i], &device_info, static_cast<int32_t>(i)));
     }
   } else {
-    for (const string& fed_output : fed_outputs) {
+    for (const std::string& fed_output : fed_outputs) {
       feed_rewrites.emplace_back(
           new RecvFeedRewrite(&fed_output, &device_info));
     }
@@ -319,10 +320,10 @@ absl::Status RewriteGraphForExecution(
   if (use_function_convention) {
     for (size_t i = 0; i < fetch_outputs.size(); ++i) {
       fetch_rewrites.emplace_back(new RetvalFetchRewrite(
-          &fetch_outputs[i], &device_info, static_cast<int32>(i)));
+          &fetch_outputs[i], &device_info, static_cast<int32_t>(i)));
     }
   } else {
-    for (const string& fetch_output : fetch_outputs) {
+    for (const std::string& fetch_output : fetch_outputs) {
       fetch_rewrites.emplace_back(
           new SendFetchRewrite(&fetch_output, &device_info));
     }
@@ -334,22 +335,22 @@ absl::Status RewriteGraphForExecution(
 
 namespace {
 template <typename StringContainer>
-std::vector<string> ConvertToVector(StringContainer field) {
-  return std::vector<string>(field.begin(), field.end());
+std::vector<std::string> ConvertToVector(StringContainer field) {
+  return std::vector<std::string>(field.begin(), field.end());
 }
 }  // namespace
 
 absl::Status RewriteGraphForExecution(
     Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
     const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
-    const absl::Span<const string>& target_node_names,
+    const absl::Span<const std::string>& target_node_names,
     RewriteGraphMetadata* out_metadata) {
   if (fetch_rewrites.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
         "Must specify at least one target to fetch or execute.");
   }
 
-  std::unordered_set<string> endpoints;
+  std::unordered_set<std::string> endpoints;
   for (const auto& feed_rewrite : feed_rewrites) {
     auto result = endpoints.insert(feed_rewrite->endpoint_name());
     if (!result.second) {
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 37013b8f7d09ee..c8843a37d58fa9 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -50,7 +50,8 @@ struct RewriteGraphMetadata {
 class PruneRewrite {
  public:
   // `endpoint_name` and `device_info` must outlive this object.
-  PruneRewrite(const string* endpoint_name, const DeviceAttributes* device_info)
+  PruneRewrite(const std::string* endpoint_name,
+               const DeviceAttributes* device_info)
       : endpoint_name_(endpoint_name), device_info_(device_info) {}
   virtual ~PruneRewrite() {}
 
@@ -60,14 +61,14 @@ class PruneRewrite {
                                Node** out_node) = 0;
 
   // Returns the name of the tensor to which this rewrite applies.
-  const string& endpoint_name() { return *endpoint_name_; }
+  const std::string& endpoint_name() { return *endpoint_name_; }
 
  protected:
   // The device on which the new node will be created.
   const DeviceAttributes& device_info() { return *device_info_; }
 
  private:
-  const string* const endpoint_name_;          // Not owned.
+  const std::string* const endpoint_name_;     // Not owned.
   const DeviceAttributes* const device_info_;  // Not owned.
 };
 
@@ -98,9 +99,9 @@ class PruneRewrite {
 //    - fetch output "node:output_index" does not exist in "*g"
 //    - target node "node" does not exist in "*g"
 absl::Status RewriteGraphForExecution(
-    Graph* g, const absl::Span<const string>& fed_outputs,
-    const absl::Span<const string>& fetch_outputs,
-    const absl::Span<const string>& target_node_names,
+    Graph* g, const absl::Span<const std::string>& fed_outputs,
+    const absl::Span<const std::string>& fetch_outputs,
+    const absl::Span<const std::string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
 
@@ -109,7 +110,7 @@ absl::Status RewriteGraphForExecution(
 absl::Status RewriteGraphForExecution(
     Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
     const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
-    const absl::Span<const string>& target_node_names,
+    const absl::Span<const std::string>& target_node_names,
     RewriteGraphMetadata* out_metadata);
 
 /////////////////////////////////////////////////////////
@@ -119,14 +120,14 @@ absl::Status RewriteGraphForExecution(
 // A rewrite action that adds an _Arg node for a fed tensor.
 class ArgFeedRewrite : public PruneRewrite {
  public:
-  ArgFeedRewrite(const string* endpoint_name,
+  ArgFeedRewrite(const std::string* endpoint_name,
                  const DeviceAttributes* device_info, int32_t arg_index)
       : PruneRewrite(endpoint_name, device_info), arg_index_(arg_index) {}
   absl::Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
                        Node** out_node) override;
 
  private:
-  const int32 arg_index_;
+  const int32_t arg_index_;
 };
 
 // A rewrite action that adds a client-terminated _Recv node for a fed tensor.
@@ -140,14 +141,14 @@ class RecvFeedRewrite : public PruneRewrite {
 // A rewrite action that adds a _Retval node for a fetched tensor.
 class RetvalFetchRewrite : public PruneRewrite {
  public:
-  RetvalFetchRewrite(const string* endpoint_name,
+  RetvalFetchRewrite(const std::string* endpoint_name,
                      const DeviceAttributes* device_info, int32_t retval_index)
       : PruneRewrite(endpoint_name, device_info), retval_index_(retval_index) {}
   absl::Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
                        Node** out_node) override;
 
  private:
-  const int32 retval_index_;
+  const int32_t retval_index_;
 };
 
 // A rewrite action that adds a client-terminated _Send node for a
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 31c5cf8a3bb444..a5f4be88e8e5de 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -49,24 +49,24 @@ class SubgraphTest : public ::testing::Test {
 
   ~SubgraphTest() override {}
 
-  void ExpectOK(const string& gdef_ascii) {
+  void ExpectOK(const std::string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef_));
     GraphConstructorOptions opts;
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef_, g_.get()));
   }
 
-  Node* FindNode(const string& name) {
+  Node* FindNode(const std::string& name) {
     for (Node* n : g_->nodes()) {
       if (n->name() == name) return n;
     }
     return nullptr;
   }
 
-  bool HasNode(const string& name) { return FindNode(name) != nullptr; }
+  bool HasNode(const std::string& name) { return FindNode(name) != nullptr; }
 
-  void ExpectNodes(const string& nodes) {
+  void ExpectNodes(const std::string& nodes) {
     int count = 0;
-    std::vector<string> actual_nodes;
+    std::vector<std::string> actual_nodes;
     for (Node* n : g_->nodes()) {
       if (n->IsOp()) {
         count++;
@@ -77,9 +77,9 @@ class SubgraphTest : public ::testing::Test {
 
     LOG(INFO) << "Nodes present: " << absl::StrJoin(actual_nodes, " ");
 
-    std::vector<string> expected_nodes = str_util::Split(nodes, ',');
+    std::vector<std::string> expected_nodes = str_util::Split(nodes, ',');
     std::sort(expected_nodes.begin(), expected_nodes.end());
-    for (const string& s : expected_nodes) {
+    for (const std::string& s : expected_nodes) {
       Node* n = FindNode(s);
       EXPECT_TRUE(n != nullptr) << s;
       if (n->type_string() == "_Send" || n->type_string() == "_Recv") {
@@ -92,7 +92,8 @@ class SubgraphTest : public ::testing::Test {
         << "\nExpected: " << absl::StrJoin(expected_nodes, ",");
   }
 
-  bool HasEdge(const string& src, int src_out, const string& dst, int dst_in) {
+  bool HasEdge(const std::string& src, int src_out, const std::string& dst,
+               int dst_in) {
     for (const Edge* e : g_->edges()) {
       if (e->src()->name() == src && e->src_output() == src_out &&
           e->dst()->name() == dst && e->dst_input() == dst_in)
@@ -100,20 +101,20 @@ class SubgraphTest : public ::testing::Test {
     }
     return false;
   }
-  bool HasControlEdge(const string& src, const string& dst) {
+  bool HasControlEdge(const std::string& src, const std::string& dst) {
     return HasEdge(src, Graph::kControlSlot, dst, Graph::kControlSlot);
   }
 
-  string Subgraph(const string& fed_str, const string& fetch_str,
-                  const string& targets_str,
-                  bool use_function_convention = false) {
+  std::string Subgraph(const std::string& fed_str, const std::string& fetch_str,
+                       const std::string& targets_str,
+                       bool use_function_convention = false) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(*g_, subgraph);
-    std::vector<string> fed =
+    std::vector<std::string> fed =
         str_util::Split(fed_str, ',', str_util::SkipEmpty());
-    std::vector<string> fetch =
+    std::vector<std::string> fetch =
         str_util::Split(fetch_str, ',', str_util::SkipEmpty());
-    std::vector<string> targets =
+    std::vector<std::string> targets =
         str_util::Split(targets_str, ',', str_util::SkipEmpty());
 
     subgraph::RewriteGraphMetadata metadata;
@@ -355,7 +356,7 @@ void BM_SubgraphHelper(::testing::benchmark::State& state,
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
     Node* last_node = nullptr;
     for (int i = 0; i < num_nodes; i++) {
-      string name = absl::StrCat("N", i);
+      std::string name = absl::StrCat("N", i);
       if (i > 0) {
         last_node = ops::UnaryOp("Op", last_node, b.opts().WithName(name));
       } else {
@@ -365,12 +366,12 @@ void BM_SubgraphHelper(::testing::benchmark::State& state,
     TF_CHECK_OK(GraphDefBuilderToGraph(b, &g));
   }
 
-  std::vector<string> fed;
+  std::vector<std::string> fed;
   if (num_nodes > 1000) {
     fed.push_back(absl::StrCat("N", num_nodes - 1000));
   }
-  std::vector<string> fetch;
-  std::vector<string> targets = {absl::StrCat("N", num_nodes - 1)};
+  std::vector<std::string> fetch;
+  std::vector<std::string> targets = {absl::StrCat("N", num_nodes - 1)};
 
   for (auto s : state) {
     Graph* subgraph = new Graph(OpRegistry::Global());
diff --git a/tensorflow/core/graph/tensor_id.h b/tensorflow/core/graph/tensor_id.h
index 31b30fa14af463..30caf3857e303c 100644
--- a/tensorflow/core/graph/tensor_id.h
+++ b/tensorflow/core/graph/tensor_id.h
@@ -44,7 +44,7 @@ struct TensorId : public std::pair<absl::string_view, int> {
   const absl::string_view node() const { return first; }
   int index() const { return second; }
 
-  string ToString() const {
+  std::string ToString() const {
     if (second == Graph::kControlSlot) return absl::StrCat("^", first);
     return absl::StrCat(first, ":", second);
   }
@@ -63,19 +63,19 @@ bool IsTensorIdControl(const TensorId& tensor_id);
 
 // Same as TensorId, except owns the backing storage for the op name. This makes
 // the memory management simpler at the expense of a copy.
-struct SafeTensorId : public std::pair<string, int> {
-  typedef std::pair<string, int> Base;
+struct SafeTensorId : public std::pair<std::string, int> {
+  typedef std::pair<std::string, int> Base;
 
   // NOTE(skyewm): this is required on some platforms. I'm not sure why the
   // using "using Base::pair;" isn't always sufficient.
   SafeTensorId() : Base() {}
-  SafeTensorId(const string& str, int idx) : Base(str, idx) {}
+  SafeTensorId(const std::string& str, int idx) : Base(str, idx) {}
   SafeTensorId(const TensorId& id);
 
-  const string& node() const { return first; }
+  const std::string& node() const { return first; }
   int index() const { return second; }
 
-  string ToString() const {
+  std::string ToString() const {
     if (second == Graph::kControlSlot) return absl::StrCat("^", first);
     return absl::StrCat(first, ":", second);
   }
diff --git a/tensorflow/core/graph/tensor_id_test.cc b/tensorflow/core/graph/tensor_id_test.cc
index 15bffd170642c8..4bec9298680b78 100644
--- a/tensorflow/core/graph/tensor_id_test.cc
+++ b/tensorflow/core/graph/tensor_id_test.cc
@@ -23,7 +23,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-string ParseHelper(const string& n) { return ParseTensorName(n).ToString(); }
+std::string ParseHelper(const std::string& n) {
+  return ParseTensorName(n).ToString();
+}
 
 TEST(TensorIdTest, ParseTensorName) {
   EXPECT_EQ(ParseHelper("W1"), "W1:0");
@@ -35,8 +37,8 @@ TEST(TensorIdTest, ParseTensorName) {
   EXPECT_EQ(ParseHelper("^foo"), "^foo");
 }
 
-uint32 Skewed(random::SimplePhilox* rnd, int max_log) {
-  const uint32 space = 1 << (rnd->Rand32() % (max_log + 1));
+uint32_t Skewed(random::SimplePhilox* rnd, int max_log) {
+  const uint32_t space = 1 << (rnd->Rand32() % (max_log + 1));
   return rnd->Rand32() % space;
 }
 
@@ -44,9 +46,9 @@ void BM_ParseTensorName(::testing::benchmark::State& state) {
   const int arg = state.range(0);
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  std::vector<string> names;
+  std::vector<std::string> names;
   for (int i = 0; i < 100; i++) {
-    string name;
+    std::string name;
     switch (arg) {
       case 0: {  // Generate random names
         size_t len = Skewed(&rnd, 4);
@@ -92,7 +94,7 @@ void BM_ParseTensorName(::testing::benchmark::State& state) {
 BENCHMARK(BM_ParseTensorName)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4)->Arg(5);
 
 TEST(TensorIdTest, IsTensorIdControl) {
-  string input = "^foo";
+  std::string input = "^foo";
   TensorId tensor_id = ParseTensorName(input);
   EXPECT_TRUE(IsTensorIdControl(tensor_id));
 
@@ -106,7 +108,7 @@ TEST(TensorIdTest, IsTensorIdControl) {
 }
 
 TEST(TensorIdTest, PortZero) {
-  for (string input : {"foo", "foo:0"}) {
+  for (std::string input : {"foo", "foo:0"}) {
     TensorId tensor_id = ParseTensorName(input);
     EXPECT_EQ("foo", tensor_id.node());
     EXPECT_EQ(0, tensor_id.index());
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index f83bf238cde9d1..b882361aa8093e 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -32,8 +32,9 @@ namespace tensorflow {
 namespace test {
 namespace graph {
 
-Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
-           const uint64 sender_incarnation, const string& receiver) {
+Node* Send(Graph* g, Node* input, const std::string& tensor,
+           const std::string& sender, const uint64_t sender_incarnation,
+           const std::string& receiver) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_Send")
                   .Input(input, 0)
@@ -46,9 +47,9 @@ Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
   return ret;
 }
 
-Node* Recv(Graph* g, const string& tensor, const string& type,
-           const string& sender, const uint64 sender_incarnation,
-           const string& receiver) {
+Node* Recv(Graph* g, const std::string& tensor, const std::string& type,
+           const std::string& sender, const uint64_t sender_incarnation,
+           const std::string& receiver) {
   Node* ret;
   DataType dtype;
   CHECK(DataTypeFromString(type, &dtype));
@@ -72,7 +73,7 @@ Node* Constant(Graph* g, const Tensor& tensor) {
   return ret;
 }
 
-Node* Constant(Graph* g, const Tensor& tensor, const string& name) {
+Node* Constant(Graph* g, const Tensor& tensor, const std::string& name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(name, "Const")
                   .Attr("dtype", tensor.dtype())
@@ -85,7 +86,7 @@ Node* HostConstant(Graph* g, const Tensor& tensor) {
   return HostConstant(g, tensor, g->NewName("n"));
 }
 
-Node* HostConstant(Graph* g, const Tensor& tensor, const string& name) {
+Node* HostConstant(Graph* g, const Tensor& tensor, const std::string& name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(name, "HostConst")
                   .Attr("dtype", tensor.dtype())
@@ -104,7 +105,7 @@ Node* Var(Graph* g, const DataType dtype, const TensorShape& shape) {
 }
 
 Node* Var(Graph* g, const DataType dtype, const TensorShape& shape,
-          const string& name) {
+          const std::string& name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(name, "Variable")
                   .Attr("dtype", dtype)
@@ -134,7 +135,7 @@ Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive, bool reverse) {
   return ret;
 }
 
-Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
+Node* Reduce(Graph* g, const std::string& reduce, Node* data, Node* axes,
              bool keep_dims) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), reduce, g->op_registry())
@@ -179,7 +180,7 @@ Node* BatchMatmul(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y) {
   return ret;
 }
 
-Node* RandomNumberGenerator(const string& op, Graph* g, Node* input,
+Node* RandomNumberGenerator(const std::string& op, Graph* g, Node* input,
                             DataType dtype) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), op, g->op_registry())
@@ -222,7 +223,7 @@ Node* RandomPoisson(Graph* g, Node* shape, Node* lam) {
   return ret;
 }
 
-Node* Unary(Graph* g, const string& func, Node* input, int index) {
+Node* Unary(Graph* g, const std::string& func, Node* input, int index) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), func, g->op_registry())
                   .Input(input, index)
@@ -230,7 +231,7 @@ Node* Unary(Graph* g, const string& func, Node* input, int index) {
   return ret;
 }
 
-Node* Binary(Graph* g, const string& func, Node* in0, Node* in1) {
+Node* Binary(Graph* g, const std::string& func, Node* in0, Node* in1) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), func, g->op_registry())
                   .Input(in0)
@@ -239,7 +240,7 @@ Node* Binary(Graph* g, const string& func, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins) {
+Node* Multi(Graph* g, const std::string& func, absl::Span<Node* const> ins) {
   Node* ret;
   auto b = NodeBuilder(g->NewName("n"), func, g->op_registry());
   for (Node* n : ins) b = b.Input(n);
@@ -271,7 +272,7 @@ Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
   return ret;
 }
 
-Node* Error(Graph* g, Node* input, const string& errmsg, bool log_error) {
+Node* Error(Graph* g, Node* input, const std::string& errmsg, bool log_error) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
                   .Input(input)
@@ -317,7 +318,7 @@ Node* Switch(Graph* g, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Enter(Graph* g, Node* input, const string& frame_name) {
+Node* Enter(Graph* g, Node* input, const std::string& frame_name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Enter")
                   .Input(input)
@@ -341,11 +342,11 @@ Node* Merge(Graph* g, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in) {
+Node* Merge(Graph* g, Node* in0, absl::Span<const std::string> remaining_in) {
   std::vector<NodeBuilder::NodeOut> inputs;
   inputs.reserve(remaining_in.size() + 1);
   inputs.emplace_back(in0);
-  for (const string& in_name : remaining_in) {
+  for (const std::string& in_name : remaining_in) {
     inputs.emplace_back(in_name, 0, inputs[0].dt);
   }
 
@@ -383,7 +384,7 @@ Node* ConcatV2(Graph* g, absl::Span<Node* const> tensors, Node* concat_dim) {
   return ret;
 }
 
-Node* Next(Graph* g, const string& name, Node* input) {
+Node* Next(Graph* g, const std::string& name, Node* input) {
   Node* ret;
   TF_CHECK_OK(
       NodeBuilder(name, "NextIteration").Input(input).Finalize(g, &ret));
@@ -497,7 +498,7 @@ Node* DiagPart(Graph* g, Node* in, DataType type) {
   return ret;
 }
 
-Node* CheckNumerics(Graph* g, Node* in, const string& message) {
+Node* CheckNumerics(Graph* g, Node* in, const std::string& message) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "CheckNumerics")
                   .Input(in)
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index df7843f884b17d..f4df5a4ed4d038 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -39,7 +39,7 @@ void ToGraphDef(Graph* g, GraphDef* def);
 
 // Adds a node in "g" producing a constant "tensor".
 Node* Constant(Graph* g, const Tensor& tensor);
-Node* Constant(Graph* g, const Tensor& tensor, const string& name);
+Node* Constant(Graph* g, const Tensor& tensor, const std::string& name);
 
 // Adds a node in "g" producing a constant "tensor" on the host.
 // The given node which, unlike the regular Constant above, always
@@ -47,26 +47,27 @@ Node* Constant(Graph* g, const Tensor& tensor, const string& name);
 // in GPU tests where the test Op in question runs on the device
 // but requires some arguments to be pinned to the host.
 Node* HostConstant(Graph* g, const Tensor& tensor);
-Node* HostConstant(Graph* g, const Tensor& tensor, const string& name);
+Node* HostConstant(Graph* g, const Tensor& tensor, const std::string& name);
 
 // Adds a variable in "g" of the given "shape" and "dtype".
 Node* Var(Graph* g, DataType dtype, const TensorShape& shape);
 Node* Var(Graph* g, DataType dtype, const TensorShape& shape,
-          const string& name);
+          const std::string& name);
 
 // Adds an assign node in "g" which assigns "val" into "var".
 Node* Assign(Graph* g, Node* var, Node* val);
 
 // Adds a send node "g" sending "input" as a named "tensor" from
 // "sender" to "receiver".
-Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
-           uint64 sender_incarnation, const string& receiver);
+Node* Send(Graph* g, Node* input, const std::string& tensor,
+           const std::string& sender, uint64_t sender_incarnation,
+           const std::string& receiver);
 
 // Adds a recv node in "g" receiving a named "tensor" from "sender"
 // to "receiver".
-Node* Recv(Graph* g, const string& tensor, const string& type,
-           const string& sender, uint64 sender_incarnation,
-           const string& receiver);
+Node* Recv(Graph* g, const std::string& tensor, const std::string& type,
+           const std::string& sender, uint64_t sender_incarnation,
+           const std::string& receiver);
 
 // Adds a cumsum "node" in "g" doing cumsum(data, axes).
 Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
@@ -74,7 +75,7 @@ Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
 
 // Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
 // a reduction, e.g., Sum, Max, Min, Mean, etc.
-Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
+Node* Reduce(Graph* g, const std::string& reduce, Node* data, Node* axes,
              bool keep_dims = false);
 
 // Adds a Matmul node in g doing in0.contract(in1).
@@ -89,17 +90,17 @@ Node* BatchMatmul(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y);
 Node* QuantizeToUINT8(Graph* g, Node* data);
 
 // Adds a unary function "func" "node" in "g" taking "input".
-Node* Unary(Graph* g, const string& func, Node* input, int index = 0);
+Node* Unary(Graph* g, const std::string& func, Node* input, int index = 0);
 
 // Adds an identity node in "g" taking "input" and producing an
 // identity copy.
 Node* Identity(Graph* g, Node* input, int index = 0);
 
 // Adds a binary function "func" node in "g" taking "in0" and "in1".
-Node* Binary(Graph* g, const string& func, Node* in0, Node* in1);
+Node* Binary(Graph* g, const std::string& func, Node* in0, Node* in1);
 
 // Adds a function "func" node in "g" taking inputs "ins".
-Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins);
+Node* Multi(Graph* g, const std::string& func, absl::Span<Node* const> ins);
 
 // Adds a binary add node in "g" doing in0 + in1.
 Node* Add(Graph* g, Node* in0, Node* in1);
@@ -131,7 +132,7 @@ Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
 
 // Adds an error node in "g". The node's computation always
 // generates an error with the given error message "errmsg".
-Node* Error(Graph* g, Node* input, const string& errmsg,
+Node* Error(Graph* g, Node* input, const std::string& errmsg,
             bool log_error = false);
 
 // Adds a node that generates a invalid ref output.
@@ -150,7 +151,7 @@ Node* NoOp(Graph* g, const std::vector<Node*>& control_inputs);
 Node* Switch(Graph* g, Node* in0, Node* in1);
 
 // Adds an Enter node in "g", which enters a new frame.
-Node* Enter(Graph* g, Node* input, const string& frame_name);
+Node* Enter(Graph* g, Node* input, const std::string& frame_name);
 
 // Adds an Exit node in "g", which exits a frame.
 Node* Exit(Graph* g, Node* input);
@@ -160,11 +161,11 @@ Node* Merge(Graph* g, Node* in0, Node* in1);
 
 // Adds a Merge node in "g". The first input is "in0", the remaining
 // inputs are only given by their names in remaining_in.
-Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in);
+Node* Merge(Graph* g, Node* in0, absl::Span<const std::string> remaining_in);
 
 // Adds a NextIteration node in "g", which makes its input available
 // to the next iteration.
-Node* Next(Graph* g, const string& name, Node* input);
+Node* Next(Graph* g, const std::string& name, Node* input);
 
 // Adds a LoopCond node in "g", representing the "pivot" termination
 // condition of a loop.
@@ -215,7 +216,7 @@ Node* Diag(Graph* g, Node* in, DataType type);
 Node* DiagPart(Graph* g, Node* in, DataType type);
 
 // Add a CheckNumerics node in "g".
-Node* CheckNumerics(Graph* g, Node* in, const string& message);
+Node* CheckNumerics(Graph* g, Node* in, const std::string& message);
 
 // Add an _Arg node in "g".
 Node* Arg(Graph* g, int64_t index, DataType type);
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index 154d9f26c80cf5..4572ceb9de7897 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -100,7 +100,7 @@ absl::Status ValidateGraphHasNoCycle(const Graph& graph) {
   }
 
   if (processed < graph.num_nodes()) {
-    std::vector<string> nodes_in_cycle;
+    std::vector<std::string> nodes_in_cycle;
     for (int i = 0; i < pending_count.size() && nodes_in_cycle.size() < 3;
          ++i) {
       if (pending_count[i] != 0) {
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index b593a2c9b63c7e..35e7ebb4cff6e0 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -38,7 +38,7 @@ REGISTER_OP("FloatInput").Output("o: float");
 REGISTER_OP("Int32Input").Output("o: int32");
 
 TEST(ValidateGraphDefTest, TestValidGraph) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'FloatInput' }"
       "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -50,7 +50,7 @@ TEST(ValidateGraphDefTest, TestValidGraph) {
 }
 
 TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'Int32Input' }"
       "node { "
@@ -74,7 +74,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
 
 TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // "DstT" attribute is missing.
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { "
       "       name: 'B' op: 'Cast' "
@@ -102,7 +102,7 @@ TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
   TF_ASSERT_OK(OpDefBuilder("UniqueSnowflake").Finalize(&op_reg_data));
   OpList op_list;
   *op_list.add_op() = op_reg_data.op_def;
-  const string graph_def_str = "node { name: 'A' op: 'UniqueSnowflake' }";
+  const std::string graph_def_str = "node { name: 'A' op: 'UniqueSnowflake' }";
   GraphDef graph_def;
   auto parser = protobuf::TextFormat::Parser();
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
@@ -114,7 +114,7 @@ TEST(ValidateGraphDefAgainstOpListTest, GraphWithGlobalOpNotInOpList) {
   TF_ASSERT_OK(OpDefBuilder("NotAnywhere").Finalize(&op_reg_data));
   OpList op_list;
   *op_list.add_op() = op_reg_data.op_def;
-  const string graph_def_str = "node { name: 'A' op: 'FloatInput' }";
+  const std::string graph_def_str = "node { name: 'A' op: 'FloatInput' }";
   GraphDef graph_def;
   auto parser = protobuf::TextFormat::Parser();
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
@@ -150,7 +150,7 @@ TEST(GetOpListForValidationTest, ShouldStripDocs) {
 }
 
 TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'Int32Input' }"
       "node { "
@@ -165,7 +165,7 @@ TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
 }
 
 TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'A' op: 'Int32Input' }"
       "node { "
@@ -181,7 +181,7 @@ TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
 }
 
 TEST(ValidateGraphHasNoCycleTest, NoCyclePasses) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'FloatInput' }"
       "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -198,7 +198,7 @@ TEST(ValidateGraphHasNoCycleTest, NoCyclePasses) {
 }
 
 TEST(ValidateGraphHasNoCycleTest, NoCycleWithMergePasses) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       R"EOF(
       node { name: 'A' op: 'FloatInput' }
       node { name: 'merge' op: 'Merge' input: [ 'A:0', 'next:0' ]
@@ -221,8 +221,8 @@ TEST(ValidateGraphHasNoCycleTest, NoCycleWithMergePasses) {
   TF_EXPECT_OK(graph::ValidateGraphHasNoCycle(graph));
 }
 
-Node* AddNodeFromNodeDef(Graph& graph, const string& name,
-                         const string& node_type, int num_inputs) {
+Node* AddNodeFromNodeDef(Graph& graph, const std::string& name,
+                         const std::string& node_type, int num_inputs) {
   auto builder = NodeDefBuilder(name, node_type);
   for (int i = 0; i < num_inputs; ++i) {
     builder = builder.Input(absl::StrCat("node_", i), i, DT_FLOAT);
diff --git a/tensorflow/core/graph/while_context.h b/tensorflow/core/graph/while_context.h
index e23e9df90afd2d..4f15b7d37c7b18 100644
--- a/tensorflow/core/graph/while_context.h
+++ b/tensorflow/core/graph/while_context.h
@@ -39,7 +39,7 @@ class WhileContext {
                std::vector<OutputTensor> body_inputs,
                std::vector<OutputTensor> body_outputs);
 
-  const string& frame_name() const { return frame_name_; }
+  const std::string& frame_name() const { return frame_name_; }
   const std::vector<Node*>& enter_nodes() const { return enter_nodes_; }
   const std::vector<Node*>& exit_nodes() const { return exit_nodes_; }
   const OutputTensor& cond_output() const { return cond_output_; }
@@ -53,7 +53,7 @@ class WhileContext {
   // uniquely identified by its frame name. Frames are used by the executor to
   // manage the iterations of a loop. See the FrameState comment in
   // core/common_runtime/executor.cc for more details.
-  const string frame_name_;
+  const std::string frame_name_;
 
   // The enter nodes defining the input loop variables to the while loop. This
   // vector defines the order of the loop variables.
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.cc b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
index 42dffe79ecabe3..1e355d45a91ec1 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
@@ -34,7 +34,7 @@ GenNode::GenNode(const NodeDef* node) : node_(node), op_(nullptr) {}
 
 absl::Status GenNode::BuildGraphInMap(const GraphDef& source, GenNodeMap* map) {
   for (const auto& n : source.node()) {
-    const string& name = n.name();
+    const std::string& name = n.name();
     if (map->find(name) != map->end()) {
       // This error code looks more meaningful than ALREADY_EXISTS.
       return absl::Status(absl::StatusCode::kInvalidArgument,
@@ -95,7 +95,7 @@ absl::Status GenNode::ParseInputs(const GenNodeMap* map) {
 
   for (int i = 0; i < n_inputs; ++i) {
     int other_position;
-    string other_name = ParseNodeName(node_->input(i), &other_position);
+    std::string other_name = ParseNodeName(node_->input(i), &other_position);
     auto other_it = map->find(other_name);
     if (other_it == map->end()) {
       return absl::Status(
@@ -138,8 +138,8 @@ bool GenNode::IsMultiInput(Port port) const {
   return (it->second.size() > 1);
 }
 
-GenNode::Port::operator string() const {
-  string result = this->IsInbound() ? "i" : "o";
+GenNode::Port::operator std::string() const {
+  std::string result = this->IsInbound() ? "i" : "o";
   if (this->IsControl()) {
     result.append("C");
   } else {
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.h b/tensorflow/core/grappler/graph_analyzer/gen_node.h
index 57d5f59ec2ccd7..7194a48a6a2538 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node.h
@@ -35,7 +35,7 @@ namespace graph_analyzer {
 class GenNode;
 
 // To find nodes by name.
-using GenNodeMap = std::unordered_map<string, std::unique_ptr<GenNode>>;
+using GenNodeMap = std::unordered_map<std::string, std::unique_ptr<GenNode>>;
 
 // One node in the graph, in the form convenient for traversal and generation of
 // subgraphs. It refers to the original NodeDef protobuf for most information
@@ -51,8 +51,8 @@ class GenNode {
   explicit GenNode(const NodeDef* node);
 
   // Access wrappers.
-  const string& name() const { return node_->name(); }
-  const string& opcode() const { return node_->op(); }
+  const std::string& name() const { return node_->name(); }
+  const std::string& opcode() const { return node_->op(); }
   const NodeDef* node_def() const { return node_; }
 
   // Parse the inputs of this node and update the map accordingly, creating the
@@ -111,7 +111,7 @@ class GenNode {
 
     // Convenient for printing. I've really wanted it to be implicit but
     // ClangTidy insists on making it explicit.
-    explicit operator string() const;
+    explicit operator std::string() const;
 
    private:
     explicit Port(IntPort value) : value_(value) {}
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
index 2d47abda617615..dde0fb720c0170 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
@@ -315,16 +315,16 @@ absl::Status GraphAnalyzer::CollateResult() {
   return absl::OkStatus();
 }
 
-std::vector<string> GraphAnalyzer::DumpRawSubgraphs() {
-  std::vector<string> result;
+std::vector<std::string> GraphAnalyzer::DumpRawSubgraphs() {
+  std::vector<std::string> result;
   for (const auto& it : result_) {
     result.emplace_back(it->Dump());
   }
   return result;
 }
 
-std::vector<string> GraphAnalyzer::DumpSubgraphs() {
-  std::vector<string> result;
+std::vector<std::string> GraphAnalyzer::DumpSubgraphs() {
+  std::vector<std::string> result;
   for (auto ptr : ordered_collation_) {
     result.emplace_back(
         absl::StrFormat("%d %s", ptr->count, ptr->sig->ToString()));
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
index 9a321e69b531fb..be46b6843225a6 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
@@ -46,7 +46,7 @@ class GraphAnalyzer {
   absl::Status Run();
 
   // Returns the subgraphs found in Run() printed to text.
-  std::vector<string> DumpSubgraphs();
+  std::vector<std::string> DumpSubgraphs();
 
   // Prints the subgraphs found in Run() to stdout.
   absl::Status OutputSubgraphs();
@@ -78,7 +78,7 @@ class GraphAnalyzer {
   absl::Status CollateResult();
 
   // Returns the raw subgraphs found in FindSubgraphs() printed to text.
-  std::vector<string> DumpRawSubgraphs();
+  std::vector<std::string> DumpRawSubgraphs();
 
   // Finds and adds appropriately to either partial_ or result_ all the
   // subgraphs that can be created by extending the parent subgraph by one node.
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
index 9822f5446f4b39..4e9220d3a5c7e5 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
@@ -56,10 +56,12 @@ class GraphAnalyzerTest : public ::testing::Test, protected TestGraphs {
     gran_->ExtendSubgraphAllOrNone(parent, node);
   }
 
-  std::vector<string> DumpRawSubgraphs() { return gran_->DumpRawSubgraphs(); }
+  std::vector<std::string> DumpRawSubgraphs() {
+    return gran_->DumpRawSubgraphs();
+  }
 
-  std::vector<string> DumpPartials() {
-    std::vector<string> result;
+  std::vector<std::string> DumpPartials() {
+    std::vector<std::string> result;
     for (const auto& it : gran_->partial_) {
       result.emplace_back(it->Dump());
     }
@@ -68,7 +70,9 @@ class GraphAnalyzerTest : public ::testing::Test, protected TestGraphs {
 
   const GenNodeMap& GetNodes() { return gran_->nodes_; }
 
-  GenNode* GetNode(const string& name) { return gran_->nodes_.at(name).get(); }
+  GenNode* GetNode(const std::string& name) {
+    return gran_->nodes_.at(name).get();
+  }
 
   SubgraphPtrSet& GetResult() { return gran_->result_; }
   SubgraphPtrSet& GetPartial() { return gran_->partial_; }
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
index 72662005ecdec7..0b1b3af2ea5571 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
@@ -31,7 +31,7 @@ namespace grappler {
 namespace graph_analyzer {
 
 // Dies on failure.
-static void LoadModel(const string& filename,
+static void LoadModel(const std::string& filename,
                       tensorflow::MetaGraphDef* metagraph) {
   LOG(INFO) << "Loading model from " << filename;
   absl::Status st;
@@ -49,7 +49,7 @@ static void LoadModel(const string& filename,
 // of train ops (if provided).
 void MaybePruneGraph(const tensorflow::MetaGraphDef& metagraph,
                      tensorflow::GraphDef* graph) {
-  std::vector<string> fetch_nodes;
+  std::vector<std::string> fetch_nodes;
   for (const auto& fetch :
        metagraph.collection_def().at("train_op").node_list().value()) {
     LOG(INFO) << "Fetch node: " << fetch;
@@ -72,7 +72,7 @@ void MaybePruneGraph(const tensorflow::MetaGraphDef& metagraph,
   }
 }
 
-void GraphAnalyzerTool(const string& file_name, int n) {
+void GraphAnalyzerTool(const std::string& file_name, int n) {
   if (n < 1) {
     LOG(FATAL) << "Invalid subgraph size " << n << ", must be at least 1";
   }
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
index 5a91fe7dc8eb7d..85f75706acf4cb 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace grappler {
 namespace graph_analyzer {
 
-void GraphAnalyzerTool(const string& file_name, int n);
+void GraphAnalyzerTool(const std::string& file_name, int n);
 
 }  // end namespace graph_analyzer
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.cc b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
index 9210bf56b8047b..123bd0f060bccf 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
@@ -99,7 +99,7 @@ void SigNode::ComputeTopoHash0() {
   last_hashed_nodes_ = next_hashed_nodes_ = node_mask_;
 
   // TODO(babkin): include the attributes too, as an option.
-  size_t hval = std::hash<string>()(opcode());
+  size_t hval = std::hash<std::string>()(opcode());
 
   // Getting the topology of the links in to the hash early should get more
   // conflicts resolved early.
@@ -208,8 +208,8 @@ bool SigNode::operator==(const SigNode& other) const {
 
 constexpr int Signature::kMaxGraphSize;
 
-string Signature::ToString() const {
-  string result;
+std::string Signature::ToString() const {
+  std::string result;
   for (size_t n = 0; n < nodes.size(); ++n) {
     // TODO(babkin): add attributes too.
     result += absl::StrFormat("%d:%s", n, nodes[n]->opcode());
@@ -219,9 +219,9 @@ string Signature::ToString() const {
       // The link entries are already sorted, by tags and then by the
       // node ranks.
       if (link.tag.local.IsInbound()) {
-        result +=
-            absl::StrFormat("[%s:%s:%d]", string(link.tag.local),
-                            string(link.tag.remote), entry.peer->unique_rank_);
+        result += absl::StrFormat("[%s:%s:%d]", std::string(link.tag.local),
+                                  std::string(link.tag.remote),
+                                  entry.peer->unique_rank_);
       }
     }
     result.push_back(',');
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.h b/tensorflow/core/grappler/graph_analyzer/sig_node.h
index 6c0731ebb92b54..2caaf605615796 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.h
@@ -42,7 +42,7 @@ class SigNode;
 // To find nodes by name. Having the map ordered makes the tests easier,
 // and it isn't used in production code often enough to get any win from
 // using an unordered map.
-using SigNodeMap = std::map<string, std::unique_ptr<SigNode>>;
+using SigNodeMap = std::map<std::string, std::unique_ptr<SigNode>>;
 
 // One node in the graph, in the form convenient for generation of the signature
 // of the graph, and comparison of two (sub)graphs for equivalence. It refers to
@@ -61,8 +61,8 @@ class SigNode {
   explicit SigNode(const NodeDef* node);
 
   // Access wrappers.
-  const string& name() const { return node_->name(); }
-  const string& opcode() const { return node_->op(); }
+  const std::string& name() const { return node_->name(); }
+  const std::string& opcode() const { return node_->op(); }
   const NodeDef* node_def() const { return node_; }
 
   // For extraction of subgraphs into a separate SigNodeMap, copies the links
@@ -261,7 +261,7 @@ struct Signature {
   absl::Status Compute();
 
   // Convert the computed signature to a string representation.
-  string ToString() const;
+  std::string ToString() const;
 
   SigNodeMap map;        // The nodes in the graph, accessible by name.
   size_t sig_short = 0;  // Hash of the signature, for the quick equality check.
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
index 6f38b4dc8b075c..56980ccedf459c 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
@@ -312,7 +312,7 @@ TEST_F(SigNodeTest, ComputeTopoHash0) {
   EXPECT_THAT(RefNextHashedNodes(&sn1), Eq(0x02));
   EXPECT_THAT(RefTopoHash(&sn1), SizeIs(1));
 
-  size_t exp_hval = std::hash<string>()(sn1.opcode());
+  size_t exp_hval = std::hash<std::string>()(sn1.opcode());
   CombineHash(1, &exp_hval);
   CombineHash(1, &exp_hval);
   CombineHash(2, &exp_hval);
@@ -640,14 +640,14 @@ class SignatureTest : public SigBaseTest {
     std::vector<size_t> countdown;
     InitPermutation(graph_size, &plain_permutation, &countdown);
 
-    std::set<string> signatures;
+    std::set<std::string> signatures;
     std::vector<size_t> permutation;
     do {
       BuildPermutation(plain_permutation, countdown, &permutation);
 
       constexpr bool kDebugPermutation = false;
       if (kDebugPermutation) {
-        string p;
+        std::string p;
         for (int i = 0; i < permutation.size(); ++i) {
           p.push_back('0' + permutation[i]);
         }
@@ -1070,7 +1070,7 @@ TEST_F(SignatureTest, OrderLinks) {
   }
 
   // How it was ordered in the original graph.
-  string before = sig_.ToString();
+  std::string before = sig_.ToString();
   // clang-format off
   EXPECT_THAT(before, Eq(
     "0:Mul[i0:o0:5][i0:o0:4][i0:o1:4][i0:o2:3][i0:o2:2][i0:o3:2],"
@@ -1084,7 +1084,7 @@ TEST_F(SignatureTest, OrderLinks) {
 
   OrderLinks(&sig_);
 
-  string after = sig_.ToString();
+  std::string after = sig_.ToString();
   // clang-format off
   EXPECT_THAT(after, Eq(
       "0:Mul[i0:o0:4][i0:o0:5][i0:o1:4][i0:o2:2][i0:o2:3][i0:o3:2],"
@@ -1132,7 +1132,7 @@ TEST_F(SignatureTest, ToString) {
     RefHashIsFinal(sig_.nodes[i]) = true;
   }
 
-  string result = sig_.ToString();
+  std::string result = sig_.ToString();
 
   // clang-format off
   ASSERT_THAT(result, Eq(
@@ -1151,14 +1151,14 @@ TEST_F(SignatureTest, Permutation) {
   std::vector<size_t> countdown;
   InitPermutation(5, &plain_permutation, &countdown);
 
-  std::set<string> results;
+  std::set<std::string> results;
 
   std::vector<size_t> permutation;
   do {
     BuildPermutation(plain_permutation, countdown, &permutation);
     EXPECT_THAT(permutation, SizeIs(5));
 
-    string p;
+    std::string p;
     for (int i = 0; i < permutation.size(); ++i) {
       p.push_back('0' + permutation[i]);
     }
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.cc b/tensorflow/core/grappler/graph_analyzer/subgraph.cc
index cfa26f243b20df..c08f23d97468cf 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph.cc
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph.cc
@@ -81,9 +81,9 @@ size_t Subgraph::Identity::Hash() const {
   return result;
 }
 
-string Subgraph::Dump() {
+std::string Subgraph::Dump() {
   // TODO(babkin): this is simplified for now.
-  std::vector<string> nodes;
+  std::vector<std::string> nodes;
   for (const auto& n : id_) {
     if (specific_) {
       nodes.emplace_back(absl::StrFormat("%s(%s)", n->opcode(), n->name()));
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.h b/tensorflow/core/grappler/graph_analyzer/subgraph.h
index 7d3494cdc43540..140d7d626d8030 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph.h
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph.h
@@ -64,7 +64,7 @@ class Subgraph {
   size_t Hash() const { return hash_; }
 
   // Dump the subgraph information to a string.
-  string Dump();
+  std::string Dump();
 
   // Extract this subgraph into a separate graph representation for signature
   // building, that includes only the links between the nodes in the subgraph
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
index da29e6cff5d803..2d6849cafbcb57 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
@@ -120,15 +120,15 @@ TEST(SubgraphTest, Iteration) {
   {
     SubgraphIterator sit(&sg);
     SubgraphIterator sit2(&sg);
-    std::vector<string> links;
+    std::vector<std::string> links;
     for (; !sit.AtEnd(); sit.Next()) {
       EXPECT_TRUE(sit == sit2);
       sit2.Next();
       EXPECT_FALSE(sit == sit2);
 
-      links.push_back(absl::StrFormat("[%s,%s,%s]", string(sit.GetPort()),
+      links.push_back(absl::StrFormat("[%s,%s,%s]", std::string(sit.GetPort()),
                                       sit.GetNeighbor().node->name(),
-                                      string(sit.GetNeighbor().port)));
+                                      std::string(sit.GetNeighbor().port)));
     }
     EXPECT_TRUE(sit == sit2);
 
diff --git a/tensorflow/core/grappler/graph_analyzer/test_tools.cc b/tensorflow/core/grappler/graph_analyzer/test_tools.cc
index fe24424d81cd1b..f9cc5cda65a40d 100644
--- a/tensorflow/core/grappler/graph_analyzer/test_tools.cc
+++ b/tensorflow/core/grappler/graph_analyzer/test_tools.cc
@@ -33,15 +33,15 @@ namespace test {
 
 //=== Helper methods to construct the nodes.
 
-NodeDef MakeNodeConst(const string& name) {
+NodeDef MakeNodeConst(const std::string& name) {
   NodeDef n;
   n.set_name(name);
   n.set_op("Const");
   return n;
 }
 
-NodeDef MakeNode2Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2) {
+NodeDef MakeNode2Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2) {
   NodeDef n;
   n.set_name(name);
   n.set_op(opcode);
@@ -50,9 +50,9 @@ NodeDef MakeNode2Arg(const string& name, const string& opcode,
   return n;
 }
 
-NodeDef MakeNode4Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2, const string& arg3,
-                     const string& arg4) {
+NodeDef MakeNode4Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2,
+                     const std::string& arg3, const std::string& arg4) {
   NodeDef n;
   n.set_name(name);
   n.set_op(opcode);
@@ -64,45 +64,47 @@ NodeDef MakeNode4Arg(const string& name, const string& opcode,
 }
 
 // Not really a 2-argument but convenient to construct.
-NodeDef MakeNodeShapeN(const string& name, const string& arg1,
-                       const string& arg2) {
+NodeDef MakeNodeShapeN(const std::string& name, const std::string& arg1,
+                       const std::string& arg2) {
   // This opcode is multi-input but not commutative.
   return MakeNode2Arg(name, "ShapeN", arg1, arg2);
 }
 
 // Not really a 2-argument but convenient to construct.
-NodeDef MakeNodeIdentityN(const string& name, const string& arg1,
-                          const string& arg2) {
+NodeDef MakeNodeIdentityN(const std::string& name, const std::string& arg1,
+                          const std::string& arg2) {
   // The argument is of a list type.
   return MakeNode2Arg(name, "IdentityN", arg1, arg2);
 }
 
-NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1,
-                                const string& arg2, const string& arg3,
-                                const string& arg4) {
+NodeDef MakeNodeQuantizedConcat(const std::string& name,
+                                const std::string& arg1,
+                                const std::string& arg2,
+                                const std::string& arg3,
+                                const std::string& arg4) {
   // This opcode has multiple multi-inputs.
   return MakeNode4Arg(name, "QuantizedConcat", arg1, arg2, arg3, arg4);
 }
 
 //=== Helper methods for analysing the structures.
 
-std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map) {
+std::vector<std::string> DumpLinkMap(const GenNode::LinkMap& link_map) {
   // This will order the entries first.
-  std::map<string, string> ordered;
+  std::map<std::string, std::string> ordered;
   for (const auto& link : link_map) {
-    string key = string(link.first);
+    std::string key = std::string(link.first);
 
     // Order the other sides too. They may be repeating, so store them
     // in a multiset.
-    std::multiset<string> others;
+    std::multiset<std::string> others;
     for (const auto& other : link.second) {
-      others.emplace(
-          absl::StrFormat("%s[%s]", other.node->name(), string(other.port)));
+      others.emplace(absl::StrFormat("%s[%s]", other.node->name(),
+                                     std::string(other.port)));
     }
     ordered[key] = absl::StrJoin(others, ", ");
   }
   // Now dump the result in a predictable order.
-  std::vector<string> result;
+  std::vector<std::string> result;
   result.reserve(ordered.size());
   for (const auto& link : ordered) {
     result.emplace_back(link.first + ": " + link.second);
@@ -110,7 +112,8 @@ std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map) {
   return result;
 }
 
-std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map) {
+std::vector<std::string> DumpLinkHashMap(
+    const SigNode::LinkHashMap& link_hash_map) {
   // The entries in this map are ordered by hash value which might change
   // at any point. Re-order them by the link tag.
   std::map<SigNode::LinkTag, size_t> tags;
@@ -118,23 +121,24 @@ std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map) {
     tags[entry.second.tag] = entry.first;
   }
 
-  std::vector<string> result;
+  std::vector<std::string> result;
   for (const auto& id : tags) {
     // For predictability, the nodes need to be sorted.
-    std::vector<string> nodes;
+    std::vector<std::string> nodes;
     for (const auto& peer : link_hash_map.at(id.second).peers) {
       nodes.emplace_back(peer->name());
     }
     std::sort(nodes.begin(), nodes.end());
-    result.emplace_back(string(id.first.local) + ":" + string(id.first.remote) +
-                        ": " + absl::StrJoin(nodes, ", "));
+    result.emplace_back(std::string(id.first.local) + ":" +
+                        std::string(id.first.remote) + ": " +
+                        absl::StrJoin(nodes, ", "));
   }
   return result;
 }
 
-std::vector<string> DumpHashedPeerVector(
+std::vector<std::string> DumpHashedPeerVector(
     const SigNode::HashedPeerVector& hashed_peers) {
-  std::vector<string> result;
+  std::vector<std::string> result;
 
   // Each subset of nodes with the same hash has to be sorted by name.
   // Other than that, the vector is already ordered by full tags.
diff --git a/tensorflow/core/grappler/graph_analyzer/test_tools.h b/tensorflow/core/grappler/graph_analyzer/test_tools.h
index e53c8e9b198cb0..89c6f146e0ab01 100644
--- a/tensorflow/core/grappler/graph_analyzer/test_tools.h
+++ b/tensorflow/core/grappler/graph_analyzer/test_tools.h
@@ -33,47 +33,49 @@ namespace test {
 
 //=== Helper methods to construct the nodes.
 
-NodeDef MakeNodeConst(const string& name);
+NodeDef MakeNodeConst(const std::string& name);
 
-NodeDef MakeNode2Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2);
+NodeDef MakeNode2Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2);
 
-NodeDef MakeNode4Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2, const string& arg3,
-                     const string& arg4);
+NodeDef MakeNode4Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2,
+                     const std::string& arg3, const std::string& arg4);
 
-inline NodeDef MakeNodeMul(const string& name, const string& arg1,
-                           const string& arg2) {
+inline NodeDef MakeNodeMul(const std::string& name, const std::string& arg1,
+                           const std::string& arg2) {
   return MakeNode2Arg(name, "Mul", arg1, arg2);
 }
 
 // Not really a 2-argument but convenient to construct.
-inline NodeDef MakeNodeAddN(const string& name, const string& arg1,
-                            const string& arg2) {
+inline NodeDef MakeNodeAddN(const std::string& name, const std::string& arg1,
+                            const std::string& arg2) {
   return MakeNode2Arg(name, "AddN", arg1, arg2);
 }
 
-inline NodeDef MakeNodeSub(const string& name, const string& arg1,
-                           const string& arg2) {
+inline NodeDef MakeNodeSub(const std::string& name, const std::string& arg1,
+                           const std::string& arg2) {
   return MakeNode2Arg(name, "Sub", arg1, arg2);
 }
 
 // Has 2 honest outputs.
-inline NodeDef MakeNodeBroadcastGradientArgs(const string& name,
-                                             const string& arg1,
-                                             const string& arg2) {
+inline NodeDef MakeNodeBroadcastGradientArgs(const std::string& name,
+                                             const std::string& arg1,
+                                             const std::string& arg2) {
   return MakeNode2Arg(name, "BroadcastGradientArgs", arg1, arg2);
 }
 
-NodeDef MakeNodeShapeN(const string& name, const string& arg1,
-                       const string& arg2);
+NodeDef MakeNodeShapeN(const std::string& name, const std::string& arg1,
+                       const std::string& arg2);
 
-NodeDef MakeNodeIdentityN(const string& name, const string& arg1,
-                          const string& arg2);
+NodeDef MakeNodeIdentityN(const std::string& name, const std::string& arg1,
+                          const std::string& arg2);
 
-NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1,
-                                const string& arg2, const string& arg3,
-                                const string& arg4);
+NodeDef MakeNodeQuantizedConcat(const std::string& name,
+                                const std::string& arg1,
+                                const std::string& arg2,
+                                const std::string& arg3,
+                                const std::string& arg4);
 
 //=== A container of pre-constructed graphs.
 
@@ -106,12 +108,13 @@ class TestGraphs {
 
 //=== Helper methods for analysing the structures.
 
-std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map);
+std::vector<std::string> DumpLinkMap(const GenNode::LinkMap& link_map);
 
 // Also checks for the consistency of hash values.
-std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map);
+std::vector<std::string> DumpLinkHashMap(
+    const SigNode::LinkHashMap& link_hash_map);
 
-std::vector<string> DumpHashedPeerVector(
+std::vector<std::string> DumpHashedPeerVector(
     const SigNode::HashedPeerVector& hashed_peers);
 
 }  // end namespace test
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
index 5d3e91d8dccee1..87fc1d1f141b2e 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -38,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-FileInputYielder::FileInputYielder(const std::vector<string>& filenames,
+FileInputYielder::FileInputYielder(const std::vector<std::string>& filenames,
                                    size_t max_iterations)
     : filenames_(filenames),
       current_file_(0),
@@ -64,7 +64,7 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
     }
   }
 
-  const string& filename = filenames_[current_file_];
+  const std::string& filename = filenames_[current_file_];
   ++current_file_;
 
   if (!Env::Default()->FileExists(filename).ok()) {
@@ -97,12 +97,12 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
     metagraph = MetaGraphDef();
     return NextItem(item);
   } else {
-    std::unordered_set<string> train_ops;
-    for (const string& val :
+    std::unordered_set<std::string> train_ops;
+    for (const std::string& val :
          metagraph.collection_def().at("train_op").node_list().value()) {
       train_ops.insert(NodeName(val));
     }
-    std::unordered_set<string> train_ops_found;
+    std::unordered_set<std::string> train_ops_found;
     for (auto& node : metagraph.graph_def().node()) {
       if (train_ops.find(node.name()) != train_ops.end()) {
         train_ops_found.insert(node.name());
@@ -120,7 +120,8 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
     }
   }
 
-  const string id = absl::StrCat(Fingerprint64(metagraph.SerializeAsString()));
+  const std::string id =
+      absl::StrCat(Fingerprint64(metagraph.SerializeAsString()));
 
   ItemConfig cfg;
   std::unique_ptr<GrapplerItem> new_item =
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index f3e9ecb677fdf8..ac1fdb7ac604bc 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -37,12 +37,12 @@ class FileInputYielder : public InputYielder {
   // Iterates over the files specified in the list of 'filename' up to
   // 'max_iterations' times.
   explicit FileInputYielder(
-      const std::vector<string>& filenames,
+      const std::vector<std::string>& filenames,
       size_t max_iterations = std::numeric_limits<size_t>::max());
   bool NextItem(GrapplerItem* item) override;
 
  private:
-  const std::vector<string> filenames_;
+  const std::vector<std::string> filenames_;
   size_t current_file_;
   size_t current_iteration_;
   size_t max_iterations_;
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 7f39582ba663f0..f496d48e28af82 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -44,7 +44,7 @@ namespace grappler {
 namespace {
 GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
                         bool use_multiple_devices, bool insert_queue,
-                        const std::vector<string>& device_names) {
+                        const std::vector<std::string>& device_names) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -102,7 +102,7 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
 
 TrivialTestGraphInputYielder::TrivialTestGraphInputYielder(
     int num_stages, int width, int tensor_size, bool insert_queue,
-    const std::vector<string>& device_names)
+    const std::vector<std::string>& device_names)
     : num_stages_(num_stages),
       width_(width),
       tensor_size_(tensor_size),
diff --git a/tensorflow/core/grappler/inputs/utils.cc b/tensorflow/core/grappler/inputs/utils.cc
index 294bb2cead1111..6c6d3be7a25515 100644
--- a/tensorflow/core/grappler/inputs/utils.cc
+++ b/tensorflow/core/grappler/inputs/utils.cc
@@ -28,21 +28,22 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool FilesExist(const std::vector<string>& files,
+bool FilesExist(const std::vector<std::string>& files,
                 std::vector<absl::Status>* status) {
   return Env::Default()->FilesExist(files, status);
 }
 
-bool FilesExist(const std::set<string>& files) {
-  return FilesExist(std::vector<string>(files.begin(), files.end()), nullptr);
+bool FilesExist(const std::set<std::string>& files) {
+  return FilesExist(std::vector<std::string>(files.begin(), files.end()),
+                    nullptr);
 }
 
-bool FileExists(const string& file, absl::Status* status) {
+bool FileExists(const std::string& file, absl::Status* status) {
   *status = Env::Default()->FileExists(file);
   return status->ok();
 }
 
-absl::Status ReadGraphDefFromFile(const string& graph_def_path,
+absl::Status ReadGraphDefFromFile(const std::string& graph_def_path,
                                   GraphDef* result) {
   absl::Status status;
   if (!ReadBinaryProto(Env::Default(), graph_def_path, result).ok()) {
@@ -51,7 +52,7 @@ absl::Status ReadGraphDefFromFile(const string& graph_def_path,
   return status;
 }
 
-absl::Status ReadMetaGraphDefFromFile(const string& graph_def_path,
+absl::Status ReadMetaGraphDefFromFile(const std::string& graph_def_path,
                                       MetaGraphDef* result) {
   absl::Status status;
   if (!ReadBinaryProto(Env::Default(), graph_def_path, result).ok()) {
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 9caefcd836c171..50a35211149f15 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -29,18 +29,18 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool FilesExist(const std::vector<string>& files,
+bool FilesExist(const std::vector<std::string>& files,
                 std::vector<absl::Status>* status = nullptr);
-bool FilesExist(const std::set<string>& files);
+bool FilesExist(const std::set<std::string>& files);
 
-bool FileExists(const string& file, absl::Status* status);
+bool FileExists(const std::string& file, absl::Status* status);
 
 // Reads GraphDef from file in either text or raw serialized format.
-absl::Status ReadGraphDefFromFile(const string& graph_def_path,
+absl::Status ReadGraphDefFromFile(const std::string& graph_def_path,
                                   GraphDef* result);
 
 // Reads MetaGraphDef from file in either text or raw serialized format.
-absl::Status ReadMetaGraphDefFromFile(const string& meta_graph_def_path,
+absl::Status ReadMetaGraphDefFromFile(const std::string& meta_graph_def_path,
                                       MetaGraphDef* result);
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/inputs/utils_test.cc b/tensorflow/core/grappler/inputs/utils_test.cc
index b32229a051fa86..ff2e14fc930244 100644
--- a/tensorflow/core/grappler/inputs/utils_test.cc
+++ b/tensorflow/core/grappler/inputs/utils_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class UtilsTest : public ::testing::Test {
  protected:
-  string BaseDir() { return io::JoinPath(testing::TmpDir(), "base_dir"); }
+  std::string BaseDir() { return io::JoinPath(testing::TmpDir(), "base_dir"); }
 
   void SetUp() override {
     TF_CHECK_OK(env_->CreateDir(BaseDir()));
@@ -70,24 +70,24 @@ class UtilsTest : public ::testing::Test {
 
   GraphDef graph_def_;
   MetaGraphDef meta_graph_def_;
-  string non_existent_file_;
-  string actual_file_;
-  string text_graph_def_file_;
-  string binary_graph_def_file_;
-  string text_meta_graph_def_file_;
-  string binary_meta_graph_def_file_;
+  std::string non_existent_file_;
+  std::string actual_file_;
+  std::string text_graph_def_file_;
+  std::string binary_graph_def_file_;
+  std::string text_meta_graph_def_file_;
+  std::string binary_meta_graph_def_file_;
   Env* env_ = Env::Default();
 };
 
 TEST_F(UtilsTest, FilesExist) {
-  EXPECT_FALSE(FilesExist(std::vector<string>{{non_existent_file_}}));
-  EXPECT_FALSE(
-      FilesExist(std::vector<string>{{non_existent_file_}, {actual_file_}}));
-  EXPECT_TRUE(FilesExist(std::vector<string>{{actual_file_}}));
+  EXPECT_FALSE(FilesExist(std::vector<std::string>{{non_existent_file_}}));
+  EXPECT_FALSE(FilesExist(
+      std::vector<std::string>{{non_existent_file_}, {actual_file_}}));
+  EXPECT_TRUE(FilesExist(std::vector<std::string>{{actual_file_}}));
 
   std::vector<absl::Status> status;
   EXPECT_FALSE(FilesExist(
-      std::vector<string>{{non_existent_file_}, {actual_file_}}, &status));
+      std::vector<std::string>{{non_existent_file_}, {actual_file_}}, &status));
   EXPECT_EQ(status.size(), 2);
   EXPECT_FALSE(status[0].ok());
   EXPECT_TRUE(status[1].ok());
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 60d08603d472c8..b5e7922697e4ad 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -90,12 +90,12 @@ std::pair<int, int> GetDeviceGPUArch(
   }
 
   int major, minor;
-  if (!strings::safe_strto32(split_arch_str[0], &major)) {
+  if (!absl::SimpleAtoi(split_arch_str[0], &major)) {
     return {0, 0};
   }
 
   if (split_arch_str.size() > 1) {
-    if (strings::safe_strto32(split_arch_str[1], &minor)) {
+    if (absl::SimpleAtoi(split_arch_str[1], &minor)) {
       return {major, minor};
     } else {
       return {0, 0};
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index fe7d4eb4f33f67..1f4943889cc06b 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -72,7 +72,7 @@ Tensor GenerateRandomTensorInRange(const TensorShape& shape, double minval,
 
 void VerifyGraphsEquivalent(const GraphDef& original_graph,
                             const GraphDef& optimized_graph,
-                            const string& func) {
+                            const std::string& func) {
   EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
   GraphView optimized_view(&optimized_graph);
   for (int i = 0; i < original_graph.node_size(); ++i) {
@@ -146,10 +146,10 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
 
-  NodeDef* AddSimpleNode(const string& name, const string& op,
-                         const std::vector<string>& inputs,
+  NodeDef* AddSimpleNode(const std::string& name, const std::string& op,
+                         const std::vector<std::string>& inputs,
                          GraphDef* graph) const {
-    std::vector<std::pair<string, AttrValue>> attributes;
+    std::vector<std::pair<std::string, AttrValue>> attributes;
     if (op == "AddN" || op == "ShapeN") {
       AttrValue num_inputs;
       num_inputs.set_i(inputs.size());
@@ -203,7 +203,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     auto input_tensor = GenerateRandomTensorInRange<DT_FLOAT>(
         TensorShape({size, size}), input_min, input_max);
-    std::vector<std::pair<string, Tensor>> feed = {{"input", input_tensor}};
+    std::vector<std::pair<std::string, Tensor>> feed = {
+        {"input", input_tensor}};
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
     AutoMixedPrecision optimizer(mode_);
@@ -564,7 +565,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto var1_tensor =
       GenerateConstantTensor<DT_FLOAT>(TensorShape({32, 32}), 3.141593f);
-  std::vector<std::pair<string, Tensor>> feed = {{"var1", var1_tensor}};
+  std::vector<std::pair<std::string, Tensor>> feed = {{"var1", var1_tensor}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
   AutoMixedPrecision optimizer(mode_);
@@ -1035,7 +1036,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
   // A separate Tensor List cluster is added to test that it is still changed to
   // DT_HALF.
   FunctionDefLibrary function_lib;
-  const Tensor kShape = test::AsTensor<int32>({32, 32});
+  const Tensor kShape = test::AsTensor<int32_t>({32, 32});
   FunctionDef func1 = FunctionDefHelper::Define(
       "Func1", {"ihandle: variant", "x: float"},
       {"ohandle: variant", "y: float"}, {},
@@ -1120,7 +1121,7 @@ int GetCudaVersion(const Cluster& cluster) {
       const auto& device_env = device_properties.environment();
       auto it = device_env.find("cuda");
       if (it != device_env.end()) {
-        string cuda_version_str = it->second;
+        std::string cuda_version_str = it->second;
         return std::stoi(cuda_version_str);
       }
     }
@@ -1407,7 +1408,7 @@ TEST_F(AutoMixedPrecisionCpuTest, MixedFanout) {
 class AutoMixedPrecisionSimulateGpuTest : public GrapplerTest {
  protected:
   void SetUp() override {
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     DeviceProperties cpu_device;
     cpu_device.set_type("CPU");
     cpu_device.set_frequency(1000);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 857e33bf028c82..8f3603829ffb46 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -95,11 +95,11 @@ class ConstantFoldingTest : public GrapplerTest {
       TF_EXPECT_OK(status);
 
       EXPECT_EQ(7, output.node_size());
-      const string snapshot_or_identity =
+      const std::string snapshot_or_identity =
           use_snapshot ? "Snapshot" : "Identity";
       for (int i = 0; i < output.node_size(); ++i) {
         const NodeDef& node = output.node(i);
-        const string& name = node.name();
+        const std::string& name = node.name();
         if (name == "mul1") {
           EXPECT_EQ("Const", node.op());
           EXPECT_EQ("^x", node.input(0));
@@ -220,7 +220,7 @@ class ConstantFoldingTest : public GrapplerTest {
     EXPECT_EQ(2, found);
 
     // Check that const folded multiplication node has the expected value.
-    std::vector<string> fetch = {"mul"};
+    std::vector<std::string> fetch = {"mul"};
     Tensor value(DT_FLOAT, input_shape);
     for (int i = 0; i < value.NumElements(); ++i) {
       value.flat<float>()(i) = i;
@@ -309,7 +309,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   EXPECT_EQ("d", node_d.name());
   EXPECT_EQ("Const", node_d.op());
 
-  std::vector<string> fetch = {"d"};
+  std::vector<std::string> fetch = {"d"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors_expected.size());
@@ -397,7 +397,7 @@ TEST_F(ConstantFoldingTest, AddTree) {
   auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
 
-  std::vector<string> fetch = {"add_parent", "mul_parent"};
+  std::vector<std::string> fetch = {"add_parent", "mul_parent"};
   auto tensor_expected =
       EvaluateNodes(item.graph, fetch, {{"x", x_t}, {"y", y_t}});
   ASSERT_EQ(fetch.size(), tensor_expected.size());
@@ -453,7 +453,7 @@ TEST_F(ConstantFoldingTest, AddSubtactTree) {
   // Check that the result nodes have the expected value.
   auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
 
-  std::vector<string> fetch = {"add_parent"};
+  std::vector<std::string> fetch = {"add_parent"};
   auto tensor_expected = EvaluateNodes(item.graph, fetch, {{"x", x_t}});
   ASSERT_EQ(fetch.size(), tensor_expected.size());
   fetch = {"add_parent"};
@@ -478,7 +478,7 @@ TEST_F(ConstantFoldingTest, ConstantPushDown) {
                                  ops::Placeholder::Shape(TensorShape({2, 2})));
 
             auto get_op = [&](bool is_commutative, bool is_left_arg_const,
-                              const string& name, const Output& const_arg,
+                              const std::string& name, const Output& const_arg,
                               const Output non_const_arg) -> Output {
               if (is_add) {
                 if (is_commutative) {
@@ -523,7 +523,7 @@ TEST_F(ConstantFoldingTest, ConstantPushDown) {
 
             // Check that the result nodes have the expected value.
             auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-            std::vector<string> fetch = {"parent"};
+            std::vector<std::string> fetch = {"parent"};
             auto tensor_expected =
                 EvaluateNodes(item.graph, fetch, {{"x", x_t}});
             ASSERT_EQ(fetch.size(), tensor_expected.size());
@@ -600,7 +600,7 @@ TEST_F(ConstantFoldingTest, ConstantPushDownBiasAdd) {
   // Check that the result nodes have the expected value.
   auto x_mat_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto x_vec_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
-  std::vector<string> fetch = item.fetch;
+  std::vector<std::string> fetch = item.fetch;
   auto tensor_expected = EvaluateNodes(
       item.graph, fetch, {{"x_vec", x_vec_t}, {"x_mat", x_mat_t}});
   ASSERT_EQ(fetch.size(), tensor_expected.size());
@@ -615,10 +615,9 @@ TEST_F(ConstantFoldingTest, ConstantPushDownBiasAdd) {
 // This test fails on ROCm platform (see commit message for details)
 #ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
-  for (string data_format : {
-         "NHWC",
+  for (std::string data_format : {"NHWC",
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-             "NCHW"
+                                  "NCHW"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
        }) {
     MulConvPushDownTest(
@@ -636,10 +635,9 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
 // This test fails on ROCm platform (see commit message for details)
 #ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
-  for (string data_format : {
-         "NHWC",
+  for (std::string data_format : {"NHWC",
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-             "NCHW"
+                                  "NCHW"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
        }) {
     for (auto mul_const_input_shape :
@@ -658,10 +656,9 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
 
 TEST_F(ConstantFoldingTest,
        MulConvPushDownTest_Conv2D_SingletonConst_ShapeMismatch) {
-  for (string data_format : {
-         "NHWC",
+  for (std::string data_format : {"NHWC",
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-             "NCHW"
+                                  "NCHW"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
        }) {
     MulConvPushDownTest(
@@ -841,18 +838,18 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
-    const string suffix =
+    const std::string suffix =
         (const_type == kConst ? "_const"
                               : (const_type == kLike ? "_like" : "_fill"));
-    const string zeros_name = strings::StrCat("zeros", suffix);
-    const string ones_name = strings::StrCat("ones", suffix);
-    const string ctrl_zeros_name = strings::StrCat("^zeros", suffix);
-    const string ctrl_ones_name = strings::StrCat("^ones", suffix);
+    const std::string zeros_name = absl::StrCat("zeros", suffix);
+    const std::string ones_name = absl::StrCat("ones", suffix);
+    const std::string ctrl_zeros_name = absl::StrCat("^zeros", suffix);
+    const std::string ctrl_ones_name = absl::StrCat("^ones", suffix);
 
     EXPECT_EQ(const_type == kFill ? 43 : 39, output.node_size());
     for (int i = 0; i < output.node_size(); ++i) {
       const NodeDef& node = output.node(i);
-      const string& name = node.name();
+      const std::string& name = node.name();
       if (name == "mul1") {
         EXPECT_EQ("Const", node.op());
         EXPECT_EQ("^x", node.input(0));
@@ -968,8 +965,8 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ("y", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       }
-      const std::set<string> square_zero_const{"mul1", "mul2",    "mul5",
-                                               "mul6", "matmul1", "matmul2"};
+      const std::set<std::string> square_zero_const{
+          "mul1", "mul2", "mul5", "mul6", "matmul1", "matmul2"};
       if (square_zero_const.count(name) > 0) {
         TensorProto t = node.attr().at("value").tensor();
         EXPECT_EQ(1, t.float_val_size());
@@ -1029,7 +1026,7 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   EXPECT_EQ(8, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    const string& name = node.name();
+    const std::string& name = node.name();
     if (name == "div_i") {
       // Integer division is unchanged.
       EXPECT_EQ("Div", node.op());
@@ -1061,7 +1058,7 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   }
 
   // Check that the reciprocals have the expected value.
-  std::vector<string> fetch = {"cf_half"};
+  std::vector<std::string> fetch = {"cf_half"};
   auto tensor_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(fetch.size(), tensor_expected.size());
   fetch = {"ConstantFolding/div_f_recip", "ConstantFolding/realdiv_recip"};
@@ -1090,13 +1087,13 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   // Multiplies without any additional ops to supply the output shape.
   int count = 0;
   std::vector<Output> muls;
-  std::unordered_set<string> not_converted;
-  std::unordered_set<string> to_const;
-  std::unordered_set<string> to_identity;
+  std::unordered_set<std::string> not_converted;
+  std::unordered_set<std::string> to_const;
+  std::unordered_set<std::string> to_identity;
   for (const auto* x : {&x_known, &x_partially_known, &x_unknown}) {
     for (const auto* zeros :
          {&zeros_known, &zeros_partially_known, &zeros_unknown}) {
-      const string name = strings::StrCat("mul_", count++);
+      const std::string name = absl::StrCat("mul_", count++);
       muls.push_back(ops::Mul(s.WithOpName(name), *x, *zeros));
       if (x == &x_partially_known && zeros == &zeros_partially_known) {
         to_identity.insert(name);
@@ -1120,7 +1117,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   EXPECT_EQ(15, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    const string& name = node.name();
+    const std::string& name = node.name();
     if (to_const.count(name) > 0) {
       EXPECT_EQ("Const", node.op()) << node.name();
     } else if (to_identity.count(name) > 0) {
@@ -1130,7 +1127,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
     }
   }
 
-  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
+  const std::vector<std::string> fetch = {"mul_0", "mul_4", "mul_8"};
   auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto x_partially_unknown_t =
       GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
@@ -1166,11 +1163,11 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   // will propagate the shape back to the inputs of AddN, making the
   // output shapes of all its inputs known
   std::vector<Output> muls_deduced_output_shape;
-  std::unordered_set<string> to_const;
+  std::unordered_set<std::string> to_const;
   int count = 0;
   for (const auto& x : {x_partially_known, x_unknown}) {
     for (const auto& zeros : {zeros_partially_known, zeros_unknown}) {
-      const string name = strings::StrCat("mul_", count++);
+      const std::string name = absl::StrCat("mul_", count++);
       muls_deduced_output_shape.push_back(
           ops::Mul(s.WithOpName(name), x, zeros));
       to_const.insert(name);
@@ -1193,7 +1190,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   EXPECT_EQ(10, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    const string& name = node.name();
+    const std::string& name = node.name();
     if (to_const.count(name) > 0) {
       EXPECT_EQ("Const", node.op()) << node.name();
       EXPECT_EQ(2, node.input_size());
@@ -1201,7 +1198,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
-  const std::vector<string> fetch = {"addn1"};
+  const std::vector<std::string> fetch = {"addn1"};
   auto x_partially_unknown_t =
       GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
@@ -1230,10 +1227,10 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
   MAKE_TEST_GRAPH(float);
   MAKE_TEST_GRAPH(double);
   MAKE_TEST_GRAPH(int64_t);
-  MAKE_TEST_GRAPH(int32);
-  MAKE_TEST_GRAPH(int16);
-  MAKE_TEST_GRAPH(int8);
-  MAKE_TEST_GRAPH(uint8);
+  MAKE_TEST_GRAPH(int32_t);
+  MAKE_TEST_GRAPH(int16_t);
+  MAKE_TEST_GRAPH(int8_t);
+  MAKE_TEST_GRAPH(uint8_t);
 #undef MAKE_TEST_GRAPH
 
   Output bool_const = ops::Const(s.WithOpName("bool_const"), true, {5});
@@ -1307,7 +1304,7 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   EXPECT_EQ("f", new_d.name());
   EXPECT_EQ("Const", new_d.op());
 
-  std::vector<string> fetch = {"e", "f"};
+  std::vector<std::string> fetch = {"e", "f"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(fetch.size(), tensors_expected.size());
@@ -1338,7 +1335,7 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i3"};
+  std::vector<std::string> expected_nodes = {"dflt", "p1", "p2", "i3"};
   EXPECT_EQ(output.node_size(), expected_nodes.size());
   int i = 0;
   int found = 0;
@@ -1381,8 +1378,8 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
-                                        "i1",   "i2", "e"};
+  std::vector<std::string> expected_nodes = {"dflt", "p1", "p2", "c",
+                                             "i1",   "i2", "e"};
   EXPECT_EQ(output.node_size(), expected_nodes.size());
   int i = 0;
   int found = 0;
@@ -1439,7 +1436,7 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
+  std::vector<std::string> expected_nodes = {"dflt", "p1", "p2", "i2"};
   EXPECT_EQ(output.node_size(), expected_nodes.size());
   int i = 0;
   for (const auto& node : output.node()) {
@@ -1466,9 +1463,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   ops::DynamicPartition part(scope.WithOpName("partition"), input, indices,
                              num_partitions);
 
-  std::vector<string> outputs;
+  std::vector<std::string> outputs;
   for (int i = 0; i < num_partitions; ++i) {
-    string part_out_name = strings::StrCat("part_out", i);
+    std::string part_out_name = absl::StrCat("part_out", i);
     ops::Identity partition_out(scope.WithOpName(part_out_name),
                                 {part.outputs[i]});
     outputs.push_back(part_out_name);
@@ -1481,7 +1478,7 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   Tensor initial_val(DT_INT32, TensorShape({3}));
   test::FillIota<int>(&initial_val, 7);
   for (int i = 1; i < 5; ++i) {
-    TF_CHECK_OK(NodeDefBuilder(strings::StrCat("in", i), "Const")
+    TF_CHECK_OK(NodeDefBuilder(absl::StrCat("in", i), "Const")
                     .Attr("dtype", DT_INT32)
                     .Attr("value", initial_val)
                     .Finalize(item.graph.add_node()));
@@ -1502,7 +1499,7 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
                   .Finalize(item.graph.add_node()));
 
   for (int i = 0; i < 4; ++i) {
-    string concat_offset_out_name = strings::StrCat("concat_offset_out", i);
+    std::string concat_offset_out_name = absl::StrCat("concat_offset_out", i);
     TF_CHECK_OK(NodeDefBuilder(concat_offset_out_name, "Identity")
                     .Attr("T", DT_INT32)
                     .Input("concat_offsets", i, DT_INT32)
@@ -1518,8 +1515,8 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
 
   int constant_folded = 0;
   for (const auto& node : output.node()) {
-    if (node.name().find("part_out") != string::npos ||
-        node.name().find("concat_offset_out") != string::npos) {
+    if (node.name().find("part_out") != std::string::npos ||
+        node.name().find("concat_offset_out") != std::string::npos) {
       ++constant_folded;
       EXPECT_EQ("Const", node.op());
     }
@@ -1638,7 +1635,7 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
   auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
   auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({11, 13}));
-  std::vector<string> fetch_nodes = {"p2"};
+  std::vector<std::string> fetch_nodes = {"p2"};
   auto tensors_expected = EvaluateNodes(
       item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
   EXPECT_EQ(1, tensors_expected.size());
@@ -1711,8 +1708,8 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
   auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
   auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
-  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
-                                           "i2c", "i3a", "i3b"};
+  const std::vector<std::string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
+                                                "i2c", "i3a", "i3b"};
   auto tensors_expected = EvaluateNodes(
       item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
@@ -1814,15 +1811,16 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::set<string> present_nodes = {"v_in",     "v_ctrl",
-                                    "switch",   "i",
-                                    "p1",       "p2",
-                                    "m",        "false",
-                                    "constant", "switch2",
-                                    "i2",       "i3",
-                                    "m2",       "ConstantFoldingCtrl/switch_0",
-                                    "rank",     "size"};
-  std::set<string> not_present_nodes = {"ConstantFolding/switch2-0"};
+  std::set<std::string> present_nodes = {
+      "v_in",     "v_ctrl",
+      "switch",   "i",
+      "p1",       "p2",
+      "m",        "false",
+      "constant", "switch2",
+      "i2",       "i3",
+      "m2",       "ConstantFoldingCtrl/switch_0",
+      "rank",     "size"};
+  std::set<std::string> not_present_nodes = {"ConstantFolding/switch2-0"};
   EXPECT_EQ(present_nodes.size(), output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
@@ -1862,7 +1860,7 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
 
   v_ctrl_t.flat<bool>()(0) = true;
-  std::vector<string> fetch_nodes = {"m", "m2"};
+  std::vector<std::string> fetch_nodes = {"m", "m2"};
   auto tensors_expected = EvaluateNodes(
       item.graph, fetch_nodes, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
   EXPECT_EQ(2, tensors_expected.size());
@@ -1915,15 +1913,16 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
   GraphDef output;
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
-  std::set<string> present_nodes = {"v_in",     "v_ctrl",
-                                    "switch",   "i",
-                                    "p1",       "p2",
-                                    "m",        "false",
-                                    "constant", "switch2",
-                                    "i2",       "i3",
-                                    "m2",       "ConstantFoldingCtrl/switch_0"};
-  std::set<string> not_present_nodes = {"rank", "size",
-                                        "ConstantFolding/switch2-0"};
+  std::set<std::string> present_nodes = {
+      "v_in",     "v_ctrl",
+      "switch",   "i",
+      "p1",       "p2",
+      "m",        "false",
+      "constant", "switch2",
+      "i2",       "i3",
+      "m2",       "ConstantFoldingCtrl/switch_0"};
+  std::set<std::string> not_present_nodes = {"rank", "size",
+                                             "ConstantFolding/switch2-0"};
   EXPECT_EQ(present_nodes.size(), output.node_size());
 
   int found = 0;
@@ -2584,7 +2583,7 @@ TEST_F(ConstantFoldingTest, MergeConcat_PartialFolding) {
 }
 
 TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
-  PaddingWithZeroSize<int32>();
+  PaddingWithZeroSize<int32_t>();
   PaddingWithZeroSize<int64_t>();
 }
 
@@ -2770,7 +2769,7 @@ TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
       GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 1, 1}));
   auto input_var_one_dim_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
   Tensor input_var_axis_t(DT_INT32, TensorShape({1}));
-  input_var_axis_t.flat<int32>()(0) = 0;
+  input_var_axis_t.flat<int32_t>()(0) = 0;
   auto tensors_expected =
       EvaluateNodes(item.graph, item.fetch,
                     {{"input_var_three_dim", input_var_three_dim_t},
@@ -2895,7 +2894,7 @@ TEST_F(ConstantFoldingTest, Packing) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  const std::vector<string> fetch_nodes = {"i1", "i2"};
+  const std::vector<std::string> fetch_nodes = {"i1", "i2"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
   auto tensors = EvaluateNodes(output, fetch_nodes);
@@ -2971,7 +2970,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
+  std::vector<std::string> fetch_nodes = {"o1", "o2", "p1", "p2"};
   auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
   auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
   auto tensors_expected =
@@ -3042,7 +3041,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch_nodes = {"o1", "o2"};
+  std::vector<std::string> fetch_nodes = {"o1", "o2"};
   auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
@@ -3331,7 +3330,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
       };
   for (bool use_add_n : {true, false}) {
     auto fun = use_add_n ? addn_fun : accumulate_fun;
-    const string op_name = use_add_n ? "AddN" : "AccumulateNV2";
+    const std::string op_name = use_add_n ? "AddN" : "AccumulateNV2";
     Scope s = Scope::NewRootScope();
     Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
                                 ops::Placeholder::Shape(TensorShape({2, 2})));
@@ -3411,7 +3410,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
       }
     }
 
-    std::vector<string> fetch = {"acc0"};
+    std::vector<std::string> fetch = {"acc0"};
     auto tensors_expected = EvaluateNodes(item.graph, fetch);
     auto tensors = EvaluateNodes(output, fetch);
     EXPECT_EQ(1, tensors_expected.size());
@@ -3613,7 +3612,7 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   }
   EXPECT_EQ(found, 3);
 
-  std::vector<string> fetch = {"stack", "stack_no_axis"};
+  std::vector<std::string> fetch = {"stack", "stack_no_axis"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(2, tensors_expected.size());
@@ -3741,8 +3740,8 @@ TEST_F(ConstantFoldingTest, TensorArraySize) {
   auto tensors_actual = EvaluateNodes(output, {"dynamic_sz", "static_sz"});
   EXPECT_EQ(2, tensors_expected.size());
   EXPECT_EQ(2, tensors_actual.size());
-  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_actual[0]);
-  test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
+  test::ExpectTensorEqual<int32_t>(tensors_expected[0], tensors_actual[0]);
+  test::ExpectTensorEqual<int32_t>(tensors_expected[1], tensors_actual[1]);
 }
 
 TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
@@ -3770,7 +3769,7 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
   EXPECT_EQ("c", node_d.name());
   EXPECT_EQ("Const", node_d.op());
 
-  std::vector<string> fetch = {"c"};
+  std::vector<std::string> fetch = {"c"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors_expected.size());
@@ -3800,7 +3799,7 @@ TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> fetch = {"result"};
+  std::vector<std::string> fetch = {"result"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors_expected.size());
@@ -3869,9 +3868,9 @@ class ConstantFoldingCastConstTest : public GrapplerTest {
     return output;
   }
 
-  void EvaluateAndCompareUnoptimized(const GraphDef& unoptimized_graph,
-                                     const GraphDef& optimized_graph,
-                                     const std::vector<string>& fetch_nodes) {
+  void EvaluateAndCompareUnoptimized(
+      const GraphDef& unoptimized_graph, const GraphDef& optimized_graph,
+      const std::vector<std::string>& fetch_nodes) {
     auto tensors_expected = EvaluateNodes(unoptimized_graph, fetch_nodes);
     auto tensors = EvaluateNodes(optimized_graph, fetch_nodes);
     ASSERT_EQ(fetch_nodes.size(), tensors_expected.size());
@@ -4093,8 +4092,8 @@ TEST_F(ConstantFoldingTest, SimplifyCase) {
     TensorShapeProto* g_shape = output_shapes.mutable_list()->add_shape();
     g_shape->set_unknown_rank(true);
 
-    const Tensor kZero = test::AsScalar<int32>(0);
-    const Tensor kOne = test::AsScalar<int32>(1);
+    const Tensor kZero = test::AsScalar<int32_t>(0);
+    const Tensor kOne = test::AsScalar<int32_t>(1);
     item.graph = test::function::GDef(
         {NDef("one", "Const", {},
               {{"value", index == 0 ? kZero : kOne}, {"dtype", DT_INT32}},
@@ -4265,8 +4264,8 @@ TEST_F(ConstantFoldingTest, SimplifySelect_BroadcastTo) {
           ASSERT_EQ(node.input_size(), 4);
           EXPECT_EQ(node.input(0), pred_val ? "then" : "else");
           EXPECT_EQ(node.input(1),
-                    strings::StrCat("ConstantFolding/select-broadcastto_shape-",
-                                    pred_val ? 1 : 2));
+                    absl::StrCat("ConstantFolding/select-broadcastto_shape-",
+                                 pred_val ? 1 : 2));
           EXPECT_EQ(node.input(2), pred_val ? "^else" : "^if");
           EXPECT_EQ(node.input(3), pred_val ? "^if" : "^then");
         }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 2854810e3c040f..aef15c4fdf1b2e 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -252,7 +252,7 @@ absl::Status TransposeContext::InitializeTransposeContext(
   TF_RETURN_IF_ERROR(status);
   context->num_nodes = context->graph.node_size();
   const auto& nodes_to_preserve = item.NodesToPreserve();
-  context->nodes_to_preserve = absl::flat_hash_set<string>(
+  context->nodes_to_preserve = absl::flat_hash_set<std::string>(
       nodes_to_preserve.begin(), nodes_to_preserve.end());
   TF_RETURN_IF_ERROR(context->frames.InferFromGraph(context->graph));
   return absl::OkStatus();
@@ -262,9 +262,9 @@ absl::Status TransposeContext::InitializeTransposeContext(
 void TransposeContext::AssignDeviceAndDataFormats(
     absl::string_view target_device, absl::string_view src_format,
     absl::string_view dst_format) {
-  this->target_device = string(target_device);
-  this->src_format = string(src_format);
-  this->dst_format = string(dst_format);
+  this->target_device = std::string(target_device);
+  this->src_format = std::string(src_format);
+  this->dst_format = std::string(dst_format);
   this->src_dim_indices = GetDimensionIndices(src_format);
   this->dst_dim_indices = GetDimensionIndices(dst_format);
   this->src_to_dst = GetPermutation(this->src_dim_indices, dst_format);
@@ -276,9 +276,9 @@ void TransposeContext::AssignDeviceAndDataFormats(
 bool Transposer::ShouldProcess(const TransposeContext& context,
                                const utils::MutableNodeView& node) const {
   const auto* node_def = node.node();
-  const string& device_name = GetDeviceName(*node_def);
-  string device;
-  string task;
+  const std::string& device_name = GetDeviceName(*node_def);
+  std::string device;
+  std::string task;
   const bool is_on_target_device =
       DeviceNameUtils::SplitDeviceName(device_name, &task, &device) &&
       absl::StrContains(absl::AsciiStrToLower(device),
@@ -306,12 +306,12 @@ absl::Status Transposer::CreateConstPermNode(
   DCHECK(!graph_view->HasNode(node_name));
 
   NodeDef node;
-  node.set_name(string(node_name));
+  node.set_name(node_name);
   node.set_op(kOpConst);
-  node.set_device(string(device));
+  node.set_device(device);
 
   if (!control_node_name.empty()) {
-    node.add_input(string(control_node_name));
+    node.add_input(std::string(control_node_name));
   }
 
   AttrValue attr_data_type;
@@ -337,8 +337,8 @@ absl::Status Transposer::CreateTransposeNode(
     const DataType& data_type, absl::string_view device,
     TensorShapeProto fanin_shape, absl::Span<const int> permutation,
     absl::string_view control_node_name, utils::MutationNewNode* added_node,
-    string* transpose_node_name) {
-  const string node_name = absl::Substitute(name_format, kOpTranspose);
+    std::string* transpose_node_name) {
+  const std::string node_name = absl::Substitute(name_format, kOpTranspose);
   auto* graph_view = context->graph_view.get();
   DCHECK(!graph_view->HasNode(node_name));
   *transpose_node_name = node_name;
@@ -346,7 +346,7 @@ absl::Status Transposer::CreateTransposeNode(
   NodeDef node;
   node.set_name(node_name);
   node.set_op(kOpTranspose);
-  node.set_device(string(device));
+  node.set_device(device);
 
   AttrValue attr_data_type;
   attr_data_type.set_type(data_type);
@@ -367,7 +367,7 @@ absl::Status Transposer::CreateTransposeNode(
 
   // Create Const Node
   utils::MutationNewNode const_perm_added_node;
-  const string const_perm_node_name =
+  const std::string const_perm_node_name =
       absl::Substitute(name_format, "PermConst");
   TF_RETURN_IF_ERROR(CreateConstPermNode(context, const_perm_node_name, device,
                                          permutation, control_node_name,
@@ -457,11 +457,11 @@ absl::Status Transposer::CreateDataFormatNode(
 
   // Create the node
   NodeDef node;
-  node.set_name(string(node_name));
+  node.set_name(node_name);
 
   // Set up parameters of node.
-  node.set_op(string(op));
-  node.set_device(string(device));
+  node.set_op(op);
+  node.set_device(device);
   AttrValue attr_data_type;
   attr_data_type.set_type(data_type);
   node.mutable_attr()->insert({"T", attr_data_type});
@@ -503,7 +503,7 @@ absl::Status Transposer::UpdateEdge(
   auto* dst_node_def = dst_node->node();
 
   // TODO(lyandy): Minimize device parsing/fetching.
-  const string device = GetDeviceName(
+  const std::string device = GetDeviceName(
       is_src_format_to_dst_format ? *dst_node_def : *src_node_def);
   DataType data_type =
       is_src_format_to_dst_format
@@ -515,7 +515,7 @@ absl::Status Transposer::UpdateEdge(
                 .dtype();
 
   utils::MutationNewNode added_node;
-  string added_node_name;
+  std::string added_node_name;
   if (op == kOpTranspose) {
     TensorShapeProto input_shape_proto;
     input_shape_proto.set_unknown_rank(true);
@@ -527,7 +527,7 @@ absl::Status Transposer::UpdateEdge(
         input_shape_proto = src_node_shape_attr->list().shape(src_port);
       }
     }
-    const string control_node_name =
+    const std::string control_node_name =
         is_in_frame ? AsControlDependency(src_node_def->name()) : "";
     const std::vector<int>& permutation =
         is_src_format_to_dst_format ? context->src_to_dst : context->dst_to_src;
@@ -540,7 +540,7 @@ absl::Status Transposer::UpdateEdge(
                                 GetDeviceName(*src_node_def), &parsed_name) &&
                             parsed_name.type != "CPU" &&
                             IsHostMemory(*src_node_def, src_port);
-    const string node_name = absl::Substitute(name_format, op);
+    const std::string node_name = absl::Substitute(name_format, op);
     TF_RETURN_IF_ERROR(CreateDataFormatNode(
         context, node_name, op, device, data_type, is_fanin_on_host,
         is_src_format_to_dst_format, &added_node));
@@ -655,40 +655,42 @@ bool Transposer::CanProcessNode(const TransposeContext& context,
          !(node.NumRegularFanouts() == 0 && node.NumControlledFanouts() == 0);
 }
 
-string Transposer::GetFaninNameFormat(absl::string_view node_name, int port,
-                                      absl::string_view src_format,
-                                      absl::string_view dst_format) {
+std::string Transposer::GetFaninNameFormat(absl::string_view node_name,
+                                           int port,
+                                           absl::string_view src_format,
+                                           absl::string_view dst_format) {
   return absl::StrCat(node_name, "-", port, "-$0", src_format, "To", dst_format,
                       "-", kOptimizedSuffix);
 }
 
-string Transposer::GetFanoutNameFormat(absl::string_view node_name, int port,
-                                       int index, absl::string_view src_format,
-                                       absl::string_view dst_format) {
+std::string Transposer::GetFanoutNameFormat(absl::string_view node_name,
+                                            int port, int index,
+                                            absl::string_view src_format,
+                                            absl::string_view dst_format) {
   return absl::StrCat(node_name, "-", port, "-", index, "-$0", dst_format, "To",
                       src_format, "-", kOptimizedSuffix);
 }
 
-string Transposer::LayoutOptimizerNode(absl::string_view node_name) {
+std::string Transposer::LayoutOptimizerNode(absl::string_view node_name) {
   return absl::StrCat(node_name, "-", kOptimizedSuffix);
 }
 
-string Transposer::GetReshapeNodeNameFormat(absl::string_view node_name,
-                                            int index,
-                                            absl::string_view src_format,
-                                            absl::string_view dst_format) {
+std::string Transposer::GetReshapeNodeNameFormat(absl::string_view node_name,
+                                                 int index,
+                                                 absl::string_view src_format,
+                                                 absl::string_view dst_format) {
   return absl::StrCat(node_name, "-", index, "-", kReshape, src_format, "To",
                       dst_format);
 }
 
-string Transposer::GetShapeConstNodeNameFormat(absl::string_view node_name,
-                                               int index) {
+std::string Transposer::GetShapeConstNodeNameFormat(absl::string_view node_name,
+                                                    int index) {
   return absl::StrCat(node_name, "-", index, "-", kReshapeConst);
 }
 
 // Layout sensitive transposer.
 
-inline string GetLayoutSensitiveNodeDataFormat(
+inline std::string GetLayoutSensitiveNodeDataFormat(
     const utils::MutableNodeView& node) {
   const auto* attr = node.GetAttr(kAttrDataFormat);
   if (attr != nullptr) {
@@ -1086,7 +1088,7 @@ inline bool IsValidConstPermTransposeNode(const utils::MutableNodeView& node,
     return false;
   }
 
-  const auto& tensor_data = tensor.unaligned_flat<int32>();
+  const auto& tensor_data = tensor.unaligned_flat<int32_t>();
   for (int i = 0; i < permutation_size; i++) {
     if (permutation[i] != tensor_data(i)) {
       return false;
@@ -1252,11 +1254,11 @@ absl::Status BinaryOpTransposer::AddNodeReshape(
     absl::string_view node_device, absl::string_view input_name,
     absl::string_view shape_const_node_name, const DataType& data_type) {
   NodeDef new_node;
-  new_node.set_name(string(node_name));
-  new_node.add_input(string(input_name));
-  new_node.add_input(string(shape_const_node_name));
+  new_node.set_name(node_name);
+  new_node.add_input(std::string(input_name));
+  new_node.add_input(std::string(shape_const_node_name));
   new_node.set_op(kReshape);
-  new_node.set_device(string(node_device));
+  new_node.set_device(node_device);
 
   AttrValue attr_type_indices;
   attr_type_indices.set_type(DT_INT32);
@@ -1276,9 +1278,9 @@ absl::Status BinaryOpTransposer::AddNodeShapeConst(
     absl::string_view node_device, bool node_in_frame, int num_channels,
     absl::string_view depended_node, int rank) {
   NodeDef new_node;
-  new_node.set_name(string(node_name));
+  new_node.set_name(node_name);
   new_node.set_op(kOpConst);
-  new_node.set_device(string(node_device));
+  new_node.set_device(node_device);
   AttrValue attr_data_type;
   attr_data_type.set_type(DT_INT32);
   new_node.mutable_attr()->insert({"dtype", attr_data_type});
@@ -1296,7 +1298,7 @@ absl::Status BinaryOpTransposer::AddNodeShapeConst(
     // This is to ensure the transpose node and the const node are in the same
     // frame.
     // TODO(halehri): Add Test that exercises this condition.
-    new_node.add_input(AsControlDependency(string(depended_node)));
+    new_node.add_input(AsControlDependency(std::string(depended_node)));
   }
 
   absl::Status status;
@@ -1313,11 +1315,12 @@ absl::Status BinaryOpTransposer::MaybeReshapeVectorFanin(
     vector_index = 0;
   }
   if (vector_index != -1) {
-    const string& node_name = node->GetName();
-    const string& node_device = node->GetDevice();
-    string reshape_node_name = LayoutOptimizerNode(GetReshapeNodeNameFormat(
-        node_name, vector_index, context->src_format, context->dst_format));
-    string shape_const_node_name = LayoutOptimizerNode(
+    const std::string& node_name = node->GetName();
+    const std::string& node_device = node->GetDevice();
+    std::string reshape_node_name =
+        LayoutOptimizerNode(GetReshapeNodeNameFormat(
+            node_name, vector_index, context->src_format, context->dst_format));
+    std::string shape_const_node_name = LayoutOptimizerNode(
         GetShapeConstNodeNameFormat(node_name, vector_index));
     const auto& fanin = node->GetRegularFanin(vector_index);
     auto* fanin_node = fanin.node_view();
@@ -1513,7 +1516,7 @@ bool ReduceTransposer::IsAlongAxis(const Tensor& tensor,
   for (int i = 0; i < axis_size; ++i) {
     int local_axis = 0;
     if (tensor.dtype() == DT_INT32) {
-      local_axis = tensor.flat<int32>()(i);
+      local_axis = tensor.flat<int32_t>()(i);
     } else {
       local_axis = tensor.flat<int64_t>()(i);
     }
@@ -2023,10 +2026,10 @@ absl::Status UnaryGradTransposer::TransposeNode(TransposeContext* context,
 
 // Utils.
 
-string GetDeviceName(const NodeDef& node) { return node.device(); }
+std::string GetDeviceName(const NodeDef& node) { return node.device(); }
 
 bool IsDefaultLayoutSensitiveOp(const NodeDef& node) {
-  static absl::flat_hash_set<string>* default_layout_sensitive_ops =
+  static absl::flat_hash_set<std::string>* default_layout_sensitive_ops =
       new absl::flat_hash_set<std::string>(
           {"AvgPool", "Conv2D", "DepthwiseConv2dNative", "DepthToSpace",
            "FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3",
@@ -2049,7 +2052,7 @@ bool IsLayoutSensitiveOp(const NodeDef& node) {
 }
 
 bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
-  static absl::flat_hash_set<string>* agnostic_nodes =
+  static absl::flat_hash_set<std::string>* agnostic_nodes =
       new absl::flat_hash_set<std::string>({"Abs",
                                             "Acos",
                                             "Acosh",
@@ -2253,7 +2256,7 @@ bool GetValueAttrFromConstInputNode(
 }
 
 bool IsDataFormatOp(const utils::MutableNodeView& node) {
-  const string& op = node.GetOp();
+  const std::string& op = node.GetOp();
   return op == kOpDataFormatDimMap || op == kOpDataFormatVecPermute;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index b46afca62dc4a3..46ae83f1f04a3b 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -40,7 +40,7 @@ class StaticScheduleTest : public ::testing::Test {
     cpu_device.set_l1_cache_size(32 * 1024);
     cpu_device.set_l2_cache_size(256 * 1024);
     cpu_device.set_l3_cache_size(4 * 1024 * 1024);
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     return std::unique_ptr<VirtualCluster>(new VirtualCluster(devices));
   }
diff --git a/tensorflow/core/grappler/utils/scc_test.cc b/tensorflow/core/grappler/utils/scc_test.cc
index 4fc4e7abaa4339..d4c196167fca43 100644
--- a/tensorflow/core/grappler/utils/scc_test.cc
+++ b/tensorflow/core/grappler/utils/scc_test.cc
@@ -31,7 +31,7 @@ namespace {
 class SCCTest : public ::testing::Test {
  public:
   void SetUp() override {
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     DeviceProperties unknown_device;
     devices["MY_DEVICE"] = unknown_device;
     cluster_ = std::make_unique<VirtualCluster>(devices);
@@ -41,11 +41,11 @@ class SCCTest : public ::testing::Test {
   void TearDown() override { cluster_.reset(); }
 
  protected:
-  static NodeDef CreateNode(const string& name,
-                            absl::Span<const string> inputs) {
+  static NodeDef CreateNode(const std::string& name,
+                            absl::Span<const std::string> inputs) {
     NodeDef node;
     node.set_name(name);
-    for (const string& input : inputs) {
+    for (const std::string& input : inputs) {
       node.add_input(input);
     }
     return node;
@@ -86,7 +86,7 @@ TEST_F(SCCTest, DisjointCycleAndPath) {
   *graph.add_node() = CreateNode("h", {"g"});
 
   std::vector<const NodeDef*> nodes;
-  std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<std::string, const NodeDef*> name_to_node;
   for (const auto& n : graph.node()) {
     nodes.push_back(&n);
     name_to_node[n.name()] = &n;
@@ -149,7 +149,7 @@ TEST_F(SCCTest, WikipediaExample) {
   *graph.add_node() = CreateNode("h", {"h"});
 
   std::vector<const NodeDef*> nodes;
-  std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<std::string, const NodeDef*> name_to_node;
   for (const auto& n : graph.node()) {
     nodes.push_back(&n);
     name_to_node[n.name()] = &n;
@@ -187,7 +187,7 @@ TEST_F(SCCTest, TensorFlowLoop) {
        with open('/tmp/graph.txt', 'w') as f:
          f.write(str(tf.get_default_graph().as_graph_def()))
   */
-  const string gdef_ascii = R"EOF(
+  const std::string gdef_ascii = R"EOF(
 node {
   name: "Const"
   op: "Const"
@@ -411,7 +411,7 @@ versions {
 
 TEST_F(SCCTest, NestedLoops) {
   GrapplerItem item;
-  string filename = io::JoinPath(
+  std::string filename = io::JoinPath(
       testing::TensorFlowSrcRoot(),
       "core/grappler/costs/graph_properties_testdata/nested_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
diff --git a/tensorflow/core/grappler/verifiers/graph_verifier.h b/tensorflow/core/grappler/verifiers/graph_verifier.h
index 53d62e4c986d68..0e59d4ed3a28c7 100644
--- a/tensorflow/core/grappler/verifiers/graph_verifier.h
+++ b/tensorflow/core/grappler/verifiers/graph_verifier.h
@@ -41,7 +41,7 @@ class GraphVerifier {
   virtual ~GraphVerifier() {}
 
   // A name for the verifier.
-  virtual string name() const = 0;
+  virtual std::string name() const = 0;
 
   // Implement an algorithm to verify the specified graph.
   // The return value is a Status that represents a concatenation of Status of
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.h b/tensorflow/core/grappler/verifiers/structure_verifier.h
index de77933fedac10..968f840b41c8a0 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier.h
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.h
@@ -32,7 +32,7 @@ class StructureVerifier : public GraphVerifier {
   StructureVerifier() {}
   ~StructureVerifier() override {}
 
-  string name() const override { return "structure_verifier"; };
+  std::string name() const override { return "structure_verifier"; };
 
   absl::Status Verify(const GraphDef& graph) override;
 };
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
index 562deb5367493c..d01a729d6c0796 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -35,7 +35,7 @@ namespace {
 class StructureVerifierTest : public ::testing::Test {
  protected:
   StructureVerifierTest() { verifier_ = std::make_unique<StructureVerifier>(); }
-  void SetGraph(const string& gdef_ascii) {
+  void SetGraph(const std::string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &graph_));
   }
   GraphDef graph_;
diff --git a/tensorflow/core/kernels/autotune_conv_impl.h b/tensorflow/core/kernels/autotune_conv_impl.h
index 63c6a64d1282a7..91530d7bbc269f 100644
--- a/tensorflow/core/kernels/autotune_conv_impl.h
+++ b/tensorflow/core/kernels/autotune_conv_impl.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow::internal {
 
 template <typename LaunchFunc, typename Sig>
-StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
+absl::StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
     OpKernelContext* ctx,
     std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
     bool actually_do_autotune, const LaunchFunc& launch_func,
@@ -54,10 +54,10 @@ StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
 
     TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
     se::dnn::ProfileResult profile_result;
-    Status cudnn_launch_status =
+    absl::Status cudnn_launch_status =
         actually_do_autotune
             ? launch_func(allocator_used, runner, &profile_result)
-            : OkStatus();
+            : absl::OkStatus();
     if (!actually_do_autotune) {
       // Make the result valid according to `is_valid`.
       profile_result.set_algorithm(desc);
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index cdd9af962f346e..6448e5ae36fead 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -535,6 +535,7 @@ tf_cc_test(
         "@local_xla//xla/tsl/lib/monitoring:cell_reader",
         "@local_xla//xla/tsl/lib/monitoring:test_utils",
         "@local_xla//xla/tsl/platform:criticality",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index abc5fa34f3a3ab..8c53a2299110a5 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -284,8 +284,12 @@ static auto* mixed_priority_batching_policy_value =
 void RecordBatchParamMixedPriorityBatchingPolicy(
     MixedPriorityBatchingPolicy mixed_priority_batching_policy,
     const std::string& model_name, const std::string& op_name) {
-  mixed_priority_batching_policy_value->GetCell(model_name, op_name)
-      ->Set(absl::StrCat(mixed_priority_batching_policy));
+  auto policy_str =
+      GetMixedPriorityBatchingPolicyString(mixed_priority_batching_policy);
+  if (policy_str.ok()) {
+    mixed_priority_batching_policy_value->GetCell(model_name, op_name)
+        ->Set(std::string(*policy_str));
+  }
 }
 
 void RecordBatchParamMaxEnqueuedBatches(int64_t max_enqueued_batches,
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index 0e781747bcf170..a3b813ec6b7a4f 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/tsl/lib/monitoring/cell_reader.h"
 #include "xla/tsl/lib/monitoring/test_utils.h"
 #include "xla/tsl/platform/criticality.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/cost_constants.h"
 #include "tensorflow/core/common_runtime/cost_measurement.h"
 #include "tensorflow/core/common_runtime/cost_measurement_registry.h"
@@ -282,9 +283,13 @@ TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
         /*forced_warmup_batch_size=*/0));
   }
   blocking_counter.Wait();
+
+  TF_ASSERT_OK_AND_ASSIGN(absl::string_view policy_str,
+                          GetMixedPriorityBatchingPolicyString(
+                              GetParam().mixed_priority_batching_policy));
   EXPECT_EQ(
       mixed_priority_policy_reader_->Read("my_model_name", "my_batch_node"),
-      absl::StrCat(GetParam().mixed_priority_batching_policy));
+      policy_str);
 
   for (const auto& [batch_size, expected_count] :
        GetParam().expected_batch_size_count) {
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.cc b/tensorflow/core/kernels/batching_util/batch_scheduler.cc
index 91bfad8642ecd8..e74f6dfe9ddc08 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.cc
@@ -40,5 +40,23 @@ absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
       "Unknown mixed priority batching policy: %s", attr_value));
 }
 
+absl::StatusOr<absl::string_view> GetMixedPriorityBatchingPolicyString(
+    MixedPriorityBatchingPolicy policy) {
+  switch (policy) {
+    case MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize:
+      return kLowPriorityPaddingWithMaxBatchSizeAttrValue;
+    case MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize:
+      return kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue;
+    case MixedPriorityBatchingPolicy::kPriorityIsolation:
+      return kPriorityIsolationAttrValue;
+    case MixedPriorityBatchingPolicy::kPriorityMerge:
+      return kPriorityMergeAttrValue;
+    default:
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Unknown mixed priority batching policy: %d", policy));
+  }
+}
+
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index 936473a1884dc9..4060a8b15fbd96 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -70,6 +70,9 @@ enum class MixedPriorityBatchingPolicy {
 absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
     absl::string_view attr_value);
 
+absl::StatusOr<absl::string_view> GetMixedPriorityBatchingPolicyString(
+    MixedPriorityBatchingPolicy policy);
+
 // The abstract superclass for a unit of work to be done as part of a batch.
 //
 // An implementing subclass typically contains (or points to):
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index d587f482763fde..fce07c171b8e85 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -49,6 +49,12 @@ TEST(MixedPriorityBatchingPolicyTest, InvalidAttrValueError) {
           absl::StatusCode::kInvalidArgument,
           ::testing::HasSubstr(
               "Unknown mixed priority batching policy: invalid_attr_value")));
+  EXPECT_THAT(
+      GetMixedPriorityBatchingPolicyString(
+          static_cast<MixedPriorityBatchingPolicy>(4)),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::HasSubstr("Unknown mixed priority batching policy: 4")));
 }
 
 using MixedPriorityBatchingPolicyParameterizedTest = ::testing::TestWithParam<
@@ -59,6 +65,8 @@ TEST_P(MixedPriorityBatchingPolicyParameterizedTest,
   auto [attr_name, policy] = GetParam();
   EXPECT_THAT(GetMixedPriorityBatchingPolicy(attr_name),
               absl_testing::IsOkAndHolds(Eq(policy)));
+  EXPECT_THAT(GetMixedPriorityBatchingPolicyString(policy),
+              absl_testing::IsOkAndHolds(Eq(attr_name)));
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
index b5354be35c70a9..4ac0100fbdf44a 100644
--- a/tensorflow/core/kernels/batching_util/concat_split_util.h
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -81,7 +81,7 @@ absl::Status Concat(OpKernelContext* context,
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
     if (std::is_same<Device, GPUDevice>::value) {
       ConcatGPU<T>(context, inputs_flat, output, &output_flat);
-      return OkStatus();
+      return absl::OkStatus();
     }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
@@ -198,9 +198,9 @@ absl::Status SplitCPU(OpKernelContext* context, const Tensor& input,
 
 // Handles the general case, on GPU.
 template <typename T>
-Status SplitGPU(OpKernelContext* context, const Tensor& input,
-                const gtl::ArraySlice<int64_t>& sizes,
-                std::vector<Tensor>* outputs) {
+absl::Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                      const absl::Span<const int64_t>& sizes,
+                      std::vector<Tensor>* outputs) {
   // TODO(olston, apassos): Implement this.
   LOG(FATAL) << "Not yet implemented";  // Crash ok
 }
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index b9b9c81342439c..dac9640b1e61ff 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -57,23 +57,23 @@ struct AccumulatorType<Eigen::bfloat16> {
 // Definition of the GPU implementations declared in bias_op.cc.
 
 template <typename T>
-__global__ void BiasNHWCKernel(int32 nthreads, const T* __restrict__ input,
+__global__ void BiasNHWCKernel(int32_t nthreads, const T* __restrict__ input,
                                const T* __restrict__ bias,
-                               T* __restrict__ output, int32 bias_size) {
+                               T* __restrict__ output, int32_t bias_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 bias_offset = index % bias_size;
+    int32_t bias_offset = index % bias_size;
     output[index] = ldg(input + index) + ldg(bias + bias_offset);
   }
 }
 
 template <typename T>
-__global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
+__global__ void BiasNCHWKernel(int32_t nthreads, const T* __restrict__ input,
                                const T* __restrict__ bias,
-                               T* __restrict__ output, int32 bias_size,
-                               int32 image_size) {
+                               T* __restrict__ output, int32_t bias_size,
+                               int32_t image_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 index2 = index / image_size;
-    int32 bias_offset = index2 % bias_size;
+    int32_t index2 = index / image_size;
+    int32_t bias_offset = index2 % bias_size;
     output[index] = ldg(input + index) + ldg(bias + bias_offset);
   }
 }
@@ -82,11 +82,12 @@ __global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
 // dimension.
 template <typename T>
 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
-                         T* output, int32 batch, int32 height, int32 width,
-                         int depth, int32 channel, TensorFormat data_format) {
-  const int32 bias_size = channel;
-  const int32 image_size = height * width * depth;
-  const int32 total_count = batch * bias_size * image_size;
+                         T* output, int32_t batch, int32_t height,
+                         int32_t width, int depth, int32_t channel,
+                         TensorFormat data_format) {
+  const int32_t bias_size = channel;
+  const int32_t image_size = height * width * depth;
+  const int32_t total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
   }
@@ -109,49 +110,49 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
 
 // A naive implementation that is functional on all cases.
 template <typename T>
-__global__ void BiasGradNHWC_Naive(int32 nthreads,
+__global__ void BiasGradNHWC_Naive(int32_t nthreads,
                                    const T* __restrict__ output_backprop,
                                    T* __restrict__ bias_backprop,
-                                   int32 bias_size) {
+                                   int32_t bias_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 bias_offset = index % bias_size;
+    int32_t bias_offset = index % bias_size;
     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
   }
 }
 
 // A naive implementation that is functional on all cases.
 template <typename T>
-__global__ void BiasGradNCHW_Naive(int32 nthreads,
+__global__ void BiasGradNCHW_Naive(int32_t nthreads,
                                    const T* __restrict__ output_backprop,
                                    T* __restrict__ bias_backprop,
-                                   int32 bias_size, int32 image_size) {
+                                   int32_t bias_size, int32_t image_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 index2 = index / image_size;
-    int32 bias_offset = index2 % bias_size;
+    int32_t index2 = index / image_size;
+    int32_t bias_offset = index2 % bias_size;
     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
   }
 }
 
 template <typename T>
 __global__ void BiasGradNHWC_SharedAtomics(
-    int32 nthreads, const T* __restrict__ output_backprop,
-    T* __restrict__ bias_backprop, int32 bias_size) {
+    int32_t nthreads, const T* __restrict__ output_backprop,
+    T* __restrict__ bias_backprop, int32_t bias_size) {
   typedef typename AccumulatorType<T>::type AccT;
   GPU_DYNAMIC_SHARED_MEM_DECL(8, char, s_buf);
   AccT* s_data = reinterpret_cast<AccT*>(s_buf);
-  for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
+  for (int32_t index = threadIdx.x; index < bias_size; index += blockDim.x) {
     s_data[index] = AccT(0);
   }
   __syncthreads();
 
-  for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+  for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int32 bias_offset = index % bias_size;
+    int32_t bias_offset = index % bias_size;
     GpuAtomicAddShared(s_data + bias_offset, AccT(ldg(output_backprop + index)));
   }
   __syncthreads();
 
-  for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
+  for (int32_t index = threadIdx.x; index < bias_size; index += blockDim.x) {
     GpuAtomicAdd(bias_backprop + index, T(s_data[index]));
   }
 }
@@ -159,26 +160,26 @@ __global__ void BiasGradNHWC_SharedAtomics(
 template <typename T>
 __global__ void BiasGradNCHW_SharedAtomics(
     const T* __restrict__ output_backprop, T* __restrict__ bias_backprop,
-    int32 batch, int32 bias_size, int32 image_size, int group_size) {
+    int32_t batch, int32_t bias_size, int32_t image_size, int group_size) {
   // Initialize the shared memory.
   typedef typename AccumulatorType<T>::type AccT;
-  const int32 kSDataSize = 32;
+  const int32_t kSDataSize = 32;
   __shared__ AccT s_data[kSDataSize];
-  for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
+  for (int32_t index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
     s_data[index] = AccT(0);
   }
   __syncthreads();
 
   // Accumulate all the values within this thread. They all have the same bias
   // index.
-  int32 bias_index = blockIdx.x % bias_size;
-  int32 group_index = blockIdx.x / bias_size;
-  int32 total_count = batch * image_size;
+  int32_t bias_index = blockIdx.x % bias_size;
+  int32_t group_index = blockIdx.x / bias_size;
+  int32_t total_count = batch * image_size;
   AccT sum(0);
-  for (int32 index = group_index * blockDim.x + threadIdx.x;
+  for (int32_t index = group_index * blockDim.x + threadIdx.x;
        index < total_count; index += blockDim.x * group_size) {
-    int32 image_offset = index % image_size;
-    int32 batch = index / image_size;
+    int32_t image_offset = index % image_size;
+    int32_t batch = index / image_size;
     T val = ldg(output_backprop +
                 (batch * bias_size + bias_index) * image_size + image_offset);
     sum += AccT(val);
@@ -192,11 +193,11 @@ __global__ void BiasGradNCHW_SharedAtomics(
 
   // Accumulate the results in the shared memory into the first element.
   // No syncthreads is needed since this is only in the same warp.
-  int32 thread_index = threadIdx.x;
+  int32_t thread_index = threadIdx.x;
 #if GOOGLE_CUDA
   if (thread_index < 32) {
     AccT data = s_data[thread_index];
-    for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
+    for (int32_t delta = warpSize / 2; delta > 0; delta /= 2) {
       data += GpuShuffleXorSync(kCudaWarpAll, data, delta);
     }
     if (thread_index == 0) {
@@ -219,20 +220,20 @@ __global__ void BiasGradNCHW_SharedAtomics(
 
 template <typename T>
 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
-                             T* bias_backprop, int32 batch, int32 height,
-                             int32 width, int32 depth, int32 channel,
+                             T* bias_backprop, int32_t batch, int32_t height,
+                             int32_t width, int32_t depth, int32_t channel,
                              TensorFormat data_format) {
-  const int32 bias_size = channel;
-  const int32 image_size = height * width * depth;
-  const int32 total_count = batch * bias_size * image_size;
+  const int32_t bias_size = channel;
+  const int32_t image_size = height * width * depth;
+  const int32_t total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
   }
-  static constexpr int32 kWarpSize = 32;
+  static constexpr int32_t kWarpSize = 32;
   GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
 
   const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
-  int32 shared_memory_size = 0;
+  int32_t shared_memory_size = 0;
   if (data_format == FORMAT_NHWC) {
     shared_memory_size = bias_size * sizeof(typename AccumulatorType<T>::type);
   }
diff --git a/tensorflow/core/kernels/bias_op_gpu.h b/tensorflow/core/kernels/bias_op_gpu.h
index 0ece14a946cd19..60f17e6de240de 100644
--- a/tensorflow/core/kernels/bias_op_gpu.h
+++ b/tensorflow/core/kernels/bias_op_gpu.h
@@ -68,12 +68,12 @@ class BiasGradGPUProfileResult {
   }
   BiasAddGradGPUMode algorithm() const { return algorithm_; }
   void set_algorithm(BiasAddGradGPUMode val) { algorithm_ = val; }
-  uint64 elapsed_time() const { return elapsed_time_; }
-  void set_elapsed_time(uint64 val) { elapsed_time_ = val; }
+  uint64_t elapsed_time() const { return elapsed_time_; }
+  void set_elapsed_time(uint64_t val) { elapsed_time_ = val; }
 
  private:
   BiasAddGradGPUMode algorithm_ = BiasAddGradGPUMode::kInvalid;
-  uint64 elapsed_time_ = std::numeric_limits<uint64>::max();
+  uint64_t elapsed_time_ = std::numeric_limits<uint64_t>::max();
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index 529fe0b278621f..19a45db1ff01b6 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -36,11 +36,11 @@ namespace functor {
 
 template <typename Tidx, typename T>
 struct BincountFunctor<GPUDevice, Tidx, T, false> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
-                        const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output,
-                        const Tidx num_bins) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                              const typename TTypes<T, 1>::ConstTensor& weights,
+                              typename TTypes<T, 1>::Tensor& output,
+                              const Tidx num_bins) {
     if (weights.size() != 0) {
       return errors::Unimplemented(
           "Weights are not yet supported by the GPU implementation of Bincount."
@@ -48,7 +48,7 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
           " tf.function(jit_compile=True).");
     }
     if (output.size() == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (tensorflow::OpDeterminismRequired()) {
       // TODO(reedwm): Is this really nondeterministic?
@@ -88,11 +88,11 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
     }
     Tensor temp_storage;
     TF_RETURN_IF_ERROR(context->allocate_temp(
-        DataTypeToEnum<int8>::value,
+        DataTypeToEnum<int8_t>::value,
         TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
         &temp_storage));
 
-    void* d_temp_storage = temp_storage.flat<int8>().data();
+    void* d_temp_storage = temp_storage.flat<int8_t>().data();
     // The second HistogramEven is to actual run with d_temp_storage
     // allocated with temp_storage_bytes.
     err = gpuprim::DeviceHistogram::HistogramEven(
@@ -109,7 +109,7 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
       return errors::Internal(
           "Could not launch HistogramEven: ", GpuGetErrorString(err), ".");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -126,11 +126,11 @@ __global__ void BincountReduceKernel(const Tidx* in, T* out, const int nthreads,
 
 template <typename Tidx, typename T>
 struct BincountFunctor<GPUDevice, Tidx, T, true> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
-                        const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output,
-                        const Tidx num_bins) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                              const typename TTypes<T, 1>::ConstTensor& weights,
+                              typename TTypes<T, 1>::Tensor& output,
+                              const Tidx num_bins) {
     const int nthreads = arr.dimension(0);
 
     auto d = context->eigen_gpu_device();
@@ -206,11 +206,11 @@ __global__ void BincountColReduceSharedKernel(const Tidx* in, const T* weights,
 
 template <typename Tidx, typename T, bool binary_count>
 struct BincountReduceFunctor<GPUDevice, Tidx, T, binary_count> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<Tidx, 2>::ConstTensor& in,
-                        const typename TTypes<T, 2>::ConstTensor& weights,
-                        typename TTypes<T, 2>::Tensor& out,
-                        const Tidx num_bins) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 2>::ConstTensor& in,
+                              const typename TTypes<T, 2>::ConstTensor& weights,
+                              typename TTypes<T, 2>::Tensor& out,
+                              const Tidx num_bins) {
     const int num_rows = in.dimension(0);
     const int num_cols = in.dimension(1);
 
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index e354966e744549..e58902ddfccc21 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -149,11 +149,11 @@ TF_CALL_float8_e4m3fn(REGISTER_KERNEL);
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("BroadcastTo")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("input")
                             .HostMemory("shape")
                             .HostMemory("output"),
-                        BroadcastToOp<CPUDevice, int32>);
+                        BroadcastToOp<CPUDevice, int32_t>);
 #endif
 #if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
 REGISTER_KERNEL_BUILDER(Name("BroadcastTo")
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index 93df5a624e76bd..d69244d0c67cad 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -34,18 +34,19 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, bool useSharedMem>
 __global__ void BucketizeCustomKernel(
-    const int32 size_in, const T* __restrict__ in, const int32 size_boundaries,
-    GpuDeviceArrayStruct<float> boundaries_array, int32* __restrict__ out) {
+    const int32_t size_in, const T* __restrict__ in,
+    const int32_t size_boundaries, GpuDeviceArrayStruct<float> boundaries_array,
+    int32_t* __restrict__ out) {
   const float* boundaries = GetGpuDeviceArrayOnDevice(&boundaries_array);
 
   GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(float), unsigned char, shared_mem);
   float* shared_mem_boundaries = reinterpret_cast<float*>(shared_mem);
 
   if (useSharedMem) {
-    int32 lidx = threadIdx.y * blockDim.x + threadIdx.x;
-    int32 blockSize = blockDim.x * blockDim.y;
+    int32_t lidx = threadIdx.y * blockDim.x + threadIdx.x;
+    int32_t blockSize = blockDim.x * blockDim.y;
 
-    for (int32 i = lidx; i < size_boundaries; i += blockSize) {
+    for (int32_t i = lidx; i < size_boundaries; i += blockSize) {
       shared_mem_boundaries[i] = boundaries[i];
     }
 
@@ -56,11 +57,11 @@ __global__ void BucketizeCustomKernel(
 
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in[i];
-    int32 bucket = 0;
-    int32 count = size_boundaries;
+    int32_t bucket = 0;
+    int32_t count = size_boundaries;
     while (count > 0) {
-      int32 l = bucket;
-      int32 step = count / 2;
+      int32_t l = bucket;
+      int32_t step = count / 2;
       l += step;
       if (!(value < static_cast<T>(boundaries[l]))) {
         bucket = ++l;
@@ -78,10 +79,10 @@ namespace functor {
 template <typename T>
 struct BucketizeFunctor<GPUDevice, T> {
   // PRECONDITION: boundaries_vector must be sorted.
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<T, 1>::ConstTensor& input,
-                        const std::vector<float>& boundaries_vector,
-                        typename TTypes<int32, 1>::Tensor& output) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<T, 1>::ConstTensor& input,
+                              const std::vector<float>& boundaries_vector,
+                              typename TTypes<int32_t, 1>::Tensor& output) {
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
     GpuDeviceArrayOnHost<float> boundaries_array(context,
@@ -93,8 +94,8 @@ struct BucketizeFunctor<GPUDevice, T> {
     TF_RETURN_IF_ERROR(boundaries_array.Finalize());
 
     GpuLaunchConfig config = GetGpuLaunchConfig(input.size(), d);
-    int32 shared_mem_size = sizeof(float) * boundaries_vector.size();
-    const int32 kMaxSharedMemBytes = 16384;
+    int32_t shared_mem_size = sizeof(float) * boundaries_vector.size();
+    const int32_t kMaxSharedMemBytes = 16384;
     if (shared_mem_size < d.sharedMemPerBlock() &&
         shared_mem_size < kMaxSharedMemBytes) {
       TF_CHECK_OK(GpuLaunchKernel(BucketizeCustomKernel<T, true>,
@@ -108,7 +109,7 @@ struct BucketizeFunctor<GPUDevice, T> {
           config.thread_per_block, 0, d.stream(), input.size(), input.data(),
           boundaries_vector.size(), boundaries_array.data(), output.data()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 0a4e011815b80d..98e35f138363d5 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -184,10 +184,10 @@ class GpuCastOp : public CastOpBase {
   }
 
  private:
-  Status Prepare() {
+  absl::Status Prepare() {
     if (external_src_dtype_ == external_dst_dtype_) {
       work_ = nullptr;  // Identity
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (src_dtype_ == DT_BOOL) {
       work_ = GetGpuCastFromBool(dst_dtype_);
@@ -228,7 +228,7 @@ class GpuCastOp : public CastOpBase {
     } else if (src_dtype_ == DT_UINT4) {
       work_ = GetGpuCastFromUint4(dst_dtype_);
     }
-    return work_ == nullptr ? Unimplemented() : OkStatus();
+    return work_ == nullptr ? Unimplemented() : absl::OkStatus();
   }
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -263,14 +263,14 @@ CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<float>);
 CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<double>);
 #else
 REGISTER_CAST_GPU(bool, bfloat16);
-REGISTER_CAST_GPU(int8, bfloat16);
-REGISTER_CAST_GPU(int16, bfloat16);
-REGISTER_CAST_GPU(int32, bfloat16);
-REGISTER_CAST_GPU(int64, bfloat16);
-REGISTER_CAST_GPU(uint8, bfloat16);
-REGISTER_CAST_GPU(uint16, bfloat16);
-REGISTER_CAST_GPU(uint32, bfloat16);
-REGISTER_CAST_GPU(uint64, bfloat16);
+REGISTER_CAST_GPU(int8_t, bfloat16);
+REGISTER_CAST_GPU(int16_t, bfloat16);
+REGISTER_CAST_GPU(int32_t, bfloat16);
+REGISTER_CAST_GPU(int64_t, bfloat16);
+REGISTER_CAST_GPU(uint8_t, bfloat16);
+REGISTER_CAST_GPU(uint16_t, bfloat16);
+REGISTER_CAST_GPU(uint32_t, bfloat16);
+REGISTER_CAST_GPU(uint64_t, bfloat16);
 REGISTER_CAST_GPU(Eigen::half, bfloat16);
 REGISTER_CAST_GPU(float, bfloat16);
 REGISTER_CAST_GPU(double, bfloat16);
@@ -301,43 +301,43 @@ REGISTER_CAST_GPU(float8_e4m3fn, float8_e5m2);
 REGISTER_CAST_GPU(float8_e4m3fn, float8_e4m3fn);
 
 REGISTER_CAST_GPU(int4, int4);
-REGISTER_CAST_GPU(int4, int8);
-REGISTER_CAST_GPU(int4, int16);
-REGISTER_CAST_GPU(int4, int32);
+REGISTER_CAST_GPU(int4, int8_t);
+REGISTER_CAST_GPU(int4, int16_t);
+REGISTER_CAST_GPU(int4, int32_t);
 REGISTER_CAST_GPU(int4, int64_t);
 REGISTER_CAST_GPU(int4, uint4);
-REGISTER_CAST_GPU(int4, uint8);
-REGISTER_CAST_GPU(int4, uint16);
-REGISTER_CAST_GPU(int4, uint32);
+REGISTER_CAST_GPU(int4, uint8_t);
+REGISTER_CAST_GPU(int4, uint16_t);
+REGISTER_CAST_GPU(int4, uint32_t);
 REGISTER_CAST_GPU(int4, uint64_t);
 
-REGISTER_CAST_GPU(int8, int4);
-REGISTER_CAST_GPU(int16, int4);
-REGISTER_CAST_GPU(int32, int4);
+REGISTER_CAST_GPU(int8_t, int4);
+REGISTER_CAST_GPU(int16_t, int4);
+REGISTER_CAST_GPU(int32_t, int4);
 REGISTER_CAST_GPU(int64_t, int4);
 REGISTER_CAST_GPU(uint4, int4);
-REGISTER_CAST_GPU(uint8, int4);
-REGISTER_CAST_GPU(uint16, int4);
-REGISTER_CAST_GPU(uint32, int4);
+REGISTER_CAST_GPU(uint8_t, int4);
+REGISTER_CAST_GPU(uint16_t, int4);
+REGISTER_CAST_GPU(uint32_t, int4);
 REGISTER_CAST_GPU(uint64_t, int4);
 
-REGISTER_CAST_GPU(uint4, int8);
-REGISTER_CAST_GPU(uint4, int16);
-REGISTER_CAST_GPU(uint4, int32);
+REGISTER_CAST_GPU(uint4, int8_t);
+REGISTER_CAST_GPU(uint4, int16_t);
+REGISTER_CAST_GPU(uint4, int32_t);
 REGISTER_CAST_GPU(uint4, int64_t);
 REGISTER_CAST_GPU(uint4, uint4);
-REGISTER_CAST_GPU(uint4, uint8);
-REGISTER_CAST_GPU(uint4, uint16);
-REGISTER_CAST_GPU(uint4, uint32);
+REGISTER_CAST_GPU(uint4, uint8_t);
+REGISTER_CAST_GPU(uint4, uint16_t);
+REGISTER_CAST_GPU(uint4, uint32_t);
 REGISTER_CAST_GPU(uint4, uint64_t);
 
-REGISTER_CAST_GPU(int8, uint4);
-REGISTER_CAST_GPU(int16, uint4);
-REGISTER_CAST_GPU(int32, uint4);
+REGISTER_CAST_GPU(int8_t, uint4);
+REGISTER_CAST_GPU(int16_t, uint4);
+REGISTER_CAST_GPU(int32_t, uint4);
 REGISTER_CAST_GPU(int64_t, uint4);
-REGISTER_CAST_GPU(uint8, uint4);
-REGISTER_CAST_GPU(uint16, uint4);
-REGISTER_CAST_GPU(uint32, uint4);
+REGISTER_CAST_GPU(uint8_t, uint4);
+REGISTER_CAST_GPU(uint16_t, uint4);
+REGISTER_CAST_GPU(uint32_t, uint4);
 REGISTER_CAST_GPU(uint64_t, uint4);
 
 #undef REGISTER_CAST_GPU
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 7963edda7afaca..5f5552edd519ca 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -38,7 +38,7 @@ CastFunctorType GetCpuCastFromInt64(DataType dst_dtype) {
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
 #if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CAST_CASE(GPUDevice, int64, bfloat16);
+  CAST_CASE(GPUDevice, int64_t, bfloat16);
 #else
   CURRY_TYPES3(CAST_CASE, GPUDevice, int64);
 #endif
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index b1d2b3954aa91d..31ceecab9a84ee 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -38,10 +38,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename T>
 __global__ void CheckNumericsKernel(const T* __restrict__ data, int size,
                                     int abnormal_detected[2]) {
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
 
-  int32 offset = thread_id;
+  int32_t offset = thread_id;
 
   while (offset < size) {
     if (isnan(data[offset])) {
@@ -61,10 +61,10 @@ __global__ void CheckNumericsKernel(const T* __restrict__ data, int size,
 template <typename T>
 __global__ void CheckNumericsKernelV2(const T* __restrict__ data, int size,
                                       int abnormal_detected[3]) {
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
 
-  int32 offset = thread_id;
+  int32_t offset = thread_id;
 
   while (offset < size) {
     if (isnan(data[offset])) {
@@ -85,8 +85,8 @@ template <typename T>
 struct CheckNumericsLaunch {
   void Run(const GPUDevice& d, const T* data, int size,
            int abnormal_detected[2]) {
-    const int32 block_size = d.maxGpuThreadsPerBlock();
-    const int32 num_blocks =
+    const int32_t block_size = d.maxGpuThreadsPerBlock();
+    const int32_t num_blocks =
         (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
@@ -103,8 +103,8 @@ template <typename T>
 struct CheckNumericsLaunchV2 {
   void Run(const GPUDevice& d, const T* data, int size,
            int abnormal_detected[3]) {
-    const int32 block_size = d.maxGpuThreadsPerBlock();
-    const int32 num_blocks =
+    const int32_t block_size = d.maxGpuThreadsPerBlock();
+    const int32_t num_blocks =
         (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index c44680b27124aa..9e69fb36115602 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -22,10 +22,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-NcclBase::NcclBase(CollectiveType type, const string& name)
+NcclBase::NcclBase(CollectiveType type, const std::string& name)
     : type_(type), name_(name), col_ctx_(nullptr), col_params_(nullptr) {}
 
-Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
+absl::Status NcclBase::InitializeCollectiveParams(
+    CollectiveParams* col_params) {
   if (type_ != col_params->instance.type) {
     return errors::Internal("Expected initialized type ", type_,
                             " to match type in CollectiveParams ",
@@ -60,10 +61,10 @@ Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
                             ", expected name ", expected_name);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status NcclBase::InitializeCollectiveContext(
+absl::Status NcclBase::InitializeCollectiveContext(
     std::shared_ptr<CollectiveContext> col_ctx) {
   col_ctx_ = col_ctx;
   col_params_ = col_ctx->col_params.get();
diff --git a/tensorflow/core/kernels/collective_nccl.h b/tensorflow/core/kernels/collective_nccl.h
index 4fc4bebb008e3c..26a096fa3f8bb4 100644
--- a/tensorflow/core/kernels/collective_nccl.h
+++ b/tensorflow/core/kernels/collective_nccl.h
@@ -22,19 +22,20 @@ namespace tensorflow {
 
 class NcclBase : public CollectiveImplementationInterface {
  public:
-  explicit NcclBase(CollectiveType type, const string& name);
+  explicit NcclBase(CollectiveType type, const std::string& name);
   ~NcclBase() override = default;
 
   // No-op for this collective implementation.
-  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override;
 
   // Initializes the device objects and device localities.
-  Status InitializeCollectiveContext(
+  absl::Status InitializeCollectiveContext(
       std::shared_ptr<CollectiveContext> col_ctx) override;
 
  protected:
   const CollectiveType type_;
-  const string name_;
+  const std::string name_;
   std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
 };
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index b01eb56b44baca..47cbc6ed4e388b 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -141,7 +141,7 @@ class NcclTestBase : public ::testing::Test {
       if (VLOG_IS_ON(3)) {
         string str_buf;
         for (const auto& x : expected) {
-          strings::StrAppend(&str_buf, " ", x);
+          absl::StrAppend(&str_buf, " ", x);
         }
         VLOG(3) << "Expected output " << str_buf;
       }
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 4237a8a6c8b438..58cdf8afd02485 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -74,8 +74,9 @@ void ConcatGPU(
         inputs_flat,
     Tensor* output, typename TTypes<T, 2>::Tensor* output_flat) {
   if (inputs_flat.size() < 16) {
-    if (output->NumElements() < std::numeric_limits<int32>::max()) {
-      ConcatGPUSlice<T, int32>(c->eigen_gpu_device(), inputs_flat, output_flat);
+    if (output->NumElements() < std::numeric_limits<int32_t>::max()) {
+      ConcatGPUSlice<T, int32_t>(c->eigen_gpu_device(), inputs_flat,
+                                 output_flat);
     } else {
       ConcatGPUSlice<T, int64_t>(c->eigen_gpu_device(), inputs_flat,
                                  output_flat);
@@ -84,8 +85,8 @@ void ConcatGPU(
     // Switching indexing to int64 might cause performance issues.
     // Hence, we keep int32 indexing in the GPU kernel unless we need to
     // switch to int64.
-    if (output->NumElements() < std::numeric_limits<int32>::max()) {
-      ConcatGPUCall<T, int32>(c, inputs_flat, output_flat);
+    if (output->NumElements() < std::numeric_limits<int32_t>::max()) {
+      ConcatGPUCall<T, int32_t>(c, inputs_flat, output_flat);
     } else {
       ConcatGPUCall<T, int64_t>(c, inputs_flat, output_flat);
     }
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index a6cece16d20ddf..58b6957a120f2a 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -126,7 +126,7 @@ void ConcatGPUSlice(
     Eigen::array<IntType, 2> size;
     size[0] = inputs_flat[i]->dimension(0);
     size[1] = inputs_flat[i]->dimension(1);
-    if (std::is_same<IntType, int32>::value) {
+    if (std::is_same<IntType, int32_t>::value) {
       To32Bit(*output).slice(offset, size).device(gpu_device) =
           To32Bit(*inputs_flat[i]);
     } else {
@@ -159,7 +159,7 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
     // on most processors
     // possibly due to decreasing occupancy
     // 4096 inputs is a lot, most code will take the smem path
-    const int32 kMaxSmemBytesPerformance = 16384;
+    const int32_t kMaxSmemBytesPerformance = 16384;
     if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance) {
       TF_CHECK_OK(GpuLaunchKernel(
           concat_variable_kernel<T, IntType, true>, config.block_count,
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 60d2e83194eefa..1afa68d87430ed 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -1046,7 +1046,7 @@ template <typename T, bool conjugate>
 struct SwapDimension1And2InTensor3<GPUDevice, T, conjugate> {
   typedef GPUDevice Device;
   void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64_t>& combined_dims, T* out) {
+                  const absl::Span<const int64_t>& combined_dims, T* out) {
     Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
                                static_cast<int>(combined_dims[1]),
                                static_cast<int>(combined_dims[2])};
@@ -1060,7 +1060,7 @@ template <typename T, bool conjugate>
 struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
   typedef GPUDevice Device;
   void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64_t>& combined_dims, T* out) {
+                  const absl::Span<const int64_t>& combined_dims, T* out) {
     Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
                                static_cast<int>(combined_dims[1]),
                                static_cast<int>(combined_dims[2])};
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 4b647d1e6f5a43..65696babf71ddc 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -48,7 +48,7 @@ template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
 
 // A dummy type to group forward backward data autotune results together.
 struct ConvBackwardDataAutotuneGroup {
-  static string name() { return "ConvBwdData"; }
+  static std::string name() { return "ConvBwdData"; }
 };
 
 typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
@@ -59,14 +59,14 @@ typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
 // Computes backprop input using Eigen::SpatialConvolutionBackwardInput on GPU
 // for int32 inputs.
 template <>
-struct LaunchConv2DBackpropInputOp<GPUDevice, int32> {
+struct LaunchConv2DBackpropInputOp<GPUDevice, int32_t> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
                   const std::vector<int64_t>& explicit_paddings,
                   Tensor* in_backprop, TensorFormat data_format) {
-    LaunchConv2DBackpropInputOpImpl<GPUDevice, int32> launcher;
+    LaunchConv2DBackpropInputOpImpl<GPUDevice, int32_t> launcher;
     launcher(ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter,
              row_dilation, col_dilation, row_stride, col_stride, padding,
              explicit_paddings, in_backprop, data_format);
@@ -85,8 +85,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
 
-  std::vector<int32> strides(4, 1);
-  std::vector<int32> dilations(4, 1);
+  std::vector<int32_t> strides(4, 1);
+  std::vector<int32_t> dilations(4, 1);
   auto input_h = GetTensorDimIndex(data_format, 'H');
   auto input_w = GetTensorDimIndex(data_format, 'W');
   strides[input_h] = row_stride;
@@ -147,10 +147,10 @@ void LaunchConv2DBackpropInputOpGpuImpl(
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
       data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
-    const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
-                     dims.spatial_dims[1].input_size;
-    const uint64 k = dims.out_depth;
-    const uint64 n = dims.in_depth;
+    const uint64_t m = dims.batch_size * dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size;
+    const uint64_t k = dims.out_depth;
+    const uint64_t n = dims.in_depth;
 
     auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                 out_backprop.template flat<T>().size());
@@ -175,10 +175,10 @@ void LaunchConv2DBackpropInputOpGpuImpl(
              data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, and we are not
     // using grouped convolution, so call cublas directly.
-    const uint64 m = dims.batch_size;
-    const uint64 k = dims.out_depth;
-    const uint64 n = dims.spatial_dims[0].input_size *
-                     dims.spatial_dims[1].input_size * dims.in_depth;
+    const uint64_t m = dims.batch_size;
+    const uint64_t k = dims.out_depth;
+    const uint64_t n = dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size * dims.in_depth;
 
     auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                 out_backprop.template flat<T>().size());
@@ -282,7 +282,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
   //   (2) NHWC -> OHWI
 
   Tensor transformed_filter;
-  const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
+  const auto transform_filter =
+      [&](FilterTensorFormat dst_format) -> absl::Status {
     VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
             << " to " << ToString(dst_format);
 
@@ -300,7 +301,7 @@ void LaunchConv2DBackpropInputOpGpuImpl(
         To32Bit(filter.tensor<T, 4>()),
         To32Bit(transformed_filter.tensor<T, 4>()));
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   if (compute_data_format == FORMAT_NCHW) {
@@ -394,7 +395,7 @@ void LaunchConv2DBackpropInputOpGpuImpl(
   auto autotune_entry = std::move(entry_or).value();
 
   DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
-  Status cudnn_launch_status =
+  absl::Status cudnn_launch_status =
       LaunchAutotunedConv(autotune_entry, &scratch_allocator,
                           se::dnn::ConvolutionKind::BACKWARD_DATA, stream,
                           input_desc, in_backprop_ptr, filter_desc, filter_ptr,
@@ -534,23 +535,23 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
 template <>
-void SpatialConvolutionBackwardInputFunc<GPUDevice, int32>::operator()(
-    const GPUDevice&, typename TTypes<int32, 4>::Tensor,
-    typename TTypes<int32, 4>::ConstTensor,
-    typename TTypes<int32, 4>::ConstTensor, Eigen::DenseIndex,
+void SpatialConvolutionBackwardInputFunc<GPUDevice, int32_t>::operator()(
+    const GPUDevice&, typename TTypes<int32_t, 4>::Tensor,
+    typename TTypes<int32_t, 4>::ConstTensor,
+    typename TTypes<int32_t, 4>::ConstTensor, Eigen::DenseIndex,
     Eigen::DenseIndex, Eigen::DenseIndex, Eigen::DenseIndex);
 extern template struct SpatialConvolutionBackwardInputFunc<GPUDevice, int32>;
 
 template <>
 void SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
-    GPUDevice, int32>::operator()(const GPUDevice&,
-                                  typename TTypes<int32, 4>::Tensor,
-                                  typename TTypes<int32, 4>::ConstTensor,
-                                  typename TTypes<int32, 4>::ConstTensor,
-                                  Eigen::DenseIndex, Eigen::DenseIndex,
-                                  Eigen::DenseIndex, Eigen::DenseIndex,
-                                  Eigen::DenseIndex, Eigen::DenseIndex,
-                                  Eigen::DenseIndex, Eigen::DenseIndex);
+    GPUDevice, int32_t>::operator()(const GPUDevice&,
+                                    typename TTypes<int32_t, 4>::Tensor,
+                                    typename TTypes<int32_t, 4>::ConstTensor,
+                                    typename TTypes<int32_t, 4>::ConstTensor,
+                                    Eigen::DenseIndex, Eigen::DenseIndex,
+                                    Eigen::DenseIndex, Eigen::DenseIndex,
+                                    Eigen::DenseIndex, Eigen::DenseIndex,
+                                    Eigen::DenseIndex, Eigen::DenseIndex);
 extern template struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
     GPUDevice, int32>;
 
@@ -578,9 +579,9 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                         Conv2DBackpropInputOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("input_sizes"),
-                        Conv2DBackpropInputOp<GPUDevice, int32>);
+                        Conv2DBackpropInputOp<GPUDevice, int32_t>);
 
 // To be used inside depthwise_conv_grad_op.cc.
 // TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index 42e114ad33581d..a7e53647b72bf9 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -53,10 +53,10 @@ namespace {
 absl::Status ConvBackpropExtractAndVerifyDimension(
     absl::string_view label, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& output_shape,
-    const absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, int64_t padding_before, int64_t padding_after,
-    int spatial_dim, int filter_spatial_dim,
-    ConvBackpropSpatialDimension* dim) {
+    const absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
+    int64_t padding_before, int64_t padding_after, int spatial_dim,
+    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
   dim->input_size = input_shape.dim_size(spatial_dim);
   dim->filter_size = filter_shape.dim_size(filter_spatial_dim);
   dim->output_size = output_shape.dim_size(spatial_dim);
@@ -96,9 +96,10 @@ absl::Status ConvBackpropComputeDimensionsV2(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
     const TensorShape& out_backprop_shape,
-    const absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, absl::Span<const int64_t> explicit_paddings,
-    TensorFormat data_format, ConvBackpropDimensions* dims) {
+    const absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
+    absl::Span<const int64_t> explicit_paddings, TensorFormat data_format,
+    ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
   if (input_shape.dims() != num_dims) {
@@ -161,9 +162,9 @@ absl::Status ConvBackpropComputeDimensionsV2(
 absl::Status ConvBackpropComputeDimensions(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
-    const TensorShape& out_backprop_shape, const std::vector<int32>& strides,
+    const TensorShape& out_backprop_shape, const std::vector<int32_t>& strides,
     Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
-  static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
+  static constexpr std::array<int32_t, 5> one_dilations = {{1, 1, 1, 1, 1}};
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_shape, filter_shape, out_backprop_shape,
       one_dilations, strides, padding, /*explicit_paddings=*/{}, data_format,
@@ -181,13 +182,13 @@ absl::Status Conv2DBackpropComputeInputShape(
   }
 
   if (input_sizes.dim_size(0) == 4) {
-    return TensorShapeUtils::MakeShape(input_sizes.vec<int32>(), input_shape);
+    return TensorShapeUtils::MakeShape(input_sizes.vec<int32_t>(), input_shape);
   }
 
   if (input_sizes.dim_size(0) == 2) {
     const int batch_size = GetTensorDim(out_backprop_shape, data_format, 'N');
-    const int output_height = input_sizes.vec<int32>()(0);
-    const int output_width = input_sizes.vec<int32>()(1);
+    const int output_height = input_sizes.vec<int32_t>()(0);
+    const int output_width = input_sizes.vec<int32_t>()(1);
     const int output_depth = filter_shape.dim_size(2);
     if (output_height < 0 || output_width < 0) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.h b/tensorflow/core/kernels/conv_grad_shape_utils.h
index d83c1bb25ee02f..cc0708c4fe4f74 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.h
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.h
@@ -69,7 +69,7 @@ struct ConvBackpropDimensions {
 absl::Status ConvBackpropComputeDimensions(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
-    const TensorShape& out_backprop_shape, const std::vector<int32>& strides,
+    const TensorShape& out_backprop_shape, const std::vector<int32_t>& strides,
     Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims);
 
 // The V2 version computes the same outputs with arbitrary dilation rate and
@@ -78,8 +78,8 @@ absl::Status ConvBackpropComputeDimensions(
 absl::Status ConvBackpropComputeDimensionsV2(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
-    const TensorShape& out_backprop_shape, absl::Span<const int32> dilations,
-    const std::vector<int32>& strides, Padding padding,
+    const TensorShape& out_backprop_shape, absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
     absl::Span<const int64_t> explicit_paddings, TensorFormat data_format,
     ConvBackpropDimensions* dims);
 
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index c604f3bf4bbc4d..8a9c8e8aa8a132 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -58,7 +58,7 @@ absl::Status InitConv2DParameters(const OpKernelConstruction* context,
     TF_RETURN_IF_ERROR(
         context->GetAttr("explicit_paddings", &params->explicit_paddings));
   }
-  string data_format_string;
+  std::string data_format_string;
   TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
   TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
               errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 65c63fec1e439f..199cd94c99cbaa 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -45,8 +45,8 @@ template <typename Device, typename T>
 struct LaunchConvOp {
   void operator()(OpKernelContext* context, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter,
-                  const std::vector<int64>& dilations,
-                  const std::vector<int64>& strides, Padding padding,
+                  const std::vector<int64_t>& dilations,
+                  const std::vector<int64_t>& strides, Padding padding,
                   const std::vector<int64_t>& explicit_paddings,
                   TensorFormat data_format, Tensor* output);
 };
@@ -85,13 +85,13 @@ struct Im2ColBufferResource : public ResourceBase {
   // the buffer memory held by this resource.
   mutex mu;
   T* data;
-  string DebugString() const { return "Im2ColBufferResource"; }
+  std::string DebugString() const { return "Im2ColBufferResource"; }
 };
 
 // Convolution parameters specified by Op attributes.
 struct Conv2DParameters {
-  std::vector<int32> dilations;
-  std::vector<int32> strides;
+  std::vector<int32_t> dilations;
+  std::vector<int32_t> strides;
   Padding padding;
   TensorFormat data_format;
   std::vector<int64_t> explicit_paddings;
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 72bad756b4d0fd..00c02ccd51c711 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -65,7 +65,7 @@ template <typename Device, typename T>
 class Conv3DOp : public BinaryOp<T> {
  public:
   explicit Conv3DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -175,8 +175,8 @@ class Conv3DOp : public BinaryOp<T> {
   }
 
  private:
-  std::vector<int32> dilation_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> dilation_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
   bool cudnn_use_autotune_;
diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
index 183372705aa3df..779fbb7a50bcd6 100644
--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@@ -189,7 +189,7 @@ static int64_t Conv2DWithPostOpsFlops(int batch, int height, int width,
 template <typename T>
 static Conv2DWithBiasAndActivationGraph Conv2DWithBiasAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type,
+    int out_depth, const std::string& activation_type,
     TensorFormat data_format = FORMAT_NHWC) {
   Conv2DWithBiasGraph conv_graph =
       Conv2DWithBias<T>(batch, height, width, in_depth, filter_w, filter_h,
@@ -249,7 +249,7 @@ static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(
 template <typename T>
 static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type,
+    int out_depth, const std::string& activation_type,
     TensorFormat data_format = FORMAT_NHWC) {
   Conv2DWithBatchNormGraph conv_graph =
       Conv2DWithBatchNorm<T>(batch, height, width, in_depth, filter_w, filter_h,
@@ -271,11 +271,10 @@ static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
 // Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
 // fuses into it additional computations (e.g. Relu).
 template <typename T>
-static Graph* FusedConv2DWithBias(int batch, int height, int width,
-                                  int in_depth, int filter_w, int filter_h,
-                                  int out_depth,
-                                  const std::vector<string>& fused_ops = {},
-                                  TensorFormat data_format = FORMAT_NHWC) {
+static Graph* FusedConv2DWithBias(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, const std::vector<std::string>& fused_ops = {},
+    TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
   Tensor images_t = data_format == FORMAT_NHWC
@@ -341,7 +340,7 @@ static Graph* FusedConv2DWithBias(int batch, int height, int width,
 template <typename T>
 static Graph* FusedConv2DWithBatchNorm(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const std::vector<string>& fused_ops = {},
+    int out_depth, const std::vector<std::string>& fused_ops = {},
     TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
diff --git a/tensorflow/core/kernels/conv_ops_bfloat16.cc b/tensorflow/core/kernels/conv_ops_bfloat16.cc
index 37507841647f0b..d2b9bc71b5d3a3 100644
--- a/tensorflow/core/kernels/conv_ops_bfloat16.cc
+++ b/tensorflow/core/kernels/conv_ops_bfloat16.cc
@@ -110,8 +110,8 @@ void LaunchConvOp<GPUDevice, Eigen::bfloat16>::operator()(
     Tensor* output) {
   // Get spatial dims for dilations and strides.
   int spatial_dims = input.dims() - 2;
-  gtl::InlinedVector<int64_t, 3> strides_spatial(spatial_dims);
-  gtl::InlinedVector<int64_t, 3> dilations_spatial(spatial_dims);
+  absl::InlinedVector<int64_t, 3UL> strides_spatial(spatial_dims);
+  absl::InlinedVector<int64_t, 3UL> dilations_spatial(spatial_dims);
   for (int i = 0; i < spatial_dims; ++i) {
     strides_spatial[i] =
         GetTensorDim(strides, data_format, static_cast<char>(i + '0'));
@@ -166,9 +166,9 @@ void LaunchConv2DOp<GPUDevice, Eigen::bfloat16>::operator()(
     const std::vector<int64_t>& explicit_paddings, Tensor* output,
     TensorFormat data_format) {
   // Cast strides and dilations.
-  gtl::InlinedVector<int64_t, 3> casted_strides = {row_stride, col_stride};
-  gtl::InlinedVector<int64_t, 3> casted_dilations = {row_dilation,
-                                                     col_dilation};
+  absl::InlinedVector<int64_t, 3UL> casted_strides = {row_stride, col_stride};
+  absl::InlinedVector<int64_t, 3UL> casted_dilations = {row_dilation,
+                                                        col_dilation};
 
   auto* stream = ctx->op_device_context()->stream();
   const bool cast_to_float = !IsBF16SupportedInOps(stream);
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 8887103240c9d7..ef031685c4093e 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -711,7 +711,7 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     // Compute the shape of the output tensor, and allocate it.
     TensorShape padded_shape;
-    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
+    TTypes<int32_t>::ConstMatrix paddings_matrix = paddings.matrix<int32_t>();
     for (int d = 0; d < dims; ++d) {
       const int32_t before =
           paddings_matrix(d, 0);  // Pad before existing elements.
@@ -867,7 +867,7 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   bool align_corners_;
   int offset_;
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 51a33288c8e8bb..154f43a226cfdb 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -307,7 +307,7 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
 };
 
 template <>
-struct LaunchFusedConv2DOp<CPUDevice, int8>;
+struct LaunchFusedConv2DOp<CPUDevice, int8_t>;
 
 template <>
 struct LaunchFusedConv2DOp<CPUDevice, qint8>;
@@ -732,7 +732,7 @@ class FusedConv2DOp : public OpKernel {
     // convolution with BiasAdd, but in practice it doesn't work, cuDNN ignores
     // this parameter and always does Relu activation.
     if (std::is_same<Device, GPUDevice>::value) {
-      if (std::is_same<T, int8>::value || std::is_same<T, qint8>::value) {
+      if (std::is_same<T, int8_t>::value || std::is_same<T, qint8>::value) {
         patterns = {{FCT::kBiasAdd, {"BiasAdd"}},
                     {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}}};
       } else {
diff --git a/tensorflow/core/kernels/conv_ops_fused_int8.cc b/tensorflow/core/kernels/conv_ops_fused_int8.cc
index 7f919d5087dbbe..e23864960c1568 100644
--- a/tensorflow/core/kernels/conv_ops_fused_int8.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_int8.cc
@@ -300,9 +300,8 @@ struct LaunchFusedConv2DOpCpuInt8Helper {
 };
 
 template <>
-struct LaunchFusedConv2DOp<CPUDevice, int8>
-    : LaunchFusedConv2DOpCpuInt8Helper<int8> {
-};
+struct LaunchFusedConv2DOp<CPUDevice, int8_t>
+    : LaunchFusedConv2DOpCpuInt8Helper<int8_t> {};
 
 template <>
 struct LaunchFusedConv2DOp<CPUDevice, qint8>
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index d781f26094e583..1570c1dc0d00e9 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -88,7 +88,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
   auto* stream = ctx->op_device_context()->stream();
 
   if (!autotune_map->Find(params, &autotune_entry)) {
-    profiler::ScopedAnnotation trace("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
@@ -253,7 +253,7 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
   auto* stream = ctx->op_device_context()->stream();
 
   if (!autotune_map->Find(conv_parameters, &autotune_entry)) {
-    profiler::ScopedAnnotation annotation("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation annotation("cudnn_autotuning");
 
 #if GOOGLE_CUDA
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index faf028935d3dd8..74274977e10897 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -71,18 +71,18 @@ class DnnScratchAllocator : public se::ScratchAllocator {
   DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64 GetMemoryLimitInBytes() override { return memory_limit_; }
-  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return tsl::Status{absl::StatusCode::kInvalidArgument,
-                         "Requested negative byte size!"};
+      return absl::Status{absl::StatusCode::kInvalidArgument,
+                          "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return tsl::Status{absl::StatusCode::kUnavailable,
-                         absl::StrCat("Requested memory size (", byte_size,
-                                      ") exceeds the max memory limit (",
-                                      memory_limit_, ").")};
+      return absl::Status{absl::StatusCode::kUnavailable,
+                          absl::StrCat("Requested memory size (", byte_size,
+                                       ") exceeds the max memory limit (",
+                                       memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
@@ -90,7 +90,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return tsl::Status{
+      return absl::Status{
           absl::StatusCode::kUnavailable,
           absl::StrCat("Failed to allocate the requested memory size (",
                        byte_size, ").")};
@@ -99,7 +99,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return tsl::StatusOr<se::DeviceMemory<uint8>>(
+    return absl::StatusOr<stream_executor::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -118,7 +118,8 @@ typedef Eigen::GpuDevice GPUDevice;
 // autotuning with a cache, or by falling back to a default if
 // 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend.
 template <typename T>
-StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
+absl::StatusOr<AutotuneEntry<stream_executor::dnn::FusedConvOp>>
+AutotuneFusedConv(
     bool cudnn_use_autotune,
     AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
         autotune_map,
@@ -135,7 +136,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
     se::DeviceMemory<T> side_input_ptr, int64_t scratch_size);
 
 template <typename T>
-StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
+absl::StatusOr<AutotuneEntry<stream_executor::dnn::ConvOp>> AutotuneUnfusedConv(
     bool cudnn_use_autotune,
     AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
     const ConvParameters& conv_parameters, OpKernelContext* ctx,
@@ -158,7 +159,7 @@ AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator,
 
   auto workspace_size = selected_runner->GetWorkspaceSize();
 
-  se::DeviceMemoryBase scratch_memory;
+  stream_executor::DeviceAddressBase scratch_memory;
   if (workspace_size > 0) {
     auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
     if (scratch_or.ok()) {
@@ -209,9 +210,10 @@ Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
                         AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
                             scratch_allocator, primary, no_scratch_fallback));
     auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
-    return runner(stream, nullptr,
-                  std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
-                  filter_ptr, out_ptr);
+    return runner(
+        stream, nullptr,
+        std::get<stream_executor::DeviceAddressBase>(runner_and_scratch),
+        in_ptr, filter_ptr, out_ptr);
   } else {
     auto dnn = stream->parent()->AsDnn();
     if (dnn == nullptr) {
@@ -234,7 +236,7 @@ Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
     std::unique_ptr<const se::dnn::ConvRunner> runner =
         std::move(runner_or).value();
 
-    se::DeviceMemoryBase scratch_memory;
+    stream_executor::DeviceAddressBase scratch_memory;
     int64_t workspace_size = runner->GetWorkspaceSize();
     if (workspace_size > 0) {
       auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
diff --git a/tensorflow/core/kernels/conv_ops_impl.h b/tensorflow/core/kernels/conv_ops_impl.h
index 0d3fc798bbe3c2..3d5a0ac76e5b5b 100644
--- a/tensorflow/core/kernels/conv_ops_impl.h
+++ b/tensorflow/core/kernels/conv_ops_impl.h
@@ -178,13 +178,13 @@ struct LaunchGrouped {
     std::array<int64_t, 5> shuffle({3, 0, 1, 2, 4});
 
     // Compute pre shuffle dimemnsions.
-    auto pre_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> {
+    auto pre_shuffle = [&](const Tensor& tensor) -> std::array<int64_t, 5> {
       return {tensor.dim_size(0), tensor.dim_size(1), tensor.dim_size(2),
               num_groups, tensor.dim_size(3) / num_groups};
     };
 
     // Compute post shuffle dimemnsions.
-    auto post_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> {
+    auto post_shuffle = [&](const Tensor& tensor) -> std::array<int64_t, 5> {
       return {num_groups, tensor.dim_size(0), tensor.dim_size(1),
               tensor.dim_size(2), tensor.dim_size(3) / num_groups};
     };
@@ -262,8 +262,8 @@ template <typename T>
 struct LaunchConvOp<CPUDevice, T> {
   void operator()(OpKernelContext* context, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter,
-                  const std::vector<int64>& dilations,
-                  const std::vector<int64>& strides, const Padding padding,
+                  const std::vector<int64_t>& dilations,
+                  const std::vector<int64_t>& strides, const Padding padding,
                   const std::vector<int64_t>& explicit_paddings,
                   TensorFormat data_format, Tensor* output) {
     // For now just calling existing launchers based on spatial dimensions.
@@ -292,7 +292,7 @@ class ConvOp : public BinaryOp<T> {
     OP_REQUIRES(context, groups_ == 1,
                 absl::UnimplementedError(
                     "Grouped/Depthwise Convolutions are not supported yet."));
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(context,
                 data_format_str == "CHANNELS_LAST" ||
diff --git a/tensorflow/core/kernels/conv_ops_int32.cc b/tensorflow/core/kernels/conv_ops_int32.cc
index 46320bded04997..a582aeb4b7277c 100644
--- a/tensorflow/core/kernels/conv_ops_int32.cc
+++ b/tensorflow/core/kernels/conv_ops_int32.cc
@@ -30,12 +30,12 @@ template struct Conv2DOp<CPUDevice, int32>;
 // CPU implementation, don't register this EigenTensor-based version.
 #if !defined(USE_GEMM_FOR_CONV)
 REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    Conv2DOp<CPUDevice, int32>);
+    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    Conv2DOp<CPUDevice, int32_t>);
 #endif  // USE_GEMM_FOR_CONV
 REGISTER_KERNEL_BUILDER(
-    Name("Conv").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    ConvOp<CPUDevice, int32>);
+    Name("Conv").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    ConvOp<CPUDevice, int32_t>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <>
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 929d5cb51b4c08..caff583b570092 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -86,8 +86,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
     const int right_padding = 0;
 
     AddInputFromArray<T>(image.shape(), image.flat<T>());
-    AddInputFromArray<int32>(TensorShape({2}), {resized_height, resized_width});
-    AddInputFromArray<int32>(
+    AddInputFromArray<int32_t>(TensorShape({2}),
+                               {resized_height, resized_width});
+    AddInputFromArray<int32_t>(
         TensorShape({4, 2}),
         {0, 0, top_padding, bottom_padding, left_padding, right_padding, 0, 0});
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
@@ -128,8 +129,8 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                int resize_height, int y_padding, int x_padding,
                                int filter_size, int filter_count,
                                bool resize_align_corners,
-                               const string& pad_mode, int stride,
-                               const string& padding, DataType dtype) {
+                               const std::string& pad_mode, int stride,
+                               const std::string& padding, DataType dtype) {
     Scope root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
@@ -188,8 +189,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
   void CompareFusedPadOnlyAndSeparate(int input_width, int input_height,
                                       int input_depth, int y_padding,
                                       int x_padding, int filter_size,
-                                      int filter_count, const string& pad_mode,
-                                      int stride, const string& padding,
+                                      int filter_count,
+                                      const std::string& pad_mode, int stride,
+                                      const std::string& padding,
                                       DataType dtype) {
     Scope root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -488,7 +490,7 @@ class FusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageBatchCount = 8;
 
   static constexpr bool kIsInt8 =
-      std::is_same<T, int8>::value || std::is_same<T, qint8>::value;
+      std::is_same<T, int8_t>::value || std::is_same<T, qint8>::value;
 
   using BiasAddGraphRunner =
       std::function<void(const Tensor& input_data, const Tensor& filter_data,
@@ -680,7 +682,7 @@ class FusedConv2DOpTest : public OpsTestBase {
       const Tensor& input_data, const Tensor& filter_data,
       const Tensor& scale_data, const Tensor& offset_data,
       const Tensor& mean_data, const Tensor& variance_data,
-      const string& activation_type, const std::string& padding,
+      const std::string& activation_type, const std::string& padding,
       const std::vector<int>& explicit_paddings, Tensor* output,
       bool allow_gpu_device = false, int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
@@ -780,7 +782,7 @@ class FusedConv2DOpTest : public OpsTestBase {
         TensorShape shape = arg_data.shape();
         Tensor arg_data_float = Tensor(dtype_args, shape);
         for (int index = 0; index < arg_data.NumElements(); index++) {
-          int8 v = *(reinterpret_cast<int8*>(arg_data.data()) + index);
+          int8_t v = *(reinterpret_cast<int8_t*>(arg_data.data()) + index);
           *(reinterpret_cast<float*>(arg_data_float.data()) + index) =
               static_cast<float>(v);
         }
@@ -886,7 +888,7 @@ class FusedConv2DOpTest : public OpsTestBase {
 
   void ExpectMatch(const Tensor& x, const Tensor& y, double atol) {
     constexpr bool exact_match =
-        std::is_same<T, int8>::value || std::is_same<T, qint8>::value;
+        std::is_same<T, int8_t>::value || std::is_same<T, qint8>::value;
     if (exact_match) {
       test::ExpectEqual(x, y);
     } else {
@@ -903,7 +905,7 @@ class FusedConv2DOpTest : public OpsTestBase {
 
     constexpr int int8_scale = 80;
 
-    using ConvT = typename std::conditional<kIsInt8, int8, T>::type;
+    using ConvT = typename std::conditional<kIsInt8, int8_t, T>::type;
     DataType dtype_conv = DataTypeToEnum<ConvT>::v();
 
     TensorShape image_shape{image_batch_count, image_height, image_width,
@@ -1120,7 +1122,7 @@ class FusedConv2DOpTest : public OpsTestBase {
   // Verifies that computing Conv2D+FusedBatchNorm+{Activation} in a graph is
   // identical to FusedConv2D.
   void VerifyConv2DWithBatchNormAndActivation(
-      const string& activation, int filter_size, int filter_count,
+      const std::string& activation, int filter_size, int filter_count,
       const std::vector<int>& explicit_paddings = {}, int depth = kDepth,
       int image_width = kImageWidth, int image_height = kImageHeight,
       int image_batch_count = kImageBatchCount) {
@@ -1353,7 +1355,7 @@ REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
                             SpatialConvolutionAndActivation);
 #endif
 
-using FusedBiasAddDataTypes = ::testing::Types<float, double, int8, qint8>;
+using FusedBiasAddDataTypes = ::testing::Types<float, double, int8_t, qint8>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 3ebd3a4fa76d93..531b6377b2ff64 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -433,7 +433,7 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
   explicit Conv2DUsingGemmOp(OpKernelConstruction* context)
       : BinaryOp<T>(context) {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -557,7 +557,7 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
   }
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   TensorFormat data_format_;
 
diff --git a/tensorflow/core/kernels/count_up_to_op.cc b/tensorflow/core/kernels/count_up_to_op.cc
index 5abc17a8aa2aaf..fe0709186c6809 100644
--- a/tensorflow/core/kernels/count_up_to_op.cc
+++ b/tensorflow/core/kernels/count_up_to_op.cc
@@ -102,7 +102,7 @@ class ResourceCountUpToOp : public OpKernel {
       Name("ResourceCountUpTo").TypeConstraint<TYPE>("T").Device(DEVICE_CPU), \
       ResourceCountUpToOp<TYPE>)
 
-REGISTER(int32);
+REGISTER(int32_t);
 REGISTER(int64_t);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 401f1572298d9b..7c6d9132dd2142 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -91,7 +91,7 @@ class CTCDecodeHelper {
           " batch_size: ", batch_size);
     }
 
-    auto seq_len_t = (*seq_len)->vec<int32>();
+    auto seq_len_t = (*seq_len)->vec<int32_t>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
@@ -220,7 +220,7 @@ class CTCGreedyDecoderOp : public OpKernel {
       input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                 batch_size, num_classes);
     }
-    auto seq_len_t = seq_len->vec<int32>();
+    auto seq_len_t = seq_len->vec<int32_t>();
     auto log_prob_t = log_prob->matrix<T>();
 
     log_prob_t.setZero();
@@ -309,7 +309,7 @@ class CTCBeamSearchDecoderOp : public OpKernel {
                             &decoded_values, &decoded_shape));
 
     auto inputs_t = inputs->tensor<T, 3>();
-    auto seq_len_t = seq_len->vec<int32>();
+    auto seq_len_t = seq_len->vec<int32_t>();
     auto log_prob_t = log_prob->matrix<T>();
 
     const TensorShape& inputs_shape = inputs->shape();
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 63d31fcf62d46d..a1b851feb206db 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -127,7 +127,7 @@ class CTCLossOp : public OpKernel {
         errors::InvalidArgument("len(sequence_length) != batch_size.  ",
                                 "len(sequence_length):  ", seq_len->dim_size(0),
                                 " batch_size: ", batch_size));
-    auto seq_len_t = seq_len->vec<int32>();
+    auto seq_len_t = seq_len->vec<int32_t>();
 
     OP_REQUIRES(ctx, labels_indices->dim_size(0) == labels_values->dim_size(0),
                 errors::InvalidArgument(
@@ -166,7 +166,7 @@ class CTCLossOp : public OpKernel {
                                           0, " and ", batch_size,
                                           " but saw: ", batch_indices));
 
-      auto values = g.values<int32>();
+      auto values = g.values<int32_t>();
       std::vector<int>* b_values = &labels_t[batch_indices];
       b_values->resize(values.size());
       for (int i = 0; i < values.size(); ++i) (*b_values)[i] = values(i);
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index 970eb533318bb4..d344bb09da1c39 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -39,9 +39,9 @@ class DnnPooling3dOp {
  public:
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::array<int64, 3>& size,
-                      const std::array<int64, 3>& stride,
-                      const std::array<int64, 3>& padding,
+                      const std::array<int64_t, 3>& size,
+                      const std::array<int64_t, 3>& stride,
+                      const std::array<int64_t, 3>& padding,
                       TensorFormat data_format, const Tensor& tensor_in,
                       Tensor* output);
 };
@@ -53,10 +53,10 @@ class DnnPooling3dGradOp {
  public:
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::array<int64, 3>& window,
-                      const std::array<int64, 3>& stride,
-                      const std::array<int64, 3>& padding,
-                      const std::array<int64, 3>& output_size,
+                      const std::array<int64_t, 3>& window,
+                      const std::array<int64_t, 3>& stride,
+                      const std::array<int64_t, 3>& padding,
+                      const std::array<int64_t, 3>& output_size,
                       TensorFormat data_format, const Tensor& out_backprop,
                       const TensorShape& tensor_in_shape,
                       const Tensor* tensor_in, const Tensor* tensor_out,
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index f81c3176424843..00ce115511e76d 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -1924,7 +1924,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
               << algo_config->algorithm()->tensor_ops_enabled() << ").";
       return OkStatus();
     }
-    profiler::ScopedAnnotation trace("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     // Create temp tensors when profiling backprop pass.
     auto data_type = input->dtype();
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 0f32478dcc7dc6..4f3e04d7cd4c7f 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER8(UnaryOp, CPU, "Abs", functor::abs, Eigen::half, bfloat16, float,
-          double, int8, int16, int32, int64_t);
+          double, int8_t, int16_t, int32_t, int64_t);
 
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 
@@ -44,7 +44,7 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::abs<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::abs<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index f2ca5677469f18..50fce03c1e0f95 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Acos", functor::acos, Eigen::half, bfloat16, float,
-          double, int8);
-REGISTER5(UnaryOp, CPU, "Acos", functor::acos, int16, int32, int64_t, complex64,
-          complex128);
+          double, int8_t);
+REGISTER5(UnaryOp, CPU, "Acos", functor::acos, int16_t, int32_t, int64_t,
+          complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index fdb3de69b65033..35483f244836fa 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
-          int64_t, bfloat16);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double,
+          int32_t, int64_t, bfloat16);
 
 REGISTER6(BinaryOp, CPU, "AddV2", functor::add, float, Eigen::half, double,
-          int32, int64_t, bfloat16);
+          int32_t, int64_t, bfloat16);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -55,14 +55,14 @@ REGISTER_KERNEL_BUILDER(Name("Add")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32_t>>);
 REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index 088a10fcb738f7..bb897eedca48b0 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,13 +22,13 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
-          complex128, tstring);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8_t, int16_t, complex64,
+          uint8_t, complex128, tstring);
 
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
-REGISTER8(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
-          uint16, uint32, uint64, complex128);
+REGISTER8(BinaryOp, CPU, "AddV2", functor::add, int8_t, int16_t, complex64,
+          uint8_t, uint16_t, uint32_t, uint64_t, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 6509665e89864d..927c017cbabb82 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index d974d249fac06f..aab01711419c2c 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -19,8 +19,8 @@ namespace tensorflow {
 
 #if !defined(MLIR_GENERATED_CPU_KERNELS_ENABLED) || \
     !defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
-REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index 831f4d86c48277..a7a7c91fde59f0 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 949b162509ecff..f8cbd536b24731 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -269,12 +269,12 @@ REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
 REGISTER_CPU_KERNEL(bfloat16);
-REGISTER_CPU_KERNEL(int8);
-REGISTER_CPU_KERNEL(int16);
-REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int8_t);
+REGISTER_CPU_KERNEL(int16_t);
+REGISTER_CPU_KERNEL(int32_t);
 REGISTER_CPU_KERNEL(int64_t);
-REGISTER_CPU_KERNEL(uint8);
-REGISTER_CPU_KERNEL(uint16);
+REGISTER_CPU_KERNEL(uint8_t);
+REGISTER_CPU_KERNEL(uint16_t);
 REGISTER_CPU_KERNEL(std::complex<float>);
 REGISTER_CPU_KERNEL(std::complex<double>);
 #undef REGISTER_CPU_KERNEL
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
index e84885218ea1a3..f22d3bd3db7c96 100644
--- a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -24,11 +24,11 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-__global__ void UnaryClipCustomKernel(const int32 size_in,
-                                      const T *__restrict__ in0,
-                                      const T *__restrict__ in1,
-                                      const T *__restrict__ in2,
-                                      T *__restrict__ out) {
+__global__ void UnaryClipCustomKernel(const int32_t size_in,
+                                      const T* __restrict__ in0,
+                                      const T* __restrict__ in1,
+                                      const T* __restrict__ in2,
+                                      T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[0] < in0[i] ? in2[0] : in0[i];
     out[i] = value < in1[0] ? in1[0] : value;
@@ -36,11 +36,11 @@ __global__ void UnaryClipCustomKernel(const int32 size_in,
 }
 
 template <typename T>
-__global__ void BinaryRightClipCustomKernel(const int32 size_in,
-                                            const T *__restrict__ in0,
-                                            const T *__restrict__ in1,
-                                            const T *__restrict__ in2,
-                                            T *__restrict__ out) {
+__global__ void BinaryRightClipCustomKernel(const int32_t size_in,
+                                            const T* __restrict__ in0,
+                                            const T* __restrict__ in1,
+                                            const T* __restrict__ in2,
+                                            T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[i] < in0[i] ? in2[i] : in0[i];
     out[i] = value < in1[0] ? in1[0] : value;
@@ -48,11 +48,11 @@ __global__ void BinaryRightClipCustomKernel(const int32 size_in,
 }
 
 template <typename T>
-__global__ void BinaryLeftClipCustomKernel(const int32 size_in,
-                                           const T *__restrict__ in0,
-                                           const T *__restrict__ in1,
-                                           const T *__restrict__ in2,
-                                           T *__restrict__ out) {
+__global__ void BinaryLeftClipCustomKernel(const int32_t size_in,
+                                           const T* __restrict__ in0,
+                                           const T* __restrict__ in1,
+                                           const T* __restrict__ in2,
+                                           T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[0] < in0[i] ? in2[0] : in0[i];
     out[i] = value < in1[i] ? in1[i] : value;
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 8840579bdeccee..d537a7f39e0753 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -19,10 +19,10 @@ namespace tensorflow {
 
 REGISTER6(BinaryOp, CPU, "Div", functor::div, float, Eigen::half, double,
           bfloat16, complex64, complex128);
-REGISTER8(BinaryOp, CPU, "Div", functor::safe_div, uint8, uint16, uint32,
-          uint64, int8, int16, int32, int64_t);
-REGISTER8(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16,
-          uint32, uint64, int8, int16, int32, int64_t);
+REGISTER8(BinaryOp, CPU, "Div", functor::safe_div, uint8_t, uint16_t, uint32_t,
+          uint64_t, int8_t, int16_t, int32_t, int64_t);
+REGISTER8(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8_t, uint16_t,
+          uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t);
 REGISTER4(BinaryOp, CPU, "TruncateDiv", functor::truncate_div_real, Eigen::half,
           bfloat16, float, double);
 REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
@@ -35,8 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_div<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 87b499c96fdd05..7aecd4f62b2bf7 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Equal", functor::equal_to, float, Eigen::half, double,
-          uint8, int8, int16, bfloat16);
-REGISTER8(BinaryOp, CPU, "Equal", functor::equal_to, uint16, uint32, uint64,
-          qint8, qint16, quint8, quint16, qint32);
+          uint8_t, int8_t, int16_t, bfloat16);
+REGISTER8(BinaryOp, CPU, "Equal", functor::equal_to, uint16_t, uint32_t,
+          uint64_t, qint8, qint16, quint8, quint16, qint32);
 REGISTER_KERNEL_BUILDER(
     Name("ApproximateEqual").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     ApproximateEqualOp<CPUDevice, float>);
@@ -32,8 +32,8 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::equal_to<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
index 29f15d7f7d9a67..e17cda8f2fbab6 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32, int64_t, complex64,
-          complex128, tstring, bool);
+REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32_t, int64_t,
+          complex64, complex128, tstring, bool);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64,
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 0b4584988ad526..95c5652548004a 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
-          uint32, uint64, int8, int16, int32, int64_t);
+REGISTER8(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8_t, uint16_t,
+          uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t);
 REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
           Eigen::half, bfloat16, double);
 
@@ -49,7 +49,7 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_div<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 590f6e592df7d3..9cc40508e1adce 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int8, int16,
-          int32, int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 REGISTER4(BinaryOp, CPU, "FloorMod", functor::floor_fmod, Eigen::half, bfloat16,
           float, double);
 
@@ -39,7 +39,7 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index b05e875e19b3d9..1cd27097ce66fe 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER9(BinaryOp, CPU, "Greater", functor::greater, float, Eigen::half,
-          double, int32, int64_t, uint8, uint16, uint32, uint64);
-REGISTER3(BinaryOp, CPU, "Greater", functor::greater, int8, int16, bfloat16);
+          double, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
+REGISTER3(BinaryOp, CPU, "Greater", functor::greater, int8_t, int16_t,
+          bfloat16);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Greater", functor::greater, float, Eigen::half,
@@ -44,6 +45,6 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::greater<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 7ccfc5501a6fda..1c9e7df836deb7 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -17,9 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER9(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, float,
-          Eigen::half, double, int32, int64_t, uint8, uint16, uint32, uint64);
-REGISTER3(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, int8, int16,
-          bfloat16);
+          Eigen::half, double, int32_t, int64_t, uint8_t, uint16_t, uint32_t,
+          uint64_t);
+REGISTER3(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, int8_t,
+          int16_t, bfloat16);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER9(BinaryOp, GPU, "GreaterEqual", functor::greater_equal, float,
@@ -45,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::greater_equal<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index 5e5e827217ca1b..2f54bd8292b3b6 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64_t,
-          uint8, uint16, uint32, uint64);
+REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8_t, int16_t, int32_t,
+          int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_left_shift.cc b/tensorflow/core/kernels/cwise_op_left_shift.cc
index 19c9f84ead760e..76632030feec8f 100644
--- a/tensorflow/core/kernels/cwise_op_left_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_left_shift.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "LeftShift", functor::left_shift, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "LeftShift", functor::left_shift, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 9c7535fae31365..62dd9a18a5d86e 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
-          bfloat16, int32);
-REGISTER7(BinaryOp, CPU, "Less", functor::less, uint8, uint16, uint32, uint64,
-          int8, int16, int64_t);
+          bfloat16, int32_t);
+REGISTER7(BinaryOp, CPU, "Less", functor::less, uint8_t, uint16_t, uint32_t,
+          uint64_t, int8_t, int16_t, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -45,6 +45,6 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::less<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index b6acec213cb6ff..e17272986381fb 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
-          bfloat16, double, int32);
-REGISTER7(BinaryOp, CPU, "LessEqual", functor::less_equal, int64_t, uint8,
-          uint16, uint32, uint64, int8, int16);
+          bfloat16, double, int32_t);
+REGISTER7(BinaryOp, CPU, "LessEqual", functor::less_equal, int64_t, uint8_t,
+          uint16_t, uint32_t, uint64_t, int8_t, int16_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -45,7 +45,7 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::less_equal<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 9be2a3a0fc9062..74db589e7783d2 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER4(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
           bfloat16, double);
-REGISTER8(BinaryOp, CPU, "Maximum", functor::maximum, int8, uint8, int16,
-          uint16, int32, uint32, int64_t, uint64);
+REGISTER8(BinaryOp, CPU, "Maximum", functor::maximum, int8_t, uint8_t, int16_t,
+          uint16_t, int32_t, uint32_t, int64_t, uint64_t);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
@@ -44,7 +44,7 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::maximum<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::maximum<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 67d1c6a8452517..5a101acf5e47ce 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER4(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
           bfloat16, double);
-REGISTER8(BinaryOp, CPU, "Minimum", functor::minimum, int8, uint8, int16,
-          uint16, int32, uint32, int64_t, uint64);
+REGISTER8(BinaryOp, CPU, "Minimum", functor::minimum, int8_t, uint8_t, int16_t,
+          uint16_t, int32_t, uint32_t, int64_t, uint64_t);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
@@ -45,7 +45,7 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::minimum<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::minimum<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mod.cc b/tensorflow/core/kernels/cwise_op_mod.cc
index 32fd740a38ccca..51b91ceb85c2fd 100644
--- a/tensorflow/core/kernels/cwise_op_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_mod.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER2(BinaryOp, CPU, "Mod", functor::safe_mod, int32, int64_t);
+REGISTER2(BinaryOp, CPU, "Mod", functor::safe_mod, int32_t, int64_t);
 REGISTER2(BinaryOp, CPU, "Mod", functor::fmod, float, double);
-REGISTER2(BinaryOp, CPU, "TruncateMod", functor::safe_mod, int32, int64_t);
+REGISTER2(BinaryOp, CPU, "TruncateMod", functor::safe_mod, int32_t, int64_t);
 REGISTER2(BinaryOp, CPU, "TruncateMod", functor::fmod, float, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -45,13 +45,13 @@ REGISTER_KERNEL_BUILDER(Name("Mod")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_mod<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_mod<int32_t>>);
 REGISTER_KERNEL_BUILDER(Name("TruncateMod")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_mod<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_mod<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 9af3108676258b..cc6fd91248766c 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
-          int32, bfloat16);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
+          uint8_t, int32_t, bfloat16);
 REGISTER6(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
           double, complex64, complex128, bfloat16);
 
@@ -53,8 +53,8 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::mul<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::mul<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 66ed75f469ebb5..31080a3e01bc0a 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER8(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, uint32, uint64,
-          int16, int64_t, complex64, complex128);
+REGISTER8(BinaryOp, CPU, "Mul", functor::mul, int8_t, uint16_t, uint32_t,
+          uint64_t, int16_t, int64_t, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER8(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, uint32, uint64,
diff --git a/tensorflow/core/kernels/cwise_op_neg_1.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
index f3bd66c8986e5c..7f589ece2e313f 100644
--- a/tensorflow/core/kernels/cwise_op_neg_1.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64_t);
+REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8_t, int16_t, int32_t, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -37,6 +37,6 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::neg<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::neg<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 22b8ff992ce37b..6e787b88bb1694 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,17 +17,17 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-          double, uint8, int8, int16, bfloat16);
-REGISTER8(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16, uint32,
-          uint64, qint8, qint16, quint8, quint16, qint32);
+          double, uint8_t, int8_t, int16_t, bfloat16);
+REGISTER8(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16_t, uint32_t,
+          uint64_t, qint8, qint16, quint8, quint16, qint32);
 
 REGISTER_KERNEL_BUILDER(Name("NotEqual")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::not_equal_to<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 9f5a2508733ebe..537a8c4c0b8bf9 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32, int64_t,
+REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32_t, int64_t,
           complex64, complex128, tstring, bool);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index d052c6021faf37..ae21c4613f1bc4 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
           double, complex64, complex128);
-REGISTER4(BinaryOp, CPU, "Pow", functor::safe_pow, int8, int16, int32, int64_t);
+REGISTER4(BinaryOp, CPU, "Pow", functor::safe_pow, int8_t, int16_t, int32_t,
+          int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_right_shift.cc b/tensorflow/core/kernels/cwise_op_right_shift.cc
index 7386c3674ba9c0..cc960b023390a1 100644
--- a/tensorflow/core/kernels/cwise_op_right_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_right_shift.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "RightShift", functor::right_shift, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "RightShift", functor::right_shift, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index fa3289f9e57f32..bab42c5b58f5cc 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER6(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
-          bfloat16, int32, int64_t);
+          bfloat16, int32_t, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index bf572572eace3d..5ef7a4008c8728 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -289,7 +289,7 @@ REGISTER_SELECT_GPU(bool);
 REGISTER_SELECT_GPU(Eigen::half);
 REGISTER_SELECT_GPU(float);
 REGISTER_SELECT_GPU(double);
-REGISTER_SELECT_GPU(int32);
+REGISTER_SELECT_GPU(int32_t);
 REGISTER_SELECT_GPU(int64_t);
 REGISTER_SELECT_GPU(complex64);
 REGISTER_SELECT_GPU(complex128);
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 895280a22ab890..632e4a8cce12d5 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(UnaryOp, CPU, "Sign", functor::sign, float, double, Eigen::half,
           bfloat16, complex64, complex128);
-REGISTER4(UnaryOp, CPU, "Sign", functor::sign, int8, int16, int32, int64_t);
+REGISTER4(UnaryOp, CPU, "Sign", functor::sign, int8_t, int16_t, int32_t,
+          int64_t);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
@@ -41,7 +42,7 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::sign<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::sign<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index e8122ba19a2632..ddca86ae25b7c9 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -18,9 +18,9 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
-          int32, int64_t, complex64, complex128);
-REGISTER7(UnaryOp, CPU, "Square", functor::square, bfloat16, int8, int16, uint8,
-          uint16, uint32, uint64);
+          int32_t, int64_t, complex64, complex128);
+REGISTER7(UnaryOp, CPU, "Square", functor::square, bfloat16, int8_t, int16_t,
+          uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -45,7 +45,7 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::square<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::square<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 2a34dd2c5290aa..c6f3fe2567afea 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
-          float, Eigen::half, double, bfloat16, int32, int64_t, complex64,
+          float, Eigen::half, double, bfloat16, int32_t, int64_t, complex64,
           complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -37,8 +37,8 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("x")
         .HostMemory("y")
         .HostMemory("z")
-        .TypeConstraint<int32>("T"),
-    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
+        .TypeConstraint<int32_t>("T"),
+    BinaryOp<CPUDevice, functor::squared_difference<int32_t>>);
 
 REGISTER_KERNEL_BUILDER(
     Name("SquaredDifference")
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("x")
         .HostMemory("y")
         .HostMemory("z")
-        .TypeConstraint<int32>("T"),
-    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
+        .TypeConstraint<int32_t>("T"),
+    BinaryOp<CPUDevice, functor::squared_difference<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index db8c81db3cf460..b4eb0447115d22 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
-          int64_t, bfloat16, complex64, complex128);
+REGISTER8(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double,
+          int32_t, int64_t, bfloat16, complex64, complex128);
 #if !defined(__ANDROID_TYPES_SLIM__)
 // Sub op for int8, uint8, int16, uint16
-REGISTER6(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16,
-          uint32, uint64);
+REGISTER6(BinaryOp, CPU, "Sub", functor::sub, int8_t, uint8_t, int16_t,
+          uint16_t, uint32_t, uint64_t);
 #else
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -53,7 +53,7 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::sub<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::sub<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 733fa8af2f9cae..a202e3717938e9 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -36,7 +36,7 @@ void BinaryOpShared::SetComputeError(OpKernelContext* ctx) {
   // associated information.  This is sufficient for now, since the only binary
   // ops that have compute errors are integer division and mod, and the only
   // error they produce is zero division.
-  const string& op = ctx->op_kernel().type_string();
+  const std::string& op = ctx->op_kernel().type_string();
   if ((op == "Div" || op == "Mod" || op == "FloorMod" || op == "FloorDiv") &&
       DataTypeIsInteger(ctx->op_kernel().input_type(0))) {
     ctx->CtxFailure(errors::InvalidArgument("Integer division by zero"));
@@ -62,7 +62,7 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
         TryGetNodeAttr(ctx->op_kernel().def(), "incompatible_shape_error",
                        &(incompatible_shape_error));
     if (has_attr && !incompatible_shape_error) {
-      const string& op = ctx->op_kernel().type_string();
+      const std::string& op = ctx->op_kernel().type_string();
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
       result = (op == "NotEqual");
       return;
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 41b851fda99b4b..5c6663b666aea3 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -27,7 +27,7 @@ namespace {
 // Creates a Graph which applies a unary "func" on a 3D tensor of
 // type T with "num" elements.
 template <typename T>
-static Graph* Unary(const string& func, int num, DataType dtype) {
+static Graph* Unary(const std::string& func, int num, DataType dtype) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)}));
   CHECK_GT(data.NumElements(), 0);
@@ -97,7 +97,7 @@ BM_UNARY(gpu, Round, float, DT_FLOAT);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // data func scalar.
-Graph* BinaryScalar(int num, const string& func) {
+Graph* BinaryScalar(int num, const std::string& func) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
   lhs.flat<float>().setRandom();
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 62122f5d50987f..1813f7e9e02005 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -73,7 +73,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
         op_version_(op_version),
         traceme_metadata_(
             {{"batch_size",
-              strings::Printf("%lld", static_cast<long long>(batch_size))},
+              absl::StrFormat("%lld", static_cast<long long>(batch_size))},
              {"drop_remainder", drop_remainder ? "true" : "false"},
              {"parallel_copy", parallel_copy ? "true" : "false"}}) {
     input_->Ref();
@@ -106,7 +106,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.op_version = op_version_;
     return std::make_unique<Iterator>(Iterator::Params{
@@ -121,7 +121,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.op_version = op_version_;
     params.set_args(batch_size_);
@@ -146,9 +146,9 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return input_->CheckExternalState();
   }
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
-    const int64 cardinality = Cardinality();
+    const int64_t cardinality = Cardinality();
     if (index < 0 || index >= cardinality) {
       return errors::OutOfRange("Index out of range [0, ", cardinality,
                                 "):", index);
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 1cc826e8c17b3d..ad5ba2464ce9c3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -86,7 +86,7 @@ class DatasetRandomAccessCache {
 
   // Extends the temporary cache up to a given index and then updates
   // out_tensors with the element at that index.
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) {
     if (!iter_resource_) {
       TF_ASSIGN_OR_RETURN(iter_resource_,
@@ -104,7 +104,7 @@ class DatasetRandomAccessCache {
   std::vector<std::vector<Tensor>> GetCacheData() { return cache_; }
 
  private:
-  absl::Status ExtendTempCacheToIndex(int64 index, OpKernelContext* ctx) {
+  absl::Status ExtendTempCacheToIndex(int64_t index, OpKernelContext* ctx) {
     bool end_of_sequence;
     while (cache_.size() <= index) {
       std::vector<Tensor> out_tensors;
@@ -169,7 +169,7 @@ class IteratorRandomAccessCache {
 class CacheDatasetOp::FileDatasetBase : public DatasetBase {
  public:
   FileDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                  string filename, Env* env)
+                  std::string filename, Env* env)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
         filename_(std::move(filename)),
@@ -184,7 +184,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
   ~FileDatasetBase() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.dataset_prefix = kFileDatasetPrefix;
     return std::make_unique<FileIterator>(FileIterator::Params{
@@ -199,7 +199,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.dataset_prefix = kFileDatasetPrefix;
     return name_utils::DatasetDebugString(kDatasetType, params);
@@ -225,7 +225,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
  private:
   static size_t StringPaddingSize(size_t num_tensors) {
-    return strings::Printf(kPaddingSizeStrFormat, num_tensors - 1).size();
+    return absl::StrFormat(kPaddingSizeStrFormat, num_tensors - 1).size();
   }
 
   std::string FormatName(size_t item_index, size_t tensor_index) const {
@@ -328,14 +328,14 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
       ~FileWriterIterator() override {
         if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) {
           LOG(WARNING) << kIncompleteCacheErrorMessage;
-          std::vector<string> cache_files;
+          std::vector<std::string> cache_files;
           absl::Status s = dataset()->env_->GetMatchingPaths(
               absl::StrCat(filename_, "*"), &cache_files);
           if (!s.ok()) {
             LOG(WARNING) << "Failed to get matching files on " << filename_
                          << "* : " << s.ToString();
           }
-          for (const string& path : cache_files) {
+          for (const std::string& path : cache_files) {
             s = dataset()->env_->DeleteFile(path);
             if (!s.ok()) {
               LOG(WARNING) << "Failed to delete " << path << " : "
@@ -387,7 +387,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         size_t tensor_index = 0;
         for (const Tensor& t : *out_tensors) {
           DCHECK_LT(tensor_index, dataset()->num_tensors_);
-          string key = dataset()->FormatName(cur_index_, tensor_index++);
+          std::string key = dataset()->FormatName(cur_index_, tensor_index++);
           TF_RETURN_IF_ERROR(writer_->Add(key, t));
         }
         if (*end_of_sequence) {
@@ -576,9 +576,9 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
       std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       // The current prefix for the cache file. This is equal to
       // `StrCat(dataset()->filename_, "_", shard_id_)`.
-      string filename_;
+      std::string filename_;
       std::unique_ptr<BundleWriter> writer_ TF_GUARDED_BY(mu_);
-      string lockfile_ TF_GUARDED_BY(mu_);
+      std::string lockfile_ TF_GUARDED_BY(mu_);
       bool lockfile_created_ TF_GUARDED_BY(mu_);
       bool iteration_completed_ TF_GUARDED_BY(mu_);
     };  // FileWriterIterator
@@ -730,7 +730,7 @@ class CacheDatasetOp::FileDataset : public CacheDatasetOp::FileDatasetBase {
 class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDatasetBase {
  public:
   explicit FileDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
-                         string filename, Env* env,
+                         std::string filename, Env* env,
                          const Tensor& resource_handle)
       : FileDatasetBase(ctx, input, filename, env),
         resource_handle_(resource_handle) {}
@@ -768,7 +768,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
   ~MemoryDatasetBase() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.dataset_prefix = kMemoryDatasetPrefix;
     return std::make_unique<MemoryIterator>(
@@ -785,7 +785,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.dataset_prefix = kMemoryDatasetPrefix;
     return name_utils::DatasetDebugString(kDatasetType, params);
@@ -795,7 +795,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return input_->Cardinality(options);
   };
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     mutex_lock l(mu_);
 
@@ -815,7 +815,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return dataset_random_access_cache_->Get(ctx, index, out_tensors);
   }
 
-  absl::Status Get(AnyContext ctx, int64 index,
+  absl::Status Get(AnyContext ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     mutex_lock l(mu_);
     if (!iterator_random_access_cache_) {
@@ -1182,7 +1182,7 @@ void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kFileName, &filename));
   if (filename.empty()) {
     static std::atomic<int64_t> resource_id_counter(0);
-    const string& container = ctx->resource_manager()->default_container();
+    const std::string& container = ctx->resource_manager()->default_container();
     auto name = strings::StrCat(ctx->op_kernel().name(), "/", kMemoryCache, "_",
                                 resource_id_counter.fetch_add(1));
     if (op_version_ == 2) {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
index 2ccf09149c4c34..ec4067c15110c3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
@@ -32,10 +32,10 @@ constexpr char kMemoryDatasetPrefix[] = "Memory";
 class CacheDatasetParams : public DatasetParams {
  public:
   template <typename T>
-  CacheDatasetParams(T input_dataset_params, string filename,
+  CacheDatasetParams(T input_dataset_params, std::string filename,
                      DataTypeVector output_dtypes,
                      std::vector<PartialTensorShape> output_shapes,
-                     string node_name)
+                     std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         filename_(filename) {
@@ -51,7 +51,8 @@ class CacheDatasetParams : public DatasetParams {
     return {filename_tensor};
   }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     *input_names = {CacheDatasetOp::kInputDataset, CacheDatasetOp::kFileName};
     return absl::OkStatus();
   }
@@ -63,12 +64,14 @@ class CacheDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return CacheDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return CacheDatasetOp::kDatasetType;
+  }
 
-  string filename() const { return filename_; }
+  std::string filename() const { return filename_; }
 
  private:
-  string filename_;
+  std::string filename_;
 };
 
 class CacheDatasetOpTest : public DatasetOpsTestBase {
@@ -82,14 +85,14 @@ class CacheDatasetOpTest : public DatasetOpsTestBase {
 
   ~CacheDatasetOpTest() override {
     if (!cache_filename_.empty()) {
-      std::vector<string> cache_files;
+      std::vector<std::string> cache_files;
       absl::Status s = device_->env()->GetMatchingPaths(
           absl::StrCat(cache_filename_, "*"), &cache_files);
       if (!s.ok()) {
         LOG(WARNING) << "Failed to get matching files on " << cache_filename_
                      << "* : " << s;
       }
-      for (const string& path : cache_files) {
+      for (const std::string& path : cache_files) {
         s = device_->env()->DeleteFile(path);
         if (!s.ok()) {
           LOG(WARNING) << "Failed to delete " << path << " : " << s;
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
index 0dce7f73215f92..0338ca1b3fcfc8 100644
--- a/tensorflow/core/kernels/data/cache_ops.cc
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -80,7 +80,7 @@ AnonymousMemoryCacheHandleOp::AnonymousMemoryCacheHandleOp(
                                               /* ref_counting */ true,
                                               /* return_deleter */ true) {}
 
-string AnonymousMemoryCacheHandleOp::name() { return kMemoryCache; }
+std::string AnonymousMemoryCacheHandleOp::name() { return kMemoryCache; }
 
 absl::Status AnonymousMemoryCacheHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
diff --git a/tensorflow/core/kernels/data/cache_ops.h b/tensorflow/core/kernels/data/cache_ops.h
index e1e58ae9c1df89..f91f261ea79bec 100644
--- a/tensorflow/core/kernels/data/cache_ops.h
+++ b/tensorflow/core/kernels/data/cache_ops.h
@@ -62,7 +62,7 @@ class MemoryCacheManager : public ResourceBase {
  public:
   MemoryCacheManager() : cache_(std::make_shared<MemoryCache>()) {}
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
   std::shared_ptr<MemoryCache> get() { return cache_; }
 
@@ -77,7 +77,7 @@ class AnonymousMemoryCacheHandleOp
   explicit AnonymousMemoryCacheHandleOp(OpKernelConstruction* ctx);
 
  private:
-  string name() override;
+  std::string name() override;
   absl::Status CreateResource(
       OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 6d4bfc88504a7e..d9fed39b07ba88 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -105,7 +105,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
   }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
         this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
@@ -124,7 +124,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
@@ -155,7 +155,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return to_concatenate_->CheckExternalState();
   }
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     if (index < input_cardinality_) {
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index b3c114ce833a08..cafd1d4880b379 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -120,7 +120,7 @@ void DatasetToGraphOp::Compute(OpKernelContext* ctx) {
 DatasetCardinalityOp::DatasetCardinalityOp(OpKernelConstruction* ctx)
     : OpKernel(ctx), cardinality_options_(new CardinalityOptions) {
   if (ctx->HasAttr(kCardinalityOptions)) {
-    string options_serialized;
+    std::string options_serialized;
     OP_REQUIRES_OK(ctx, ctx->GetAttr(kCardinalityOptions, &options_serialized));
     if (!options_serialized.empty())
       cardinality_options_->ParseFromString(options_serialized);
@@ -141,7 +141,7 @@ void DatasetFromGraphOp::Compute(OpKernelContext* ctx) {
   GraphDef graph_def;
   OP_REQUIRES(ctx, graph_def.ParseFromString(graph_def_string),
               errors::InvalidArgument("Could not parse GraphDef"));
-  string output_node;
+  std::string output_node;
   for (const auto& node : graph_def.node()) {
     if (node.op() == FunctionLibraryDefinition::kRetOp) {
       output_node = node.input(0);
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 88e7f1528d4c83..3163f4e62c320a 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -112,9 +112,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         output_shapes_(output_shapes),
         traceme_metadata_(
             {{"block_length",
-              strings::Printf("%lld", static_cast<long long>(block_length))},
+              absl::StrFormat("%lld", static_cast<long long>(block_length))},
              {"cycle_length",
-              strings::Printf("%lld", static_cast<long long>(cycle_length))},
+              absl::StrFormat("%lld", static_cast<long long>(cycle_length))},
              {"deterministic",
               deterministic.IsDeterministic() || deterministic.IsDefault()
                   ? "true"
@@ -126,7 +126,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.op_version = op_version_;
     bool deterministic =
@@ -143,7 +143,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.op_version = op_version_;
     return name_utils::DatasetDebugString(kDatasetType, params);
@@ -949,7 +949,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     absl::Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-      string iterator_name =
+      std::string iterator_name =
           strings::StrCat(prefix(), "::", kWorker, "_", index);
       TF_RETURN_IF_ERROR(writer->WriteScalar(iterator_name, kInputSize,
                                              workers_[index].input.size()));
@@ -975,7 +975,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     absl::Status ReadWorkerStateLocked(IteratorContext* ctx,
                                        IteratorStateReader* reader, int index)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-      string worker_prefix =
+      std::string worker_prefix =
           strings::StrCat(prefix(), "::", kWorker, "_", index);
       // Restore inputs.
       int64_t input_size;
@@ -1009,7 +1009,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                               IteratorStateWriter* writer,
                                               int index)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-      string iterator_name =
+      std::string iterator_name =
           strings::StrCat(prefix(), "::", kWorkerThread, "_", index);
       if (worker_thread_states_[index].iterator != nullptr) {
         TF_RETURN_IF_ERROR(
@@ -1043,7 +1043,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                              IteratorStateReader* reader,
                                              int index,
                                              WorkerThreadState* state) {
-      string worker_prefix =
+      std::string worker_prefix =
           strings::StrCat(prefix(), "::", kWorkerThread, "_", index);
       // Restore inputs.
       int64_t input_size;
@@ -1083,8 +1083,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     absl::Status WriteOutputElemLocked(IteratorStateWriter* writer,
                                        const OutputElem& output_elem,
-                                       const string& iterator_name,
-                                       const string& prefix)
+                                       const std::string& iterator_name,
+                                       const std::string& prefix)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       TF_RETURN_IF_ERROR(WriteStatusLocked(writer, iterator_name,
                                            absl::StrCat(prefix, "_", kStatus),
@@ -1103,8 +1103,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     absl::Status ReadOutputElemLocked(IteratorContext* ctx,
                                       IteratorStateReader* reader,
                                       OutputElem* output_elem,
-                                      const string& iterator_name,
-                                      const string& prefix) {
+                                      const std::string& iterator_name,
+                                      const std::string& prefix) {
       TF_RETURN_IF_ERROR(ReadStatusLocked(reader, iterator_name,
                                           absl::StrCat(prefix, "_", kStatus),
                                           &output_elem->status));
@@ -1123,8 +1123,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     absl::Status WriteStatusLocked(IteratorStateWriter* writer,
-                                   const string& iterator_name,
-                                   const string& prefix,
+                                   const std::string& iterator_name,
+                                   const std::string& prefix,
                                    const absl::Status& status)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       TF_RETURN_IF_ERROR(
@@ -1139,8 +1139,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     absl::Status ReadStatusLocked(IteratorStateReader* reader,
-                                  const string& iterator_name,
-                                  const string& prefix, absl::Status* status) {
+                                  const std::string& iterator_name,
+                                  const std::string& prefix,
+                                  absl::Status* status) {
       int64_t code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(
           iterator_name, absl::StrCat(prefix, "_", kCode), &code_int));
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
index a3e38ce4aeab90..f5d94b30bbd7ba 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
@@ -80,7 +80,7 @@ class RandomDatasetParams : public DatasetParams {
                       bool rerandomize_each_iteration,
                       DataTypeVector output_dtypes,
                       std::vector<PartialTensorShape> output_shapes,
-                      string node_name)
+                      std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         seed_(CreateTensor<int64_t>(TensorShape({}), {seed})),
@@ -98,7 +98,8 @@ class RandomDatasetParams : public DatasetParams {
     return {seed_, seed2_, seed_generator_resource_};
   }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     *input_names = {RandomDatasetOp::kSeed, RandomDatasetOp::kSeed2};
     if (op_version_ == 2) {
       input_names->emplace_back("seed_generator");
@@ -117,7 +118,9 @@ class RandomDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return RandomDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return RandomDatasetOp::kDatasetType;
+  }
 
  private:
   Tensor seed_;
diff --git a/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
index fe2315e35bd6a4..01f96cb04ed82e 100644
--- a/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
@@ -38,7 +38,7 @@ class SaveDatasetV2Params : public DatasetParams {
                       std::vector<FunctionDef> func_lib, bool use_shard_func,
                       DataTypeVector output_dtypes,
                       std::vector<PartialTensorShape> output_shapes,
-                      string node_name, DataTypeVector type_arguments)
+                      std::string node_name, DataTypeVector type_arguments)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         path_(path),
@@ -59,7 +59,8 @@ class SaveDatasetV2Params : public DatasetParams {
     return input_tensors;
   }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->clear();
     input_names->emplace_back(SaveDatasetV2Op::kInputDataset);
     input_names->emplace_back(SaveDatasetV2Op::kPath);
@@ -78,11 +79,13 @@ class SaveDatasetV2Params : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string path() const { return path_; }
+  std::string path() const { return path_; }
 
-  string dataset_type() const override { return SaveDatasetV2Op::kDatasetType; }
+  std::string dataset_type() const override {
+    return SaveDatasetV2Op::kDatasetType;
+  }
 
-  string op_name() const override { return "SaveDatasetV2"; }
+  std::string op_name() const override { return "SaveDatasetV2"; }
 
   std::vector<FunctionDef> func_lib() const override { return func_lib_; }
 
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index b765f96d60e71c..ff15bd00f4e1c6 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -60,7 +60,7 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::Sleep")});
     }
@@ -72,7 +72,9 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override { return "SleepDatasetOp::Dataset"; }
+    std::string DebugString() const override {
+      return "SleepDatasetOp::Dataset";
+    }
 
     int64_t CardinalityInternal(CardinalityOptions options) const override {
       return input_->Cardinality(options);
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index cab138c9903c42..3ab56ba9af36bd 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -72,7 +72,7 @@ class SqlDatasetOp : public DatasetOpKernel {
     // TODO(b/64276826) Change this check when we add support for other
     // databases.
     OP_REQUIRES(ctx, driver_name == "sqlite",
-                errors::InvalidArgument(tensorflow::strings::Printf(
+                errors::InvalidArgument(absl::StrFormat(
                     "The database type, %s, is not supported by SqlDataset. "
                     "The set of supported databases is: {'sqlite'}.",
                     driver_name.c_str())));
diff --git a/tensorflow/core/kernels/data/finalize_dataset_op_test.cc b/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
index 2077cc28c161ec..c076e2dcc4dc77 100644
--- a/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
@@ -31,7 +31,7 @@ class FinalizeDatasetParams : public DatasetParams {
   template <typename T>
   FinalizeDatasetParams(T input_dataset_params, DataTypeVector output_dtypes,
                         std::vector<PartialTensorShape> output_shapes,
-                        string node_name)
+                        std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         has_captured_ref_(false) {
@@ -40,7 +40,8 @@ class FinalizeDatasetParams : public DatasetParams {
 
   std::vector<Tensor> GetInputTensors() const override { return {}; }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->emplace_back(FinalizeDatasetOp::kInputDataset);
     return absl::OkStatus();
   }
@@ -52,7 +53,7 @@ class FinalizeDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return "Finalize"; }
+  std::string dataset_type() const override { return "Finalize"; }
 
  private:
   bool has_captured_ref_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index d10513763fa726..a4a3bb4c77afeb 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -583,7 +583,7 @@ AnonymousIteratorHandleOp::AnonymousIteratorHandleOp(
   OP_REQUIRES_OK(context, context->GetAttr(kOutputShapes, &output_shapes_));
 }
 
-string AnonymousIteratorHandleOp::name() { return kAnonymousIterator; }
+std::string AnonymousIteratorHandleOp::name() { return kAnonymousIterator; }
 
 absl::Status AnonymousIteratorHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
@@ -725,7 +725,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
         graph_def_version_(ctx->graph_def_version())
 
   {
-    string shared_name;
+    std::string shared_name;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &shared_name));
     OP_REQUIRES(ctx, shared_name.empty(),
                 errors::InvalidArgument("OneShotIteratorOp does not currently "
@@ -837,9 +837,10 @@ class OneShotIteratorOp : public AsyncOpKernel {
         &f_handle));
     FunctionLibraryRuntime::Options opts;
     opts.cancellation_manager = ctx->cancellation_manager();
-    ScopedStepContainer step_container(opts.step_id, [ctx](const string& name) {
-      ctx->resource_manager()->Cleanup(name).IgnoreError();
-    });
+    ScopedStepContainer step_container(
+        opts.step_id, [ctx](const std::string& name) {
+          ctx->resource_manager()->Cleanup(name).IgnoreError();
+        });
     opts.step_container = &step_container;
     opts.runner = ctx->runner();
     opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index b381f28def6ea4..e5ef9d1451cd69 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -84,7 +84,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
   }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
         this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
@@ -103,7 +103,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
@@ -137,7 +137,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return absl::OkStatus();
   }
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->reserve(output_dtypes().size());
diff --git a/tensorflow/core/kernels/data/zip_dataset_op_test.cc b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
index ce62706e224c2f..bfde48fb9509d8 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
@@ -42,7 +42,7 @@ class ZipDatasetParams : public DatasetParams {
   ZipDatasetParams(std::vector<T> input_dataset_params,
                    DataTypeVector output_dtypes,
                    std::vector<PartialTensorShape> output_shapes,
-                   int num_input_datasets, string node_name)
+                   int num_input_datasets, std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         num_input_datasets_(num_input_datasets) {
@@ -57,7 +57,8 @@ class ZipDatasetParams : public DatasetParams {
 
   std::vector<Tensor> GetInputTensors() const override { return {}; }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->clear();
     for (int i = 0; i < num_input_datasets_; ++i) {
       input_names->emplace_back(
@@ -75,10 +76,12 @@ class ZipDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return ZipDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return ZipDatasetOp::kDatasetType;
+  }
 
  private:
-  int32 num_input_datasets_;
+  int32_t num_input_datasets_;
 };
 
 class ZipDatasetOpTest : public DatasetOpsTestBase {};
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 78ca7948e55c0f..db7cf3f31f7849 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -560,7 +560,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
 
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -619,7 +619,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
             input_sizes.dims()));
     TensorShape input_shape;
-    const int32* in_sizes_data = input_sizes.template flat<int32>().data();
+    const int32_t* in_sizes_data = input_sizes.template flat<int32_t>().data();
 
     for (int i = 0; i < input_sizes.NumElements(); ++i) {
       OP_REQUIRES(context, in_sizes_data[i] >= 0,
@@ -695,7 +695,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
   bool use_cudnn_grouped_conv_;
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -1071,7 +1071,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
 
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -1129,7 +1129,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
             filter_sizes.dims()));
     TensorShape filter_shape;
-    const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
+    const int32_t* filter_sizes_data =
+        filter_sizes.template flat<int32_t>().data();
     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
                   errors::InvalidArgument("Dimension ", i,
@@ -1249,7 +1250,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
   bool use_cudnn_grouped_conv_;
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index bcc81f903b84f6..5743b6d6cc8cc7 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -599,11 +599,11 @@ class CufftScratchAllocator : public se::ScratchAllocator {
   CufftScratchAllocator(int64_t memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
-  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
-      return tsl::StatusOr<se::DeviceMemory<uint8>>();
+      return absl::StatusOr<stream_executor::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
@@ -611,13 +611,13 @@ class CufftScratchAllocator : public se::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return tsl::StatusOr<se::DeviceMemory<uint8>>();
+      return absl::StatusOr<stream_executor::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return tsl::StatusOr<se::DeviceMemory<uint8>>(
+    return absl::StatusOr<stream_executor::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 3d510e4b50dadd..015b4ac8fafa7a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -717,7 +717,8 @@ class CudnnBatchNormAllocatorInTemp : public ScratchAllocator {
     return std::numeric_limits<int64_t>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64_t byte_size) override {
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
+      int64_t byte_size) override {
     Tensor temporary_memory;
     const DataType tf_data_type = DataTypeToEnum<T>::v();
     int64_t allocate_count =
@@ -770,7 +771,8 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
     return std::numeric_limits<int64_t>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64_t byte_size) override {
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
+      int64_t byte_size) override {
     output_allocated = true;
     DCHECK(total_byte_size_ == 0)
         << "Reserve space allocator can only be called once";
@@ -787,7 +789,7 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
     auto memory_uint8 = DeviceMemory<uint8>::MakeFromByteSize(
         temporary_memory->template flat<T>().data(),
         temporary_memory->template flat<T>().size() * sizeof(T));
-    return StatusOr<DeviceMemory<uint8>>(memory_uint8);
+    return absl::StatusOr<stream_executor::DeviceMemory<uint8>>(memory_uint8);
   }
 
   int64_t TotalByteSize() { return total_byte_size_; }
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index 73fa3c4b74e296..794f51cd1cb394 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -62,7 +62,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
     // TODO(dga):  Test the batch case also.
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<tstring>()() =
-        string(reinterpret_cast<const char*>(data), size);
+        std::string(reinterpret_cast<const char*>(data), size);
     RunInputs({{"input", input_tensor}});
   }
 };
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 09c7563d2efd17..d178208a1a35e0 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -81,7 +81,7 @@ class FuzzSession {
   // Initializes the FuzzSession.  Not safe for multithreading.
   // Separate init function because the call to virtual BuildGraphDef
   // can't be put into the constructor.
-  Status InitIfNeeded() {
+  absl::Status InitIfNeeded() {
     if (initialized_) {
       return absl::OkStatus();
     }
@@ -96,7 +96,7 @@ class FuzzSession {
     GraphDef graph_def;
     TF_CHECK_OK(root.ToGraphDef(&graph_def));
 
-    Status status = session_->Create(graph_def);
+    absl::Status status = session_->Create(graph_def);
     if (!status.ok()) {
       // This is FATAL, because this code is designed to fuzz an op
       // within a session.  Failure to create the session means we
@@ -111,20 +111,20 @@ class FuzzSession {
   // any returned output.
   // Note: We are ignoring Status from Run here since fuzzers don't need to
   // check it (as that will slow them down and printing/logging is useless).
-  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+  void RunInputs(const std::vector<std::pair<std::string, Tensor> >& inputs) {
     RunInputsWithStatus(inputs).IgnoreError();
   }
 
   // Same as RunInputs but don't ignore status
-  Status RunInputsWithStatus(
-      const std::vector<std::pair<string, Tensor> >& inputs) {
+  absl::Status RunInputsWithStatus(
+      const std::vector<std::pair<std::string, Tensor> >& inputs) {
     return session_->Run(inputs, {}, {"output"}, nullptr);
   }
 
   // Dispatches to FuzzImpl;  small amount of sugar to keep the code
   // of the per-op fuzzers tiny.
   int Fuzz(const uint8_t* data, size_t size) {
-    Status status = InitIfNeeded();
+    absl::Status status = InitIfNeeded();
     TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
                         << status.message();
     // No return value from fuzzing:  Success is defined as "did not
@@ -146,7 +146,7 @@ class FuzzStringInputOp : public FuzzSession {
   void FuzzImpl(const uint8_t* data, size_t size) final {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<tstring>()() =
-        string(reinterpret_cast<const char*>(data), size);
+        std::string(reinterpret_cast<const char*>(data), size);
     RunInputs({{"input", input_tensor}});
   }
 };
diff --git a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
index 08af574ac9ae4e..458329000ca349 100644
--- a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
@@ -42,7 +42,7 @@ class FuzzOneHot : public FuzzSession {
   void FuzzImpl(const uint8_t* data, size_t size) override {
     int64_t input_size;
     int32_t depth;
-    uint8 on, off;
+    uint8_t on, off;
     const uint8_t* input_data;
 
     if (size > 3) {
@@ -51,7 +51,7 @@ class FuzzOneHot : public FuzzSession {
       if (size > kMaxSize) {
         size = kMaxSize;
       }
-      depth = static_cast<int32>(data[0]);
+      depth = static_cast<int32_t>(data[0]);
       on = data[1];
       off = data[2];
       input_size = static_cast<int64_t>(size - 3);
@@ -69,13 +69,13 @@ class FuzzOneHot : public FuzzSession {
     Tensor on_tensor(tensorflow::DT_UINT8, TensorShape({}));
     Tensor off_tensor(tensorflow::DT_UINT8, TensorShape({}));
 
-    auto flat_tensor = input_tensor.flat<uint8>();
+    auto flat_tensor = input_tensor.flat<uint8_t>();
     for (size_t i = 0; i < input_size; i++) {
       flat_tensor(i) = input_data[i];
     }
-    depth_tensor.scalar<int32>()() = depth;
-    on_tensor.scalar<uint8>()() = on;
-    off_tensor.scalar<uint8>()() = off;
+    depth_tensor.scalar<int32_t>()() = depth;
+    on_tensor.scalar<uint8_t>()() = on;
+    off_tensor.scalar<uint8_t>()() = off;
 
     RunInputs({{"input", input_tensor},
                {"depth", depth_tensor},
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index de3ae36dc75d56..a8cc47e599ee43 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -53,7 +53,8 @@ class FuzzParseTensor : public FuzzSession {
     // detects another similar OOM.
     // After adding `-fsanitize=null` to ASAN (cl/317376103), the memory
     // footprint increased, so we lower the maximum threshold to 2^18.
-    string as_string = string(reinterpret_cast<const char*>(data), size);
+    std::string as_string =
+        std::string(reinterpret_cast<const char*>(data), size);
     TensorProto proto;
     if (!ParseProtoUnlimited(&proto, as_string)) {
       LOG(WARNING) << "Unable to parse proto of tensor\n";
diff --git a/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
index 5104711ad3048f..81f489b2080d80 100644
--- a/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
@@ -67,7 +67,7 @@ class FuzzScatterNd : public FuzzSession {
 
     // Subsequent elements give the contents of the shape tensor.
     // To not get out of memory, reduce all dimensions to at most kMaxDim
-    auto flat_shape = shape_tensor.flat<int32>();
+    auto flat_shape = shape_tensor.flat<int32_t>();
     for (i = 0; i < shape_dims; i++) {
       flat_shape(i) = data[data_ix++] % kMaxDim;
     }
@@ -94,7 +94,7 @@ class FuzzScatterNd : public FuzzSession {
     Tensor indices_tensor(tensorflow::DT_INT32, TensorShape(indices_dims));
 
     // Rest of the buffer is used to fill in the indices_tensor
-    auto flat_indices = indices_tensor.flat<int32>();
+    auto flat_indices = indices_tensor.flat<int32_t>();
     for (i = 0; i < num_indices && data_ix < size; i++) {
       flat_indices(i) = data[data_ix++];
     }
@@ -118,7 +118,7 @@ class FuzzScatterNd : public FuzzSession {
     Tensor updates_tensor(tensorflow::DT_INT32, TensorShape(updates_dims));
 
     // We don't care about the values in the updates_tensor, make them all be 1
-    auto flat_updates = updates_tensor.flat<int32>();
+    auto flat_updates = updates_tensor.flat<int32_t>();
     for (i = 0; i < num_indices; i++) {
       flat_updates(i) = 1;
     }
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
index bcdc406d85201a..151a956ca22fd4 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
@@ -195,8 +195,7 @@ TEST_F(NonMaxSuppressionV2GPUOpTest, TestInconsistentBoxAndScoreShapes) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -210,7 +209,7 @@ TEST_F(NonMaxSuppressionV2GPUOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      absl::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/immutable_constant_op.cc b/tensorflow/core/kernels/immutable_constant_op.cc
index be0194413a3b81..4fbd1edfba920a 100644
--- a/tensorflow/core/kernels/immutable_constant_op.cc
+++ b/tensorflow/core/kernels/immutable_constant_op.cc
@@ -26,7 +26,7 @@ class MemmappedTensorAllocator : public Allocator {
  public:
   MemmappedTensorAllocator() {}
 
-  absl::Status InitializeFromRegion(const string& name, Env* env) {
+  absl::Status InitializeFromRegion(const std::string& name, Env* env) {
     const auto status =
         env->NewReadOnlyMemoryRegionFromFile(name, &memory_region_);
     if (!status.ok()) {
@@ -34,7 +34,7 @@ class MemmappedTensorAllocator : public Allocator {
     }
     return absl::OkStatus();
   }
-  string Name() override { return "MemmappedTensorAllocator"; }
+  std::string Name() override { return "MemmappedTensorAllocator"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     if ((reinterpret_cast<intptr_t>(memory_region_->data())) % alignment != 0) {
diff --git a/tensorflow/core/kernels/immutable_constant_op.h b/tensorflow/core/kernels/immutable_constant_op.h
index 264abc8401b3b4..cd645686bddcfa 100644
--- a/tensorflow/core/kernels/immutable_constant_op.h
+++ b/tensorflow/core/kernels/immutable_constant_op.h
@@ -38,7 +38,7 @@ class ImmutableConstantOp : public OpKernel {
   static constexpr char const* kMemoryRegionNameAttr = "memory_region_name";
 
  private:
-  string region_name_;
+  std::string region_name_;
   DataType dtype_;
   TensorShape shape_;
   ImmutableConstantOp(const ImmutableConstantOp&) = delete;
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index 1cfed79bf3318e..955d3f8751c12a 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -40,7 +40,7 @@ constexpr size_t kTestTensorSizeBytes = kTestTensorSize * sizeof(float);
 class TestReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   TestReadOnlyMemoryRegion() = delete;
-  explicit TestReadOnlyMemoryRegion(uint64 length)
+  explicit TestReadOnlyMemoryRegion(uint64_t length)
       : memptr_(cpu_allocator()->AllocateRaw(kTestAlignment, length)),
         length_(length) {}
   ~TestReadOnlyMemoryRegion() override {
@@ -48,11 +48,11 @@ class TestReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
   }
   const void* data() override { return memptr_; }
   float* GetWritableDataStart() { return reinterpret_cast<float*>(memptr_); }
-  uint64 length() override { return length_; }
+  uint64_t length() override { return length_; }
 
  protected:
   void* memptr_;
-  uint64 length_;
+  uint64_t length_;
 };
 
 // A mock file system and environment class that creates ReadOnlyMemoryRegion
@@ -65,7 +65,7 @@ class TestFileSystem : public NullFileSystem {
   using NullFileSystem::NewReadOnlyMemoryRegionFromFile;
 
   absl::Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname, TransactionToken* token,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     float val = 0;
     absl::string_view scheme, host, path;
@@ -146,13 +146,13 @@ TEST(ImmutableConstantOpTest, ExecutionError) {
       error::INTERNAL);
 }
 
-absl::Status CreateTempFileFloat(Env* env, float value, uint64 size,
-                                 string* filename) {
-  const string dir = testing::TmpDir();
+absl::Status CreateTempFileFloat(Env* env, float value, uint64_t size,
+                                 std::string* filename) {
+  const std::string dir = testing::TmpDir();
   *filename = io::JoinPath(dir, absl::StrCat("file_", value));
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(*filename, &file));
-  for (uint64 i = 0; i < size; ++i) {
+  for (uint64_t i = 0; i < size; ++i) {
     absl::string_view sp(static_cast<char*>(static_cast<void*>(&value)),
                          sizeof(value));
     TF_RETURN_IF_ERROR(file->Append(sp));
@@ -166,7 +166,7 @@ TEST(ImmutableConstantOpTest, FromFile) {
   Env* env = Env::Default();
   auto root = Scope::NewRootScope().ExitOnError();
 
-  string two_file, three_file;
+  std::string two_file, three_file;
   TF_ASSERT_OK(CreateTempFileFloat(env, 2.0f, 1000, &two_file));
   TF_ASSERT_OK(CreateTempFileFloat(env, 3.0f, 1000, &three_file));
   auto node1 = ops::ImmutableConst(root, DT_FLOAT, kFileTensorShape, two_file);
@@ -191,9 +191,10 @@ TEST(ImmutableConstantOpTest, FromFile) {
   EXPECT_EQ(outputs.front().flat<float>()(2), 2.0f * 3.0f);
 }
 
-absl::Status CreateTempFileBadString(Env* env, char value, uint64 size,
-                                     const string suffix, string* filename) {
-  const string dir = testing::TmpDir();
+absl::Status CreateTempFileBadString(Env* env, char value, uint64_t size,
+                                     const std::string suffix,
+                                     std::string* filename) {
+  const std::string dir = testing::TmpDir();
   *filename = io::JoinPath(dir, absl::StrCat("file_", suffix));
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(*filename, &file));
@@ -207,7 +208,7 @@ TEST(ImmutableConstantOpTest, FromFileStringUnimplmented) {
   Env* env = Env::Default();
   auto root = Scope::NewRootScope().ExitOnError();
 
-  string bad_file;
+  std::string bad_file;
   TF_ASSERT_OK(CreateTempFileBadString(env, '\xe2', 128, "bad_e2", &bad_file));
   auto result =
       ops::ImmutableConst(root, DT_STRING, kFileTensorShape, bad_file);
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index 169d331ad24487..20e12a56e8778f 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -89,15 +89,15 @@ REGISTER_KERNEL_BUILDER(Name("InTopK")
                             .HostMemory("predictions")
                             .HostMemory("targets")
                             .HostMemory("precision")
-                            .TypeConstraint<int32>("T"),
-                        InTopK<CPUDevice, float, int32>);
+                            .TypeConstraint<int32_t>("T"),
+                        InTopK<CPUDevice, float, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("InTopK")
                             .Device(DEVICE_CPU)
                             .HostMemory("predictions")
                             .HostMemory("targets")
                             .HostMemory("precision")
                             .TypeConstraint<int64_t>("T"),
-                        InTopK<CPUDevice, float, int64>);
+                        InTopK<CPUDevice, float, int64_t>);
 
 REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .Device(DEVICE_CPU)
@@ -105,8 +105,8 @@ REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .HostMemory("targets")
                             .HostMemory("k")
                             .HostMemory("precision")
-                            .TypeConstraint<int32>("T"),
-                        InTopK<CPUDevice, float, int32>);
+                            .TypeConstraint<int32_t>("T"),
+                        InTopK<CPUDevice, float, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .Device(DEVICE_CPU)
                             .HostMemory("predictions")
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .HostMemory("k")
                             .HostMemory("precision")
                             .TypeConstraint<int64_t>("T"),
-                        InTopK<CPUDevice, float, int64>);
+                        InTopK<CPUDevice, float, int64_t>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -129,18 +129,18 @@ namespace functor {
       typename TTypes<bool>::Vec output);                           \
   extern template struct InTopKFunctor<GPUDevice, T, TARGET_T>;
 
-DECLARE_GPU_SPEC(float, int32);
+DECLARE_GPU_SPEC(float, int32_t);
 DECLARE_GPU_SPEC(float, int64_t);
 
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
 REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int32>("T"),
-    InTopK<GPUDevice, float, int32>);
+    Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int32_t>("T"),
+    InTopK<GPUDevice, float, int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int64_t>("T"),
-    InTopK<GPUDevice, float, int64>);
+    InTopK<GPUDevice, float, int64_t>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/in_topk_op.h b/tensorflow/core/kernels/in_topk_op.h
index 877777642ebeb6..ad10dad72bf717 100644
--- a/tensorflow/core/kernels/in_topk_op.h
+++ b/tensorflow/core/kernels/in_topk_op.h
@@ -62,7 +62,7 @@ struct InTopKFunctor<CPUDevice, T, TargetT> {
     int64_t k_val = k.k_value;
     if (k.k_tensor != nullptr) {
       if (k.k_tensor->dtype() == DT_INT32) {
-        k_val = k.k_tensor->scalar<int32>()();
+        k_val = k.k_tensor->scalar<int32_t>()();
       } else {
         k_val = k.k_tensor->scalar<int64_t>()();
       }
diff --git a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
index cd1d3e88b510bf..b011a24cb1ed1e 100644
--- a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
@@ -39,7 +39,7 @@ template <typename T, typename TargetT>
 __global__ void ComputePredictionMaskKernel(
     const T* __restrict__ predictions,    // dims: [ num_targets x num_classes ]
     const TargetT* __restrict__ targets,  // dims: [ num_targets ]
-    int64* __restrict__ mask,             // dims: [ num_targets x num_classes ]
+    int64_t* __restrict__ mask,           // dims: [ num_targets x num_classes ]
     int num_targets, int num_classes) {
   GPU_1D_KERNEL_LOOP(i, num_targets * num_classes) {
     const int batch_index = i / num_classes;
@@ -67,7 +67,8 @@ __global__ void ComputePredictionMaskKernel(
 // larger than the target, or to '-1' if target class in invalid of predictions
 // in a batch have non-finite values.
 struct MaskSum {
-  __host__ __device__ int64 operator()(const int64& a, const int64& b) const {
+  __host__ __device__ int64_t operator()(const int64_t& a,
+                                         const int64_t& b) const {
     if (a < 0 || b < 0)
       return -1;
     else
@@ -77,8 +78,8 @@ struct MaskSum {
 
 namespace reduction_op_helper {
 template <>
-struct IdentityValue<int64, MaskSum> {
-  int64 operator()() { return 0; }
+struct IdentityValue<int64_t, MaskSum> {
+  int64_t operator()() { return 0; }
 };
 
 }  // namespace reduction_op_helper
@@ -138,8 +139,8 @@ struct InTopKFunctor<GPUDevice, T, TargetT> {
       auto in = predictions_mask.matrix<int64_t>();
       auto out = num_larger_prediction.flat<int64_t>();
 
-      ReduceImpl<int64, MaskSum, int64*, int64*, Dims<1>>(
-          context, (int64*)out.data(), (int64*)in.data(), in.rank(),
+      ReduceImpl<int64_t, MaskSum, int64_t*, int64_t*, Dims<1>>(
+          context, (int64_t*)out.data(), (int64_t*)in.data(), in.rank(),
           in.dimension(0), in.rank() >= 2 ? in.dimension(1) : 1,
           in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), Dims<1>(1),
           MaskSum());
@@ -152,8 +153,9 @@ struct InTopKFunctor<GPUDevice, T, TargetT> {
       if (k.k_tensor->dtype() == DT_INT32) {
         output.device(d) =
             (cnt >= cnt.constant(0)) &&
-            (cnt < k.k_tensor->flat<int32>().template cast<int64_t>().broadcast(
-                       Dims<1>(num_targets)));
+            (cnt <
+             k.k_tensor->flat<int32_t>().template cast<int64_t>().broadcast(
+                 Dims<1>(num_targets)));
       } else {
         output.device(d) =
             (cnt >= cnt.constant(0)) &&
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 45db7d3b2d3f49..6948cd86c1f8b1 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -106,7 +106,7 @@ class ParallelConcatUpdate : public OpKernel {
   }
 
  private:
-  int32 loc_;
+  int32_t loc_;
 };
 
 template <typename Device, typename T>
@@ -199,7 +199,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("value")
                             .HostMemory("update")
                             .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         ParallelConcatUpdate<CPUDevice>);
 #endif
 
@@ -251,7 +251,7 @@ namespace functor {
 template <typename T>
 void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
-  auto Ti = i.flat<int32>();
+  auto Ti = i.flat<int32_t>();
   auto Tv = v.flat_outer_dims<T>();
   auto Ty = y->flat_outer_dims<T>();
   auto nrows = Ty.dimension(0);
@@ -274,7 +274,7 @@ void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
 // String type only supports inplace update.
 void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
                              const Tensor& v, Tensor* y) {
-  auto Ti = i.flat<int32>();
+  auto Ti = i.flat<int32_t>();
   auto Tv = v.flat_outer_dims<tstring>();
   auto Ty = y->flat_outer_dims<tstring>();
   auto nrows = Ty.dimension(0);
@@ -398,10 +398,10 @@ class EmptyOp : public OpKernel {
         ctx, TensorShapeUtils::IsVector(shape.shape()),
         errors::InvalidArgument("shape must be a vector of int32, got shape ",
                                 shape.shape().DebugString()));
-    auto dims = shape.flat<int32>();
+    auto dims = shape.flat<int32_t>();
     TensorShape out_shape;
     OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
-                            reinterpret_cast<const int32*>(dims.data()),
+                            reinterpret_cast<const int32_t*>(dims.data()),
                             dims.size(), &out_shape));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
@@ -463,7 +463,7 @@ REGISTER(uint8_t);
 REGISTER(int64_t);
 REGISTER(uint64_t);
 
-REGISTER_EMPTY(int32, GPU);
+REGISTER_EMPTY(int32_t, GPU);
 #undef REGISTER
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -474,7 +474,7 @@ REGISTER_KERNEL_BUILDER(Name("InplaceUpdate")
                             .HostMemory("i")
                             .HostMemory("v")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         InplaceOp<CPUDevice, functor::I_UPDATE>);
 REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
                             .Device(DEVICE_DEFAULT)
@@ -482,7 +482,7 @@ REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
                             .HostMemory("i")
                             .HostMemory("v")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         InplaceOp<CPUDevice, functor::I_ADD>);
 REGISTER_KERNEL_BUILDER(Name("InplaceSub")
                             .Device(DEVICE_DEFAULT)
@@ -490,14 +490,14 @@ REGISTER_KERNEL_BUILDER(Name("InplaceSub")
                             .HostMemory("i")
                             .HostMemory("v")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         InplaceOp<CPUDevice, functor::I_SUB>);
 
 REGISTER_KERNEL_BUILDER(Name("DeepCopy")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         CopyOp<CPUDevice>);
 
 }  // end namespace
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 001b6a45e35c5d..6ba369ebdb4346 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -27,13 +27,13 @@ namespace functor {
 typedef Eigen::GpuDevice Device;
 
 template <typename T>
-__global__ void DoParallelConcatOpKernel(int nthreads, const int64 rows,
-                                         const int64 cols, int32 loc,
+__global__ void DoParallelConcatOpKernel(int nthreads, const int64_t rows,
+                                         const int64_t cols, int32_t loc,
                                          const T* __restrict__ src,
                                          T* __restrict__ dst) {
   GPU_1D_KERNEL_LOOP(idx, nthreads) {
-    int64 c = idx % cols;
-    int64 r = (loc % rows + rows) % rows;  // Guard index range.
+    int64_t c = idx % cols;
+    int64_t r = (loc % rows + rows) % rows;  // Guard index range.
     T* p = dst + r * cols + c;
     const T* q = src + idx;
     *p = ldg(q);
@@ -41,24 +41,24 @@ __global__ void DoParallelConcatOpKernel(int nthreads, const int64 rows,
 }
 
 template <typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
-                              Tensor* output) {
-  const int64 nelem = value.NumElements();
+absl::Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
+                                    int32_t loc, Tensor* output) {
+  const int64_t nelem = value.NumElements();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
   auto Toutput = output->flat_outer_dims<T>();
-  const int64 nrows = Toutput.dimension(0);
-  const int64 ncols = Toutput.dimension(1);
+  const int64_t nrows = Toutput.dimension(0);
+  const int64_t ncols = Toutput.dimension(1);
   const T* src = value.flat<T>().data();
   T* dst = output->flat<T>().data();
   TF_CHECK_OK(GpuLaunchKernel(
       DoParallelConcatOpKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
       d.stream(), cfg.virtual_thread_count, nrows, ncols, loc, src, dst));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
-Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
-                        Tensor* output) {
+absl::Status DoParallelConcat(const Device& d, const Tensor& value, int32_t loc,
+                              Tensor* output) {
   CHECK_EQ(value.dtype(), output->dtype());
   switch (value.dtype()) {
 #define CASE(type)                                              \
@@ -77,18 +77,18 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ",
                                      DataTypeString(value.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T, InplaceOpType op>
-__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
-                                  const int64 cols, const int64 n,
+__global__ void DoInplaceOpKernel(int nthreads, const int64_t rows,
+                                  const int64_t cols, const int64_t n,
                                   const T* __restrict__ src,
-                                  const int32* __restrict__ rowids,
+                                  const int32_t* __restrict__ rowids,
                                   T* __restrict__ dst) {
   GPU_1D_KERNEL_LOOP(idx, nthreads) {
-    int64 r = idx / cols;
-    int64 c = idx % cols;
+    int64_t r = idx / cols;
+    int64_t c = idx % cols;
     r = (rowids[r] % rows + rows) % rows;  // Guard index range.
     T* p = dst + r * cols + c;
     const T* q = src + idx;
@@ -109,15 +109,15 @@ __global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
 template <typename T>
 void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
-  const int64 nelem = v.NumElements();
+  const int64_t nelem = v.NumElements();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
   auto Ty = y->flat_outer_dims<T>();
-  const int64 nrows = Ty.dimension(0);
-  const int64 ncols = Ty.dimension(1);
-  const int64 n = i.NumElements();
+  const int64_t nrows = Ty.dimension(0);
+  const int64_t ncols = Ty.dimension(1);
+  const int64_t n = i.NumElements();
   const T* src = v.flat<T>().data();
   // TODO(sjhwang): Check that first dimension fits in int32 range.
-  const int32* rowids = i.flat<int32>().data();
+  const int32_t* rowids = i.flat<int32_t>().data();
   T* dst = y->flat<T>().data();
   switch (op) {
     case I_UPDATE:
@@ -144,15 +144,15 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
 template <bool>
 void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
-  const int64 nelem = v.NumElements();
+  const int64_t nelem = v.NumElements();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
   auto Ty = y->flat_outer_dims<bool>();
-  const int64 nrows = Ty.dimension(0);
-  const int64 ncols = Ty.dimension(1);
-  const int64 n = i.NumElements();
+  const int64_t nrows = Ty.dimension(0);
+  const int64_t ncols = Ty.dimension(1);
+  const int64_t n = i.NumElements();
   const bool* src = v.flat<bool>().data();
   // TODO(sjhwang): Check that first dimension fits in int32 range.
-  const int32* rowids = i.flat<int32>().data();
+  const int32_t* rowids = i.flat<int32_t>().data();
   bool* dst = y->flat<bool>().data();
   if (op == I_UPDATE) {
     TF_CHECK_OK(GpuLaunchKernel(DoInplaceOpKernel<bool, I_UPDATE>,
@@ -163,8 +163,8 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
 }
 
 template <>
-Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
-                 const Tensor& v, Tensor* y) {
+absl::Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
+                       const Tensor& v, Tensor* y) {
   CHECK_EQ(v.dtype(), y->dtype());
   switch (v.dtype()) {
 #define CASE(type)                     \
@@ -186,11 +186,11 @@ Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
       return errors::InvalidArgument("Unsupported data type from DoInplace: ",
                                      DataTypeString(v.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
-Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
+absl::Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
   CHECK_EQ(x.dtype(), y->dtype());
   switch (x.dtype()) {
 #define CASE(type)                              \
@@ -214,7 +214,7 @@ Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
       return errors::InvalidArgument("Unsupported dtype from DoCopy: ",
                                      DataTypeString(x.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 51c0d4b6654034..3919cb763171c7 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -48,7 +48,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 absl::Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
   if (t.shape() == TensorShape({})) {
-    if ((t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) ||
+    if ((t.dtype() == DT_INT32 && t.scalar<int32_t>()() == -1) ||
         (t.dtype() == DT_INT64 && t.scalar<int64_t>()() == -1)) {
       *out = PartialTensorShape();
       return absl::OkStatus();
@@ -61,7 +61,7 @@ absl::Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
                                    t.shape().dims());
   }
   if (t.dtype() == DT_INT32) {
-    return PartialTensorShape::MakePartialShape(t.vec<int32>().data(),
+    return PartialTensorShape::MakePartialShape(t.vec<int32_t>().data(),
                                                 t.NumElements(), out);
   } else if (t.dtype() == DT_INT64) {
     return PartialTensorShape::MakePartialShape(t.vec<int64_t>().data(),
@@ -157,7 +157,7 @@ class EmptyTensorList : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
     TensorList empty;
     empty.element_dtype = element_dtype_;
-    empty.max_num_elements = max_num_elements_t.scalar<int32>()();
+    empty.max_num_elements = max_num_elements_t.scalar<int32_t>()();
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape));
     empty.element_shape = element_shape;
@@ -257,7 +257,7 @@ class TensorListLength : public OpKernel {
     OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
-    result->scalar<int32>()() = l->tensors().size();
+    result->scalar<int32_t>()() = l->tensors().size();
   }
 };
 
@@ -287,7 +287,7 @@ class TensorListElementShape : public OpKernel {
     if (l->element_shape.unknown_rank()) {
       OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &result));
       if (result->dtype() == DT_INT32) {
-        result->scalar<int32>()() = -1;
+        result->scalar<int32_t>()() = -1;
       } else {
         result->scalar<int64_t>()() = -1;
       }
@@ -296,7 +296,7 @@ class TensorListElementShape : public OpKernel {
                             0, TensorShape{l->element_shape.dims()}, &result));
       for (int i = 0; i < l->element_shape.dims(); ++i) {
         if (result->dtype() == DT_INT32) {
-          result->flat<int32>()(i) = l->element_shape.dim_size(i);
+          result->flat<int32_t>()(i) = l->element_shape.dim_size(i);
         } else {
           result->flat<int64_t>()(i) = l->element_shape.dim_size(i);
         }
@@ -336,7 +336,7 @@ class TensorListReserve : public OpKernel {
         errors::InvalidArgument(
             "The num_elements to reserve must be a tensor size 1, but got ",
             c->input(1).shape()));
-    int32_t num_elements = c->input(1).scalar<int32>()();
+    int32_t num_elements = c->input(1).scalar<int32_t>()();
     OP_REQUIRES(c, num_elements >= 0,
                 errors::InvalidArgument("The num_elements to reserve must be a "
                                         "non negative number, but got ",
@@ -384,7 +384,7 @@ class TensorListResize : public OpKernel {
     OP_REQUIRES_OK(c, GetInputList(c, 0, &input_list));
     OP_REQUIRES(c, TensorShapeUtils::IsScalar(c->input(1).shape()),
                 errors::InvalidArgument("size must be a scalar"));
-    int32_t size = c->input(1).scalar<int32>()();
+    int32_t size = c->input(1).scalar<int32_t>()();
     OP_REQUIRES(
         c, size >= 0,
         errors::InvalidArgument(
@@ -473,7 +473,7 @@ class TensorListSetItem : public OpKernel {
                     " list shape: ", l->element_shape.DebugString()));
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    int32_t index = c->input(1).scalar<int32>()();
+    int32_t index = c->input(1).scalar<int32_t>()();
     if (!resize_if_index_out_of_bounds_) {
       OP_REQUIRES(c, index < l->tensors().size(),
                   errors::InvalidArgument("Trying to modify element ", index,
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 9837b08716afae..5af26a518f0b18 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -80,8 +80,8 @@ template <typename Device, typename T>
 inline void SetZero(OpKernelContext* ctx, Tensor& tensor) {
 #ifdef PLUGGABLE_DEVICE_SUPPORTED
   if (IsPluggableDevice(ctx)) {
-    auto ptr =
-        se::DeviceMemoryBase(tensor.flat<T>().data(), tensor.TotalBytes());
+    auto ptr = stream_executor::DeviceAddressBase(tensor.flat<T>().data(),
+                                                  tensor.TotalBytes());
     auto stream = ctx->op_device_context()->stream();
     auto result = stream->MemZero(&ptr, tensor.TotalBytes()).ok();
     DCHECK_EQ(true, result);
@@ -101,8 +101,10 @@ inline void CopyTensorPluggableDevice(OpKernelContext* ctx, Tensor& src,
   auto src_t = src.unaligned_flat<T>();
   auto dst_t = dst.flat<T>();
   DCHECK(DataTypeCanUseMemcpy(DataTypeToEnum<T>::v()));
-  auto src_ptr = se::DeviceMemoryBase(src_t.data(), src.TotalBytes());
-  auto dst_ptr = se::DeviceMemoryBase(dst_t.data(), dst.TotalBytes());
+  auto src_ptr =
+      stream_executor::DeviceAddressBase(src_t.data(), src.TotalBytes());
+  auto dst_ptr =
+      stream_executor::DeviceAddressBase(dst_t.data(), dst.TotalBytes());
   auto stream = ctx->op_device_context()->stream();
   auto result = stream->Memcpy(&dst_ptr, src_ptr, src.TotalBytes()).ok();
   DCHECK_EQ(true, result);
@@ -133,7 +135,7 @@ void ConcatPluggableDevice(
   size_t num_inputs = inputs.size();
   std::vector<ptrdiff_t> sizes;
   sizes.reserve(num_inputs);
-  int64 row_size = 0;
+  int64_t row_size = 0;
   for (const auto& input : inputs) {
     sizes.push_back(input->dimension(1));
     row_size += sizes.back();
@@ -145,12 +147,13 @@ void ConcatPluggableDevice(
   for (const auto& input : inputs) {
     inp.push_back(&(*input)(0, 0));
   }
-  const int64 dim0 = output->dimension(0);
-  for (int64 i = 0; i < dim0; ++i) {
-    for (int64 j = 0; j < num_inputs; ++j) {
+  const int64_t dim0 = output->dimension(0);
+  for (int64_t i = 0; i < dim0; ++i) {
+    for (int64_t j = 0; j < num_inputs; ++j) {
       auto size = sizes[j];
-      se::DeviceMemoryBase out_base{out, size * sizeof(T)};
-      se::DeviceMemoryBase inp_base{const_cast<T*>(inp[j]), size * sizeof(T)};
+      stream_executor::DeviceAddressBase out_base{out, size * sizeof(T)};
+      stream_executor::DeviceAddressBase inp_base{const_cast<T*>(inp[j]),
+                                                  size * sizeof(T)};
       OP_REQUIRES_OK(context,
                      stream->Memcpy(&out_base, inp_base, size * sizeof(T)));
       out += size;
@@ -284,7 +287,7 @@ class TensorListGetItem : public OpKernel {
                                         DataTypeString(element_dtype_),
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
-    int32_t index = c->input(1).scalar<int32>()();
+    int32_t index = c->input(1).scalar<int32_t>()();
     OP_REQUIRES(c, index < l->tensors().size(),
                 errors::InvalidArgument("Trying to access element ", index,
                                         " in a list with ", l->tensors().size(),
@@ -693,7 +696,7 @@ class TensorListGather : public OpKernel {
     // element tensors.
     if (!tensor_list->element_shape.IsFullyDefined()) {
       for (int index = 0; index < indices.NumElements(); ++index) {
-        const int i = indices.flat<int32>()(index);
+        const int i = indices.flat<int32_t>()(index);
 
         OP_REQUIRES(c, 0 <= i && i < tensor_list->tensors().size(),
                     absl::InvalidArgumentError(absl::StrCat(
@@ -728,7 +731,7 @@ class TensorListGather : public OpKernel {
     inputs_flat.reserve(indices.NumElements());
     Tensor zeros;
     for (int index = 0; index < indices.NumElements(); ++index) {
-      const int i = indices.flat<int32>()(index);
+      const int i = indices.flat<int32_t>()(index);
       OP_REQUIRES(
           c, i < tensor_list->tensors().size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
@@ -832,7 +835,7 @@ absl::Status Scatter(OpKernelContext* c, const Tensor& value,
   const auto copy_tensor = IsPluggableDevice(c) ? &CopyTensorPluggableDevice<T>
                                                 : &CopyTensor<Device, T>;
   for (int index = 0; index < indices.NumElements(); ++index) {
-    const int i = indices.flat<int32>()(index);
+    const int i = indices.flat<int32_t>()(index);
     Tensor tmp = value.Slice(index, index + 1);
     TensorShape tmp_shape = tmp.shape();
     tmp_shape.RemoveDim(0);
@@ -885,7 +888,7 @@ class TensorListScatterIntoExistingList : public OpKernel {
     // Resize the list if needed to accommodate all indices.
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    const auto indices_vec = indices.vec<int32>();
+    const auto indices_vec = indices.vec<int32_t>();
     int32_t max_index =
         (indices.NumElements() == 0)
             ? -1
@@ -956,7 +959,7 @@ class TensorListScatter : public OpKernel {
     {
       int highest_index = -1;
       for (int index = 0; index < indices.NumElements(); ++index) {
-        const int i = indices.flat<int32>()(index);
+        const int i = indices.flat<int32_t>()(index);
         OP_REQUIRES(
             c, i >= 0,
             errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc
index 92d461aba58c8e..eb0a6eec9345aa 100644
--- a/tensorflow/core/kernels/listdiff_op.cc
+++ b/tensorflow/core/kernels/listdiff_op.cc
@@ -48,7 +48,7 @@ class ListDiffOp : public OpKernel {
     const auto Ty = y.vec<T>();
     const size_t y_size = Ty.size();
 
-    OP_REQUIRES(context, x_size < std::numeric_limits<int32>::max(),
+    OP_REQUIRES(context, x_size < std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument("x too large for int32 indexing"));
 
     std::unordered_set<T> y_set;
diff --git a/tensorflow/core/kernels/load_and_remap_matrix_op.cc b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
index c746fec71d5e4d..a952da3595ccda 100644
--- a/tensorflow/core/kernels/load_and_remap_matrix_op.cc
+++ b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
@@ -133,11 +133,11 @@ class LoadAndRemapMatrixOp : public OpKernel {
         errors::InvalidArgument("The `ckpt_path` tensor must have exactly one "
                                 "element, got tensor of shape ",
                                 ckpt_path_t->shape().DebugString()));
-    const string& ckpt_path = ckpt_path_t->scalar<tstring>()();
+    const std::string& ckpt_path = ckpt_path_t->scalar<tstring>()();
     const Tensor* old_tensor_name_t;
     OP_REQUIRES_OK(context,
                    context->input("old_tensor_name", &old_tensor_name_t));
-    const string& old_tensor_name = old_tensor_name_t->scalar<tstring>()();
+    const std::string& old_tensor_name = old_tensor_name_t->scalar<tstring>()();
 
     LOG(INFO) << "Processing checkpoint : " << ckpt_path;
     BundleReader reader(context->env(), ckpt_path);
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 904e84d21778aa..b589d918626f1d 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -64,7 +64,7 @@ void AssertOp::Compute(OpKernelContext* ctx) {
   if (cond.scalar<bool>()()) {
     return;
   }
-  string msg = "assertion failed: ";
+  std::string msg = "assertion failed: ";
   for (int i = 1; i < ctx->num_inputs(); ++i) {
     absl::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_), "]");
     if (i < ctx->num_inputs() - 1) absl::StrAppend(&msg, " ");
@@ -98,7 +98,7 @@ class PrintOp : public OpKernel {
       if (call_counter_ >= first_n_) return;
       call_counter_++;
     }
-    string msg;
+    std::string msg;
     absl::StrAppend(&msg, message_);
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       absl::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_), "]");
@@ -110,8 +110,8 @@ class PrintOp : public OpKernel {
   mutex mu_;
   int64_t call_counter_ TF_GUARDED_BY(mu_) = 0;
   int64_t first_n_ = 0;
-  int32 summarize_ = 0;
-  string message_;
+  int32_t summarize_ = 0;
+  std::string message_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp);
@@ -130,8 +130,8 @@ class PrintV2Op : public OpKernel {
                   std::end(valid_output_streams_), output_stream_);
 
     if (output_stream_index == std::end(valid_output_streams_)) {
-      string error_msg = absl::StrCat("Unknown output stream: ", output_stream_,
-                                      ", Valid streams are:");
+      std::string error_msg = absl::StrCat(
+          "Unknown output stream: ", output_stream_, ", Valid streams are:");
       for (auto valid_stream : valid_output_streams_) {
         absl::StrAppend(&error_msg, " ", valid_stream);
       }
@@ -146,9 +146,9 @@ class PrintV2Op : public OpKernel {
         ctx, TensorShapeUtils::IsScalar(input_->shape()),
         errors::InvalidArgument("Input is expected to be scalar, but got ",
                                 input_->shape()));
-    const string& msg = input_->scalar<tstring>()();
+    const std::string& msg = input_->scalar<tstring>()();
 
-    string ended_msg = absl::StrCat(msg, end_);
+    std::string ended_msg = absl::StrCat(msg, end_);
 
     if (!file_path_.empty()) {
       // Outputs to a file at the specified path.
@@ -172,8 +172,8 @@ class PrintV2Op : public OpKernel {
     } else if (output_stream_ == "log(error)") {
       LOG(ERROR) << ended_msg << std::flush;
     } else {
-      string error_msg = absl::StrCat("Unknown output stream: ", output_stream_,
-                                      ", Valid streams are:");
+      std::string error_msg = absl::StrCat(
+          "Unknown output stream: ", output_stream_, ", Valid streams are:");
       for (auto valid_stream : valid_output_streams_) {
         absl::StrAppend(&error_msg, " ", valid_stream);
       }
@@ -186,10 +186,10 @@ class PrintV2Op : public OpKernel {
                                           "log(warning)", "log(error)"};
 
  private:
-  string end_;
+  std::string end_;
   // Either output_stream_ or file_path_ (but not both) will be non-empty.
-  string output_stream_;
-  string file_path_;
+  std::string output_stream_;
+  std::string file_path_;
 
   // If output_stream_ is a file path, extracts it to file_path_ and clears
   // output_stream_; otherwise sets file_paths_ to "".
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/kernels/logging_ops.h
index 5cb1213998f499..f5a58643d8e1a3 100644
--- a/tensorflow/core/kernels/logging_ops.h
+++ b/tensorflow/core/kernels/logging_ops.h
@@ -25,7 +25,7 @@ class AssertOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  int32 summarize_ = 0;
+  int32_t summarize_ = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 7efdeac7d1db9f..fbce44642938db 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class PrintingV2GraphTest : public OpsTestBase {
  protected:
-  absl::Status Init(const string& output_stream = "log(warning)") {
+  absl::Status Init(const std::string& output_stream = "log(warning)") {
     TF_CHECK_OK(NodeDefBuilder("op", "PrintV2")
                     .Input(FakeInput(DT_STRING))
                     .Attr("output_stream", output_stream)
@@ -61,8 +61,8 @@ TEST_F(PrintingV2GraphTest, InvalidInputRank) {
 
 class PrintingGraphTest : public OpsTestBase {
  protected:
-  absl::Status Init(DataType input_type1, DataType input_type2, string msg = "",
-                    int first_n = -1, int summarize = 3) {
+  absl::Status Init(DataType input_type1, DataType input_type2,
+                    std::string msg = "", int first_n = -1, int summarize = 3) {
     TF_CHECK_OK(NodeDefBuilder("op", "Print")
                     .Input(FakeInput(input_type1))
                     .Input(FakeInput(2, input_type2))
@@ -76,58 +76,58 @@ class PrintingGraphTest : public OpsTestBase {
 
 TEST_F(PrintingGraphTest, Int32Success_6) {
   TF_ASSERT_OK(Init(DT_INT32, DT_INT32));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, Int32Success_Summarize6) {
   TF_ASSERT_OK(Init(DT_INT32, DT_INT32, "", -1, 6));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, StringSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<tstring>(TensorShape({}), {"foo"});
   AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, MsgSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING, "Message: "));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<tstring>(TensorShape({}), {"foo"});
   AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, FirstNSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING, "", 3));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<tstring>(TensorShape({}), {"foo"});
   AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   // run 4 times but we only print 3 as intended
   for (int i = 0; i < 4; i++) TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 class TimestampTest : public OpsTestBase {
diff --git a/tensorflow/core/kernels/lookup_ops_test.cc b/tensorflow/core/kernels/lookup_ops_test.cc
index 2a57a46cf165f0..fb13ccc162eb90 100644
--- a/tensorflow/core/kernels/lookup_ops_test.cc
+++ b/tensorflow/core/kernels/lookup_ops_test.cc
@@ -51,8 +51,8 @@ class MockHashTable : public lookup::HashTable<K, V> {
   ~MockHashTable() override { alive = false; }
 };
 
-typedef int32 key_dtype;
-typedef int32 value_dtype;
+typedef int32_t key_dtype;
+typedef int32_t value_dtype;
 
 REGISTER_KERNEL_BUILDER(
     Name("MockAnonymousHashTable")
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 27cc76ee11b945..c936cad9addd6d 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -111,7 +111,7 @@ class InitializeTableFromTextFileOp : public OpKernel {
     if (ctx->HasAttr("offset")) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("offset", &offset_));
     }
-    string delimiter;
+    std::string delimiter;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("delimiter", &delimiter));
     OP_REQUIRES(ctx, delimiter.size() == 1,
                 errors::InvalidArgument("delimiter should be only 1 char"));
@@ -137,7 +137,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
         errors::InvalidArgument("filename should be a single string, but got ",
                                 vocab_filename_tensor.shape().DebugString()));
 
-    const string& vocab_filename = vocab_filename_tensor.scalar<tstring>()();
+    const std::string& vocab_filename =
+        vocab_filename_tensor.scalar<tstring>()();
     OP_REQUIRES(ctx, !vocab_filename.empty(),
                 errors::InvalidArgument("filename cannot be empty."));
 
diff --git a/tensorflow/core/kernels/lookup_table_init_op.h b/tensorflow/core/kernels/lookup_table_init_op.h
index e94db921bfd237..f6e246486a4532 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.h
+++ b/tensorflow/core/kernels/lookup_table_init_op.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace lookup {
 
 // Helper function to initialize an InitializableLookupTable from a text file.
-absl::Status InitializeTableFromTextFile(const string& filename,
+absl::Status InitializeTableFromTextFile(const std::string& filename,
                                          int64_t vocab_size, char delimiter,
                                          int32_t key_index, int32_t value_index,
                                          Env* env,
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 49a28dc324b9fb..54d2c8cca1669e 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -411,11 +411,11 @@ class MutableHashTableOfTensors final : public LookupInterface {
 namespace {
 
 template <typename T>
-inline uint64 HashScalar(const T& key) {
-  return static_cast<uint64>(key);
+inline uint64_t HashScalar(const T& key) {
+  return static_cast<uint64_t>(key);
 }
 
-inline uint64 HashScalar(const tstring& key) { return Hash64(key); }
+inline uint64_t HashScalar(const tstring& key) { return Hash64(key); }
 
 // If the given shape is a scalar return {1} instead. Otherwise leave it alone.
 TensorShape MaybeVectorizeShape(const TensorShape& shape) {
@@ -523,7 +523,7 @@ class MutableDenseHashTable final : public LookupInterface {
     const int64_t bit_mask = num_buckets_ - 1;
     // TODO(andreasst): parallelize using work_sharder
     for (int64_t i = 0; i < num_elements; ++i) {
-      const uint64 key_hash = HashKey(key_matrix, i);
+      const uint64_t key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_matrix, 0, key_matrix, i)) {
         return errors::InvalidArgument(
@@ -693,7 +693,7 @@ class MutableDenseHashTable final : public LookupInterface {
         deleted_key_.template shaped<K, 2>({1, key_size});
     const int64_t bit_mask = num_buckets_ - 1;
     for (int64_t i = 0; i < num_elements; ++i) {
-      const uint64 key_hash = HashKey(key_matrix, i);
+      const uint64_t key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_tensor, 0, key_matrix, i)) {
         if (ignore_empty_and_deleted_key) {
@@ -760,7 +760,7 @@ class MutableDenseHashTable final : public LookupInterface {
     const auto deleted_key_flat = deleted_key_.template flat<K>();
     const int64_t bit_mask = num_buckets_ - 1;
     for (int64_t i = 0; i < num_elements; ++i) {
-      const uint64 key_hash = HashKey(key_matrix, i);
+      const uint64_t key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_tensor, 0, key_matrix, i)) {
         return errors::InvalidArgument(
@@ -843,11 +843,11 @@ class MutableDenseHashTable final : public LookupInterface {
     return DoInsert(ctx, old_key_buckets, old_value_buckets, true);
   }
 
-  uint64 HashKey(typename TTypes<K>::ConstMatrix key, int64_t index) const {
+  uint64_t HashKey(typename TTypes<K>::ConstMatrix key, int64_t index) const {
     if (key_shape_.num_elements() == 1) {
       return HashScalar(key(index, 0));
     }
-    uint64 result = 0;
+    uint64_t result = 0;
     for (int64_t i = 0; i < key_shape_.num_elements(); ++i) {
       result = Hash64Combine(result, HashScalar(key(index, i)));
     }
@@ -876,9 +876,9 @@ class MutableDenseHashTable final : public LookupInterface {
   Tensor key_buckets_ TF_GUARDED_BY(mu_);
   Tensor value_buckets_ TF_GUARDED_BY(mu_);
   Tensor empty_key_;
-  uint64 empty_key_hash_;
+  uint64_t empty_key_hash_;
   Tensor deleted_key_;
-  uint64 deleted_key_hash_;
+  uint64_t deleted_key_hash_;
 };
 
 }  // namespace lookup
@@ -1103,19 +1103,19 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
       AnonymousLookupTableOp<lookup::HashTable<key_dtype, value_dtype>,   \
                              key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
-REGISTER_KERNEL(int32, tstring);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
+REGISTER_KERNEL(int32_t, tstring);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, tstring);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 REGISTER_KERNEL(tstring, tstring);
 
@@ -1146,19 +1146,19 @@ REGISTER_KERNEL(tstring, tstring);
           lookup::MutableHashTableOfScalars<key_dtype, value_dtype>,           \
           key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, tstring);
 REGISTER_KERNEL(int64_t, Variant);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 
 #undef REGISTER_KERNEL
@@ -1188,18 +1188,18 @@ REGISTER_KERNEL(tstring, int64_t);
           lookup::MutableHashTableOfTensors<key_dtype, value_dtype>,           \
           key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, tstring);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 
 #undef REGISTER_KERNEL
@@ -1229,19 +1229,19 @@ REGISTER_KERNEL(tstring, int64_t);
           lookup::MutableDenseHashTable<key_dtype, value_dtype>, key_dtype, \
           value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
 REGISTER_KERNEL(int64_t, bool);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, Variant);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 REGISTER_KERNEL(tstring, ResourceHandle);
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index daa7f6e32dc9dd..840720d2e3e61d 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -300,7 +300,7 @@ class HashTable : public InitializableLookupTable {
     return absl::OkStatus();
   };
 
-  absl::Status DoLazyPrepare(std::function<int64(void)> size_fn) override {
+  absl::Status DoLazyPrepare(std::function<int64_t(void)> size_fn) override {
     return DoPrepare(size_fn());
   }
 
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 3576b6c7339bd1..744b2e9c21b5ac 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -37,13 +37,13 @@ static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
 static const int kLineNumber = -1;
 static const int kWholeLine = -2;
 
-absl::Status GetNumLinesInTextFile(Env* env, const string& vocab_file,
+absl::Status GetNumLinesInTextFile(Env* env, const std::string& vocab_file,
                                    int64_t* num_lines) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file));
 
   io::InputBuffer input_buffer(file.get(), kInputBufferSize);
-  string line;
+  std::string line;
   absl::Status s = input_buffer.ReadLine(&line);
   int64_t next_id = 0;
   while (s.ok()) {
@@ -81,9 +81,10 @@ class TextFileLineIterator
   // - Index -1 means the line number stored in int64.
   // - Index >= 0 represent index (starting at zero) of the split line based on
   //   delimiter.
-  absl::Status Init(const string& filename, int64_t vocab_size, char delimiter,
-                    DataType key_dtype, int64_t key_index, DataType value_dtype,
-                    int64_t value_index, int64_t offset, Env* env) {
+  absl::Status Init(const std::string& filename, int64_t vocab_size,
+                    char delimiter, DataType key_dtype, int64_t key_index,
+                    DataType value_dtype, int64_t value_index, int64_t offset,
+                    Env* env) {
     filename_ = filename;
     vocab_size_ = vocab_size;
     delimiter_ = delimiter;
@@ -108,7 +109,7 @@ class TextFileLineIterator
   void Next() override {
     if (!valid_) return;
 
-    string line;
+    std::string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
       if (absl::IsOutOfRange(status_) && vocab_size_ != -1 &&
@@ -137,7 +138,7 @@ class TextFileLineIterator
       return;
     }
 
-    std::vector<string> tokens;
+    std::vector<std::string> tokens;
     if (!ignore_split_) {
       tokens = str_util::Split(line, delimiter_);
       const auto expected_size =
@@ -197,7 +198,7 @@ class TextFileLineIterator
   int64_t next_id_;
   int64_t offset_;
   int64_t vocab_size_;
-  string filename_;
+  std::string filename_;
   char delimiter_;
   absl::Status status_;
   bool ignore_split_;
@@ -206,13 +207,14 @@ class TextFileLineIterator
 
   // Set the corresponding value from line or tokens based on 'index' into the
   // tensor 't'. The value is transformed to the given data type 'dtype'.
-  absl::Status SetValue(const string& line, const std::vector<string>& tokens,
-                        int64_t index, Tensor* tensor) {
+  absl::Status SetValue(const std::string& line,
+                        const std::vector<std::string>& tokens, int64_t index,
+                        Tensor* tensor) {
     if (index == kLineNumber) {
       tensor->flat<int64_t>()(0) = next_id_ + offset_;
       return absl::OkStatus();
     }
-    const string& token = (index == kWholeLine) ? line : tokens[index];
+    const std::string& token = (index == kWholeLine) ? line : tokens[index];
     const DataType& dtype = tensor->dtype();
     switch (dtype) {
       case DT_INT32: {
@@ -222,7 +224,7 @@ class TextFileLineIterator
           return errors::InvalidArgument("Field ", token, " in line ", next_id_,
                                          " is not a valid int32.");
         }
-        tensor->flat<int32>()(0) = value + offset_;
+        tensor->flat<int32_t>()(0) = value + offset_;
       } break;
       case DT_INT64: {
         int64_t value;
@@ -267,7 +269,7 @@ class TextFileLineIterator
 };
 
 absl::Status GetTableHandle(absl::string_view input_name, OpKernelContext* ctx,
-                            string* container, string* table_handle) {
+                            std::string* container, std::string* table_handle) {
   {
     mutex* mu;
     TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu));
@@ -300,8 +302,8 @@ absl::Status GetResourceLookupTable(absl::string_view input_name,
 absl::Status GetReferenceLookupTable(absl::string_view input_name,
                                      OpKernelContext* ctx,
                                      LookupInterface** table) {
-  string container;
-  string table_handle;
+  std::string container;
+  std::string table_handle;
   TF_RETURN_IF_ERROR(
       GetTableHandle(input_name, ctx, &container, &table_handle));
   return ctx->resource_manager()->Lookup(container, table_handle, table);
@@ -335,8 +337,8 @@ absl::Status GetInitializableLookupTable(absl::string_view input_name,
                                      handle.name(), " is not initializable");
     }
   } else {
-    string container;
-    string table_handle;
+    std::string container;
+    std::string table_handle;
     TF_RETURN_IF_ERROR(
         GetTableHandle(input_name, ctx, &container, &table_handle));
     TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup(container, table_handle,
@@ -353,7 +355,7 @@ absl::Status GetInitializableLookupTable(absl::string_view input_name,
 
 absl::Status CheckTableDataTypes(const LookupInterface& table,
                                  DataType key_dtype, DataType value_dtype,
-                                 const string& table_name) {
+                                 const std::string& table_name) {
   if (table.key_dtype() != key_dtype || table.value_dtype() != value_dtype) {
     return errors::InvalidArgument(
         "Conflicting key/value dtypes ", DataTypeString(key_dtype), "->",
@@ -365,7 +367,7 @@ absl::Status CheckTableDataTypes(const LookupInterface& table,
 }
 
 // Helper function to initialize an InitializableLookupTable from a text file.
-absl::Status InitializeTableFromTextFile(const string& filename,
+absl::Status InitializeTableFromTextFile(const std::string& filename,
                                          int64_t vocab_size, char delimiter,
                                          int32_t key_index, int32_t value_index,
                                          int64_t offset, Env* env,
@@ -376,7 +378,7 @@ absl::Status InitializeTableFromTextFile(const string& filename,
 }
 
 absl::Status InitializeTableFromTextFile(
-    const string& filename, int64_t vocab_size, char delimiter,
+    const std::string& filename, int64_t vocab_size, char delimiter,
     int32_t key_index, int32_t value_index, int64_t offset, Env* env,
     std::unique_ptr<InitializableLookupTable::InitializerSerializer> serializer,
     InitializableLookupTable* table) {
diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h
index 677c6a5659fc23..e48718ad805bdb 100644
--- a/tensorflow/core/kernels/lookup_util.h
+++ b/tensorflow/core/kernels/lookup_util.h
@@ -53,10 +53,10 @@ absl::Status GetInitializableLookupTable(absl::string_view input_name,
 // table's data types.
 absl::Status CheckTableDataTypes(const LookupInterface& table,
                                  DataType key_dtype, DataType value_dtype,
-                                 const string& table_name);
+                                 const std::string& table_name);
 
 // Initializes `table` from `filename`.
-absl::Status InitializeTableFromTextFile(const string& filename,
+absl::Status InitializeTableFromTextFile(const std::string& filename,
                                          int64_t vocab_size, char delimiter,
                                          int32_t key_index, int32_t value_index,
                                          int64_t offset, Env* env,
@@ -65,7 +65,7 @@ absl::Status InitializeTableFromTextFile(const string& filename,
 // Initializes `table` from `filename`. `func` may specify how to represent the
 // initializer as a graphdef, so that the table can be serialized as metadata.
 absl::Status InitializeTableFromTextFile(
-    const string& filename, int64_t vocab_size, char delimiter,
+    const std::string& filename, int64_t vocab_size, char delimiter,
     int32_t key_index, int32_t value_index, int64_t offset, Env* env,
     std::unique_ptr<InitializableLookupTable::InitializerSerializer> serializer,
     InitializableLookupTable* table);
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index a4843b04d84b1b..3c8515d522501b 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -40,13 +40,13 @@ class LRNFloatTest : public OpsTestBase {
  protected:
   LRNFloatTest() : philox_(123, 17), rand_(&philox_) {}
 
-  int GetIntAttr(const string& name) {
+  int GetIntAttr(const std::string& name) {
     int value;
     TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
     return value;
   }
 
-  float GetFloatAttr(const string& name) {
+  float GetFloatAttr(const std::string& name) {
     float value;
     TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
     return value;
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
index 6949ff554a286b..ab57ba02dccbc4 100644
--- a/tensorflow/core/kernels/map_kernels.h
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -102,7 +102,7 @@ class TensorMapSize : public OpKernel {
     OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
     Tensor* result;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
-    result->scalar<int32>()() = map->tensors().size();
+    result->scalar<int32_t>()() = map->tensors().size();
   }
 };
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 14787c38e72502..12e018dfdd311d 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -489,7 +489,7 @@ class StagingMap : public ResourceBase {
     return map_.size();
   }
 
-  string DebugString() const override { return "StagingMap"; }
+  std::string DebugString() const override { return "StagingMap"; }
 };
 
 template <bool Ordered>
@@ -736,7 +736,7 @@ class MapSizeOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
-    size->scalar<int32>().setConstant(map->size());
+    size->scalar<int32_t>().setConstant(map->size());
   }
 };
 
@@ -766,7 +766,7 @@ class MapIncompleteSizeOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
-    size->scalar<int32>().setConstant(map->incomplete_size());
+    size->scalar<int32_t>().setConstant(map->incomplete_size());
   }
 };
 
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 515e58d518a129..c48e6aeeab3bad 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -43,7 +43,7 @@ class MatchingFilesOp : public OpKernel {
     const auto patterns = patterns_t->flat<tstring>();
     int num_patterns = patterns.size();
     int num_files = 0;
-    std::vector<std::vector<string>> all_fnames(num_patterns);
+    std::vector<std::vector<std::string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
       OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
                                                                &all_fnames[i]));
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 4e6a8d5266608d..343eba3db82f97 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -199,7 +199,7 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
 namespace {
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
-StatusOr<se::gpu::BlasLt::Epilogue> GetBlasLtEpilogOp(
+absl::StatusOr<stream_executor::gpu::BlasLt::Epilogue> GetBlasLtEpilogOp(
     FusedComputationType fusion) {
   if (fusion == FusedComputationType::kBiasAdd) {
     return se::gpu::BlasLt::Epilogue::kBias;
@@ -235,7 +235,7 @@ se::blas::AlgorithmConfig AutotuneMatmul(
       // scratch space is deallocated between runs.
       BlasScratchAllocator scratch_allocator(context);
 
-      Status cublaslt_launch =
+      absl::Status cublaslt_launch =
           launch_func(scratch_allocator, i, &profile_result);
 
       VLOG(4) << "  Autotune algorithm " << i
@@ -265,7 +265,7 @@ se::blas::AlgorithmConfig AutotuneMatmul(
 #endif
 
 template <typename LaunchFunc, typename Sig>
-StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
+absl::StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
     OpKernelContext* ctx,
     std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
     bool actually_do_autotune, const LaunchFunc& launch_func,
@@ -292,10 +292,10 @@ StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
 
     TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
     se::dnn::ProfileResult profile_result;
-    Status cudnn_launch_status =
+    absl::Status cudnn_launch_status =
         actually_do_autotune
             ? launch_func(allocator_used, runner, &profile_result)
-            : OkStatus();
+            : absl::OkStatus();
     if (!actually_do_autotune) {
       // Make the result valid according to `is_valid`.
       profile_result.set_algorithm(desc);
@@ -329,7 +329,7 @@ StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
 }
 
 struct FusedMatmulAutotuneGroup {
-  static string name() { return "FusedMatmul"; }
+  static std::string name() { return "FusedMatmul"; }
 };
 
 typedef AutotuneSingleton<FusedMatmulAutotuneGroup, MatmulParameters,
@@ -337,7 +337,8 @@ typedef AutotuneSingleton<FusedMatmulAutotuneGroup, MatmulParameters,
     FusedMatmulAutotuneMap;
 
 template <typename T>
-StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
+absl::StatusOr<AutotuneEntry<stream_executor::dnn::FusedMatmulOp>>
+AutotuneFusedMatmul(
     bool cudnn_use_autotune,
     AutotuneMap<MatmulParameters, AutotuneEntry<se::dnn::FusedMatmulOp>>*
         autotune_map,
@@ -350,7 +351,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
   AutotuneEntry<se::dnn::FusedMatmulOp> autotune_entry;
   auto* stream = ctx->op_device_context()->stream();
   if (!autotune_map->Find(params, &autotune_entry)) {
-    profiler::ScopedAnnotation trace("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
@@ -371,7 +372,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
             const std::unique_ptr<const se::dnn::FusedMatmulRunner>& runner,
-            se::dnn::ProfileResult* profile_result) -> Status {
+            se::dnn::ProfileResult* profile_result) -> absl::Status {
       TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
                                             runner->GetWorkspaceSize()));
       return (*runner)(stream, profile_result, scratch, a_ptr, b_ptr, bias_ptr,
@@ -562,8 +563,9 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
       auto runner_and_scratch = std::move(runner_and_scratch_or).value();
       auto& runner =
           *std::get<const se::dnn::FusedMatmulRunner*>(runner_and_scratch);
-      Status cudnn_launch_status = runner(
-          stream, nullptr, std::get<se::DeviceMemoryBase>(runner_and_scratch),
+      absl::Status cudnn_launch_status = runner(
+          stream, nullptr,
+          std::get<stream_executor::DeviceAddressBase>(runner_and_scratch),
           a_ptr, b_ptr, bias_ptr, c_ptr);
       OP_REQUIRES_OK(context, cudnn_launch_status);
       return;
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index f4991bc1fe252a..628e6d8dabceb2 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -477,7 +477,7 @@ struct LaunchBatchMatMul<CPUDevice, Scalar> {
 namespace {
 // A dummy type to group matmul autotune results together.
 struct BlasLtMatmulAutoTuneGroup {
-  static string name() { return "MatmulLt"; }
+  static std::string name() { return "MatmulLt"; }
 };
 
 typedef AutotuneSingleton<BlasLtMatmulAutoTuneGroup, BlasLtMatmulPlanParams,
@@ -493,7 +493,7 @@ typedef AutotuneSingleton<BlasLtMatmulAutoTuneGroup, BlasLtMatmulPlanParams,
 class BlasScratchAllocator : public se::ScratchAllocator {
  public:
   using Stream = se::Stream;
-  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
+  using DeviceMemoryBytes = stream_executor::DeviceAddress<uint8>;
 
   BlasScratchAllocator(OpKernelContext* context)
       : memory_limit_(0), total_byte_size_(0), context_(context) {}
@@ -503,21 +503,22 @@ class BlasScratchAllocator : public se::ScratchAllocator {
 
   int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
 
-  tsl::StatusOr<DeviceMemoryBytes> AllocateBytes(int64_t byte_size) override {
+  absl::StatusOr<BlasScratchAllocator::DeviceMemoryBytes> AllocateBytes(
+      int64_t byte_size) override {
     Tensor temporary_memory;
 
     if (memory_limit_ > 0 && byte_size > memory_limit_) {
-      return tsl::Status{
+      return absl::Status{
           absl::StatusCode::kUnavailable,
           absl::StrCat("Requested memory size (", byte_size,
                        ") exceeds the memory limit (", memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
-    Status allocation_status(context_->allocate_temp(
+    absl::Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory));
     if (!allocation_status.ok()) {
-      return tsl::Status{
+      return absl::Status{
           absl::StatusCode::kUnavailable,
           absl::StrCat("Failed to allocate requested memory of (", byte_size,
                        ").")};
@@ -526,11 +527,12 @@ class BlasScratchAllocator : public se::ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return tsl::StatusOr<DeviceMemoryBytes>(DeviceMemoryBytes::MakeFromByteSize(
-        temporary_memory.flat<uint8>().data(),
-        temporary_memory.flat<uint8>().size()));
+    return absl::StatusOr<BlasScratchAllocator::DeviceMemoryBytes>(
+        DeviceMemoryBytes::MakeFromByteSize(
+            temporary_memory.flat<uint8_t>().data(),
+            temporary_memory.flat<uint8_t>().size()));
   }
-  int64 TotalByteSize() { return total_byte_size_; }
+  int64_t TotalByteSize() { return total_byte_size_; }
 
  private:
   int64_t memory_limit_;
@@ -548,9 +550,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
                                    se::blas::Transpose::kTranspose,
                                    se::blas::Transpose::kConjugateTranspose};
-    const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
-    const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
-    const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
+    const uint64_t m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
+    const uint64_t k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
+    const uint64_t n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
     const int64_t batch_size = bcast.output_batch_size();
     auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)];
     auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)];
@@ -574,9 +576,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     auto* a_base_ptr = in_x.template flat<Scalar>().data();
     auto* b_base_ptr = in_y.template flat<Scalar>().data();
     auto* c_base_ptr = out->template flat<Scalar>().data();
-    uint64 a_stride;
-    uint64 b_stride;
-    uint64 c_stride;
+    uint64_t a_stride;
+    uint64_t b_stride;
+    uint64_t c_stride;
 
     bool is_full_broadcast =
         std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
@@ -658,9 +660,11 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
             // Create a new scratch allocator with every autotuning run so that
             // scratch space is deallocated between runs.
             BlasScratchAllocator scratch_allocator(context, max_scratch_size);
-            Status cublas_launch_status = plan_and_algorithms->ExecuteOnStream(
-                stream, *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], i,
-                scratch_allocator, se::DeviceMemoryBase{}, &profile_result);
+            absl::Status cublas_launch_status =
+                plan_and_algorithms->ExecuteOnStream(
+                    stream, *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], i,
+                    scratch_allocator, stream_executor::DeviceAddressBase{},
+                    &profile_result);
 
             VLOG(4) << "  Autotune algorithm " << i
                     << " result: " << profile_result.elapsed_time_in_ms()
diff --git a/tensorflow/core/kernels/matmul_op_real.cc b/tensorflow/core/kernels/matmul_op_real.cc
index 46fbf83a53e067..54049fb852c008 100644
--- a/tensorflow/core/kernels/matmul_op_real.cc
+++ b/tensorflow/core/kernels/matmul_op_real.cc
@@ -29,18 +29,18 @@ TF_CALL_int64(REGISTER_BATCH_MATMUL_CPU);
 REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, bfloat16, bfloat16);
 REGISTER_BATCH_MATMUL_TOUT_CPU(float, float, float);
 REGISTER_BATCH_MATMUL_TOUT_CPU(double, double, double);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int16, int16, int16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int32, int32, int32);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int16_t, int16_t, int16_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int32_t, int32_t, int32_t);
 REGISTER_BATCH_MATMUL_TOUT_CPU(int64_t, int64_t, int64_t);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int8, int8, int32);
-REGISTER_BATCH_MATMUL_TOUT_CPU(uint8, int8, int32);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int8, uint8, int32);
-REGISTER_BATCH_MATMUL_TOUT_CPU(uint8, uint8, int32);
-
-REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, int8, bfloat16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, uint8, bfloat16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int8, bfloat16, bfloat16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(uint8, bfloat16, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int8_t, int8_t, int32_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(uint8_t, int8_t, int32_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int8_t, uint8_t, int32_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(uint8_t, uint8_t, int32_t);
+
+REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, int8_t, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, uint8_t, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int8_t, bfloat16, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(uint8_t, bfloat16, bfloat16);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_BATCH_MATMUL_GPU);
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index 4562998b2848aa..e755ceb2beed1a 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -52,7 +52,7 @@ class FusedMatMulOpTest : public OpsTestBase {
   // of 'fetch' node into the output Tensor. Optional `fetch_node` parameter
   // allows to define a fetch node directly using a NodeDef for the ops that are
   // not supported by the C++ Api.
-  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+  void RunAndFetch(const tensorflow::Scope& root, const std::string& fetch,
                    Tensor* output, bool allow_gpu_device,
                    const NodeDef* fetch_node = nullptr,
                    absl::Status* last_status = nullptr) {
@@ -97,7 +97,8 @@ class FusedMatMulOpTest : public OpsTestBase {
     // to compare GPU vs CPU numbers, so place all nodes on CPU in this case.
     const bool place_all_on_gpu = allow_gpu_device && has_gpu_device;
 
-    const string device = place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
+    const std::string device =
+        place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
     for (NodeDef& mutable_node : *graph.mutable_node()) {
       mutable_node.set_device(device);
     }
@@ -137,7 +138,7 @@ class FusedMatMulOpTest : public OpsTestBase {
 
   void RunMatMulWithBiasAndActivation(
       const Tensor& lhs_data, const Tensor& rhs_data, const Tensor& bias_data,
-      bool transpose_a, bool transpose_b, const string& activation_type,
+      bool transpose_a, bool transpose_b, const std::string& activation_type,
       Tensor* output, bool allow_gpu_device = false) {
     Scope root = tensorflow::Scope::NewRootScope();
 
@@ -175,8 +176,8 @@ class FusedMatMulOpTest : public OpsTestBase {
 
   void RunFusedMatMulOp(const Tensor& lhs_data, const Tensor& rhs_data,
                         const std::vector<Tensor>& args_data,
-                        const std::vector<string>& fused_ops, bool transpose_a,
-                        bool transpose_b, Tensor* output,
+                        const std::vector<std::string>& fused_ops,
+                        bool transpose_a, bool transpose_b, Tensor* output,
                         bool allow_gpu_device = false,
                         bool* test_skipped = nullptr) {
     Scope root = tensorflow::Scope::NewRootScope();
@@ -295,7 +296,7 @@ class FusedMatMulOpTest : public OpsTestBase {
   // to FusedMatMul.
   void VerifyConv2DWithBiasAndActivation(int m, int k, int n, bool transpose_a,
                                          bool transpose_b,
-                                         const string& activation) {
+                                         const std::string& activation) {
     bool use_gpu_device =
         activation == "Relu" || (this->kTValueType == DT_HALF);
     const BiasAddGraphRunner run_default =
@@ -372,7 +373,7 @@ static auto GetActivations(DataType dtype) {
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x128x64WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(256, 128, 64, false, false,
                                             activation);
     this->VerifyConv2DWithBiasAndActivation(256, 128, 64, true, false,
@@ -385,21 +386,21 @@ TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x128x64WithActivation) {
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x256WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(1, 256, 256, false, false,
                                             activation);
   }
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x256x1WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(256, 256, 1, false, false,
                                             activation);
   }
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x1WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(1, 256, 1, false, false,
                                             activation);
   }
diff --git a/tensorflow/core/kernels/matmul_util.cc b/tensorflow/core/kernels/matmul_util.cc
index 3675018709dfc0..cd3a950f8f5c69 100644
--- a/tensorflow/core/kernels/matmul_util.cc
+++ b/tensorflow/core/kernels/matmul_util.cc
@@ -36,8 +36,7 @@ int64_t GetWorkspaceLimit(int64_t default_value_in_bytes) {
   if (workspace_limit_in_mb_str != nullptr &&
       strcmp(workspace_limit_in_mb_str, "") != 0) {
     int64_t scratch_limit_in_mb = -1;
-    if (strings::safe_strto64(workspace_limit_in_mb_str,
-                              &scratch_limit_in_mb)) {
+    if (absl::SimpleAtoi(workspace_limit_in_mb_str, &scratch_limit_in_mb)) {
       return scratch_limit_in_mb * (1 << 20);
     } else {
       LOG(WARNING) << "Invalid value for TF_CUBLAS_WORKSPACE_LIMIT_IN_MB: "
@@ -77,7 +76,7 @@ struct BlasLtMatmulPlanMap {
 
 int MatmulMaxAutotuneAlgorithmCount() {
   int64_t value;
-  Status status =
+  absl::Status status =
       ReadInt64FromEnvVar("TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS", 10, &value);
   if (!status.ok()) {
     LOG(ERROR) << status.message();
@@ -90,7 +89,7 @@ int MatmulMaxAutotuneAlgorithmCount() {
   return value;
 }
 
-StatusOr<se::blas::ComputationType> GetBlasComputationType(
+absl::StatusOr<stream_executor::blas::ComputationType> GetBlasComputationType(
     se::blas::DataType dtype) {
   using se::blas::ComputationType;
   static bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
@@ -114,9 +113,11 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
 
 }  // namespace
 
-/* static */ StatusOr<const PlanAndAlgorithms*> PlanAndAlgorithms::GetOrCreate(
-    se::Stream* stream, const BlasLtMatmulPlanParams& params,
-    absl::Mutex** ppmu, std::optional<int> max_algorithm_count) {
+/* static */ absl::StatusOr<const PlanAndAlgorithms*>
+PlanAndAlgorithms::GetOrCreate(se::Stream* stream,
+                               const BlasLtMatmulPlanParams& params,
+                               absl::Mutex** ppmu,
+                               std::optional<int> max_algorithm_count) {
   static const int64_t max_scratch_size =
       GetWorkspaceLimit(1LL << 32);  // 4GB by default
   static const int64_t max_autotune_algorithm_count =
@@ -189,25 +190,27 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
   return ptr->second.get();
 }
 
-Status PlanAndAlgorithms::ExecuteOnStream(
-    se::Stream* stream, const se::DeviceMemoryBase& a,
-    const se::DeviceMemoryBase& b, se::DeviceMemoryBase& c,
-    size_t algorithm_idx, se::ScratchAllocator& scratch_allocator,
-    const se::DeviceMemoryBase& bias,
+absl::Status PlanAndAlgorithms::ExecuteOnStream(
+    se::Stream* stream, const stream_executor::DeviceAddressBase& a,
+    const stream_executor::DeviceAddressBase& b,
+    stream_executor::DeviceAddressBase& c, size_t algorithm_idx,
+    se::ScratchAllocator& scratch_allocator,
+    const stream_executor::DeviceAddressBase& bias,
     se::blas::ProfileResult* profile_result) const {
   if (!plan || algorithm_idx >= algorithms.size()) {
     return errors::Internal("MatmulPlan or algorithms are not initialized!");
   }
   TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[algorithm_idx]));
-  return plan->ExecuteOnStream(stream, a, b, c, c,
-                               bias,                    // bias_buffer
-                               se::DeviceMemoryBase{},  // aux_buffer
-                               se::DeviceMemoryBase{},  // a_scale_buffer
-                               se::DeviceMemoryBase{},  // b_scale_buffer
-                               se::DeviceMemoryBase{},  // c_scale_buffer
-                               se::DeviceMemoryBase{},  // d_scale_buffer
-                               se::DeviceMemoryBase{},  // d_amax_buffer
-                               scratch_allocator, profile_result);
+  return plan->ExecuteOnStream(
+      stream, a, b, c, c,
+      bias,                                  // bias_buffer
+      stream_executor::DeviceAddressBase{},  // aux_buffer
+      stream_executor::DeviceAddressBase{},  // a_scale_buffer
+      stream_executor::DeviceAddressBase{},  // b_scale_buffer
+      stream_executor::DeviceAddressBase{},  // c_scale_buffer
+      stream_executor::DeviceAddressBase{},  // d_scale_buffer
+      stream_executor::DeviceAddressBase{},  // d_amax_buffer
+      scratch_allocator, profile_result);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_util.h b/tensorflow/core/kernels/matmul_util.h
index 0bf7f8acb48cf1..abcbe0ad1bea44 100644
--- a/tensorflow/core/kernels/matmul_util.h
+++ b/tensorflow/core/kernels/matmul_util.h
@@ -51,15 +51,17 @@ struct BlasLtMatmulPlanParams {
 };
 
 struct PlanAndAlgorithms {
-  static StatusOr<const PlanAndAlgorithms*> GetOrCreate(
+  static absl::StatusOr<const PlanAndAlgorithms*> GetOrCreate(
       se::Stream* stream, const BlasLtMatmulPlanParams& params,
       absl::Mutex** pmu, std::optional<int> max_algorithm_count = std::nullopt);
 
-  Status ExecuteOnStream(
-      se::Stream* stream, const se::DeviceMemoryBase& a,
-      const se::DeviceMemoryBase& b, se::DeviceMemoryBase& c,
-      size_t algorithm_idx, se::ScratchAllocator& scratch_allocator,
-      const se::DeviceMemoryBase& bias = se::DeviceMemoryBase{},
+  absl::Status ExecuteOnStream(
+      se::Stream* stream, const stream_executor::DeviceAddressBase& a,
+      const stream_executor::DeviceAddressBase& b,
+      stream_executor::DeviceAddressBase& c, size_t algorithm_idx,
+      se::ScratchAllocator& scratch_allocator,
+      const stream_executor::DeviceAddressBase& bias =
+          stream_executor::DeviceAddressBase{},
       se::blas::ProfileResult* profile_result = nullptr) const;
 
   se::gpu::BlasLt::MatmulPlanPtr plan;
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index bc99ad59db4543..a9de19492d1aff 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -227,7 +227,7 @@ template <class Device, class T>
 class MaxPoolingGradOp : public OpKernel {
  public:
   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -289,16 +289,16 @@ class MaxPoolingGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64_t>::v(),
                                                    tensor_out.shape(),
                                                    &tensor_out_arg_max));
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -351,8 +351,8 @@ class MaxPoolingGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -366,7 +366,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   typedef Eigen::GpuDevice Device;
 
   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -412,16 +412,16 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
 
     TensorShape output_shape = tensor_in.shape();
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -452,8 +452,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -473,7 +473,7 @@ class MaxPoolingGradGradOp : public OpKernel {
  public:
   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -518,16 +518,16 @@ class MaxPoolingGradGradOp : public OpKernel {
         context, out_grad_backprop.dims() == 4,
         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -683,8 +683,8 @@ class MaxPoolingGradGradOp : public OpKernel {
           params.tensor_in_batch, shard_cost, shard);
   }
 
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -698,7 +698,7 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
 
   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -739,16 +739,16 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, tensor_out.shape(), &output));
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -798,8 +798,8 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
@@ -815,7 +815,7 @@ class MaxPoolingNoMaskOp : public OpKernel {
  public:
   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -866,8 +866,8 @@ class MaxPoolingNoMaskOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -877,7 +877,7 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
  public:
   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -912,17 +912,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
 
     if (context->num_inputs() != 1) {
       const Tensor& tensor_ksize = context->input(1);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(2);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -956,8 +956,8 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -1036,8 +1036,8 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   bool propagate_nans_;
   bool include_batch_in_index_;
@@ -1109,7 +1109,7 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
  public:
   explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format_str;
+    std::string data_format_str;
     if (std::is_same<Device, GPUDevice>::value) {
       OP_REQUIRES(context, !tensorflow::OpDeterminismRequired(),
                   errors::Unimplemented("Determinism is not yet supported "
@@ -1187,8 +1187,8 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
   bool include_batch_in_index_;
@@ -1257,8 +1257,8 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   bool include_batch_in_index_;
 };
@@ -1270,7 +1270,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   typedef GPUDevice Device;
   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -1372,8 +1372,8 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -1386,7 +1386,7 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   typedef GPUDevice Device;
   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -1413,17 +1413,17 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
 
     if (context->num_inputs() != 1) {
       const Tensor& tensor_ksize = context->input(1);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(2);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -1471,8 +1471,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 759811dd74ec47..e7799161eba16c 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, const int kernel_h,
     const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
-    const int pad_l, dtype* __restrict__ top_data, int64* __restrict__ mask,
+    const int pad_l, dtype* __restrict__ top_data, int64_t* __restrict__ mask,
     const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
@@ -110,13 +110,13 @@ __global__ void MaxPoolForwardNCHW(
 // the same X, y coordinate.
 // (so channels = outer_channels, output_size = real output size / 4).
 __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
-    const int nthreads, const int32* __restrict__ bottom_data, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    int32* __restrict__ top_data) {
+    const int nthreads, const int32_t* __restrict__ bottom_data,
+    const int height, const int width, const int channels,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, int32_t* __restrict__ top_data) {
   // TODO(pauldonnelly): Implement a better optimized version of this kernel.
-  const int32 kMinINT8X4 = 0x80808080;
+  const int32_t kMinINT8X4 = 0x80808080;
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -128,8 +128,8 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    int32 maxval = kMinINT8X4;
-    const int32* bottom_data_n = bottom_data + n * channels * height * width;
+    int32_t maxval = kMinINT8X4;
+    const int32_t* bottom_data_n = bottom_data + n * channels * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = (c * height + h) * width + w;
@@ -147,7 +147,7 @@ __global__ void MaxPoolForwardNHWC(
     const int width, const int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    dtype* __restrict__ top_data, int64* __restrict__ mask,
+    dtype* __restrict__ top_data, int64_t* __restrict__ mask,
     const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
@@ -203,7 +203,7 @@ __global__ void MaxPoolForwardNHWC(
 template <typename dtype>
 __global__ void MaxPoolBackward(const int nthreads,
                                 const dtype* __restrict__ top_diff,
-                                const int64* __restrict__ mask,
+                                const int64_t* __restrict__ mask,
                                 const int top_offset, const int bottom_offset,
                                 dtype* __restrict__ bottom_diff,
                                 const bool include_batch_in_index) {
@@ -332,7 +332,7 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
 template <typename dtype>
 __global__ void MaxPoolGradBackward(const int nthreads,
                                     const dtype* __restrict__ top_diff,
-                                    const int64* __restrict__ mask,
+                                    const int64_t* __restrict__ mask,
                                     const int top_offset,
                                     const int bottom_offset,
                                     dtype* __restrict__ bottom_diff,
@@ -353,11 +353,11 @@ namespace functor {
 // Note: channels is the outer channels (dim 1) which has already been
 // divided by 4.
 bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
-    const int32* bottom_data, const int batch, const int height,
+    const int32_t* bottom_data, const int batch, const int height,
     const int width, int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    int32* top_data, const Eigen::GpuDevice& d) {
+    int32_t* top_data, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
   if (output_size == 0) return true;
@@ -377,7 +377,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     const int channels, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_t, const int pad_l, T* top_data,
-    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans,
+    int64_t* mask, const Eigen::GpuDevice& d, bool propagate_nans,
     const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
@@ -405,7 +405,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
 template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
-    const int64* mask, const int top_offset, const int bottom_offset,
+    const int64_t* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d,
     const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
@@ -454,7 +454,7 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
 template <typename T>
 bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
-    const int64* mask, const int top_offset, const int bottom_offset,
+    const int64_t* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d,
     const bool include_batch_in_index) {
   if (input_size == 0) return true;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 650a01e3ff0dc1..3e8ba784d9714e 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -44,11 +44,11 @@ struct MaxPoolForwardWithOptionalArgmax {
 };
 
 struct MaxPoolForwardNoMask_NCHW_VECT_C {
-  bool operator()(const int32* bottom_data, const int batch, const int height,
+  bool operator()(const int32_t* bottom_data, const int batch, const int height,
                   const int width, int channels, const int pooled_height,
                   const int pooled_width, const int kernel_h,
                   const int kernel_w, const int stride_h, const int stride_w,
-                  const int pad_t, const int pad_l, int32* top_data,
+                  const int pad_t, const int pad_l, int32_t* top_data,
                   const Eigen::GpuDevice& d);
 };
 
diff --git a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
index cc838aace88f33..d1185f0d5d7998 100644
--- a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
+++ b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
@@ -34,7 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void WriteCheckpoint(const string& prefix, absl::Span<const string> names,
+void WriteCheckpoint(const std::string& prefix,
+                     absl::Span<const std::string> names,
                      absl::Span<const Tensor> tensors) {
   BundleWriter writer(Env::Default(), prefix);
   ASSERT_TRUE(names.size() == tensors.size());
@@ -65,12 +66,12 @@ class MergeV2CheckpointsOpTest : public OpsTestBase {
 
   void RunMergeTest(bool delete_old_dirs, bool allow_missing_files) {
     // Writes two checkpoints.
-    const std::vector<string> prefixes = {
+    const std::vector<std::string> prefixes = {
         io::JoinPath(testing::TmpDir(), "worker0/ckpt0"),
         io::JoinPath(testing::TmpDir(), "worker1/ckpt1"),
         io::JoinPath(testing::TmpDir(), "merged/ckpt") /* merged prefix */};
     // In a different directory, to exercise "delete_old_dirs".
-    const string& kMergedPrefix = prefixes[2];
+    const std::string& kMergedPrefix = prefixes[2];
 
     // Only write this particular checkpoint if we do not allow missing files.
     if (!allow_missing_files) {
@@ -123,9 +124,10 @@ class MergeV2CheckpointsOpTest : public OpsTestBase {
     for (int i = 0; i < 2; ++i) {
       // If we allow missing files, the first checkpoint file did not exist.
       if (allow_missing_files && i == 0) continue;
-      int directory_found = Env::Default()
-                                ->IsDirectory(string(io::Dirname(prefixes[i])))
-                                .raw_code();
+      int directory_found =
+          Env::Default()
+              ->IsDirectory(std::string(io::Dirname(prefixes[i])))
+              .raw_code();
       if (delete_old_dirs) {
         EXPECT_EQ(error::NOT_FOUND, directory_found);
       } else {
diff --git a/tensorflow/core/kernels/mfcc_op.cc b/tensorflow/core/kernels/mfcc_op.cc
index 2c5f9560aaa31c..760781605239fb 100644
--- a/tensorflow/core/kernels/mfcc_op.cc
+++ b/tensorflow/core/kernels/mfcc_op.cc
@@ -49,7 +49,7 @@ class MfccOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sample_rate should be a scalar tensor, got ",
                     sample_rate_tensor.shape().DebugString(), " instead."));
-    const int32_t sample_rate = sample_rate_tensor.scalar<int32>()();
+    const int32_t sample_rate = sample_rate_tensor.scalar<int32_t>()();
 
     const int spectrogram_channels = spectrogram.dim_size(2);
     const int spectrogram_samples = spectrogram.dim_size(1);
@@ -105,8 +105,8 @@ class MfccOp : public OpKernel {
  private:
   float upper_frequency_limit_;
   float lower_frequency_limit_;
-  int32 filterbank_channel_count_;
-  int32 dct_coefficient_count_;
+  int32_t filterbank_channel_count_;
+  int32_t dct_coefficient_count_;
 };
 REGISTER_KERNEL_BUILDER(Name("Mfcc").Device(DEVICE_CPU), MfccOp);
 
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 99786dee930818..702fae1c2b37ea 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -427,7 +427,6 @@ tf_cc_test_mkl(
     size = "small",
     srcs = ["mkl_fused_batch_norm_op_test.cc"],
     linkstatic = 1,
-    tags = ["cuda-only"], # fails on AMD Rome CPUs as of 2021-03-29
     deps = [
         ":mkl_conv_op",
         ":mkl_fused_batch_norm_op",
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index b2f1e1d16579bf..7105823f79f543 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -557,7 +557,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags() + [
         "no_cuda",  # TODO(b/196608406): re-enable
         "no_cuda_asan",  # TODO(b/171341759): re-enable.
-        "cuda-only",
     ],
     deps = [
         ":base_binary_ops_test",
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 6136177effa4f3..9a76c85aba09c7 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -39,8 +39,8 @@ using GPUDevice = Eigen::GpuDevice;
 // Kernel for Multinomial op.  Data is interpreted to have the following shapes:
 //   scores: [B, S, C];  maxima: [B, S];  output: [B, S].
 template <typename OutputType>
-__global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
-                                  const int32 num_samples,
+__global__ void MultinomialKernel(int32_t nthreads, const int32_t num_classes,
+                                  const int32_t num_samples,
                                   const float* __restrict__ scores,
                                   const float* __restrict__ maxima,
                                   OutputType* __restrict__ output) {
@@ -113,7 +113,7 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     // Necessary for atomicMax() inside the kernel.
     output.device(d) = output.constant(0LL);
 
-    const int32 work_items = batch_size * num_samples * num_classes;
+    const int32_t work_items = batch_size * num_samples * num_classes;
     GpuLaunchConfig config = GetGpuLaunchConfig(work_items, d);
     TF_CHECK_OK(GpuLaunchKernel(
         MultinomialKernel<OutputType>, config.block_count,
diff --git a/tensorflow/core/kernels/multinomial_op_test.cc b/tensorflow/core/kernels/multinomial_op_test.cc
index df2d0af01c7bea..e7ce0bddbd6119 100644
--- a/tensorflow/core/kernels/multinomial_op_test.cc
+++ b/tensorflow/core/kernels/multinomial_op_test.cc
@@ -29,7 +29,7 @@ static Graph* Multinomial(int batch_size, int num_classes, int num_samples) {
   Tensor logits_t(DT_FLOAT, TensorShape({batch_size, num_classes}));
   Tensor num_samples_t(DT_INT32, TensorShape());
   logits_t.flat<float>().setRandom();
-  num_samples_t.scalar<int32>().setConstant(num_samples);
+  num_samples_t.scalar<int32_t>().setConstant(num_samples);
 
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("multinomial"), "Multinomial")
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index 8fa7170c3c0c59..61a745df498cdd 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -36,7 +36,7 @@ namespace {
 
 class Mutex : public ResourceBase {
  public:
-  explicit Mutex(OpKernelContext* c, const string& name)
+  explicit Mutex(OpKernelContext* c, const std::string& name)
       : locked_(false),
         thread_pool_(new thread::ThreadPool(
             c->env(), ThreadOptions(),
@@ -46,7 +46,9 @@ class Mutex : public ResourceBase {
     VLOG(2) << "Creating mutex with name " << name << ": " << this;
   }
 
-  string DebugString() const override { return absl::StrCat("Mutex ", name_); }
+  std::string DebugString() const override {
+    return absl::StrCat("Mutex ", name_);
+  }
 
   class LockReleaser {
    public:
@@ -127,7 +129,7 @@ class Mutex : public ResourceBase {
   condition_variable cv_ TF_GUARDED_BY(mu_);
   bool locked_ TF_GUARDED_BY(mu_);
   std::unique_ptr<thread::ThreadPool> thread_pool_;
-  string name_;
+  std::string name_;
 };
 
 }  // namespace
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index 00242596140499..77eb070e628576 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -52,7 +52,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("shared_name", &collective_prefix_));
   }
 
-  string GetCollectiveKey(OpKernelContext* c) {
+  std::string GetCollectiveKey(OpKernelContext* c) {
     return strings::StrCat(collective_prefix_, ";", c->step_id(), ";",
                            c->frame_iter().frame_id, ":",
                            c->frame_iter().iter_id);
@@ -62,7 +62,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
 
  private:
   int num_devices_;
-  string collective_prefix_;
+  std::string collective_prefix_;
 
   NcclAsyncOpBase(const NcclAsyncOpBase&) = delete;
   void operator=(const NcclAsyncOpBase&) = delete;
@@ -71,7 +71,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
 class NcclReduceOpBase : public NcclAsyncOpBase {
  public:
   explicit NcclReduceOpBase(OpKernelConstruction* c) : NcclAsyncOpBase(c) {
-    string reduction;
+    std::string reduction;
     OP_REQUIRES_OK(c, c->GetAttr("reduction", &reduction));
     if (reduction == "min") {
       reduction_op_ = ncclMin;
@@ -106,7 +106,7 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
     OP_REQUIRES_OK_ASYNC(
         c, c->forward_input_or_allocate_output({0}, 0, input->shape(), &output),
         done);
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -137,7 +137,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
       : NcclReduceOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -173,7 +173,7 @@ class NcclReduceRecvKernel : public NcclReduceOpBase {
     OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, input->shape(), &output),
                          done);
 
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -207,7 +207,7 @@ class NcclBroadcastSendKernel : public NcclAsyncOpBase {
       : NcclAsyncOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -239,11 +239,11 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
     const Tensor& shape_t = c->input(0);
     TensorShape shape;
     OP_REQUIRES_OK_ASYNC(
-        c, TensorShapeUtils::MakeShape(shape_t.vec<int32>(), &shape), done);
+        c, TensorShapeUtils::MakeShape(shape_t.vec<int32_t>(), &shape), done);
     Tensor* output;
     OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape, &output), done);
 
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 87b3f4d98d344f..dfaad0122c6e57 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -57,8 +57,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-static void SetConstOp(const string& name, std::initializer_list<int64_t> dims,
-                       DataType data_type, NodeDef* node) {
+static void SetConstOp(const std::string& name,
+                       std::initializer_list<int64_t> dims, DataType data_type,
+                       NodeDef* node) {
   Tensor tensor(data_type, TensorShape(dims));
   for (int64_t i = 0; i < tensor.NumElements(); ++i) {
     switch (data_type) {
@@ -81,13 +82,13 @@ static void SetConstOp(const string& name, std::initializer_list<int64_t> dims,
                   .Finalize(node));
 }
 
-static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes,
-                            NodeDef* node) {
+static void SetConstSizesOp(const std::string& name,
+                            const std::vector<int32_t>& sizes, NodeDef* node) {
   TensorShape shape;
   shape.AddDim(sizes.size());
   Tensor tensor(DT_INT32, shape);
   for (int64_t i = 0; i < tensor.NumElements(); ++i) {
-    tensor.flat<int32>()(i) = sizes[i];
+    tensor.flat<int32_t>()(i) = sizes[i];
   }
   TF_CHECK_OK(NodeDefBuilder(name, "Const")
                   .Attr("dtype", DT_INT32)
@@ -112,7 +113,7 @@ static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
                          int filter_rows, int filter_cols, CONV_OP op,
                          int num_threads, int stride, Padding padding,
                          bool use_gpu, DataType data_type,
-                         const string& label) {
+                         const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -159,19 +160,19 @@ static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
   SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
              data_type, graph.add_node());
   SetConstSizesOp("input_sizes",
-                  std::vector<int32>({batch, rows, cols, in_depth}),
+                  std::vector<int32_t>({batch, rows, cols, in_depth}),
                   graph.add_node());
   SetConstSizesOp(
       "filter_sizes",
-      std::vector<int32>({filter_rows, filter_cols, in_depth, out_depth}),
+      std::vector<int32_t>({filter_rows, filter_cols, in_depth, out_depth}),
       graph.add_node());
-  SetConstSizesOp("resize_size", std::vector<int32>({rows, cols}),
+  SetConstSizesOp("resize_size", std::vector<int32_t>({rows, cols}),
                   graph.add_node());
 
   TensorShape paddings_shape({4, 2});
   Tensor paddings_tensor(DT_INT32, paddings_shape);
   for (int64_t i = 0; i < paddings_tensor.NumElements(); ++i) {
-    paddings_tensor.flat<int32>()(i) = 0;
+    paddings_tensor.flat<int32_t>()(i) = 0;
   }
   TF_CHECK_OK(NodeDefBuilder("paddings", "Const")
                   .Attr("dtype", DT_INT32)
@@ -234,7 +235,7 @@ static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
 
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   test::Benchmark(device, g, &options, nullptr, nullptr, "",
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -540,7 +541,7 @@ static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
                                   int filter_rows, int filter_cols,
                                   DEPTHWISE_CONV_OP op, int num_threads,
                                   int stride, Padding padding, bool use_gpu,
-                                  const string& label) {
+                                  const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -594,10 +595,10 @@ static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
   SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth}, dtype,
              graph.add_node());
   SetConstSizesOp("input_sizes",
-                  std::vector<int32>({batch, rows, cols, in_depth}),
+                  std::vector<int32_t>({batch, rows, cols, in_depth}),
                   graph.add_node());
   SetConstSizesOp("filter_sizes",
-                  std::vector<int32>(
+                  std::vector<int32_t>(
                       {filter_rows, filter_cols, in_depth, depth_multiplier}),
                   graph.add_node());
 
@@ -637,7 +638,7 @@ static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
 
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   test::Benchmark(device, g, &options, nullptr, nullptr, "",
                   /*old_benchmark_api=*/false)
       .Run(state);
@@ -788,7 +789,7 @@ BM_ConvFloatDepthwiseBk_All(bfloat16);
 
 static void BM_LRNFloat(::testing::benchmark::State& state, int depth, int cols,
                         int rows, int batch_size, int range, int num_threads,
-                        const string& label) {
+                        const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -869,7 +870,7 @@ AvgPooling Op
 static void BM_AvgPool(::testing::benchmark::State& state, int batch_size,
                        int rows, int cols, int depth, int kernel_rows,
                        int kernel_cols, int stride, Padding padding,
-                       int num_threads, const string& label) {
+                       int num_threads, const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -960,7 +961,7 @@ BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME");
 static void BM_AvgPoolBk(::testing::benchmark::State& state, int batch_size,
                          int rows, int cols, int depth, int kernel_rows,
                          int kernel_cols, int stride, Padding padding,
-                         int num_threads, const string& label) {
+                         int num_threads, const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -979,9 +980,9 @@ static void BM_AvgPoolBk(::testing::benchmark::State& state, int batch_size,
   TensorShape output_shape({batch_size, out_height, out_width, depth});
   TensorShape shape2({4});
   Tensor input_shape_tensor(DT_INT32, shape2);
-  int32 input_dims[] = {batch_size, rows, cols, depth};
+  int32_t input_dims[] = {batch_size, rows, cols, depth};
   for (int i = 0; i < 4; i++) {
-    input_shape_tensor.flat<int32>()(i) = input_dims[i];
+    input_shape_tensor.flat<int32_t>()(i) = input_dims[i];
   }
   inputs.push_back({nullptr, &input_shape_tensor});
 
@@ -1063,7 +1064,7 @@ MaxPooling Op
 static void BM_MaxPool(::testing::benchmark::State& state, int batch_size,
                        int rows, int cols, int depth, int kernel_rows,
                        int kernel_cols, int stride, Padding padding,
-                       int num_threads, const string& label) {
+                       int num_threads, const std::string& label) {
   SessionOptions options;
   options.config.set_intra_op_parallelism_threads(num_threads);
 
@@ -1158,7 +1159,8 @@ BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME");
 static void BM_MaxPoolBk(::testing::benchmark::State& state, int batch_size,
                          int rows, int cols, int depth, int kernel_rows,
                          int kernel_cols, int stride, Padding padding,
-                         int num_threads, bool use_gpu, const string& label) {
+                         int num_threads, bool use_gpu,
+                         const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -1192,7 +1194,7 @@ static void BM_MaxPoolBk(::testing::benchmark::State& state, int batch_size,
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   test::Benchmark(device, g, /*old_benchmark_api*/ false).Run(state);
 
   state.SetItemsProcessed(batch_size * rows * cols * depth *
@@ -1252,7 +1254,7 @@ Run benchmark with:
 */
 static void BM_ReluFloat(::testing::benchmark::State& state, int batch_size,
                          int rows, int cols, int depth, int num_threads,
-                         const string& label) {
+                         const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -1323,7 +1325,7 @@ Run benchmark with:
 */
 static void BM_SoftplusFloat(::testing::benchmark::State& state, int batch_size,
                              int rows, int cols, int depth, int num_threads,
-                             const string& label) {
+                             const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -1392,7 +1394,7 @@ BM_Softplus(32, 14, 14, 576, 4, "softplus10");
 static void BM_ImageNetSoftmaxFwd(::testing::benchmark::State& state,
                                   int batch_size, int node_depth,
                                   int num_threads, bool use_gpu,
-                                  const string& label) {
+                                  const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -1409,7 +1411,7 @@ static void BM_ImageNetSoftmaxFwd(::testing::benchmark::State& state,
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   SessionOptions opts;
   opts.config.set_inter_op_parallelism_threads(1);
   opts.config.set_intra_op_parallelism_threads(num_threads);
@@ -1444,7 +1446,8 @@ BM_ImageNetSoftmaxFwd(8192, 1024, 1, true, "softmax32");
 BM_ImageNetSoftmaxFwd(8192, 32768, 1, true, "softmax128");
 
 static void BM_TopK(::testing::benchmark::State& state, int rows, int cols,
-                    int k, int num_threads, bool use_gpu, const string& label) {
+                    int k, int num_threads, bool use_gpu,
+                    const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -1458,14 +1461,14 @@ static void BM_TopK(::testing::benchmark::State& state, int rows, int cols,
   input.flat<float>().setRandom();
 
   Tensor input_k(DT_INT32, TensorShape({}));
-  input_k.scalar<int32>()() = k;
+  input_k.scalar<int32_t>()() = k;
 
   auto top_k = ops::TopK(root, input, input_k, ops::TopK::Sorted(true));
 
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   SessionOptions opts;
   opts.config.set_inter_op_parallelism_threads(1);
   opts.config.set_intra_op_parallelism_threads(num_threads);
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index 104a4c9421d188..12db3b63d8cdad 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -43,7 +43,7 @@ class NthElementOp : public OpKernel {
     OP_REQUIRES(
         context, TensorShapeUtils::IsScalar(n_in.shape()),
         errors::InvalidArgument("N must be scalar but has rank ", n_in.dims()));
-    int n = n_in.scalar<int32>()();
+    int n = n_in.scalar<int32_t>()();
     OP_REQUIRES(context, n >= 0,
                 errors::InvalidArgument("n must be non-negative but is ", n));
 
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 1a7ef6a9a46d0f..4a205ac3503f2e 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -77,7 +77,7 @@ class OneHotOp : public OpKernel {
     const int axis = (axis_ == -1) ? indices_dims : axis_;
 
     // The one-hot dimension.
-    const int32_t depth_v = depth.scalar<int32>()();
+    const int32_t depth_v = depth.scalar<int32_t>()();
     OP_REQUIRES(
         ctx, depth_v >= 0,
         errors::InvalidArgument("depth must be non-negative, got: ", depth_v));
@@ -122,7 +122,7 @@ class OneHotOp : public OpKernel {
   }
 
  private:
-  int32 axis_;
+  int32_t axis_;
 
   OneHotOp(const OneHotOp&) = delete;
   void operator=(const OneHotOp&) = delete;
diff --git a/tensorflow/core/kernels/one_hot_op_test.cc b/tensorflow/core/kernels/one_hot_op_test.cc
index 6801b29e2509f7..09cb9b8d9388ea 100644
--- a/tensorflow/core/kernels/one_hot_op_test.cc
+++ b/tensorflow/core/kernels/one_hot_op_test.cc
@@ -30,13 +30,13 @@ static Graph* OneHot(int batch_size, int num_classes, int axis) {
   std::mt19937 gen(rd());
   std::uniform_int_distribution<> dist(0, num_classes - 1);
 
-  auto indices_t = indices.flat<int32>();
+  auto indices_t = indices.flat<int32_t>();
   for (int i = 0; i < batch_size; ++i) {
     indices_t(i) = dist(gen);
   }
 
   Tensor depth(DT_INT32, TensorShape({}));
-  depth.scalar<int32>()() = num_classes;
+  depth.scalar<int32_t>()() = num_classes;
 
   Tensor on_value(DT_FLOAT, TensorShape({}));
   on_value.scalar<float>()() = 1.0f;
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 4efbac731bcaf2..ec0c6a1adcadf5 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -176,7 +176,7 @@ void OpsTestBase::CreateContext() {
   params_->frame_iter = FrameAndIter(0, 0);
   params_->inputs = inputs_;
   params_->op_kernel = kernel_.get();
-  step_container_.reset(new ScopedStepContainer(0, [](const string&) {}));
+  step_container_.reset(new ScopedStepContainer(0, [](const std::string&) {}));
   params_->step_container = step_container_.get();
   test::SetOutputAttrs(params_.get(), &out_alloc_attrs_);
   params_->slice_reader_cache = &slice_reader_cache_wrapper_;
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index ef4a7cd5142cde..da2ccad9cbba72 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -119,7 +119,7 @@ class OpsTestBase : public ::testing::Test {
   // Adds a Resource type as input. If <container> is empty, uses the default
   // container name.
   template <typename T>
-  void AddResourceInput(const string& container, const string& name,
+  void AddResourceInput(const std::string& container, const std::string& name,
                         T* resource) {
     CHECK_GT(input_types_.size(), inputs_.size())
         << "Adding more inputs than types; perhaps you need to call MakeOp";
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 0e60b84dc9ff25..f4c1db06bad961 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -168,8 +168,8 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
                             .Device(DEVICE_GPU)
                             .HostMemory("values")
                             .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PackOp<CPUDevice, int32>);
+                            .TypeConstraint<int32_t>("T"),
+                        PackOp<CPUDevice, int32_t>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index c650648147adf8..890a9954faa4a7 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -400,38 +400,38 @@ TF_CALL_uint8(REGISTER_GPU_KERNEL);
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Pad")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
+                        PadOp<CPUDevice, int32_t, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("Pad")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .TypeConstraint<int64_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
+                        PadOp<CPUDevice, int32_t, int64_t>);
 REGISTER_KERNEL_BUILDER(Name("PadV2")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("constant_values")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
+                        PadOp<CPUDevice, int32_t, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("PadV2")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .TypeConstraint<int64_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("constant_values")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
+                        PadOp<CPUDevice, int32_t, int64_t>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 3b50099fb9997c..bd9a07006a8870 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -36,7 +36,8 @@ namespace tensorflow {
 
 PaddingFIFOQueue::PaddingFIFOQueue(
     int capacity, const DataTypeVector& component_dtypes,
-    const std::vector<PartialTensorShape>& component_shapes, const string& name)
+    const std::vector<PartialTensorShape>& component_shapes,
+    const std::string& name)
     : FIFOQueue(capacity, component_dtypes,
                 ConvertShapesPartialDimensionsToZero(component_shapes), name),
       partial_shapes_(component_shapes) {}
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index 74107e80b1977b..f05862ff9b3bdd 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -36,7 +36,7 @@ class PaddingFIFOQueue : public FIFOQueue {
  public:
   PaddingFIFOQueue(int32_t capacity, const DataTypeVector& component_dtypes,
                    const std::vector<PartialTensorShape>& component_shapes,
-                   const string& name);
+                   const std::string& name);
 
   absl::Status Initialize() override;
 
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 782c22c5efd43c..66ec30bc4a2136 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -777,8 +777,8 @@ class StatelessParameterizedTruncatedNormal : public OpKernel {
                                 shape_tensor.shape().DebugString()));
     TensorShape output_shape;
     if (shape_tensor.dtype() == DataType::DT_INT32) {
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
-                                                      &output_shape));
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                              shape_tensor.vec<int32_t>(), &output_shape));
     } else {
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
                               shape_tensor.vec<int64_t>(), &output_shape));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index e7b76653dc329e..0fbb33816c8b14 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -51,16 +51,16 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename T>
 
 __global__ void __launch_bounds__(1024)
-    TruncatedNormalKernel(random::PhiloxRandom gen, T* data, int64 num_batches,
-                          int64 samples_per_batch, int64 num_elements,
-                          const T* __restrict__ means, bool single_mean,
-                          const T* __restrict__ stddevs, bool single_stddev,
-                          const T* __restrict__ minvals, bool single_minval,
-                          const T* __restrict__ maxvals, bool single_maxval,
-                          int64 kMaxIterations) {
-  const int32 max_samples_per_item = 2 * kMaxIterations;
+    TruncatedNormalKernel(random::PhiloxRandom gen, T* data,
+                          int64_t num_batches, int64_t samples_per_batch,
+                          int64_t num_elements, const T* __restrict__ means,
+                          bool single_mean, const T* __restrict__ stddevs,
+                          bool single_stddev, const T* __restrict__ minvals,
+                          bool single_minval, const T* __restrict__ maxvals,
+                          bool single_maxval, int64_t kMaxIterations) {
+  const int32_t max_samples_per_item = 2 * kMaxIterations;
   // Initial offset as given by GPU_1D_KERNEL_LOOP.
-  const int32 initial_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t initial_offset = blockIdx.x * blockDim.x + threadIdx.x;
   gen.Skip(max_samples_per_item * initial_offset);
   typedef random::UniformDistribution<random::PhiloxRandom, T> Uniform;
   typedef random::NormalDistribution<random::PhiloxRandom, T> Normal;
@@ -82,15 +82,15 @@ __global__ void __launch_bounds__(1024)
   // skips max_samples_per_item in the generator. Then after generating this
   // item, we need to skip the samples for one element for every thread to get
   // to the next element that we actually process.
-  const int32 samples_between_processed_elements =
+  const int32_t samples_between_processed_elements =
       max_samples_per_item * (gridDim.x * blockDim.x);
 
   GPU_1D_KERNEL_LOOP(offset, num_elements) {
     // Track how many more samples we need to skip before we process the next
     // element.
-    int32 remaining_samples = samples_between_processed_elements;
+    int32_t remaining_samples = samples_between_processed_elements;
 
-    const int64 batch_id = offset / samples_per_batch;
+    const int64_t batch_id = offset / samples_per_batch;
     T mean = means[single_mean ? 0 : batch_id];
     const T input_stddev = stddevs[single_stddev ? 0 : batch_id];
     T minval = minvals[single_minval ? 0 : batch_id];
@@ -231,8 +231,8 @@ __global__ void __launch_bounds__(1024)
 // Partial specialization for GPU
 template <typename T>
 struct TruncatedNormalFunctor<GPUDevice, T> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d, int64 num_batches,
-                  int64 samples_per_batch, int64 num_elements,
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, int64_t num_batches,
+                  int64_t samples_per_batch, int64_t num_elements,
                   typename TTypes<T>::ConstFlat means,
                   typename TTypes<T>::ConstFlat stddevs,
                   typename TTypes<T>::ConstFlat minvals,
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
index 1d5865587e0c13..1257b8da742ce2 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 static Graph* PTruncatedNormal(int num_batches, int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   // Use mean 0 and stdev 1
   Tensor means_t(DT_FLOAT, TensorShape({num_batches}));
@@ -56,7 +56,7 @@ static Graph* PTruncatedNormal(int num_batches, int samples_per_batch) {
 static Graph* PTruncatedNormal2SD(int num_batches, int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   Tensor means_t(DT_FLOAT, TensorShape({num_batches}));
   means_t.flat<float>().setConstant(0.0);
@@ -83,7 +83,7 @@ static Graph* PTruncatedNormal2SD(int num_batches, int samples_per_batch) {
 static Graph* PTruncatedNormalOneTail(int num_batches, int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   Tensor means_t(DT_FLOAT, TensorShape({num_batches}));
   means_t.flat<float>().setConstant(0.0);
diff --git a/tensorflow/core/kernels/parse_tensor_test.cc b/tensorflow/core/kernels/parse_tensor_test.cc
index 1473eff064e3ea..d5a40489b64fd3 100644
--- a/tensorflow/core/kernels/parse_tensor_test.cc
+++ b/tensorflow/core/kernels/parse_tensor_test.cc
@@ -106,8 +106,9 @@ TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_double) {
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int64) {
-  MakeOp<int64_t>(TensorShape({2, 3, 4}),
-                  [](int x) -> int64 { return static_cast<int64_t>(x - 10); });
+  MakeOp<int64_t>(TensorShape({2, 3, 4}), [](int x) -> int64_t {
+    return static_cast<int64_t>(x - 10);
+  });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
   ParseSerializedOutput<int64_t>(GetOutput(0), &parse_output);
@@ -115,48 +116,50 @@ TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int64) {
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int32) {
-  MakeOp<int32>(TensorShape({4, 2}),
-                [](int x) -> int32 { return static_cast<int32>(x + 7); });
+  MakeOp<int32_t>(TensorShape({4, 2}),
+                  [](int x) -> int32_t { return static_cast<int32_t>(x + 7); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<int32>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<int32>(parse_output, GetInput(0));
+  ParseSerializedOutput<int32_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int32_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int16) {
-  MakeOp<int16>(TensorShape({8}),
-                [](int x) -> int16 { return static_cast<int16>(x + 18); });
+  MakeOp<int16_t>(TensorShape({8}), [](int x) -> int16_t {
+    return static_cast<int16_t>(x + 18);
+  });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<int16>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<int16>(parse_output, GetInput(0));
+  ParseSerializedOutput<int16_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int16_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int8) {
-  MakeOp<int8>(TensorShape({2}),
-               [](int x) -> int8 { return static_cast<int8>(x + 8); });
+  MakeOp<int8_t>(TensorShape({2}),
+                 [](int x) -> int8_t { return static_cast<int8_t>(x + 8); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<int8>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<int8>(parse_output, GetInput(0));
+  ParseSerializedOutput<int8_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int8_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint16) {
-  MakeOp<uint16>(TensorShape({1, 3}),
-                 [](int x) -> uint16 { return static_cast<uint16>(x + 2); });
+  MakeOp<uint16_t>(TensorShape({1, 3}), [](int x) -> uint16_t {
+    return static_cast<uint16_t>(x + 2);
+  });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<uint16>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<uint16>(parse_output, GetInput(0));
+  ParseSerializedOutput<uint16_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<uint16_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint8) {
-  MakeOp<uint8>(TensorShape({2, 1, 1}),
-                [](int x) -> uint8 { return static_cast<uint8>(x + 1); });
+  MakeOp<uint8_t>(TensorShape({2, 1, 1}),
+                  [](int x) -> uint8_t { return static_cast<uint8_t>(x + 1); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<uint8>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<uint8>(parse_output, GetInput(0));
+  ParseSerializedOutput<uint8_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<uint8_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_complex64) {
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 97b08ce6fd2982..bbff2dc35654ad 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -43,9 +43,9 @@ PartitionedCallOp::PartitionedCallOp(OpKernelConstruction* ctx)
       shared_rendezvous_(false) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, func_.get()));
-  string deprecated_config_serialized;
+  std::string deprecated_config_serialized;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
-  string config_proto_serialized;
+  std::string config_proto_serialized;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
   OP_REQUIRES(
       ctx,
@@ -232,7 +232,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   FunctionLibraryRuntime::Options run_opts;
   ResourceMgr* resource_mgr = lib->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
-      run_opts.step_id, [resource_mgr](const string& name) {
+      run_opts.step_id, [resource_mgr](const std::string& name) {
         resource_mgr->Cleanup(name).IgnoreError();
       });
   run_opts.step_container = step_container;
@@ -251,13 +251,13 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   }
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
-  const string& func_name = func_->name();
+  const std::string& func_name = func_->name();
   tsl::profiler::TraceMe trace_me("PartitionedCallOp");
   lib->Run(run_opts, handle, inputs, rets,
            [rets, done = std::move(done), ctx, func_name,
             step_container](const absl::Status& status) {
              if (!status.ok()) {
-               const string function_and_msg =
+               const std::string function_and_msg =
                    absl::StrCat(errors::FormatFunctionForError(func_name), " ",
                                 status.message());
                ctx->SetStatus(
diff --git a/tensorflow/core/kernels/partitioned_function_ops.h b/tensorflow/core/kernels/partitioned_function_ops.h
index 2b2ec8ea959f7c..f38ad56e8a9f73 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.h
+++ b/tensorflow/core/kernels/partitioned_function_ops.h
@@ -57,7 +57,7 @@ class PartitionedCallOp : public AsyncOpKernel {
   // Using unique pointers to avoid including proto headers in kernel headers
   std::unique_ptr<NameAttrList> func_;
   std::unique_ptr<ConfigProto> config_proto_;
-  string executor_type_;
+  std::string executor_type_;
   bool shared_rendezvous_;
   mutex mu_;
   // Cache the handle per FLR because this kernel may be instantiated for
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 28e24e79fe0bcf..a63a176032f953 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -46,8 +46,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
-                                   const std::vector<int32>& ksize,
-                                   const std::vector<int32>& stride,
+                                   const std::vector<int32_t>& ksize,
+                                   const std::vector<int32_t>& stride,
                                    Padding padding, TensorFormat data_format,
                                    const TensorShape& tensor_in_shape) {
   // For maxpooling, tensor_in should have 4 dimensions.
@@ -97,9 +97,9 @@ absl::Status Pool3dParameters::forward_output_shape(TensorShape* shape) {
 template <typename T>
 struct LaunchPoolingOp<CPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
@@ -112,9 +112,9 @@ struct LaunchPoolingOp<CPUDevice, T, AVG> {
 template <typename T>
 struct LaunchPoolingOp<CPUDevice, T, MAX> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
@@ -128,7 +128,7 @@ template <typename Device, typename T, PoolingType Type>
 class Pooling3DOp : public UnaryOp<T> {
  public:
   explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -204,8 +204,8 @@ class Pooling3DOp : public UnaryOp<T> {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -214,10 +214,10 @@ template <typename T>
 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const Tensor& tensor_out, const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& out,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     output->flat<T>().setZero();
     for (int64_t p = 0; p < out_backprop.dim_size(3); ++p) {
@@ -307,7 +307,7 @@ class MaxPooling3dGradOp : public OpKernel {
  public:
   explicit MaxPooling3dGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -391,8 +391,8 @@ class MaxPooling3dGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -402,10 +402,10 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context,
                      const TensorShape& tensor_in_shape,
                      const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& output_shape,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& output_shape,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     OP_REQUIRES(
         context, tensor_in_shape.dim_size(0) == out_backprop.dim_size(0),
@@ -487,7 +487,7 @@ class AvgPooling3dGradOp : public OpKernel {
  public:
   explicit AvgPooling3dGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -536,7 +536,7 @@ class AvgPooling3dGradOp : public OpKernel {
                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
 
     TensorShape output_shape;
-    auto shape_vec = tensor_in_shape.vec<int32>();
+    auto shape_vec = tensor_in_shape.vec<int32_t>();
     for (int64_t i = 0; i < tensor_in_shape.NumElements(); ++i) {
       OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(shape_vec(i)));
     }
@@ -568,8 +568,8 @@ class AvgPooling3dGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -693,7 +693,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
  public:
   explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -779,8 +779,8 @@ class MaxPooling3dGradGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -816,9 +816,9 @@ TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 template <typename T>
 struct LaunchPoolingOp<GPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
@@ -829,9 +829,9 @@ struct LaunchPoolingOp<GPUDevice, T, AVG> {
 template <typename T>
 struct LaunchPoolingOp<GPUDevice, T, MAX> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
@@ -843,10 +843,10 @@ template <typename T>
 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const Tensor& tensor_out, const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& out,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* input_backprop) {
     const TensorShape output_shape = tensor_in.shape();
     DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
@@ -861,10 +861,10 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context,
                      const TensorShape& tensor_in_shape,
                      const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& out,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     DnnPooling3dGradOp<T>::Compute(
         context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index c0a589ff95092a..edc59f89f760bb 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -39,8 +39,8 @@ struct LaunchMaxPooling3dGradGradOp;
 // A helper class to manage sizes and shapes for 3d pooling operations.
 struct Pool3dParameters {
   // Updates context->status if there is an invalid input.
-  Pool3dParameters(OpKernelContext* context, const std::vector<int32>& ksize,
-                   const std::vector<int32>& stride, Padding padding,
+  Pool3dParameters(OpKernelContext* context, const std::vector<int32_t>& ksize,
+                   const std::vector<int32_t>& stride, Padding padding,
                    TensorFormat data_format,
                    const TensorShape& tensor_in_shape);
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 4ccca647c154aa..24ed53d027442e 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -49,34 +49,35 @@ struct RawType {
 
 template <>
 struct RawType<qint8> {
-  using type = int8;
+  using type = int8_t;
 };
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
 struct PadInputWithNegativeInf {
-  Status operator()(const GPUDevice& d,
-                    typename TTypes<T, 4, int>::ConstTensor in,
-                    int input_pad_top, int input_pad_bottom, int input_pad_left,
-                    int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
-                    TensorFormat format) {
+  absl::Status operator()(const GPUDevice& d,
+                          typename TTypes<T, 4, int>::ConstTensor in,
+                          int input_pad_top, int input_pad_bottom,
+                          int input_pad_left, int input_pad_right,
+                          typename TTypes<T, 4, int>::Tensor out,
+                          TensorFormat format) {
     T padding_value = -std::numeric_limits<T>::infinity();
     functor::PadInput<GPUDevice, T, int, 4>()(
         d, in, {{input_pad_top, input_pad_left}},
         {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
 template <>
 struct PadInputWithNegativeInf<qint8> {
-  Status operator()(const GPUDevice& d,
-                    typename TTypes<qint8, 4, int>::ConstTensor in,
-                    int input_pad_top, int input_pad_bottom, int input_pad_left,
-                    int input_pad_right,
-                    typename TTypes<qint8, 4, int>::Tensor out,
-                    TensorFormat format) {
+  absl::Status operator()(const GPUDevice& d,
+                          typename TTypes<qint8, 4, int>::ConstTensor in,
+                          int input_pad_top, int input_pad_bottom,
+                          int input_pad_left, int input_pad_right,
+                          typename TTypes<qint8, 4, int>::Tensor out,
+                          TensorFormat format) {
     return errors::InvalidArgument(
         "Explicit padding not yet supported with qint8");
   }
@@ -117,8 +118,8 @@ absl::Status CheckPaddingSize(int64_t window_rows, int64_t window_cols,
 }
 
 PoolParameters::PoolParameters(OpKernelContext* context,
-                               const std::vector<int32>& ksize,
-                               const std::vector<int32>& stride,
+                               const std::vector<int32_t>& ksize,
+                               const std::vector<int32_t>& stride,
                                Padding padding,
                                std::vector<int64_t> explicit_paddings,
                                TensorFormat data_format,
@@ -227,8 +228,8 @@ absl::Status PoolParameters::forward_output_shape(TensorShape* shape) {
 
 template <typename T>
 void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-                    const std::vector<int32>& size,
-                    const std::vector<int32>& stride, Padding padding,
+                    const std::vector<int32_t>& size,
+                    const std::vector<int32_t>& stride, Padding padding,
                     std::vector<int64_t> explicit_paddings,
                     TensorFormat data_format, const Tensor& tensor_in,
                     const TensorShape& tensor_out_shape, bool propagate_nans,
@@ -438,14 +439,12 @@ void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
 }
 
 template <typename T>
-void DnnPoolingOp<T>::Compute(OpKernelContext* context,
-                              se::dnn::PoolingMode pooling_mode,
-                              const std::vector<int32>& size,
-                              const std::vector<int32>& stride, Padding padding,
-                              std::vector<int64_t> explicit_paddings,
-                              TensorFormat data_format, const Tensor& tensor_in,
-                              const TensorShape& tensor_out_shape,
-                              bool propagate_nans) {
+void DnnPoolingOp<T>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
+    Padding padding, std::vector<int64_t> explicit_paddings,
+    TensorFormat data_format, const Tensor& tensor_in,
+    const TensorShape& tensor_out_shape, bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -457,7 +456,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
 template <>
 void DnnPoolingOp<Eigen::bfloat16>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
     Padding padding, std::vector<int64_t> explicit_paddings,
     TensorFormat data_format, const Tensor& tensor_in,
     const TensorShape& tensor_out_shape, bool propagate_nans) {
@@ -511,14 +510,14 @@ DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(double);
-DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(int32_t);
 }  // namespace functor
 
 template <typename T>
 void DnnPoolingGradImpl(OpKernelContext* context,
                         se::dnn::PoolingMode pooling_mode,
-                        const std::vector<int32>& size,
-                        const std::vector<int32>& stride, Padding padding,
+                        const std::vector<int32_t>& size,
+                        const std::vector<int32_t>& stride, Padding padding,
                         std::vector<int64_t> explicit_paddings,
                         TensorFormat data_format, const Tensor* tensor_in,
                         const Tensor* tensor_out, const Tensor& out_backprop,
@@ -856,7 +855,7 @@ void DnnPoolingGradImpl(OpKernelContext* context,
 template <typename T>
 void DnnPoolingGradOp<T>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
     Padding padding, std::vector<int64_t> explicit_paddings,
     TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
@@ -873,7 +872,7 @@ void DnnPoolingGradOp<T>::Compute(
 template <>
 void DnnPoolingGradOp<Eigen::bfloat16>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
     Padding padding, std::vector<int64_t> explicit_paddings,
     TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index bb5dda562af672..cced70b25d4a39 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -47,8 +47,8 @@ struct PoolParameters {
   // Updates context->status if there is an invalid input.
   // explicit_paddings has eight elements if padding==EXPLIICT, and zero
   // elements otherwise.
-  PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
-                 const std::vector<int32>& stride, Padding padding,
+  PoolParameters(OpKernelContext* context, const std::vector<int32_t>& ksize,
+                 const std::vector<int32_t>& stride, Padding padding,
                  std::vector<int64_t> explicit_paddings,
                  TensorFormat data_format, const TensorShape& tensor_in_shape);
 
@@ -90,7 +90,7 @@ template <typename Device, typename T>
 class MaxPoolingOp : public OpKernel {
  public:
   explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     auto status = context->GetAttr("data_format", &data_format);
     if (status.ok()) {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
@@ -297,8 +297,8 @@ class MaxPoolingOp : public OpKernel {
     }
   }
 
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -314,12 +314,12 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
                      const Tensor& input, Tensor* output) {
 #if GOOGLE_CUDA
     bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
-        reinterpret_cast<const int32*>(input.flat<qint8>().data()),
+        reinterpret_cast<const int32_t*>(input.flat<qint8>().data()),
         params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
         params.depth, params.out_height, params.out_width, params.window_rows,
         params.window_cols, params.row_stride, params.col_stride,
         params.pad_top, params.pad_left,
-        reinterpret_cast<int32*>(output->flat<qint8>().data()),
+        reinterpret_cast<int32_t*>(output->flat<qint8>().data()),
         context->eigen_gpu_device());
     if (!status) {
       context->SetStatus(errors::Internal(
@@ -338,7 +338,7 @@ template <typename Device, typename T>
 class MaxPoolingV2Op : public OpKernel {
  public:
   explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     auto status = context->GetAttr("data_format", &data_format);
     if (status.ok()) {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
@@ -375,17 +375,17 @@ class MaxPoolingV2Op : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
 
     if (context->num_inputs() != 1) {
       const Tensor& tensor_ksize = context->input(1);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(2);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -572,8 +572,8 @@ class MaxPoolingV2Op : public OpKernel {
     }
   }
 
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index c5d51e5935677a..7a891ddd63f2b3 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -41,8 +41,8 @@ class DnnPoolingOp {
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::vector<int32>& size,
-                      const std::vector<int32>& stride, Padding padding,
+                      const std::vector<int32_t>& size,
+                      const std::vector<int32_t>& stride, Padding padding,
                       std::vector<int64_t> explicit_paddings,
                       TensorFormat data_format, const Tensor& tensor_in,
                       const TensorShape& tensor_out_shape, bool propagate_nans);
@@ -57,8 +57,8 @@ class DnnPoolingGradOp {
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::vector<int32>& size,
-                      const std::vector<int32>& stride, Padding padding,
+                      const std::vector<int32_t>& size,
+                      const std::vector<int32_t>& stride, Padding padding,
                       std::vector<int64_t> explicit_paddings,
                       TensorFormat data_format, const Tensor* tensor_in,
                       const Tensor* tensor_out, const Tensor& out_backprop,
diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc
index 9d0fc7530ae889..c43415982f257a 100644
--- a/tensorflow/core/kernels/population_count_op.cc
+++ b/tensorflow/core/kernels/population_count_op.cc
@@ -49,7 +49,7 @@ class PopulationCountOp : public OpKernel {
     OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t));
 
     auto input = input_t.flat<T>();
-    auto output = output_t->flat<uint8>();
+    auto output = output_t->flat<uint8_t>();
 
     functor::PopulationCount<Device, T> popcnt;
     popcnt(c, input, output);
@@ -77,7 +77,7 @@ namespace functor {
 namespace {
 
 template <typename T>
-inline uint8 PopCnt(const T v);
+inline uint8_t PopCnt(const T v);
 
 #define POPCNT(T, N)                  \
   template <>                         \
@@ -86,13 +86,13 @@ inline uint8 PopCnt(const T v);
   }
 
 POPCNT(int8_t, 8);
-POPCNT(uint8, 8);
+POPCNT(uint8_t, 8);
 POPCNT(int16_t, 16);
-POPCNT(uint16, 16);
+POPCNT(uint16_t, 16);
 POPCNT(int32_t, 32);
-POPCNT(uint32, 32);
+POPCNT(uint32_t, 32);
 POPCNT(int64_t, 64);
-POPCNT(uint64, 64);
+POPCNT(uint64_t, 64);
 
 #undef POPCNT
 
@@ -101,9 +101,9 @@ POPCNT(uint64, 64);
 template <typename T>
 struct PopulationCount<CPUDevice, T> {
   void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
-                  TTypes<uint8>::Flat output) {
+                  TTypes<uint8_t>::Flat output) {
     const T* input_ptr = input.data();
-    uint8* output_ptr = output.data();
+    uint8_t* output_ptr = output.data();
     auto shard = [input_ptr, output_ptr](int64_t start, int64_t limit) {
       for (int64_t i = start; i < limit; ++i) {
         output_ptr[i] = PopCnt<T>(input_ptr[i]);
@@ -113,8 +113,9 @@ struct PopulationCount<CPUDevice, T> {
     // Approximating cost of popcnt: convert T to int64
     // (std::bitset constructor) and convert int64 to uint8
     // (bitset.count() -> output).  The .count() itself is relatively cheap.
-    const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
-                               Eigen::TensorOpCost::CastCost<int64_t, uint8>());
+    const double total_cost =
+        (Eigen::TensorOpCost::CastCost<T, uint8_t>() +
+         Eigen::TensorOpCost::CastCost<int64_t, uint8_t>());
     const int64_t shard_cost =
         (total_cost >= static_cast<double>(std::numeric_limits<int64_t>::max()))
             ? std::numeric_limits<int64_t>::max()
diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h
index 2c9812967366d8..b9811e59c3ea38 100644
--- a/tensorflow/core/kernels/population_count_op.h
+++ b/tensorflow/core/kernels/population_count_op.h
@@ -28,7 +28,7 @@ namespace functor {
 template <typename Device, typename T>
 struct PopulationCount {
   void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
-                  TTypes<uint8>::Flat output);
+                  TTypes<uint8_t>::Flat output);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
index 5f2f14cfba0fb7..7df72b3a8f0b84 100644
--- a/tensorflow/core/kernels/population_count_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -35,34 +35,34 @@ namespace functor {
 template <typename T>
 __global__ void PopulationCountKernel(const int size,
                                       const T* __restrict__ input,
-                                      uint8* __restrict__ output) {
+                                      uint8_t* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
 }
 
 template <>
 __global__ void PopulationCountKernel(const int size,
-                                      const int8* __restrict__ input,
-                                      uint8* __restrict__ output) {
+                                      const int8_t* __restrict__ input,
+                                      uint8_t* __restrict__ output) {
   // For some reason, __popc on a negative int8 gets confused.
   GPU_1D_KERNEL_LOOP(i, size) {
-    output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i)));
+    output[i] = __popc(ldg(reinterpret_cast<const uint8_t*>(input + i)));
   }
 }
 
 template <>
 __global__ void PopulationCountKernel(const int size,
-                                      const int16* __restrict__ input,
-                                      uint8* __restrict__ output) {
+                                      const int16_t* __restrict__ input,
+                                      uint8_t* __restrict__ output) {
   // For some reason, __popc on a negative int16 gets confused.
   GPU_1D_KERNEL_LOOP(i, size) {
-    output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i)));
+    output[i] = __popc(ldg(reinterpret_cast<const uint16_t*>(input + i)));
   }
 }
 
 template <>
-__global__ void PopulationCountKernel<int64_t>(const int size,
-                                               const int64* __restrict__ input,
-                                               uint8* __restrict__ output) {
+__global__ void PopulationCountKernel<int64_t>(
+    const int size, const int64_t* __restrict__ input,
+    uint8_t* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
 }
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 56ea77fdbcf2ca..490cc338ddb99c 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 PriorityQueue::PriorityQueue(int32_t capacity,
                              const DataTypeVector& component_dtypes,
                              const std::vector<TensorShape>& component_shapes,
-                             const string& name)
+                             const std::string& name)
     : TypedQueue(capacity, component_dtypes, component_shapes, name) {}
 
 absl::Status PriorityQueue::Initialize() {
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index f7ca800a66bf7a..46408300778673 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -50,7 +50,7 @@ class PriorityQueue
  public:
   PriorityQueue(int32_t capacity, const DataTypeVector& component_dtypes,
                 const std::vector<TensorShape>& component_shapes,
-                const string& name);
+                const std::string& name);
 
   absl::Status Initialize()
       override;  // Must be called before any other method.
@@ -69,7 +69,7 @@ class PriorityQueue
   absl::Status MatchesPriorityNodeDefTypes(const NodeDef& node_def) const;
   absl::Status MatchesPriorityNodeDefShapes(const NodeDef& node_def) const;
 
-  int32 size() const override {
+  int32_t size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 88bee91121641a..f10b5e823d4143 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -84,7 +84,7 @@ T FloatToQuantized(float input, float range_min, float range_max) {
       static_cast<int64_t>(Eigen::NumTraits<T>::highest());
   quantized = std::max(quantized, lowest_quantized);
   quantized = std::min(quantized, highest_quantized);
-  return static_cast<T>(static_cast<int32>(quantized));
+  return static_cast<T>(static_cast<int32_t>(quantized));
 }
 
 template <class T>
@@ -284,7 +284,7 @@ inline void RequantizeManyInNewRangeReference(const qint32* input,
     int64_t quantized_int64 = round_intermediate >> fp_shift;
     quantized_int64 = std::max(quantized_int64, int64_t{0});
     quantized_int64 = std::min(quantized_int64, int64_t{255});
-    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
+    output[index] = static_cast<quint8>(static_cast<int32_t>(quantized_int64));
   }
 }
 
@@ -310,7 +310,7 @@ inline void RequantizeManyInNewRange8To32BitReference(
     int64_t output_value = code_0_int64 + (input_value * mult_int32);
     output_value = std::max(output_value, lowest_quantized);
     output_value = std::min(output_value, highest_quantized);
-    output[i] = static_cast<int32>(output_value);
+    output[i] = static_cast<int32_t>(output_value);
   }
 }
 
@@ -725,7 +725,7 @@ inline void RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
   auto intermediate = fp_value.unaryExpr(int64_right_shift_op<fp_shift>());
   auto input_requantized = intermediate.cwiseMax(int64_t{0})
                                .cwiseMin(int64_t{255})
-                               .template cast<int32>()
+                               .template cast<int32_t>()
                                .template cast<quint8>();
   output->flat<quint8>().device(device) = input_requantized;
 }
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 689e98cfebb2de..6c0251b7249484 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -60,7 +60,7 @@ void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device, float input_min,
         &o_tensor);
   }
 
-  const string tolerance_str = absl::StrCat("+-", tolerance);
+  const std::string tolerance_str = absl::StrCat("+-", tolerance);
   for (size_t value_index = 0; value_index < values_count; ++value_index) {
     int e = expected_values[value_index];
     int v = output_values(value_index);
@@ -96,7 +96,7 @@ void TestRequantizeMany8To32Bit(float input_min, float input_max,
                            input_max, output_min, output_max,
                            output_values.data());
 
-  const string tolerance_str = absl::StrCat("+-", tolerance);
+  const std::string tolerance_str = absl::StrCat("+-", tolerance);
   for (int value_index = 0; value_index < values_count; ++value_index) {
     const qint32 e = expected_values[value_index];
     const qint32 v = output_values(value_index);
@@ -143,7 +143,7 @@ void TestRequantizeManyInNewRange32To8Bit(
     qint32 high = Eigen::NumTraits<qint32>::highest();
     std::vector<qint32> vals{low, high};
     int num_steps = 14419;
-    qint32 step = static_cast<int32>((1LL << 32) / num_steps);
+    qint32 step = static_cast<int32_t>((1LL << 32) / num_steps);
     qint32 v = low + static_cast<qint32>(1);
     for (int i = 0; i < num_steps; ++i) {
       vals.push_back(v);
@@ -405,7 +405,7 @@ void TestQuantizedToFloatInPlaceUsingEigen(
         input_array(i) = Eigen::NumTraits<T>::lowest() + i;
       } else {
         int64_t offset = static_cast<int64_t>(q_range / values_count * i);
-        input_array(i) = static_cast<int32>(
+        input_array(i) = static_cast<int32_t>(
             std::min<int64_t>(Eigen::NumTraits<T>::lowest() + offset,
                               Eigen::NumTraits<T>::highest()));
       }
@@ -662,8 +662,8 @@ void TestOverflowWithEigen() {
   // because the implementation does a bounds check using float, not int32.
   test::FillValues<qint32>(
       &expected,
-      {static_cast<int32>(-2147483648), static_cast<int32>(-2147483648),
-       static_cast<int32>(2147483520), static_cast<int32>(2147483520)});
+      {static_cast<int32_t>(-2147483648), static_cast<int32_t>(-2147483648),
+       static_cast<int32_t>(2147483520), static_cast<int32_t>(2147483520)});
 
   FloatToQuantizedStruct<qint32> f2q(input_min, input_max);
   Tensor output(DT_QINT32, shape);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index e34601a86b6b77..64e7ec09c46eed 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -57,7 +57,7 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
                                 " with signed_input_ ", signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
 
-    string round_mode_string;
+    std::string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     OP_REQUIRES(
         ctx,
@@ -284,7 +284,7 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
                                 "be a scalar. Got dimensions: ",
                                 num_bits_tensor.dims()));
 
-    const int num_bits_val = num_bits_tensor.scalar<int32>()();
+    const int num_bits_val = num_bits_tensor.scalar<int32_t>()();
     OP_REQUIRES(ctx,
                 num_bits_val > 0 && num_bits_val < (signed_input_ ? 62 : 63),
                 InvalidArgument("num_bits is out of range: ", num_bits_val,
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 0d5b923ecbd0e7..b93292f83d677a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -86,7 +86,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor_V3) {
   AddInputFromArray<float>(TensorShape({1}), {-3.5});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
@@ -103,7 +103,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor_V3) {
 template <typename T>
 std::vector<T> ScalePerSliceAlongAxis(std::vector<int64_t> dims, int axis,
                                       const std::vector<T>& data) {
-  uint32 seed = 123;
+  uint32_t seed = 123;
   int64_t out_size = 1;
   for (int dim : dims) {
     out_size *= dim;
@@ -292,7 +292,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
   AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
   // Scale is: 1/128
@@ -337,7 +337,7 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   std::vector<float> init_value(num_slices, 0.0f);
   AddInputFromArray<float>(range_shape, init_value);  // Min
   AddInputFromArray<float>(range_shape, init_value);  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});   // num_bits
 
   // With int8, the values in the tensor are quantized to
   // {-127, -63, 0, 38, 102, 70, 64}.
@@ -490,7 +490,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {4});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {4});  // num_bits
 
   // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
   // Scale is: 1/8
@@ -583,7 +583,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
                            {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
   AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});   // num_bits
 
   // Note that the range is given as [-1, 1].
   // With int8, the tensor is quantized to {-102, -64, 0, 38, 102, 70, -128,
@@ -664,7 +664,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   // Note that the range is given as [0, 1].
   // With int8, the tensor is quantized to {0, 0, 76, 204}
@@ -712,7 +712,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {0, 0, 0, 0});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
@@ -755,7 +755,7 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
   AddInputFromArray<float>(TensorShape({}), {1.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   absl::Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(s.ToString(),
@@ -778,7 +778,7 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_axis_given_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
   AddInputFromArray<float>(TensorShape({}), {1.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   EXPECT_THAT(
       RunOpKernel(),
diff --git a/tensorflow/core/kernels/quantize_down_and_shrink_range.cc b/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
index 02ca323b991f68..9a49f96d4c6024 100644
--- a/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
+++ b/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
@@ -64,9 +64,9 @@ class QuantizeDownAndShrinkRangeOp : public OpKernel {
     // See QuantizationRangeOp as well, which has a copy of this logic.
     auto input_array = input.flat<T1>();
     const int32_t input_lowest_quantized =
-        static_cast<int32>(Eigen::NumTraits<T1>::lowest());
+        static_cast<int32_t>(Eigen::NumTraits<T1>::lowest());
     const int32_t input_highest_quantized =
-        static_cast<int32>(Eigen::NumTraits<T1>::highest());
+        static_cast<int32_t>(Eigen::NumTraits<T1>::highest());
     T1 actual_min_quantized = input_highest_quantized;
     T1 actual_max_quantized = input_lowest_quantized;
     for (int i = 0; i < input_array.size(); ++i) {
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index c6cdbed7c0d5f6..c63c07a394b6c6 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -67,7 +67,7 @@ class QuantizeV2Op : public OpKernel {
             : (static_cast<double>(std::numeric_limits<T>::max()) -
                static_cast<double>(std::numeric_limits<T>::min()) + 1) /
                   2.0f;
-    string mode_string;
+    std::string mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("mode", &mode_string));
     OP_REQUIRES(ctx,
                 (mode_string == "MIN_COMBINED" || mode_string == "MIN_FIRST" ||
@@ -83,7 +83,7 @@ class QuantizeV2Op : public OpKernel {
       mode_ = QUANTIZE_MODE_SCALED;
     }
 
-    string round_mode_string;
+    std::string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     OP_REQUIRES(ctx,
                 (round_mode_string == "HALF_AWAY_FROM_ZERO" ||
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 76fe2e9f963bef..ec486ba87dc990 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -62,7 +62,7 @@ TEST_F(QuantizedOpTest, QuantizeV2) {
 template <typename T>
 std::vector<T> ScalePerSliceAlongAxis(std::vector<int64_t> dims, int axis,
                                       const std::vector<T>& data) {
-  uint32 seed = 123;
+  uint32_t seed = 123;
   std::minstd_rand rng(seed);
   int64_t out_size = 1;
   for (int dim : dims) {
@@ -373,14 +373,14 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(),
+                               std::numeric_limits<int32_t>::min(),
                                0,
-                               static_cast<int32>(1.0f * (1 << 23)),
-                               static_cast<int32>(1.25f * (1 << 23)),
-                               static_cast<int32>(1.75f * (1 << 23)),
-                               static_cast<int32>(127.0f * (1 << 23)),
-                               static_cast<int32>(255.0f * (1 << 23)),
-                               std::numeric_limits<int32>::max(),
+                               static_cast<int32_t>(1.0f * (1 << 23)),
+                               static_cast<int32_t>(1.25f * (1 << 23)),
+                               static_cast<int32_t>(1.75f * (1 << 23)),
+                               static_cast<int32_t>(127.0f * (1 << 23)),
+                               static_cast<int32_t>(255.0f * (1 << 23)),
+                               std::numeric_limits<int32_t>::max(),
                            });
   // We expect there will be some fuzziness in the lower bits, since this is
   // converting from float.
diff --git a/tensorflow/core/kernels/quantized_add_op.cc b/tensorflow/core/kernels/quantized_add_op.cc
index 5cf7ed1456034e..e8904e8a088395 100644
--- a/tensorflow/core/kernels/quantized_add_op.cc
+++ b/tensorflow/core/kernels/quantized_add_op.cc
@@ -149,7 +149,7 @@ void ScalarAddition(OpKernelContext* context, const quint8* full_input,
     full_input_in_output_range_64 =
         std::min(full_input_in_output_range_64, highest_quantized);
     const int32_t full_input_in_output_range =
-        static_cast<int32>(full_input_in_output_range_64);
+        static_cast<int32_t>(full_input_in_output_range_64);
     output[i] = full_input_in_output_range + scalar_in_output_range;
   }
 }
@@ -272,13 +272,15 @@ void VectorAddition(OpKernelContext* context, const quint8* x_data, float min_x,
     int64_t x_in_output_range_64 = x_0_int64 + (x_value * x_mult_int32);
     x_in_output_range_64 = std::max(x_in_output_range_64, lowest_quantized);
     x_in_output_range_64 = std::min(x_in_output_range_64, highest_quantized);
-    const int32_t x_in_output_range = static_cast<int32>(x_in_output_range_64);
+    const int32_t x_in_output_range =
+        static_cast<int32_t>(x_in_output_range_64);
 
     const int64_t y_value = static_cast<int64_t>(y_data[i]);
     int64_t y_in_output_range_64 = y_0_int64 + (y_value * y_mult_int32);
     y_in_output_range_64 = std::max(y_in_output_range_64, lowest_quantized);
     y_in_output_range_64 = std::min(y_in_output_range_64, highest_quantized);
-    const int32_t y_in_output_range = static_cast<int32>(y_in_output_range_64);
+    const int32_t y_in_output_range =
+        static_cast<int32_t>(y_in_output_range_64);
 
     output[i] = x_in_output_range + y_in_output_range;
   }
@@ -430,7 +432,7 @@ void VectorTensorAddition(const quint8* vector_data, float min_vector,
     vector_in_output_range_64 =
         std::min(vector_in_output_range_64, highest_quantized);
     const int32_t vector_in_output_range =
-        static_cast<int32>(vector_in_output_range_64);
+        static_cast<int32_t>(vector_in_output_range_64);
 
     const int64_t tensor_value = static_cast<int64_t>(tensor_data[i]);
     int64_t tensor_in_output_range_64 =
@@ -440,7 +442,7 @@ void VectorTensorAddition(const quint8* vector_data, float min_vector,
     tensor_in_output_range_64 =
         std::min(tensor_in_output_range_64, highest_quantized);
     const int32_t tensor_in_output_range =
-        static_cast<int32>(tensor_in_output_range_64);
+        static_cast<int32_t>(tensor_in_output_range_64);
 
     output[i] = vector_in_output_range + tensor_in_output_range;
   }
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index 7f7c59e2f40fc5..613fef99ea67c9 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -183,7 +183,7 @@ class QuantizedConcatOp : public OpKernel {
         errors::InvalidArgument(
             "Concat dim tensor should be a scalar integer, but got shape ",
             concat_dim_tensor->shape().DebugString()));
-    const int32_t concat_dim = concat_dim_tensor->scalar<int32>()();
+    const int32_t concat_dim = concat_dim_tensor->scalar<int32_t>()();
     OpInputList values;
     OP_REQUIRES_OK(context, context->input_list("values", &values));
     const size_t N = values.size();
diff --git a/tensorflow/core/kernels/quantized_concat_op_test.cc b/tensorflow/core/kernels/quantized_concat_op_test.cc
index 81f8b718d2b41e..cebe247f77f460 100644
--- a/tensorflow/core/kernels/quantized_concat_op_test.cc
+++ b/tensorflow/core/kernels/quantized_concat_op_test.cc
@@ -88,7 +88,7 @@ void QuantizedConcatTest::TestInvalidMinMax(const Tensor& first_min,
   Tensor second_quantized(DT_QUINT8, {1});
   test::FillValues<quint8>(&second_quantized, {1});
 
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   AddInputFromArray<quint8>(first_quantized.shape(),
                             first_quantized.flat<quint8>());
   AddInputFromArray<quint8>(second_quantized.shape(),
@@ -144,7 +144,7 @@ void QuantizedConcatTest::TestSmall8Bit(float first_min, float first_max,
                           {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
                            13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
 
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   AddInputFromArray<quint8>(first_quantized.shape(),
                             first_quantized.flat<quint8>());
   AddInputFromArray<quint8>(second_quantized.shape(),
@@ -210,7 +210,7 @@ void QuantizedConcatTest::TestSmall32Bit(float first_min, float first_max,
       {100,  200,  300,  400,  500,  600,  700,  800,  900,  1000, 1100, 1200,
        1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400});
 
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   AddInputFromArray<qint32>(first_quantized.shape(),
                             first_quantized.flat<qint32>());
   AddInputFromArray<qint32>(second_quantized.shape(),
@@ -272,7 +272,7 @@ void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
                           {1, 2, 3, 4,  5,  6,  13, 14, 15, 16, 17, 18,
                            7, 8, 9, 10, 11, 12, 19, 20, 21, 22, 23, 24});
 
-  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32_t>(TensorShape({}), {1});
   AddInputFromArray<quint8>(first_quantized.shape(),
                             first_quantized.flat<quint8>());
   AddInputFromArray<quint8>(second_quantized.shape(),
@@ -303,7 +303,7 @@ static void ConcatHelper(::testing::benchmark::State& state,
   const int kDim1 = 100;
   TensorShape shape({kDim1, dim2});
 
-  Tensor concat_dim = test::AsScalar<int32>(concat_dimension);
+  Tensor concat_dim = test::AsScalar<int32_t>(concat_dimension);
   Tensor in0(dt, shape);
   in0.flat<T>().setRandom();
   Tensor in1(dt, shape);
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 3f3e2743d674f4..14072547b310e7 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -62,8 +62,9 @@ class ReferenceConvFunctor {
                   int output_shift, int output_offset, int output_mult) {
     // Set up some constants we need for the output down-shifting and
     // saturation.
-    const int32_t highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
-    const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+    const int32_t highest =
+        static_cast<int32_t>(Eigen::NumTraits<T3>::highest());
+    const int32_t lowest = static_cast<int32_t>(Eigen::NumTraits<T3>::lowest());
 
     // When we're converting the 32 bit accumulator to a lower bit depth, we
     // need to add on 0.5 in fixed-point terms to make the operation round half
@@ -150,7 +151,7 @@ class ReferenceConvFunctor {
                     // We're promoting the T1 type to a higher bit depth here as
                     // we do the subtraction.
                     input_value =
-                        static_cast<int32>(input_source_value) - input_offset;
+                        static_cast<int32_t>(input_source_value) - input_offset;
                   } else {
                     input_value = 0;
                   }
@@ -161,7 +162,7 @@ class ReferenceConvFunctor {
                                   (in_channel * filter_count) + out_channel];
                   // Another promotion to 32 bit, as above.
                   const int32_t filter_value =
-                      static_cast<int32>(filter_source_value) - filter_offset;
+                      static_cast<int32_t>(filter_source_value) - filter_offset;
                   total += (input_value * filter_value);
                 }
               }
@@ -406,9 +407,9 @@ class Im2ColConvFunctor {
         // The gemmlowp optimized library only works for a particular set of
         // data types, so check if we meet those requirements and fall back to a
         // slower reference implementation if not.
-        const uint8* im2col_data_as_uint8 = &(im2col_buffer->value);
-        const uint8* filter_data_as_uint8 = &(filter_data->value);
-        int32* output_data_as_int32 = &(chunk_output_data->value);
+        const uint8_t* im2col_data_as_uint8 = &(im2col_buffer->value);
+        const uint8_t* filter_data_as_uint8 = &(filter_data->value);
+        int32_t* output_data_as_int32 = &(chunk_output_data->value);
         // All of the transpose_* variables are currently compile-time consts,
         // so we could just hard-code these values too, but that would break if
         // anybody changed those values in the future (e.g. to match the ability
@@ -472,7 +473,7 @@ class QuantizedConv2DOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
-    std::vector<int32> dilations;
+    std::vector<int32_t> dilations;
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations));
     OP_REQUIRES(context, dilations.size() == 4,
                 errors::InvalidArgument("Dilations field must "
@@ -612,7 +613,7 @@ class QuantizedConv2DOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
 };
 
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index ae65dc3b5e38ce..5f7143e183991a 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -38,9 +38,9 @@ template <bool TransposeA, bool TransposeB, bool TransposeC>
 void GemmlowpMultiply(OpKernelContext* op_context, const quint8* a_data,
                       const quint8* b_data, qint32* c_data, int m, int n, int k,
                       int offset_a, int offset_b, int lda, int ldb, int ldc) {
-  const uint8* a_data_as_uint8 = &(a_data->value);
-  const uint8* b_data_as_uint8 = &(b_data->value);
-  int32* c_data_as_int32 = &(c_data->value);
+  const uint8_t* a_data_as_uint8 = &(a_data->value);
+  const uint8_t* b_data_as_uint8 = &(b_data->value);
+  int32_t* c_data_as_int32 = &(c_data->value);
   static const gemmlowp::MapOrder ResultOrder =
       !TransposeC ? gemmlowp::MapOrder::RowMajor : gemmlowp::MapOrder::ColMajor;
   static const gemmlowp::MapOrder LhsOrder =
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index fed18e3a6f917d..9028137e49949d 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -38,9 +38,9 @@ void ScalarMultiply(OpKernelContext* context, const T* full_input,
                     T scalar_input, int32_t scalar_input_offset,
                     Toutput* output) {
   const int32_t scalar_minus_offset =
-      static_cast<int32>(scalar_input) - scalar_input_offset;
+      static_cast<int32_t>(scalar_input) - scalar_input_offset;
   for (int i = 0; i < num_elements; ++i) {
-    output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
+    output[i] = (static_cast<int32_t>(full_input[i]) - full_input_offset) *
                 scalar_minus_offset;
   }
 }
@@ -115,8 +115,8 @@ void VectorMultiply(OpKernelContext* context, const T* x_data, int32_t offset_x,
                     const T* y_data, int32_t offset_y, int64_t num_elements,
                     Toutput* output) {
   for (int i = 0; i < num_elements; ++i) {
-    output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
-                (static_cast<int32>(y_data[i]) - offset_y);
+    output[i] = (static_cast<int32_t>(x_data[i]) - offset_x) *
+                (static_cast<int32_t>(y_data[i]) - offset_y);
   }
 }
 
@@ -193,8 +193,8 @@ void VectorTensorMultiply(const T* vector_data, int32_t vector_offset,
                           Toutput* output) {
   for (int i = 0; i < tensor_num_elements; ++i) {
     const int64_t vector_i = i % vector_num_elements;
-    output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
-                (static_cast<int32>(tensor_data[i]) - tensor_offset);
+    output[i] = (static_cast<int32_t>(vector_data[vector_i]) - vector_offset) *
+                (static_cast<int32_t>(tensor_data[i]) - tensor_offset);
   }
 }
 
diff --git a/tensorflow/core/kernels/quantized_pooling_ops.cc b/tensorflow/core/kernels/quantized_pooling_ops.cc
index 5efedd082c4aea..5a05d1635c1d6b 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops.cc
@@ -95,8 +95,9 @@ class QuantizedAvgPoolingOp : public OpKernel {
                    params.forward_output_shape(&params_forward_output_shape));
     OP_REQUIRES_OK(context, context->allocate_output(
                                 0, params_forward_output_shape, &output));
-    const int32_t highest = static_cast<int32>(Eigen::NumTraits<T>::highest());
-    const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T>::lowest());
+    const int32_t highest =
+        static_cast<int32_t>(Eigen::NumTraits<T>::highest());
+    const int32_t lowest = static_cast<int32_t>(Eigen::NumTraits<T>::lowest());
 
     // TODO(vrv): Switch this to the Eigen::Tensor version of
     // SpatialAvgPooling once that version is running quickly.
@@ -105,12 +106,12 @@ class QuantizedAvgPoolingOp : public OpKernel {
     Tensor int32_output(DT_INT32, params_forward_output_shape);
     // Cast input to int32 tensor and call SpatialAvgPool.
     Tensor int32_input(DT_INT32, tensor_in.shape());
-    int32_input.flat<int32>() = tensor_in.flat<T>().template cast<int32>();
-    SpatialAvgPool<Device, int32>(context, &int32_output, int32_input, params,
-                                  padding_);
+    int32_input.flat<int32_t>() = tensor_in.flat<T>().template cast<int32_t>();
+    SpatialAvgPool<Device, int32_t>(context, &int32_output, int32_input, params,
+                                    padding_);
 
     // Clamp the int32 output back into quantized space.
-    output->flat<T>() = int32_output.flat<int32>()
+    output->flat<T>() = int32_output.flat<int32_t>()
                             .cwiseMax(lowest)
                             .cwiseMin(highest)
                             .template cast<T>();
@@ -124,8 +125,8 @@ class QuantizedAvgPoolingOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
 };
 
diff --git a/tensorflow/core/kernels/quantized_reshape_op_test.cc b/tensorflow/core/kernels/quantized_reshape_op_test.cc
index a7066f98f39e99..a2c7b60bbc71db 100644
--- a/tensorflow/core/kernels/quantized_reshape_op_test.cc
+++ b/tensorflow/core/kernels/quantized_reshape_op_test.cc
@@ -56,7 +56,7 @@ TEST_F(QuantizedReshapeTest, Reshape) {
     expected.flat<quint8>()(i) = quint8(i);
   }
   AddInputFromArray<quint8>(input.shape(), input.flat<quint8>());
-  AddInputFromList<int32>({3}, {5, 10, 4});  // shape
+  AddInputFromList<int32_t>({3}, {5, 10, 4});  // shape
   AddInputFromArray<float>(TensorShape({1}), {-10});
   AddInputFromArray<float>(TensorShape({1}), {20});
   TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
index 2efdd38dc6ef45..4e6f072973b3e1 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -132,7 +132,7 @@ inline T ComputeLerp(const T top_left, const T top_right, const T bottom_left,
       MulOffset<T, T_SCALE, T_CALC>(bottom_right, bottom_left, x_lerp);
   const T_CALC out = top + (bottom - top) / RESOLUTION_MULT * y_lerp;
   return static_cast<T>(
-      static_cast<int32>((out + RESOLUTION_MULT / 2) / RESOLUTION_MULT));
+      static_cast<int32_t>((out + RESOLUTION_MULT / 2) / RESOLUTION_MULT));
 }
 
 #ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
@@ -266,7 +266,7 @@ inline void OutputLerpForChannels(const InterpolationCache<T_SCALE>& xs,
 }
 
 template <int RES>
-inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
+inline void OutputLerp8x8x1(const InterpolationCache<int16_t>& xs,
                             const int64_t x_start, const int16_t ys_ilerp,
                             const float min, const float max,
                             const quint8* const ys_input_lower_ptr,
@@ -284,7 +284,7 @@ inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
 
 #else
   for (int x = x_start; x < x_start + 8; ++x) {
-    OutputLerpForChannels<RES, quint8, int16, int16>(
+    OutputLerpForChannels<RES, quint8, int16_t, int16_t>(
         xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -292,7 +292,7 @@ inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
 }
 
 template <int RES>
-inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
+inline void OutputLerp8x8x3(const InterpolationCache<int16_t>& xs,
                             const int64_t x_start, const int16_t ys_ilerp,
                             const float min, const float max,
                             const quint8* const ys_input_lower_ptr,
@@ -325,7 +325,7 @@ inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
 
 #else
   for (int x = x_start; x < x_start + 8; ++x) {
-    OutputLerpForChannels<RES, quint8, int16, int16>(
+    OutputLerpForChannels<RES, quint8, int16_t, int16_t>(
         xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -333,7 +333,7 @@ inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
 }
 
 template <int RESOLUTION>
-inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
+inline void OutputLerp32x4x1(const InterpolationCache<int32_t>& xs,
                              const int64_t x_start, const int32_t ys_ilerp,
                              const float min, const float max,
                              const qint32* const ys_input_lower_ptr,
@@ -373,7 +373,7 @@ inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
 
 #else
   for (int x = x_start; x < x_start + 4; ++x) {
-    OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
+    OutputLerpForChannels<RESOLUTION, qint32, int32_t, int64_t>(
         xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -381,7 +381,7 @@ inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
 }
 
 template <int RESOLUTION>
-inline void OutputLerp32x4x3(const InterpolationCache<int32>& xs,
+inline void OutputLerp32x4x3(const InterpolationCache<int32_t>& xs,
                              const int64_t x_start, const int32_t ys_ilerp,
                              const float min, const float max,
                              const qint32* const ys_input_lower_ptr,
@@ -458,7 +458,7 @@ inline void OutputLerp32x4x3(const InterpolationCache<int32>& xs,
 
 #else
   for (int x = x_start; x < x_start + 4; ++x) {
-    OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
+    OutputLerpForChannels<RESOLUTION, qint32, int32_t, int64_t>(
         xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -543,10 +543,10 @@ void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
 
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<int32> xs =
-      BuildLerpCache<int32>(out_width, in_width, width_scale, channels,
-                            RESOLUTION, half_pixel_centers);
-  const InterpolationCache<int32> ys = BuildLerpCache<int32>(
+  const InterpolationCache<int32_t> xs =
+      BuildLerpCache<int32_t>(out_width, in_width, width_scale, channels,
+                              RESOLUTION, half_pixel_centers);
+  const InterpolationCache<int32_t> ys = BuildLerpCache<int32_t>(
       out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);
 
   const int64_t in_row_size = in_width * channels;
@@ -581,7 +581,7 @@ void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
         }
       }
       for (; x < out_width; ++x) {
-        OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
+        OutputLerpForChannels<RESOLUTION, qint32, int32_t, int64_t>(
             xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
             ys_input_upper_ptr, output_y_ptr);
       }
@@ -606,10 +606,10 @@ void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
 
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<int16> xs =
-      BuildLerpCache<int16>(out_width, in_width, width_scale, channels,
-                            RESOLUTION, half_pixel_centers);
-  const InterpolationCache<int16> ys = BuildLerpCache<int16>(
+  const InterpolationCache<int16_t> xs =
+      BuildLerpCache<int16_t>(out_width, in_width, width_scale, channels,
+                              RESOLUTION, half_pixel_centers);
+  const InterpolationCache<int16_t> ys = BuildLerpCache<int16_t>(
       out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);
 
   const int64_t in_row_size = in_width * channels;
@@ -646,7 +646,7 @@ void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
         }
       }
       for (; x < out_width; ++x) {
-        OutputLerpForChannels<RESOLUTION, quint8, int16, int16>(
+        OutputLerpForChannels<RESOLUTION, quint8, int16_t, int16_t>(
             xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
             ys_input_upper_ptr, output_y_ptr);
       }
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index 52c66efd890ea6..8c2426ee6621b7 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -171,8 +171,8 @@ void CheckTensorValue(const T* in_data, const T* out_data, const int batch_size,
           const float val = QuantizedToFloat<T>(qval, min, max);
           if (!relative) {
             const int q_tolerance = std::round(tolerance);
-            EXPECT_TRUE(std::abs(static_cast<int32>(ref_qval) -
-                                 static_cast<int32>(qval)) <= q_tolerance)
+            EXPECT_TRUE(std::abs(static_cast<int32_t>(ref_qval) -
+                                 static_cast<int32_t>(qval)) <= q_tolerance)
                 << "ref = " << ref_val << ", val = " << val << ", " << b << ", "
                 << y << ", " << x << ", " << c << ", qval = " << qval
                 << ", ref qval = " << ref_qval << ", " << q_tolerance;
@@ -197,7 +197,7 @@ void TestResizeBilinear(const Tensor& image_tensor, const DataType dt,
   Scope root = Scope::NewRootScope();
 
   Output placeholder = ops::Placeholder(root.WithOpName("placeholder"), dt);
-  Output size = ops::Const<int32>(root.WithOpName("size"), new_size);
+  Output size = ops::Const<int32_t>(root.WithOpName("size"), new_size);
   Output in_min = ops::Const<float>(root.WithOpName("min"), min);
   Output in_max = ops::Const<float>(root.WithOpName("max"), max);
 
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 4274c775bd1557..e62b4cdf2db9d6 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -51,7 +51,7 @@ absl::Status HandleSliceToElement(const Tensor& parent, Tensor* element,
 
 QueueBase::QueueBase(int32_t capacity, const DataTypeVector& component_dtypes,
                      const std::vector<TensorShape>& component_shapes,
-                     const string& name)
+                     const std::string& name)
     : capacity_(capacity),
       component_dtypes_(component_dtypes),
       component_shapes_(component_shapes),
@@ -78,8 +78,9 @@ absl::Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
 }
 
 // static
-string QueueBase::ShapeListString(const absl::Span<const TensorShape>& shapes) {
-  string result = "[";
+std::string QueueBase::ShapeListString(
+    const absl::Span<const TensorShape>& shapes) {
+  std::string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
     absl::StrAppend(&result, first ? "" : ", ", shape.DebugString());
@@ -90,7 +91,7 @@ string QueueBase::ShapeListString(const absl::Span<const TensorShape>& shapes) {
 }
 
 absl::Status QueueBase::MatchesNodeDefOp(const NodeDef& node_def,
-                                         const string& op) const {
+                                         const std::string& op) const {
   if (node_def.op() != op) {
     return errors::InvalidArgument("Shared queue '", name_, "' has type '", op,
                                    "' that does not match type of Node '",
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index d39ab45498b843..e55693b4d540d4 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -46,7 +46,7 @@ class QueueBase : public QueueInterface {
   //   name: A name to use for the queue.
   QueueBase(int32_t capacity, const DataTypeVector& component_dtypes,
             const std::vector<TensorShape>& component_shapes,
-            const string& name);
+            const std::string& name);
 
   // Implementations of QueueInterface methods --------------------------------
   const DataTypeVector& component_dtypes() const override {
@@ -64,7 +64,7 @@ class QueueBase : public QueueInterface {
     return component_shapes_;
   }
 
-  int32 capacity() const { return capacity_; }
+  int32_t capacity() const { return capacity_; }
 
   bool is_closed() const override {
     mutex_lock lock(mu_);
@@ -103,7 +103,7 @@ class QueueBase : public QueueInterface {
   };
 
   // Returns the number of components in a queue-element tuple.
-  int32 num_components() const { return component_dtypes_.size(); }
+  int32_t num_components() const { return component_dtypes_.size(); }
 
   // True if shapes were specified.  If so, inputs will be validated
   // against them, etc.
@@ -135,26 +135,27 @@ class QueueBase : public QueueInterface {
   ~QueueBase() override;
 
   // Helpers for implementing MatchesNodeDef().
-  static string ShapeListString(const absl::Span<const TensorShape>& shapes);
+  static std::string ShapeListString(
+      const absl::Span<const TensorShape>& shapes);
   absl::Status MatchesNodeDefOp(const NodeDef& node_def,
-                                const string& op) const;
+                                const std::string& op) const;
   absl::Status MatchesNodeDefCapacity(const NodeDef& node_def,
                                       int32_t capacity) const;
   absl::Status MatchesNodeDefTypes(const NodeDef& node_def) const;
   absl::Status MatchesNodeDefShapes(const NodeDef& node_def) const;
 
  protected:
-  const int32 capacity_;
+  const int32_t capacity_;
   const DataTypeVector component_dtypes_;
   const std::vector<TensorShape> component_shapes_;
-  const string name_;
+  const std::string name_;
   mutable mutex mu_;
   bool closed_ TF_GUARDED_BY(mu_);
 
   struct Attempt;
   typedef std::function<RunResult(Attempt*)> RunCallback;
   struct Attempt {
-    int32 elements_requested;
+    int32_t elements_requested;
     DoneCallback done_callback;  // must be run outside mu_
     OpKernelContext* context;
     CancellationManager* cancellation_manager;  // not owned
diff --git a/tensorflow/core/kernels/queue_op.cc b/tensorflow/core/kernels/queue_op.cc
index e16c6034de4596..2f77020256080a 100644
--- a/tensorflow/core/kernels/queue_op.cc
+++ b/tensorflow/core/kernels/queue_op.cc
@@ -210,7 +210,7 @@ DequeueManyOp::DequeueManyOp(OpKernelConstruction* context)
 void DequeueManyOp::ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                                  DoneCallback callback) {
   const Tensor& Tnum_elements = ctx->input(1);
-  int32_t num_elements = Tnum_elements.flat<int32>()(0);
+  int32_t num_elements = Tnum_elements.flat<int32_t>()(0);
 
   OP_REQUIRES_ASYNC(ctx, num_elements >= 0,
                     errors::InvalidArgument("DequeueManyOp requested ",
@@ -283,7 +283,7 @@ DequeueUpToOp::DequeueUpToOp(OpKernelConstruction* context)
 void DequeueUpToOp::ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                                  DoneCallback callback) {
   const Tensor& Tnum_elements = ctx->input(1);
-  int32_t num_elements = Tnum_elements.flat<int32>()(0);
+  int32_t num_elements = Tnum_elements.flat<int32_t>()(0);
 
   OP_REQUIRES_ASYNC(ctx, num_elements >= 0,
                     errors::InvalidArgument("DequeueUpToOp requested ",
@@ -349,7 +349,7 @@ void QueueSizeOp::ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                                DoneCallback callback) {
   Tensor* Tqueue_size = nullptr;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_size));
-  Tqueue_size->flat<int32>().setConstant(queue->size());
+  Tqueue_size->flat<int32_t>().setConstant(queue->size());
   callback();
 }
 
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index 57a771d91fcb50..4c5c1ee10b0433 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -39,7 +39,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
 
  protected:
   // Variables accessible by subclasses
-  int32 capacity_;
+  int32_t capacity_;
   DataTypeVector component_types_;
 
  private:
diff --git a/tensorflow/core/kernels/ragged_cross_op.cc b/tensorflow/core/kernels/ragged_cross_op.cc
index 9e7d0d52ac2ae7..9612e6bcdbabfb 100644
--- a/tensorflow/core/kernels/ragged_cross_op.cc
+++ b/tensorflow/core/kernels/ragged_cross_op.cc
@@ -51,7 +51,7 @@ class FeatureReader {
   virtual int64_t FeatureCount(int64_t batch) const = 0;
 
   // Copies the value for the specified feature to `out`.
-  virtual void ReadValue(int64_t batch, int64_t n, uint64* out) const = 0;
+  virtual void ReadValue(int64_t batch, int64_t n, uint64_t* out) const = 0;
   virtual void ReadValue(int64_t batch, int64_t n, tstring* out) const = 0;
 
   virtual ~FeatureReader() {}
@@ -70,10 +70,10 @@ void CopyToString(const tstring& src, tstring* dst) {
 void CopyToString(int64_t src, tstring* dst) { *dst = std::to_string(src); }
 
 // Copies a feature value `src` to an int64 fingerprint `dst`.
-void CopyToFingerprint(const tstring& feature, uint64* dst) {
+void CopyToFingerprint(const tstring& feature, uint64_t* dst) {
   *dst = Fingerprint64(feature);
 }
-void CopyToFingerprint(int64_t feature, uint64* dst) { *dst = feature; }
+void CopyToFingerprint(int64_t feature, uint64_t* dst) { *dst = feature; }
 
 // A FeatureReader that is backed by a ragged tensor.
 template <typename ValuesType, typename SplitsType>
@@ -87,7 +87,7 @@ class RaggedFeatureReader : public FeatureReader {
     return row_splits_(batch + 1) - row_splits_(batch);
   }
 
-  void ReadValue(int64_t batch, int64_t n, uint64* out) const override {
+  void ReadValue(int64_t batch, int64_t n, uint64_t* out) const override {
     CopyToFingerprint(values_(row_splits_(batch) + n), out);
   }
 
@@ -110,7 +110,7 @@ class DenseFeatureReader : public FeatureReader {
 
   int64_t FeatureCount(int64_t batch) const override { return feature_count_; }
 
-  void ReadValue(int64_t batch, int64_t n, uint64* out) const override {
+  void ReadValue(int64_t batch, int64_t n, uint64_t* out) const override {
     CopyToFingerprint(values_(batch, n), out);
   }
 
@@ -145,7 +145,7 @@ class SparseFeatureReader : public FeatureReader {
     return row_splits_[batch + 1] - row_splits_[batch];
   }
 
-  void ReadValue(int64_t batch, int64_t n, uint64* out) const override {
+  void ReadValue(int64_t batch, int64_t n, uint64_t* out) const override {
     CopyToFingerprint(values_(row_splits_[batch] + n), out);
   }
 
@@ -179,7 +179,7 @@ class OutputWriterImpl : public OutputWriter {
   using FlatSplits = typename TTypes<SplitsType>::ConstFlat;
 
   OutputWriterImpl(const FeatureReaders& features, int64_t num_buckets,
-                   uint64 hash_key, const Tensor* splits_out,
+                   uint64_t hash_key, const Tensor* splits_out,
                    Tensor* values_out)
       : features_(features),
         num_buckets_(num_buckets),
@@ -220,9 +220,9 @@ class OutputWriterImpl : public OutputWriter {
   void WriteCombination(int64_t batch_index,
                         const std::vector<int>& combination, int64_t* out) {
     // Do the fingerprint concatenation on uint64.
-    uint64 hashed_output = hash_key_;
+    uint64_t hashed_output = hash_key_;
     for (size_t i = 0; i < combination.size(); ++i) {
-      uint64 hash_i;
+      uint64_t hash_i;
       features_[i]->ReadValue(batch_index, combination[i], &hash_i);
       hashed_output = FingerprintCat64(hashed_output, hash_i);
     }
@@ -254,7 +254,7 @@ class OutputWriterImpl : public OutputWriter {
 
   const FeatureReaders& features_;
   const int64_t num_buckets_;
-  const uint64 hash_key_;
+  const uint64_t hash_key_;
   FlatSplits splits_out_;
   FlatValues values_out_;
 };
@@ -263,7 +263,7 @@ class OutputWriterImpl : public OutputWriter {
 // given tensors.
 std::unique_ptr<OutputWriter> MakeOutputWriter(const FeatureReaders& features,
                                                int64_t num_buckets,
-                                               uint64 hash_key,
+                                               uint64_t hash_key,
                                                const Tensor* splits_out,
                                                Tensor* values_out) {
   if (values_out->dtype() == DT_INT64) {
@@ -271,7 +271,7 @@ std::unique_ptr<OutputWriter> MakeOutputWriter(const FeatureReaders& features,
       return std::make_unique<OutputWriterImpl<int64_t, int64_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     } else {
-      return std::make_unique<OutputWriterImpl<int64_t, int32>>(
+      return std::make_unique<OutputWriterImpl<int64_t, int32_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     }
   } else {
@@ -279,7 +279,7 @@ std::unique_ptr<OutputWriter> MakeOutputWriter(const FeatureReaders& features,
       return std::make_unique<OutputWriterImpl<tstring, int64_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     } else {
-      return std::make_unique<OutputWriterImpl<tstring, int32>>(
+      return std::make_unique<OutputWriterImpl<tstring, int32_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     }
   }
@@ -298,7 +298,7 @@ class RaggedCrossOp : public OpKernel {
     // supported by REGISTER_OP.
     int64_t signed_hash_key_;
     OP_REQUIRES_OK(context, context->GetAttr("hash_key", &signed_hash_key_));
-    hash_key_ = static_cast<uint64>(signed_hash_key_);
+    hash_key_ = static_cast<uint64_t>(signed_hash_key_);
 
     int num_sparse;
     OP_REQUIRES_OK(context, context->GetAttr("Nsparse", &num_sparse));
@@ -542,7 +542,7 @@ class RaggedCrossOp : public OpKernel {
             new RaggedFeatureReader<int64_t, int64_t>(values, splits));
       } else {
         features->emplace_back(
-            new RaggedFeatureReader<int64_t, int32>(values, splits));
+            new RaggedFeatureReader<int64_t, int32_t>(values, splits));
       }
     } else {
       if (splits.dtype() == DT_INT64) {
@@ -550,7 +550,7 @@ class RaggedCrossOp : public OpKernel {
             new RaggedFeatureReader<tstring, int64_t>(values, splits));
       } else {
         features->emplace_back(
-            new RaggedFeatureReader<tstring, int32>(values, splits));
+            new RaggedFeatureReader<tstring, int32_t>(values, splits));
       }
     }
     return absl::OkStatus();
@@ -632,7 +632,7 @@ class RaggedCrossOp : public OpKernel {
   }
 
   int64_t num_buckets_;
-  uint64 hash_key_;
+  uint64_t hash_key_;
   std::vector<DataType> ragged_values_types_;
   std::vector<DataType> ragged_splits_types_;
   std::vector<DataType> sparse_values_types_;
@@ -642,8 +642,8 @@ class RaggedCrossOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("RaggedCross")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<int32>("out_row_splits_type"),
-                        RaggedCrossOp<int32>);
+                            .TypeConstraint<int32_t>("out_row_splits_type"),
+                        RaggedCrossOp<int32_t>);
 REGISTER_KERNEL_BUILDER(Name("RaggedCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<int64_t>("out_row_splits_type"),
diff --git a/tensorflow/core/kernels/ragged_gather_op_test.cc b/tensorflow/core/kernels/ragged_gather_op_test.cc
index ca070524a62acc..cebccdd360f2d4 100644
--- a/tensorflow/core/kernels/ragged_gather_op_test.cc
+++ b/tensorflow/core/kernels/ragged_gather_op_test.cc
@@ -65,7 +65,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather) {
   // indices = [2, 1, 0, 3]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
   // params.shape = [4, None]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({4}),                     // indices.shape
       {2, 1, 0, 3},                         // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -87,7 +87,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_3DParams) {
   // indices = [2, 1, 0, 2, 3]
   // params = [[[]], [[.1, 2], [.3]], [], [[.4, .5], [.6, .7, .8]], [[.9]]]
   // params.shape = [5, None, None]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({5}),                             // indices.shape
       {2, 1, 0, 2, 3},                              // indices
       {{0, 1, 3, 3, 5, 6}, {0, 0, 2, 3, 5, 8, 9}},  // params_nested_splits
@@ -111,7 +111,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_4DParams) {
   // indices = [2, 1, 0, 2]
   // params = [[[]], [[[1, 2], [3, 4], [5, 6]], [[7, 8]]], []]
   // params.shape = [4, None, None, 2]
-  BuildRaggedGatherGraph<int32, int32>(
+  BuildRaggedGatherGraph<int32_t, int32_t>(
       TensorShape({4}),              // indices.shape
       {2, 1, 0, 2},                  // indices
       {{0, 1, 3, 3}, {0, 0, 3, 4}},  // params_nested_splits
@@ -129,15 +129,15 @@ TEST_F(RaggedGatherOpTest, RaggedGather_4DParams) {
                                    test::AsTensor<int64_t>({0, 0, 2, 3, 3}));
   test::ExpectTensorEqual<int64_t>(*GetOutput(1),
                                    test::AsTensor<int64_t>({0, 3, 4, 4}));
-  test::ExpectTensorEqual<int32>(
+  test::ExpectTensorEqual<int32_t>(
       *GetOutput(2),
-      test::AsTensor<int32>({1, 2, 3, 4, 5, 6, 7, 8}, TensorShape({4, 2})));
+      test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}, TensorShape({4, 2})));
 }
 
 TEST_F(RaggedGatherOpTest, RaggedGather_2DIndices) {
   // indices = [[2, 1], [0, 3]]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2, 2}),                  // indices.shape
       {2, 1, 0, 3},                         // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -161,7 +161,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_2DIndices) {
 TEST_F(RaggedGatherOpTest, RaggedGather_ScalarIndices) {
   // indices = 2
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({}),                      // indices.shape
       {2},                                  // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -178,7 +178,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_ScalarIndices) {
 TEST_F(RaggedGatherOpTest, RaggedGather_OutOfBounds) {
   // indices = [2, 10]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {2, 10},                              // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -189,7 +189,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_OutOfBounds) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {0, 2},                               // indices
       {{0, 3, 5, 2, 9}},                    // params_nested_splits
@@ -200,7 +200,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {0, 2},                               // indices
       {{-1, 3, 2, 7, 9}},                   // params_nested_splits
@@ -211,7 +211,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({0}),  // indices.shape
       {},                // indices
       {{}},              // params_nested_splits
@@ -222,7 +222,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {0, 2},                               // indices
       {{0, 20, 40, 80, 100}},               // params_nested_splits
@@ -234,7 +234,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
 }
 
 TEST_F(RaggedGatherOpTest, BadValuesShape) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({0}),  // indices.shape
       {},                // indices
       {{0}},             // params_nested_splits
diff --git a/tensorflow/core/kernels/ragged_range_op_test.cc b/tensorflow/core/kernels/ragged_range_op_test.cc
index 699531a8d3647c..9a951af9017a36 100644
--- a/tensorflow/core/kernels/ragged_range_op_test.cc
+++ b/tensorflow/core/kernels/ragged_range_op_test.cc
@@ -90,10 +90,10 @@ TEST_F(RaggedRangeOpTest, RangeSizeOverflow) {
 }
 
 TEST_F(RaggedRangeOpTest, RangeSizeOverflow2) {
-  BuildRaggedRangeGraph<int64>();
-  AddInputFromArray<int64>(TensorShape({}), {static_cast<int64_t>(5e18)});
-  AddInputFromArray<int64>(TensorShape({}), {static_cast<int64_t>(-5e18)});
-  AddInputFromArray<int64>(TensorShape({}), {-1});
+  BuildRaggedRangeGraph<int64_t>();
+  AddInputFromArray<int64_t>(TensorShape({}), {static_cast<int64_t>(5e18)});
+  AddInputFromArray<int64_t>(TensorShape({}), {static_cast<int64_t>(-5e18)});
+  AddInputFromArray<int64_t>(TensorShape({}), {-1});
 
   EXPECT_EQ(absl::StrCat("Requires ((limit - start) / delta) <= ",
                          std::numeric_limits<int64_t>::max()),
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
index 7f92a50133ce99..ffb186af87ece4 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
@@ -228,8 +228,8 @@ class RaggedTensorToSparseOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<int32>("Tsplits"),
-                        RaggedTensorToSparseOp<int32>);
+                            .TypeConstraint<int32_t>("Tsplits"),
+                        RaggedTensorToSparseOp<int32_t>);
 
 REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 516a0cddcb6acc..28820593a4b5c5 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -445,8 +445,8 @@ void copy_array<tstring, int64_t>(tstring* dst, const tstring* src,
 }
 
 template <>
-void copy_array<tstring, int32>(tstring* dst, const tstring* src,
-                                int32_t size) {
+void copy_array<tstring, int32_t>(tstring* dst, const tstring* src,
+                                  int32_t size) {
   slow_copy_array(dst, src, size);
 }
 
@@ -460,8 +460,8 @@ void copy_array<Eigen::half, int64_t>(Eigen::half* dst, const Eigen::half* src,
 }
 
 template <>
-void copy_array<Eigen::half, int32>(Eigen::half* dst, const Eigen::half* src,
-                                    int32_t size) {
+void copy_array<Eigen::half, int32_t>(Eigen::half* dst, const Eigen::half* src,
+                                      int32_t size) {
   slow_copy_array(dst, src, size);
 }
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
index b0f53598d32de9..e23a2c07ed861b 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
@@ -51,7 +51,8 @@ class RaggedTensorToTensorOpTest : public ::tensorflow::OpsTestBase {
   // Builds the tensorflow test graph for RaggedTensorToTensor.
   template <typename VALUE_TYPE, typename INDEX_TYPE>
   void BuildRaggedTensorToTensorGraph(
-      const TensorShape& shape, const std::vector<string>& row_partition_types,
+      const TensorShape& shape,
+      const std::vector<std::string>& row_partition_types,
       const ShapeAndValues<VALUE_TYPE>& values,
       const ShapeAndValues<VALUE_TYPE>& default_value,
       const std::vector<ShapeAndValues<INDEX_TYPE>>& row_partition_tensors) {
@@ -95,12 +96,13 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor) {
   // indices = [2, 1, 0, 3]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
   // params.shape = [4, None]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({4, 4}),                 // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -117,12 +119,12 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor) {
 TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorRowSplits) {
   // indices = [2, 1, 0, 3]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({4, 4}),  // shape
       {"ROW_SPLITS"},       // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
-      createScalar<float>(1.5),               // default_value
-      {createVector<int32>({0, 3, 3, 7, 9})}  // row_partition_tensors
+      createScalar<float>(1.5),                 // default_value
+      {createVector<int32_t>({0, 3, 3, 7, 9})}  // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -143,16 +145,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParams) {
   //           [[.4, .5], [.6, .7, .8]],
   //           [[.9]]
   //          ]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({5, 2, 3}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
        "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
       {
-          createScalar<int32>(5),
-          createVector<int32>({0, 1, 1, 3, 3, 4}),
-          createVector<int32>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
+          createScalar<int32_t>(5),
+          createVector<int32_t>({0, 1, 1, 3, 3, 4}),
+          createVector<int32_t>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
       }  // row_partition_tensors
   );
   TF_ASSERT_OK(RunOpKernel());
@@ -181,14 +183,14 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsRowSplits) {
   //           [[.4, .5], [.6, .7, .8]],
   //           [[.9]]
   //          ]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({5, 2, 3}),        // shape
       {"ROW_SPLITS", "ROW_SPLITS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
       {
-          createVector<int32>({0, 1, 3, 3, 5, 6}),
-          createVector<int32>({0, 0, 2, 3, 5, 8, 9}),
+          createVector<int32_t>({0, 1, 3, 3, 5, 6}),
+          createVector<int32_t>({0, 0, 2, 3, 5, 8, 9}),
       }  // row_partition_tensors
   );
   TF_ASSERT_OK(RunOpKernel());
@@ -249,15 +251,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParams) {
   //            []
   // ]
   // params.shape = [3, 2, 3, 2]
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({4, 2, 3, 2}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS",
-       "VALUE_ROWIDS"},                               // row_partition_types
-      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
-      createScalar<int32>(15),                        // default_value
-      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
-       createVector<int32>({1, 1, 1, 2}),
-       createVector<int32>({0, 0, 1, 1, 2, 2, 3, 3})}  // row_partition_tensors
+       "VALUE_ROWIDS"},                                 // row_partition_types
+      createVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32_t>(15),                        // default_value
+      {createScalar<int32_t>(5), createVector<int32_t>({0, 1, 1}),
+       createVector<int32_t>({1, 1, 1, 2}),
+       createVector<int32_t>({0, 0, 1, 1, 2, 2, 3, 3})}
+      // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -277,9 +280,9 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParams) {
   //             [[15,15],[15,15],[15,15]],
   //           ]
   // params.shape = [3, 2, 3, 2]
-  test::ExpectTensorEqual<int32>(
+  test::ExpectTensorEqual<int32_t>(
       *GetOutput(0),
-      test::AsTensor<int32>(
+      test::AsTensor<int32_t>(
           {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1,  2,  3,  4,
            5,  6,  7,  8,  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15},
@@ -296,14 +299,14 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
   //            []
   // ]
   // params.shape = [3, 2, 3, 2]
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({4, 2, 3, 2}),  // shape
       {"ROW_SPLITS", "ROW_SPLITS", "ROW_SPLITS"},
       // row_partition_types
-      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
-      createScalar<int32>(15),                        // default_value
-      {createVector<int32>({0, 1, 3}), createVector<int32>({0, 0, 3, 4}),
-       createVector<int32>({0, 2, 4, 6, 8})}  // row_partition_tensors
+      createVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32_t>(15),                        // default_value
+      {createVector<int32_t>({0, 1, 3}), createVector<int32_t>({0, 0, 3, 4}),
+       createVector<int32_t>({0, 2, 4, 6, 8})}  // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -323,9 +326,9 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
   //             [[15,15],[15,15],[15,15]],
   //           ]
   // params.shape = [3, 2, 3, 2]
-  test::ExpectTensorEqual<int32>(
+  test::ExpectTensorEqual<int32_t>(
       *GetOutput(0),
-      test::AsTensor<int32>(
+      test::AsTensor<int32_t>(
           {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1,  2,  3,  4,
            5,  6,  7,  8,  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15},
@@ -334,12 +337,13 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
 
 TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpanded) {
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({3, 5}),                 // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -357,14 +361,15 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpanded) {
 // Adds a dense dimension.
 TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpandedDense) {
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({3, 5, 2}),              // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       ShapeAndValues<float>{TensorShape({9, 2}),
                             {.1, 1.1, .2, 1.2, .3, 1.3, .4, 1.4, .5, 1.5, .6,
                              1.6, .7, 1.7, .8, 1.8, .9, 1.9}},  // values
       createScalar<float>(1.5),                                 // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -386,12 +391,13 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorConstrained) {
   //           [.4, .5, .6, .7],
   //           [.8, .9]]
   // constrained to (3, 3)
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({3, 3}),                 // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -418,16 +424,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsConstrained) {
   //           [[.9]]
   //          ]
   // params.shape = [5, None, None]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({4, 1, 2}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
        "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
       {
-          createScalar<int32>(5),
-          createVector<int32>({0, 1, 1, 3, 3, 4}),
-          createVector<int32>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
+          createScalar<int32_t>(5),
+          createVector<int32_t>({0, 1, 1, 3, 3, 4}),
+          createVector<int32_t>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
       }  // row_partition_tensors
   );
   TF_ASSERT_OK(RunOpKernel());
@@ -457,15 +463,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsConstrained) {
   //            []
   // ]
   // params.shape = [3, 2, 3, 2]
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({2, 2, 2, 2}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS",
-       "VALUE_ROWIDS"},                               // row_partition_types
-      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
-      createScalar<int32>(15),                        // default_value
-      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
-       createVector<int32>({1, 1, 1, 2}),
-       createVector<int32>({0, 0, 1, 1, 2, 2, 3, 3})}  // row_partition_tensors
+       "VALUE_ROWIDS"},                                 // row_partition_types
+      createVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32_t>(15),                        // default_value
+      {createScalar<int32_t>(5), createVector<int32_t>({0, 1, 1}),
+       createVector<int32_t>({1, 1, 1, 2}),
+       createVector<int32_t>({0, 0, 1, 1, 2, 2, 3, 3})}
+      // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -480,25 +487,38 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsConstrained) {
   //           ],
   //          ]
   // params.shape = [3, 2, 3, 2]
-  test::ExpectTensorEqual<int32>(*GetOutput(0), test::AsTensor<int32>(
-                                                    {
-                                                        15, 15, 15, 15,  //
-                                                        15, 15, 15, 15,  //
-                                                        1, 2, 3, 4,      //
-                                                        7, 8, 15, 15,    //
-                                                    },
-                                                    TensorShape({2, 2, 2, 2})));
+  test::ExpectTensorEqual<int32_t>(*GetOutput(0),
+                                   test::AsTensor<int32_t>(
+                                       {
+                                           15,
+                                           15,
+                                           15,
+                                           15,  //
+                                           15,
+                                           15,
+                                           15,
+                                           15,  //
+                                           1,
+                                           2,
+                                           3,
+                                           4,  //
+                                           7,
+                                           8,
+                                           15,
+                                           15,  //
+                                       },
+                                       TensorShape({2, 2, 2, 2})));
 }
 
 TEST_F(RaggedTensorToTensorOpTest, ShapeWrongDimensions) {
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({10, 7, 10, 20}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
-       "VALUE_ROWIDS"},                   // row_partition_types
-      createVector<int32>({1, 2, 3, 4}),  // values
-      createScalar<int32>(15),            // default_value
-      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
-       createVector<int32>({1, 1, 1, 2})}  // row_partition_tensors
+       "VALUE_ROWIDS"},                     // row_partition_types
+      createVector<int32_t>({1, 2, 3, 4}),  // values
+      createScalar<int32_t>(15),            // default_value
+      {createScalar<int32_t>(5), createVector<int32_t>({0, 1, 1}),
+       createVector<int32_t>({1, 1, 1, 2})}  // row_partition_tensors
   );
   // Fails with an invalid argument.
   EXPECT_EQ(absl::IsInvalidArgument(RunOpKernel()), true);
@@ -508,7 +528,7 @@ class RaggedTensorToTensorOpUnknownShapeTest
     : public ::tensorflow::OpsTestBase {
  protected:
   std::unique_ptr<ShapeInferenceTestOp> op_;
-  void SetAttributes(const absl::Span<const string> row_partition_types,
+  void SetAttributes(const absl::Span<const std::string> row_partition_types,
                      int num_row_partition_tensors) {
     op_ = std::make_unique<ShapeInferenceTestOp>("RaggedTensorToTensor");
     SetAttrValue(row_partition_types,
@@ -519,7 +539,8 @@ class RaggedTensorToTensorOpUnknownShapeTest
 };
 
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
-  SetAttributes(absl::Span<const string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
+  SetAttributes(absl::Span<const std::string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"},
+                2);
 
   INFER_OK(*op_, "?;?;?;?;?", "?");
   INFER_OK(*op_, "?;[6];[];[];[6]", "[?,?]");
@@ -544,7 +565,7 @@ TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, RowSplits) {
   // RaggedTensorToTensor(param_splits+, param_values, indices) -> [splits+,
   // values]
-  SetAttributes(absl::Span<const string>{"ROW_SPLITS"}, 1);
+  SetAttributes(absl::Span<const std::string>{"ROW_SPLITS"}, 1);
 
   // value, default_value, ROW_SPLITS
   INFER_OK(*op_, "?;?;?;?", "?");
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index b4d7fc8395b614..a46f40d177778c 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -256,7 +256,7 @@ class RaggedTensorToVariantGradientOp : public OpKernel {
     auto flat_row_splits = row_splits.flat<SPLIT_TYPE>();
     TensorShape dense_values_shape;
     OP_REQUIRES_OK(context,
-                   TensorShapeUtils::MakeShape(context->input(2).vec<int32>(),
+                   TensorShapeUtils::MakeShape(context->input(2).vec<int32_t>(),
                                                &dense_values_shape));
 
     // Validate row_splits.
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
index 95bd16bbbcbafe..f25f8b34198702 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
@@ -232,7 +232,7 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
   const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
                                            5, 6, 7, 8, 9, 8, 9};
 
-  BuildEncodeRaggedTensorGraph<int, int32>(
+  BuildEncodeRaggedTensorGraph<int, int32_t>(
       {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
       batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -240,12 +240,12 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  ExpectRaggedTensorVariantEqual<int, int32>(
-      CreateVariantFromRagged<int, int32>(
+  ExpectRaggedTensorVariantEqual<int, int32_t>(
+      CreateVariantFromRagged<int, int32_t>(
           {{0, 1, 3, 4, 5, 6}, {0, 2, 3, 4, 5, 6, 7}}, {0, 1, 1, 2, 2, 3, 4}),
       *encoded_list(0).get<RaggedTensorVariant>());
-  ExpectRaggedTensorVariantEqual<int, int32>(
-      CreateVariantFromRagged<int, int32>(
+  ExpectRaggedTensorVariantEqual<int, int32_t>(
+      CreateVariantFromRagged<int, int32_t>(
           {{0, 1, 2, 3, 4, 5}, {0, 1, 2, 5, 6, 7}}, {5, 6, 7, 8, 9, 8, 9}),
       *encoded_list(1).get<RaggedTensorVariant>());
 }
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
index 7dc63ac8fbf7f8..87cfc50f8a268a 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
@@ -140,7 +140,7 @@ class RaggedTensorToVariantGradientKernelTest
   void BuildEncodeRaggedTensorGradientGraph(
       const std::vector<Variant>& encoded_ragged_grad,
       const std::vector<SPLIT_TYPE>& row_splits,
-      const std::vector<int32>& dense_values_shape) {
+      const std::vector<int32_t>& dense_values_shape) {
     const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
     const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
 
@@ -161,8 +161,8 @@ class RaggedTensorToVariantGradientKernelTest
     AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), row_splits);
 
     int64_t dense_values_shape_size = dense_values_shape.size();
-    AddInputFromArray<int32>(TensorShape({dense_values_shape_size}),
-                             dense_values_shape);
+    AddInputFromArray<int32_t>(TensorShape({dense_values_shape_size}),
+                               dense_values_shape);
   }
 
   template <typename VALUE_TYPE, typename SPLIT_TYPE>
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.cc b/tensorflow/core/kernels/ragged_tensor_variant.cc
index b6b70a283c7c48..5608888b5500d1 100644
--- a/tensorflow/core/kernels/ragged_tensor_variant.cc
+++ b/tensorflow/core/kernels/ragged_tensor_variant.cc
@@ -22,9 +22,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-string RaggedTensorVariant::TypeName() const { return "RaggedTensorVariant"; }
+std::string RaggedTensorVariant::TypeName() const {
+  return "RaggedTensorVariant";
+}
 
-string RaggedTensorVariant::DebugString() const {
+std::string RaggedTensorVariant::DebugString() const {
   return absl::StrCat(
       "RaggedTensorVariant(dtype=", DataTypeString(values_.dtype()),
       ", ragged_rank=", nested_splits_.size(), ", splits_dtype=",
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.h b/tensorflow/core/kernels/ragged_tensor_variant.h
index 1d2066b0dcf457..c75505a603c531 100644
--- a/tensorflow/core/kernels/ragged_tensor_variant.h
+++ b/tensorflow/core/kernels/ragged_tensor_variant.h
@@ -41,8 +41,8 @@ class RaggedTensorVariant {
       : values_(std::move(values)), nested_splits_(nested_splits) {}
 
   // Variant support methods.
-  string TypeName() const;
-  string DebugString() const;
+  std::string TypeName() const;
+  std::string DebugString() const;
   void Encode(VariantTensorData* data) const;
   bool Decode(const VariantTensorData& data);
 
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index 98118b78eb5b58..875744b86ecf47 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -360,8 +360,8 @@ class RandomBinomialOp : public OpKernel {
     TensorShape bcast_shape = BCast::ToShape(bcast.output_shape());
     TensorShape output_shape;
     if (shape_tensor.dtype() == DataType::DT_INT32) {
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
-                                                      &output_shape));
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                              shape_tensor.vec<int32_t>(), &output_shape));
     } else {
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
                               shape_tensor.vec<int64_t>(), &output_shape));
@@ -380,11 +380,11 @@ class RandomBinomialOp : public OpKernel {
     const int64_t num_sample_dims =
         (shape_tensor.dim_size(0) - bcast.output_shape().size());
     for (int64_t i = 0; i < num_sample_dims; ++i) {
-      samples_per_batch *= shape_tensor.flat<int32>()(i);
+      samples_per_batch *= shape_tensor.flat<int32_t>()(i);
     }
     int64_t num_batches = 1;
     for (int64_t i = num_sample_dims; i < shape_tensor.dim_size(0); ++i) {
-      num_batches *= shape_tensor.flat<int32>()(i);
+      num_batches *= shape_tensor.flat<int32_t>()(i);
     }
     const int64_t num_elements = num_batches * samples_per_batch;
 
@@ -409,8 +409,9 @@ class RandomBinomialOp : public OpKernel {
                 errors::InvalidArgument("Unsupported algorithm id: ", alg));
     static_assert(std::is_same<StateElementType, int64_t>::value,
                   "StateElementType must be int64");
-    static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
-                  "PhiloxRandom::ResultElementType must be uint32");
+    static_assert(
+        std::is_same<PhiloxRandom::ResultElementType, uint32_t>::value,
+        "PhiloxRandom::ResultElementType must be uint32");
     OP_REQUIRES(ctx, var_tensor_flat.size() >= PHILOX_MIN_STATE_SIZE,
                 errors::InvalidArgument(
                     "For Philox algorithm, the size of state must be at least ",
@@ -478,8 +479,8 @@ class StatelessRandomBinomialOp : public OpKernel {
     TensorShape bcast_shape = BCast::ToShape(bcast.output_shape());
     TensorShape output_shape;
     if (shape_tensor.dtype() == DataType::DT_INT32) {
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
-                                                      &output_shape));
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                              shape_tensor.vec<int32_t>(), &output_shape));
     } else {
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
                               shape_tensor.vec<int64_t>(), &output_shape));
@@ -494,14 +495,14 @@ class StatelessRandomBinomialOp : public OpKernel {
         (shape_tensor.dim_size(0) - bcast.output_shape().size());
     for (int64_t i = 0; i < num_sample_dims; ++i) {
       samples_per_batch *= shape_tensor.dtype() == DataType::DT_INT32
-                               ? shape_tensor.flat<int32>()(i)
-                               : shape_tensor.flat<int64>()(i);
+                               ? shape_tensor.flat<int32_t>()(i)
+                               : shape_tensor.flat<int64_t>()(i);
     }
     int64_t num_batches = 1;
     for (int64_t i = num_sample_dims; i < shape_tensor.dim_size(0); ++i) {
       num_batches *= shape_tensor.dtype() == DataType::DT_INT32
-                         ? shape_tensor.flat<int32>()(i)
-                         : shape_tensor.flat<int64>()(i);
+                         ? shape_tensor.flat<int32_t>()(i)
+                         : shape_tensor.flat<int64_t>()(i);
     }
     const int64_t num_elements = num_batches * samples_per_batch;
 
@@ -557,7 +558,7 @@ class StatelessRandomBinomialOp : public OpKernel {
 REGISTER_ALL(Eigen::half);
 REGISTER_ALL(float);
 REGISTER_ALL(double);
-REGISTER_ALL(int32);
+REGISTER_ALL(int32_t);
 REGISTER_ALL(int64_t);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/random_binomial_op_test.cc b/tensorflow/core/kernels/random_binomial_op_test.cc
index 80af07f13a4083..9e715b5afccf92 100644
--- a/tensorflow/core/kernels/random_binomial_op_test.cc
+++ b/tensorflow/core/kernels/random_binomial_op_test.cc
@@ -28,7 +28,7 @@ static Graph* RandomBinomialGraph(double count, double prob, int num_batches,
                                   int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   Tensor counts_t(DT_FLOAT, TensorShape({num_batches}));
   counts_t.flat<float>().setConstant(count);
diff --git a/tensorflow/core/kernels/random_index_shuffle_test.cc b/tensorflow/core/kernels/random_index_shuffle_test.cc
index 259f484cc344ca..02458f4aa99f49 100644
--- a/tensorflow/core/kernels/random_index_shuffle_test.cc
+++ b/tensorflow/core/kernels/random_index_shuffle_test.cc
@@ -32,11 +32,11 @@ class RandomIndexShuffleTest : public ::testing::TestWithParam<uint64_t> {
 
 // Check that we do a correct bijection.
 TEST_P(RandomIndexShuffleTest, Bijection) {
-  const std::array<uint32, 3>& key = {42, 73, 1991};
+  const std::array<uint32_t, 3>& key = {42, 73, 1991};
   const uint64_t max_value = GetMaxValue();
   std::vector<bool> seen(max_value + 1, false);
   for (uint64_t value = 0; value <= max_value; ++value) {
-    const uint64 output_value =
+    const uint64_t output_value =
         index_shuffle(value, key, max_value, /* rounds= */ 4);
     EXPECT_GE(output_value, 0);
     EXPECT_LE(output_value, max_value);
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 7624b56b50b587..87179f9fef5e8f 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -157,7 +157,7 @@ class RandomGammaOp : public OpKernel {
                     shape_t.DebugString()));
     TensorShape samples_shape;
     if (shape_t.dtype() == DataType::DT_INT32) {
-      auto vec = shape_t.flat<int32>();
+      auto vec = shape_t.flat<int32_t>();
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(vec.data(), vec.size(),
                                                       &samples_shape));
     } else if (shape_t.dtype() == DataType::DT_INT64) {
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index ea16f54ec9acb4..cef648707d3422 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -40,8 +40,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // nullptr, they provide the input; otherwise `gen` provides the input.
 template <class Distribution>
 struct FillPhiloxRandom<CPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64* key,
-                  const uint64* counter, random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64_t* key,
+                  const uint64_t* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64_t size,
                   Distribution dist);
 };
@@ -51,8 +51,8 @@ typedef Eigen::GpuDevice GPUDevice;
 // Declares the partially GPU-specialized functor struct.
 template <class Distribution>
 struct FillPhiloxRandom<GPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64* key,
-                  const uint64* counter, random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64_t* key,
+                  const uint64_t* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64_t size,
                   Distribution dist);
 };
diff --git a/tensorflow/core/kernels/random_op_cpu.h b/tensorflow/core/kernels/random_op_cpu.h
index cfa927c1e539ea..7d7a16dcc6a3fc 100644
--- a/tensorflow/core/kernels/random_op_cpu.h
+++ b/tensorflow/core/kernels/random_op_cpu.h
@@ -60,8 +60,8 @@ using random::SingleSampleAdapter;
 template <typename Device, class Distribution>
 struct FillPhiloxRandom {
   typedef typename Distribution::ResultElementType T;
-  void operator()(OpKernelContext* ctx, const Device&, const uint64* key,
-                  const uint64* counter, random::PhiloxRandom gen, T* data,
+  void operator()(OpKernelContext* ctx, const Device&, const uint64_t* key,
+                  const uint64_t* counter, random::PhiloxRandom gen, T* data,
                   int64_t size, Distribution dist) {
     OP_REQUIRES(
         ctx, false,
@@ -156,8 +156,8 @@ struct FillPhiloxRandomTask<Distribution, true> {
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
-    OpKernelContext* ctx, const CPUDevice&, const uint64* key,
-    const uint64* counter, random::PhiloxRandom gen,
+    OpKernelContext* ctx, const CPUDevice&, const uint64_t* key,
+    const uint64_t* counter, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64_t size,
     Distribution dist) {
   if (key != nullptr && counter != nullptr) {
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
index f8efa21daba8ff..dbb66c2148397d 100644
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -34,17 +34,17 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
-                                random::PhiloxRandom gen, T* data, int64 size,
+  PHILOX_DEVICE_INLINE void Run(const uint64_t* key, const uint64_t* counter,
+                                random::PhiloxRandom gen, T* data, int64_t size,
                                 Distribution dist);
 };
 
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+  PHILOX_DEVICE_INLINE void Run(const uint64_t* key, const uint64_t* counter,
                                 random::PhiloxRandom base_gen, T* data,
-                                int64 size, Distribution dist);
+                                int64_t size, Distribution dist);
 };
 
 template <typename T, int ElementCount>
@@ -83,14 +83,14 @@ class SampleCopier<float, 4> {
 };
 
 template <>
-class SampleCopier<int32, 4> {
+class SampleCopier<int32_t, 4> {
  public:
   // Copies the elements from the array to buf. buf must be 128-bit aligned,
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      int32* __restrict__ buf,
-      const tensorflow::random::Array<int32, 4>& array) const {
+      int32_t* __restrict__ buf,
+      const tensorflow::random::Array<int32_t, 4>& array) const {
     ::int4 vec;
     vec.x = array[0];
     vec.y = array[1];
@@ -119,14 +119,14 @@ class SampleCopier<double, 2> {
 };
 
 template <>
-class SampleCopier<int64, 2> {
+class SampleCopier<int64_t, 2> {
  public:
   // Copies the elements from the array to buf. buf must be 128-bit aligned,
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      int64* __restrict__ buf,
-      const tensorflow::random::Array<int64, 2>& array) const {
+      int64_t* __restrict__ buf,
+      const tensorflow::random::Array<int64_t, 2>& array) const {
     longlong2 vec;
     vec.x = array[0];
     vec.y = array[1];
@@ -139,13 +139,13 @@ class SampleCopier<int64, 2> {
 // distribution. Each output takes a fixed number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
-    const uint64* key, const uint64* counter, random::PhiloxRandom gen, T* data,
-    int64 size, Distribution dist) {
+    const uint64_t* key, const uint64_t* counter, random::PhiloxRandom gen,
+    T* data, int64_t size, Distribution dist) {
   const int kGroupSize = Distribution::kResultElementCount;
 
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
-  int64 offset = thread_id * kGroupSize;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
+  int64_t offset = thread_id * kGroupSize;
   if (key != nullptr && counter != nullptr) {
     gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
   }
@@ -174,8 +174,8 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
 // distribution. Each output takes a variable number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
-    const uint64* key, const uint64* counter, random::PhiloxRandom base_gen,
-    T* data, int64 size, Distribution dist) {
+    const uint64_t* key, const uint64_t* counter, random::PhiloxRandom base_gen,
+    T* data, int64_t size, Distribution dist) {
   if (key != nullptr && counter != nullptr) {
     base_gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
   }
@@ -189,10 +189,10 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
                                            kReservedSamplesPerOutput /
                                            PhiloxRandom::kResultElementCount;
 
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
-  int64 group_index = thread_id;
-  int64 offset = group_index * kGroupSize;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
+  int64_t group_index = thread_id;
+  int64_t offset = group_index * kGroupSize;
 
   while (offset < size) {
     // Since each output takes a variable number of samples, we need to
@@ -219,10 +219,10 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
-    FillPhiloxRandomKernelLaunch(const uint64* key, const uint64* counter,
+    FillPhiloxRandomKernelLaunch(const uint64_t* key, const uint64_t* counter,
                                  random::PhiloxRandom base_gen,
                                  typename Distribution::ResultElementType* data,
-                                 int64 size, Distribution dist) {
+                                 int64_t size, Distribution dist) {
   FillPhiloxRandomKernel<Distribution,
                          Distribution::kVariableSamplesPerOutput>()
       .Run(key, counter, base_gen, data, size, dist);
@@ -231,13 +231,13 @@ __global__ void __launch_bounds__(1024)
 // Partial specialization for GPU
 template <class Distribution>
 void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
-    OpKernelContext*, const GPUDevice& d, const uint64* key,
-    const uint64* counter, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
+    OpKernelContext*, const GPUDevice& d, const uint64_t* key,
+    const uint64_t* counter, random::PhiloxRandom gen,
+    typename Distribution::ResultElementType* data, int64_t size,
     Distribution dist) {
   if (size == 0) return;
-  const int32 block_size = d.maxGpuThreadsPerBlock();
-  const int32 num_blocks =
+  const int32_t block_size = d.maxGpuThreadsPerBlock();
+  const int32_t num_blocks =
       std::min<int64_t>(
           d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
           size + block_size - 1) /
diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc
index 7292ce4ec8a3f0..5abe81f27f31e2 100644
--- a/tensorflow/core/kernels/random_op_test.cc
+++ b/tensorflow/core/kernels/random_op_test.cc
@@ -26,13 +26,13 @@ namespace tensorflow {
 namespace {
 
 Tensor VecShape(int64_t v) {
-  if (v >= std::numeric_limits<int32>::max()) {
+  if (v >= std::numeric_limits<int32_t>::max()) {
     Tensor shape(DT_INT64, TensorShape({1}));
     shape.vec<int64_t>()(0) = v;
     return shape;
   } else {
     Tensor shape(DT_INT32, TensorShape({1}));
-    shape.vec<int32>()(0) = v;
+    shape.vec<int32_t>()(0) = v;
     return shape;
   }
 }
diff --git a/tensorflow/core/kernels/random_ops_util.h b/tensorflow/core/kernels/random_ops_util.h
index b990456965ff59..c203181d575818 100644
--- a/tensorflow/core/kernels/random_ops_util.h
+++ b/tensorflow/core/kernels/random_ops_util.h
@@ -26,20 +26,21 @@ using random::PhiloxRandom;
 // The following 2 functions use the contract "lower 32 bits for the first
 // uint32, higher 32 bits for the second". Note that this is endian-neutral,
 // unlike a direct memory copy `memcpy(output, &input, 8)`.
-PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64 input, uint32* output1,
-                                          uint32* output2) {
-  *output1 = static_cast<uint32>(input);
-  *output2 = static_cast<uint32>(input >> 32);
+PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64_t input, uint32_t* output1,
+                                          uint32_t* output2) {
+  *output1 = static_cast<uint32_t>(input);
+  *output2 = static_cast<uint32_t>(input >> 32);
 }
 
-PHILOX_DEVICE_INLINE uint64 Uint32sToUint64(uint32 input1, uint32 input2) {
-  auto u64_1 = static_cast<uint64>(input1);
-  auto u64_2 = static_cast<uint64>(input2);
+PHILOX_DEVICE_INLINE uint64_t Uint32sToUint64(uint32_t input1,
+                                              uint32_t input2) {
+  auto u64_1 = static_cast<uint64_t>(input1);
+  auto u64_2 = static_cast<uint64_t>(input2);
   return u64_1 | (u64_2 << 32);
 }
 
 PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
-    uint64 const* ptr) {
+    const uint64_t* ptr) {
   PhiloxRandom::ResultType counter;
   Uint64ToUint32s(ptr[0], &counter[0], &counter[1]);
   Uint64ToUint32s(ptr[1], &counter[2], &counter[3]);
@@ -47,24 +48,24 @@ PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
 }
 
 PHILOX_DEVICE_INLINE void WriteCounterToMem(
-    PhiloxRandom::ResultType const& counter, uint64* ptr) {
+    PhiloxRandom::ResultType const& counter, uint64_t* ptr) {
   ptr[0] = Uint32sToUint64(counter[0], counter[1]);
   ptr[1] = Uint32sToUint64(counter[2], counter[3]);
 }
 
-PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(uint64 const* ptr) {
+PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(const uint64_t* ptr) {
   PhiloxRandom::Key key;
   Uint64ToUint32s(ptr[0], &key[0], &key[1]);
   return key;
 }
 
 PHILOX_DEVICE_INLINE void WriteKeyToMem(PhiloxRandom::Key const& key,
-                                        uint64* ptr) {
+                                        uint64_t* ptr) {
   *ptr = Uint32sToUint64(key[0], key[1]);
 }
 
 PHILOX_DEVICE_INLINE PhiloxRandom GetPhiloxRandomFromCounterKeyMem(
-    uint64 const* counter_ptr, uint64 const* key_ptr) {
+    const uint64_t* counter_ptr, const uint64_t* key_ptr) {
   return PhiloxRandom(GetCounterFromMem(counter_ptr), GetKeyFromMem(key_ptr));
 }
 
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index 3c703f5d0ca0d7..9b1f93584ad86b 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -351,7 +351,7 @@ TF_CALL_double(REGISTER);
 REGISTER_ALL(Eigen::half);
 REGISTER_ALL(float);
 REGISTER_ALL(double);
-REGISTER_ALL(int32);
+REGISTER_ALL(int32_t);
 REGISTER_ALL(int64_t);
 
 #undef REGISTER_ALL
diff --git a/tensorflow/core/kernels/random_poisson_op_test.cc b/tensorflow/core/kernels/random_poisson_op_test.cc
index ea2541d8bdf1b2..4d8f62a2e142d8 100644
--- a/tensorflow/core/kernels/random_poisson_op_test.cc
+++ b/tensorflow/core/kernels/random_poisson_op_test.cc
@@ -24,13 +24,13 @@ namespace tensorflow {
 namespace {
 
 Tensor VecShape(int64_t v) {
-  if (v >= std::numeric_limits<int32>::max()) {
+  if (v >= std::numeric_limits<int32_t>::max()) {
     Tensor shape(DT_INT64, TensorShape({1}));
     shape.vec<int64_t>()(0) = v;
     return shape;
   } else {
     Tensor shape(DT_INT32, TensorShape({1}));
-    shape.vec<int32>()(0) = v;
+    shape.vec<int32_t>()(0) = v;
     return shape;
   }
 }
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 856357489bdfab..c9c83d381e6ff9 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -45,7 +45,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
   RandomShuffleQueue(int32_t capacity, int32_t min_after_dequeue, int64_t seed,
                      int64_t seed2, const DataTypeVector& component_dtypes,
                      const std::vector<TensorShape>& component_shapes,
-                     const string& name);
+                     const std::string& name);
 
   absl::Status Initialize()
       override;  // Must be called before any other method.
@@ -61,7 +61,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
                       CallbackWithTuple callback) override;
   absl::Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() const override {
+  int32_t size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
@@ -78,7 +78,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
                                                    OpKernelContext* ctx,
                                                    Tensor* out_tensor);
 
-  const int32 min_after_dequeue_;
+  const int32_t min_after_dequeue_;
   const int64_t original_seed_;
   const int64_t original_seed2_;
 
@@ -93,7 +93,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
 RandomShuffleQueue::RandomShuffleQueue(
     int32_t capacity, int32_t min_after_dequeue, int64_t seed, int64_t seed2,
     const DataTypeVector& component_dtypes,
-    const std::vector<TensorShape>& component_shapes, const string& name)
+    const std::vector<TensorShape>& component_shapes, const std::string& name)
     : TypedQueue(capacity, component_dtypes, component_shapes, name),
       min_after_dequeue_(min_after_dequeue),
       original_seed_(seed),
@@ -503,7 +503,7 @@ class RandomShuffleQueueOp : public TypedQueueOp {
     return CreateTypedQueue(queue, ret);
   }
 
-  int32 min_after_dequeue_;
+  int32_t min_after_dequeue_;
   int64_t seed_;
   int64_t seed2_;
   std::vector<TensorShape> component_shapes_;
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index db4f97c3e925de..2f8fb60c3b9f44 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -248,7 +248,7 @@ FixedUnigramSampler::FixedUnigramSampler(int64_t range, float distortion,
 }
 
 absl::Status FixedUnigramSampler::SetDistributionSampler(
-    Env* env, const string& vocab_file) {
+    Env* env, const std::string& vocab_file) {
   TF_RETURN_IF_ERROR(LoadFromFile(env, vocab_file, distortion_));
   if (!TF_PREDICT_TRUE(FixedUnigramSampler::range() == weights_.size()))
     return (errors::InvalidArgument("range is ", FixedUnigramSampler::range(),
@@ -287,18 +287,18 @@ void FixedUnigramSampler::FillReservedIds(int32_t num_reserved_ids) {
 }
 
 absl::Status FixedUnigramSampler::LoadFromFile(Env* env,
-                                               const string& vocab_file,
+                                               const std::string& vocab_file,
                                                float distortion) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file));
 
   io::InputBuffer in(file.get(), 262144 /*bytes*/);
-  string line;
+  std::string line;
   int32_t word_id = weights_.size();
   while (in.ReadLine(&line).ok()) {
     // The vocabulary file should be in csv like format, with the last
     // field the weight associated with the word.
-    std::vector<string> cols = str_util::Split(line, ',');
+    std::vector<std::string> cols = str_util::Split(line, ',');
     if (cols.empty()) continue;
     // Skip entries that do not belong to this shard.
     if (word_id % num_shards_ == shard_) {
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index c49bbcc5b1eede..cecb681cd4e973 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -208,7 +208,7 @@ class FixedUnigramSampler : public RangeSampler {
                       int32_t num_shards, int32_t shard);
   // The vocab_file is assumed to be a CSV, with the last entry of each row a
   // value representing the counts or probabilities for the corresponding ID.
-  absl::Status SetDistributionSampler(Env* env, const string& vocab_file);
+  absl::Status SetDistributionSampler(Env* env, const std::string& vocab_file);
   absl::Status SetDistributionSampler(const std::vector<float>& unigrams);
   float Probability(int64_t value) const override;
 
@@ -225,14 +225,14 @@ class FixedUnigramSampler : public RangeSampler {
   // Sharding information of the sampler. The whole vocabulary is sharded
   // into num_shards_ smaller ranges and each sampler is responsible for one
   // such smaller range, identified by the shard number.
-  int32 num_shards_;
-  int32 shard_;
+  int32_t num_shards_;
+  int32_t shard_;
   float distortion_;
   // Fill the sampler with the appropriate number of reserved IDs.
   void FillReservedIds(int32_t num_reserved_ids);
   // Load IDs to sample from a CSV file. It is assumed that the last item of
   // each row contains a count or probability for the corresponding ID.
-  absl::Status LoadFromFile(Env* env, const string& vocab_file,
+  absl::Status LoadFromFile(Env* env, const std::string& vocab_file,
                             float distortion);
   // Load from an in-memory array.
   void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion);
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
index 1aeadc634ccea3..93891f10446311 100644
--- a/tensorflow/core/kernels/range_sampler_test.cc
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -157,7 +157,7 @@ static const char kVocabContent[] =
     "w9,256";
 TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -169,7 +169,7 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramNoExistingFilename) {
   Env* env = Env::Default();
-  string fname = "NoExistingFile";
+  std::string fname = "NoExistingFile";
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   absl::Status s = test_sampler->SetDistributionSampler(env, fname);
   sampler_.reset(test_sampler);
@@ -177,7 +177,7 @@ TEST_F(RangeSamplerTest, FixedUnigramNoExistingFilename) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(8, 0.8, 0, 1, 0);
   absl::Status s = test_sampler->SetDistributionSampler(env, fname);
@@ -186,7 +186,7 @@ TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -195,7 +195,7 @@ TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -204,7 +204,7 @@ TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(10, 0.8, 1, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -217,7 +217,7 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(11, 0.8, 2, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 1fae7e40af9abd..d1c3fbd1f70cb9 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -30,13 +30,13 @@ class RecordInputOp : public OpKernel {
   TYPE FIELD;                \
   OP_REQUIRES_OK(ctx, ctx->GetAttr(#FIELD, &FIELD));
 
-    GETATTR(string, file_pattern);
+    GETATTR(std::string, file_pattern);
     GETATTR(int64_t, file_random_seed);
     GETATTR(float, file_shuffle_shift_ratio);
     GETATTR(int64_t, file_buffer_size);
     GETATTR(int64_t, file_parallelism);
     GETATTR(int64_t, batch_size);
-    GETATTR(string, compression_type);
+    GETATTR(std::string, compression_type);
 #undef GETATTR
 
     OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index db8d59515c2f43..e186c92e7c3b30 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -82,10 +82,10 @@ bool RecordYielder::ShouldFinish(const absl::Status& s) {
   return stop_ || !status_.ok();
 }
 
-static absl::Status MatchFiles(const string& patterns,
-                               std::vector<string>* filenames) {
+static absl::Status MatchFiles(const std::string& patterns,
+                               std::vector<std::string>* filenames) {
   for (const auto& file_pattern : str_util::Split(patterns, ',')) {
-    std::vector<string> tmp_filenames;
+    std::vector<std::string> tmp_filenames;
     TF_RETURN_IF_ERROR(
         Env::Default()->GetMatchingPaths(file_pattern, &tmp_filenames));
     filenames->insert(filenames->end(),
@@ -102,7 +102,7 @@ void RecordYielder::MainLoop() {
     num_records_added_in_epoch_ = 0;
 
     // Finds all files.
-    std::vector<string> filenames;
+    std::vector<std::string> filenames;
     absl::Status s = MatchFiles(opts_.file_pattern, &filenames);
 
     if (filenames.empty()) {
@@ -121,7 +121,7 @@ void RecordYielder::MainLoop() {
     std::shuffle(filenames.begin(), filenames.end(), shuffle_rnd);
 
     // Left-shift the filename list.
-    const std::vector<string>::size_type num = filenames.size();
+    const std::vector<std::string>::size_type num = filenames.size();
     int64_t shift;
     if (0 <= opts_.file_shuffle_shift_ratio &&
         opts_.file_shuffle_shift_ratio < 1) {
@@ -136,7 +136,8 @@ void RecordYielder::MainLoop() {
     for (int i = 0; i < N; ++i) {
       Shard* shard = &shards[i];
       shard->index = i;
-      for (std::vector<string>::size_type j = i; j < filenames.size(); j += N) {
+      for (std::vector<std::string>::size_type j = i; j < filenames.size();
+           j += N) {
         shard->filenames.push_back(filenames[j]);
       }
       thread_->Schedule([this, shard]() { ShardLoop(shard); });
@@ -172,7 +173,7 @@ void RecordYielder::MainLoop() {
   main_loop_done_.Notify();
 }
 
-bool RecordYielder::Add(std::vector<string>* values) {
+bool RecordYielder::Add(std::vector<std::string>* values) {
   mutex_lock l(mu_);
   while (!BufNotFull()) {
     buf_not_full_.wait(l);
@@ -197,9 +198,9 @@ bool RecordYielder::Add(std::vector<string>* values) {
 }
 
 void RecordYielder::ShardLoop(Shard* shard) {
-  std::vector<string> values;
+  std::vector<std::string> values;
   const int64_t kRecords = 16;
-  for (const string& filename : shard->filenames) {
+  for (const std::string& filename : shard->filenames) {
     std::unique_ptr<RandomAccessFile> file;
     if (ShouldFinish(absl::OkStatus())) break;
     absl::Status s = Env::Default()->NewRandomAccessFile(filename, &file);
@@ -211,7 +212,7 @@ void RecordYielder::ShardLoop(Shard* shard) {
         io::RecordReaderOptions::CreateRecordReaderOptions(
             opts_.compression_type);
     io::RecordReader rdr(file.get(), options);
-    uint64 offset = 0;
+    uint64_t offset = 0;
     tstring record;
     while (true) {
       absl::Status s = rdr.ReadRecord(&offset, &record);
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 6184a283ecece1..8f201082eac5f4 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -59,7 +59,7 @@ class RecordYielder {
  public:
   struct Options {
     // Glob pattern for tfrecords.
-    string file_pattern;
+    std::string file_pattern;
 
     // Random seed. It determines how data files are shuffled and how
     // records are shuffled.
@@ -73,13 +73,13 @@ class RecordYielder {
     float file_shuffle_shift_ratio = 0;
 
     // Randomization buffer keeps these many records.
-    uint64 bufsize = 1;
+    uint64_t bufsize = 1;
 
     // Uses these many concurrent tfrecord iterators to iterate through
     // tfrecords.
-    int32 parallelism = 1;
+    int32_t parallelism = 1;
 
-    string compression_type;
+    std::string compression_type;
   };
 
   explicit RecordYielder(OpKernelConstruction* context,
@@ -116,7 +116,7 @@ class RecordYielder {
   std::mt19937_64 rnd_ TF_GUARDED_BY(mu_);
 
   // Randomization buffer.
-  std::vector<string> buf_ TF_GUARDED_BY(mu_);
+  std::vector<std::string> buf_ TF_GUARDED_BY(mu_);
 
   // True iff we are draining an epoch.
   bool epoch_end_ = false;
@@ -145,14 +145,14 @@ class RecordYielder {
     // any.
     return stop_ || !status_.ok() || (epoch_end_ && !buf_.empty()) ||
            (!epoch_end_ &&
-            buf_.size() >= std::max<uint64>(1, opts_.bufsize / 2));
+            buf_.size() >= std::max<uint64_t>(1, opts_.bufsize / 2));
   }
 
   void MainLoop();
   struct Shard;
   void ShardLoop(Shard* shard);
   bool ShouldFinish(const absl::Status& s);
-  bool Add(std::vector<string>* values);
+  bool Add(std::vector<std::string>* values);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduce_join_op.cc b/tensorflow/core/kernels/reduce_join_op.cc
index 6ee2ef0139a427..e05e4c3b4d6030 100644
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@@ -47,7 +47,7 @@ const absl::InlinedVector<int64_t, 8> GetStrides(const TensorShape& shape) {
 // nonspecified dimensions set to 0.  Dimensions must be ordered from outer-most
 // to inner-most with respect to the subset linear index.
 inline int64_t LinearSubIndexToFullIndex(
-    int64_t output_index, const absl::InlinedVector<int32, 8>& dim_list,
+    int64_t output_index, const absl::InlinedVector<int32_t, 8>& dim_list,
     const TensorShape& input_shape,
     const absl::InlinedVector<int64_t, 8>& strides) {
   int64_t result = 0;
@@ -63,7 +63,7 @@ inline int64_t LinearSubIndexToFullIndex(
 
 // Computes the number of input elements reduced per output element.
 int64_t GetReductionIterSize(
-    const absl::InlinedVector<int32, 8>& reduced_indices,
+    const absl::InlinedVector<int32_t, 8>& reduced_indices,
     const TensorShape& input_shape) {
   int64_t result = 1;
   for (int32_t reduce_dim : reduced_indices) {
@@ -74,12 +74,12 @@ int64_t GetReductionIterSize(
 
 // Computes a list of all true reduced indices, accounting for negative
 // indices.
-absl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
-                                                int32_t input_dims) {
-  const auto reduction_indices_flat = reduction_indices.flat<int32>();
+absl::InlinedVector<int32_t, 8> GetReducedIndices(
+    const Tensor& reduction_indices, int32_t input_dims) {
+  const auto reduction_indices_flat = reduction_indices.flat<int32_t>();
   const int32_t reduction_dims = reduction_indices_flat.size();
 
-  absl::InlinedVector<int32, 8> reduced_indices(reduction_dims);
+  absl::InlinedVector<int32_t, 8> reduced_indices(reduction_dims);
   for (int32_t i = 0; i < reduction_dims; ++i) {
     reduced_indices[i] = reduction_indices_flat(reduction_dims - i - 1);
     reduced_indices[i] += reduced_indices[i] < 0 ? input_dims : 0;
@@ -91,7 +91,7 @@ absl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
 // Appends all unreduced dimensions to the given vector.
 void MakeUnreducedIndices(absl::InlinedVector<bool, 8> index_is_reduced,
                           int32_t input_dims,
-                          absl::InlinedVector<int32, 8>* unreduced_indices) {
+                          absl::InlinedVector<int32_t, 8>* unreduced_indices) {
   for (int32_t index = 0; index < input_dims; ++index) {
     if (!index_is_reduced[index]) unreduced_indices->push_back(index);
   }
@@ -128,7 +128,7 @@ class ReduceJoinOp : public OpKernel {
     const int32_t input_dims = input_shape.dims();
 
     const Tensor& reduction_indices = context->input(1);
-    const auto reduction_indices_flat = reduction_indices.flat<int32>();
+    const auto reduction_indices_flat = reduction_indices.flat<int32_t>();
     const int32_t reduction_dims = reduction_indices_flat.size();
 
     absl::InlinedVector<bool, 8> index_is_reduced(input_dims, false);
@@ -146,9 +146,9 @@ class ReduceJoinOp : public OpKernel {
       index_is_reduced[true_reduce_index] = true;
     }
 
-    absl::InlinedVector<int32, 8> reduced_indices =
+    absl::InlinedVector<int32_t, 8> reduced_indices =
         GetReducedIndices(reduction_indices, input_dims);
-    absl::InlinedVector<int32, 8> unreduced_indices;
+    absl::InlinedVector<int32_t, 8> unreduced_indices;
     MakeUnreducedIndices(index_is_reduced, input_dims, &unreduced_indices);
     const auto strides = GetStrides(input_shape);
 
@@ -179,7 +179,7 @@ class ReduceJoinOp : public OpKernel {
 
  private:
   bool keep_dims_;
-  string separator_;
+  std::string separator_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ReduceJoin").Device(DEVICE_CPU), ReduceJoinOp);
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 11b3ae7f328cd3..defa95f8c0f65d 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -118,12 +118,12 @@ struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
     }                                                                         \
   }
 
-CASTING_SPECIALIZATION(uint8, uint64);
-CASTING_SPECIALIZATION(uint16, uint64);
-CASTING_SPECIALIZATION(uint32, uint64);
-CASTING_SPECIALIZATION(int8, int64_t);
-CASTING_SPECIALIZATION(int16, int64_t);
-CASTING_SPECIALIZATION(int32, int64_t);
+CASTING_SPECIALIZATION(uint8_t, uint64_t);
+CASTING_SPECIALIZATION(uint16_t, uint64_t);
+CASTING_SPECIALIZATION(uint32_t, uint64_t);
+CASTING_SPECIALIZATION(int8_t, int64_t);
+CASTING_SPECIALIZATION(int16_t, int64_t);
+CASTING_SPECIALIZATION(int32_t, int64_t);
 CASTING_SPECIALIZATION(bfloat16, float);
 #undef CASTING_SPECIALIZATION
 
diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc
index 02ad8662cdc014..54c8e4969717c7 100644
--- a/tensorflow/core/kernels/reduction_ops_all.cc
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@@ -19,16 +19,16 @@ namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(
     Name("All")
-        .TypeConstraint<int32>("Tidx")
+        .TypeConstraint<int32_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int32, Eigen::internal::AndReducer>);
+    ReductionOp<CPUDevice, bool, int32_t, Eigen::internal::AndReducer>);
 REGISTER_KERNEL_BUILDER(
     Name("All")
         .TypeConstraint<int64_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int64, Eigen::internal::AndReducer>);
+    ReductionOp<CPUDevice, bool, int64_t, Eigen::internal::AndReducer>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc
index 0d5b531b6b9d22..9675bbccc0f7e2 100644
--- a/tensorflow/core/kernels/reduction_ops_any.cc
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@@ -19,16 +19,16 @@ namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(
     Name("Any")
-        .TypeConstraint<int32>("Tidx")
+        .TypeConstraint<int32_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int32, Eigen::internal::OrReducer>);
+    ReductionOp<CPUDevice, bool, int32_t, Eigen::internal::OrReducer>);
 REGISTER_KERNEL_BUILDER(
     Name("Any")
         .TypeConstraint<int64_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int64, Eigen::internal::OrReducer>);
+    ReductionOp<CPUDevice, bool, int64_t, Eigen::internal::OrReducer>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index 60f5b9462f8366..028743cf9c3d18 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -44,10 +44,10 @@ TensorShape ReductionHelper::shuffled_shape() {
   return shape;
 }
 
-absl::InlinedVector<int32, 8> ReductionHelper::permutation() {
+absl::InlinedVector<int32_t, 8> ReductionHelper::permutation() {
   const int dims = data_reshape_.size();
   const int unreduced_dims = (dims + !reduce_first_axis_) / 2;
-  absl::InlinedVector<int32, 8> perm(dims);
+  absl::InlinedVector<int32_t, 8> perm(dims);
   for (int i = 0; i < unreduced_dims; i++) {
     perm[i] = 2 * i + reduce_first_axis_;
   }
@@ -84,7 +84,7 @@ absl::Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
   // bitmap[i] indicates whether to reduce data along i-th axis.
   absl::InlinedVector<bool, 4> bitmap(data.dims(), false);
   if (axis.dtype() == DT_INT32) {
-    TF_RETURN_IF_ERROR(SimplifyHelper<int32>(data, axis, bitmap));
+    TF_RETURN_IF_ERROR(SimplifyHelper<int32_t>(data, axis, bitmap));
   } else {
     TF_RETURN_IF_ERROR(SimplifyHelper<int64_t>(data, axis, bitmap));
   }
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 6ce777f748a777..daab208f725bec 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -114,7 +114,7 @@ class ReductionHelper {
   TensorShape shuffled_shape();
 
   // Permutation of reduced dims needed to put reduction dimensions at the end
-  absl::InlinedVector<int32, 8> permutation();
+  absl::InlinedVector<int32_t, 8> permutation();
 
  private:
   bool reduce_first_axis_;  // True if need to reduce the 0-th dimension.
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index 59d7c89b7795be..54025c4e612fe2 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -67,23 +67,23 @@ REGISTER_GPU_KERNELS(int64_t);
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MaxReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64_t>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MaxReducer<int32>>);
+REGISTER_KERNEL_BUILDER(Name("Max")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int32_t,
+                                    Eigen::internal::MaxReducer<int32_t>>);
+REGISTER_KERNEL_BUILDER(Name("Max")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int64_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int64_t,
+                                    Eigen::internal::MaxReducer<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index d493cc7514b5d1..b81cd549373d2e 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -65,24 +65,23 @@ REGISTER_GPU_KERNELS(double);
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MinReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64_t>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MinReducer<int32>>);
-
+REGISTER_KERNEL_BUILDER(Name("Min")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int32_t,
+                                    Eigen::internal::MinReducer<int32_t>>);
+REGISTER_KERNEL_BUILDER(Name("Min")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int64_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int64_t,
+                                    Eigen::internal::MinReducer<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index e28964905fe85c..43b921c40829ce 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -60,24 +60,24 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_DEFAULT)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::SumReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_DEFAULT)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64_t>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::SumReducer<int32>>);
+REGISTER_KERNEL_BUILDER(Name("Sum")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tidx")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<CPUDevice, int32_t, int32_t,
+                                    Eigen::internal::SumReducer<int32_t>>);
+REGISTER_KERNEL_BUILDER(Name("Sum")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int64_t>("Tidx")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<CPUDevice, int32_t, int64_t,
+                                    Eigen::internal::SumReducer<int32_t>>);
 
 #endif
 
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index cb5fda312752ac..4c77592f5dbf36 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -24,58 +24,58 @@ namespace tensorflow {
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
 template <typename T>
-static Graph* ToScalar(const string& reduce, int num_x, int num_y) {
+static Graph* ToScalar(const std::string& reduce, int num_x, int num_y) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y}));
   data.flat<T>().setRandom();
   Tensor axes(DT_INT32, TensorShape({2}));
-  axes.flat<int32>()(0) = 0;
-  axes.flat<int32>()(1) = 1;
+  axes.flat<int32_t>()(0) = 0;
+  axes.flat<int32_t>()(1) = 1;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* ColReduce(const string& reduce, int num_x, int num_y) {
+static Graph* ColReduce(const std::string& reduce, int num_x, int num_y) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 0;
+  axes.flat<int32_t>()(0) = 0;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* RowReduce(const string& reduce, int num_x, int num_y) {
+static Graph* RowReduce(const std::string& reduce, int num_x, int num_y) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 1;
+  axes.flat<int32_t>()(0) = 1;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) {
+static Graph* ThreeDYReduce(const std::string& reduce, int num_y, int num_z) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 1;
+  axes.flat<int32_t>()(0) = 1;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
+static Graph* ThreeDXZReduce(const std::string& reduce, int num_y, int num_z) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({2}));
-  axes.flat<int32>()(0) = 0;
-  axes.flat<int32>()(1) = 2;
+  axes.flat<int32_t>()(0) = 0;
+  axes.flat<int32_t>()(1) = 2;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
@@ -85,7 +85,7 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
 // into a scalar on a "device". Runs the bench for "iters" times.
 template <typename T>
 static void ReduceToScalar(::testing::benchmark::State& state,
-                           const string& device, const string& reduce,
+                           const std::string& device, const std::string& reduce,
                            int num_x, int num_y) {
   test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
@@ -97,8 +97,8 @@ static void ReduceToScalar(::testing::benchmark::State& state,
 }
 
 static void DoRowReduce(::testing::benchmark::State& state,
-                        const string& device, const string& reduce, int num_x,
-                        int num_y) {
+                        const std::string& device, const std::string& reduce,
+                        int num_x, int num_y) {
   test::Benchmark(device, RowReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -109,8 +109,8 @@ static void DoRowReduce(::testing::benchmark::State& state,
 }
 
 static void DoColReduce(::testing::benchmark::State& state,
-                        const string& device, const string& reduce, int num_x,
-                        int num_y) {
+                        const std::string& device, const std::string& reduce,
+                        int num_x, int num_y) {
   test::Benchmark(device, ColReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -121,8 +121,8 @@ static void DoColReduce(::testing::benchmark::State& state,
 }
 
 static void Do3DYReduce(::testing::benchmark::State& state,
-                        const string& device, const string& reduce, int num_x,
-                        int num_y) {
+                        const std::string& device, const std::string& reduce,
+                        int num_x, int num_y) {
   test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -133,8 +133,8 @@ static void Do3DYReduce(::testing::benchmark::State& state,
 }
 
 static void Do3DXZReduce(::testing::benchmark::State& state,
-                         const string& device, const string& reduce, int num_x,
-                         int num_y) {
+                         const std::string& device, const std::string& reduce,
+                         int num_x, int num_y) {
   test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
index 9d0bb60ed436b4..e90656fd36b298 100644
--- a/tensorflow/core/kernels/reference_gemm.h
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -64,8 +64,8 @@ void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
     c_j_stride = 1;
   }
 
-  const int32_t highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
-  const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+  const int32_t highest = static_cast<int32_t>(Eigen::NumTraits<T3>::highest());
+  const int32_t lowest = static_cast<int32_t>(Eigen::NumTraits<T3>::lowest());
   const int32_t rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
 
   int i, j, l;
@@ -74,9 +74,9 @@ void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
       int32_t total = 0;
       for (l = 0; l < k; l++) {
         const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
-        const int32_t a_value = static_cast<int32>(a[a_index]) - offset_a;
+        const int32_t a_value = static_cast<int32_t>(a[a_index]) - offset_a;
         const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
-        const int32_t b_value = static_cast<int32>(b[b_index]) - offset_b;
+        const int32_t b_value = static_cast<int32_t>(b[b_index]) - offset_b;
         total += (a_value * b_value);
       }
       const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
index ddcc165cf5fd18..23be3bd76534fd 100644
--- a/tensorflow/core/kernels/regex_full_match_op.cc
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -41,7 +41,7 @@ class RegexFullMatchOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
-    const string pattern = pattern_tensor->flat<tstring>()(0);
+    const std::string pattern = pattern_tensor->flat<tstring>()(0);
     std::shared_ptr<RE2> regex = CachedRE2(pattern);
     OP_REQUIRES(ctx, regex->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
@@ -57,7 +57,7 @@ class RegexFullMatchOp : public OpKernel {
   }
 
  private:
-  std::shared_ptr<RE2> CachedRE2(const string& pattern) {
+  std::shared_ptr<RE2> CachedRE2(const std::string& pattern) {
     {
       tf_shared_lock l(mu_);
       if (regex_ != nullptr && regex_->pattern() == pattern) {
@@ -88,7 +88,7 @@ REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
 class StaticRegexFullMatchOp : public OpKernel {
  public:
   explicit StaticRegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string pattern;
+    std::string pattern;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("pattern", &pattern));
     re_ = std::make_unique<RE2>(pattern);
     OP_REQUIRES(ctx, re_->ok(),
diff --git a/tensorflow/core/kernels/regex_replace_op_test.cc b/tensorflow/core/kernels/regex_replace_op_test.cc
index 73979d41222f3b..41ee85d7e4b02b 100644
--- a/tensorflow/core/kernels/regex_replace_op_test.cc
+++ b/tensorflow/core/kernels/regex_replace_op_test.cc
@@ -67,8 +67,9 @@ Tensor GetTestTensor(int batch) {
   return t;
 }
 
-Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
-                              const string& input_rewrite) {
+Graph* SetupRegexReplaceGraph(const Tensor& input,
+                              const std::string& input_pattern,
+                              const std::string& input_rewrite) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor pattern(DT_STRING, TensorShape({}));
   pattern.flat<tstring>().setConstant(input_pattern);
@@ -103,8 +104,8 @@ BENCHMARK(BM_RegexReplace)
     ->Arg(128)
     ->Arg(256);
 
-Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
-                        const string& rewrite) {
+Graph* SetupStaticGraph(const Tensor& input, const std::string& input_pattern,
+                        const std::string& rewrite) {
   Graph* g = new Graph(OpRegistry::Global());
 
   TF_CHECK_OK(NodeBuilder("static_regex_replace_op", "StaticRegexReplace")
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index a1bba19fc27506..6cfc5354f95419 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -44,10 +44,10 @@ namespace functor {
 __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
                                    const Eigen::half* __restrict__ feature,
                                    Eigen::half* __restrict__ backprop,
-                                   int32 count) {
-  int32 half2_count = count >> 1;
-  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_device_threads = gridDim.x * blockDim.x;
+                                   int32_t count) {
+  int32_t half2_count = count >> 1;
+  int32_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_device_threads = gridDim.x * blockDim.x;
 
   while (index < half2_count) {
     // The fast branch.
@@ -97,9 +97,9 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
 __global__ void ReluGradHalfKernelVector(
     const Eigen::half* __restrict__ gradient,
     const Eigen::half* __restrict__ feature, Eigen::half* __restrict__ backprop,
-    int32 count) {
-  int32 half8_count = count / VectorSizeElements;
-  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+    int32_t count) {
+  int32_t half8_count = count / VectorSizeElements;
+  int32_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (index < half8_count) {
     // Cast to xx_h8 for vector load and store.
@@ -174,17 +174,17 @@ struct ReluGrad<Device, Eigen::half> {
     auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
     bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
                    backprop_ptr % 16 == 0;
-    int32 count = gradient.size();
-    constexpr int32 kThreadInBlock = 512;
+    int32_t count = gradient.size();
+    constexpr int32_t kThreadInBlock = 512;
     if (count == 0) return;
     if (aligned) {
-      int32 half8_count = Eigen::divup(count, VectorSizeElements);
-      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
+      int32_t half8_count = Eigen::divup(count, VectorSizeElements);
+      int32_t kBlock = Eigen::divup(half8_count, kThreadInBlock);
       TF_CHECK_OK(GpuLaunchKernel(
           ReluGradHalfKernelVector, kBlock, kThreadInBlock, 0, d.stream(),
           gradient.data(), feature.data(), backprop.data(), count));
     } else {
-      int32 half2_count = Eigen::divup(count, 2);
+      int32_t half2_count = Eigen::divup(count, 2);
       GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
           half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
       TF_CHECK_OK(GpuLaunchKernel(
@@ -195,8 +195,8 @@ struct ReluGrad<Device, Eigen::half> {
 };
 
 __global__ void Relu_int8x4_kernel(int vect_count,
-                                   const int32* __restrict__ input,
-                                   int32* __restrict__ output) {
+                                   const int32_t* __restrict__ input,
+                                   int32_t* __restrict__ output) {
   CUDA_1D_KERNEL_LOOP(index, vect_count) {
 #if GOOGLE_CUDA
     output[index] = __vmaxs4(input[index], 0);
@@ -221,17 +221,17 @@ struct Relu<Device, qint8> {
   // 'output' should have the same size as 'input'.
   void operator()(const Device& d, typename TTypes<qint8>::ConstTensor input,
                   typename TTypes<qint8>::Tensor output) {
-    int32 count = input.size();
+    int32_t count = input.size();
     if (count == 0) return;
 
-    int32 vect_count = Eigen::divup(count, 4);
-    constexpr int32 kThreadInBlock = 512;
+    int32_t vect_count = Eigen::divup(count, 4);
+    constexpr int32_t kThreadInBlock = 512;
     GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
         vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock);
     TF_CHECK_OK(GpuLaunchKernel(
         Relu_int8x4_kernel, config.block_count, config.thread_per_block, 0,
-        d.stream(), vect_count, reinterpret_cast<const int32*>(input.data()),
-        reinterpret_cast<int32*>(output.data())));
+        d.stream(), vect_count, reinterpret_cast<const int32_t*>(input.data()),
+        reinterpret_cast<int32_t*>(output.data())));
   }
 };
 
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
index 554c142b9db87a..127381c00034f7 100644
--- a/tensorflow/core/kernels/reshape_op.h
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -61,8 +61,8 @@ class ReshapeOp : public OpKernel {
     switch (sizes.dtype()) {
       case DT_INT32:
         OP_REQUIRES_OK(context,
-                       ValidateSizes<int32>(sizes, &product, &unknown_index,
-                                            &shape, &sizes_has_zero_dim));
+                       ValidateSizes<int32_t>(sizes, &product, &unknown_index,
+                                              &shape, &sizes_has_zero_dim));
         break;
       case DT_INT64:
         OP_REQUIRES_OK(context,
@@ -145,7 +145,7 @@ class ReshapeOp : public OpKernel {
         *has_zero_dim = true;
       } else {
         if (MultiplyWithoutOverflow(shape->num_elements(), size) < 0) {
-          string msg;
+          std::string msg;
           for (int ii = 0; ii < num_dims; ++ii) {
             if (ii != 0) {
               absl::StrAppend(&msg, ", ");
diff --git a/tensorflow/core/kernels/reshape_util_gpu.cu.cc b/tensorflow/core/kernels/reshape_util_gpu.cu.cc
index 22f09a0ee92aa8..f3c48ef42c9ae4 100644
--- a/tensorflow/core/kernels/reshape_util_gpu.cu.cc
+++ b/tensorflow/core/kernels/reshape_util_gpu.cu.cc
@@ -36,7 +36,7 @@ __global__ void ReshapeSparseTensorKernel(
   GPU_1D_KERNEL_LOOP(sparse_index, nnz) {
     const Tindex* input_index = &input_indices[sparse_index * input_rank];
     Tindex* output_index = &output_indices[sparse_index * output_rank];
-    int64 dense_index = 0;  // int64 to avoid overflow if Tindex is int32
+    int64_t dense_index = 0;  // int64 to avoid overflow if Tindex is int32
     // Flatten input index from slowest- to fastest-changing dimension.
     for (int i = 0; i < input_rank; ++i) {
       dense_index = dense_index * input_shape[i] + input_index[i];
@@ -55,14 +55,14 @@ __global__ void ReshapeSparseTensorKernel(
 namespace functor {
 
 template <>
-Status ReshapeSparseTensorFunctor<GPUDevice>::operator()(
+absl::Status ReshapeSparseTensorFunctor<GPUDevice>::operator()(
     OpKernelContext* context, const TensorShape& input_shape,
     const TensorShape& output_shape,
     typename TTypes<int64_t>::ConstMatrix input_indices,
     typename TTypes<int64_t>::Matrix output_indices) const {
-  const int64 input_rank = input_shape.dims();
-  const int64 output_rank = output_shape.dims();
-  const int64 nnz = input_indices.dimension(0);
+  const int64_t input_rank = input_shape.dims();
+  const int64_t output_rank = output_shape.dims();
+  const int64_t nnz = input_indices.dimension(0);
   // We copy input_shape and output_shape to the GPU and then launch a kernel
   // to compute output_indices.
   Tensor input_shape_gpu_t;
@@ -75,16 +75,16 @@ Status ReshapeSparseTensorFunctor<GPUDevice>::operator()(
   auto output_shape_gpu = output_shape_gpu_t.flat<int64_t>();
   se::Stream* stream = context->op_device_context()->stream();
   if (!stream) return errors::Internal("No GPU stream available.");
-  se::DeviceMemoryBase input_shape_gpu_mem(input_shape_gpu.data(),
-                                           input_rank * sizeof(int64));
+  stream_executor::DeviceAddressBase input_shape_gpu_mem(
+      input_shape_gpu.data(), input_rank * sizeof(int64_t));
   TF_RETURN_IF_ERROR(stream->Memcpy(&input_shape_gpu_mem,
                                     input_shape.dim_sizes().data(),
-                                    input_rank * sizeof(int64)));
-  se::DeviceMemoryBase output_shape_gpu_mem(output_shape_gpu.data(),
-                                            output_rank * sizeof(int64));
+                                    input_rank * sizeof(int64_t)));
+  stream_executor::DeviceAddressBase output_shape_gpu_mem(
+      output_shape_gpu.data(), output_rank * sizeof(int64_t));
   TF_RETURN_IF_ERROR(stream->Memcpy(&output_shape_gpu_mem,
                                     output_shape.dim_sizes().data(),
-                                    output_rank * sizeof(int64)));
+                                    output_rank * sizeof(int64_t)));
   const GPUDevice& device = context->template eigen_device<GPUDevice>();
   auto config = GetGpuLaunchConfig(nnz, device);
   return GpuLaunchKernel(ReshapeSparseTensorKernel<int64_t>, config.block_count,
diff --git a/tensorflow/core/kernels/resource_ops_test.cc b/tensorflow/core/kernels/resource_ops_test.cc
index ffc2815d4201d3..43df25dc056eb5 100644
--- a/tensorflow/core/kernels/resource_ops_test.cc
+++ b/tensorflow/core/kernels/resource_ops_test.cc
@@ -42,7 +42,7 @@ class MockResource : public ResourceBase {
       *alive_ = false;
     }
   }
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   bool* alive_;
   int payload_;
 };
@@ -103,7 +103,7 @@ TEST_F(MockHandleCreationOpTest, RefCounting) {
   // Feed and run
   AddInputFromArray<int64_t>(TensorShape({}),
                              {reinterpret_cast<int64_t>(&alive)});
-  AddInputFromArray<int32>(TensorShape({}), {payload});
+  AddInputFromArray<int32_t>(TensorShape({}), {payload});
   TF_ASSERT_OK(RunOpKernel());
   EXPECT_TRUE(alive);
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
index 1c8d79988a2457..53a52e6cda4303 100644
--- a/tensorflow/core/kernels/resource_variable_ops.h
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -32,9 +32,9 @@ class VarHandleOp : public OpKernel {
  private:
   // Same fields as in ResourceHandleOp.
   bool is_anonymous_;
-  string container_;
-  string name_;
-  string debug_name_;
+  std::string container_;
+  std::string name_;
+  std::string debug_name_;
   Tensor const_tensor_;
 
   DtypeAndPartialTensorShape dtype_and_shape_;
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
index 15dacaf6d93c45..16bfd01ab4f335 100644
--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -62,8 +62,8 @@ Tensor MakeInput(const TensorShape& shape,
 }
 
 TEST_F(RestoreOpTest, RestoreSimple) {
-  const string filename = io::JoinPath(testing::TmpDir(), "tensor_simple");
-  const std::vector<string> tensor_names = {
+  const std::string filename = io::JoinPath(testing::TmpDir(), "tensor_simple");
+  const std::vector<std::string> tensor_names = {
       "tensor_bool",  "tensor_int",        "tensor_float",  "tensor_double",
       "tensor_qint8", "tensor_qint32",     "tensor_uint8",  "tensor_int8",
       "tensor_int16", "tensor_int64",      "tensor_string", "tensor_complex64",
@@ -103,7 +103,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     // Input #1 is the tensor names
     Tensor input_1 = MakeInput<tstring>(
         TensorShape({static_cast<int>(tensor_names.size())}),
-        [&tensor_names](int x) -> string { return tensor_names[x]; });
+        [&tensor_names](int x) -> std::string { return tensor_names[x]; });
     inputs.push_back({nullptr, &input_1});
 
     // Input #2 is a 1-d bool tensor
@@ -111,8 +111,8 @@ TEST_F(RestoreOpTest, RestoreSimple) {
         MakeInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
     inputs.push_back({nullptr, &input_2});
     // Input #3 is a 1-d integer tensor
-    Tensor input_3 = MakeInput<int32>(TensorShape({10}),
-                                      [](int x) -> int32 { return x + 1; });
+    Tensor input_3 = MakeInput<int32_t>(TensorShape({10}),
+                                        [](int x) -> int32_t { return x + 1; });
     inputs.push_back({nullptr, &input_3});
     // Input #4 is a 2-d float tensor
     Tensor input_4 = MakeInput<float>(TensorShape({2, 4}), [](int x) -> float {
@@ -136,24 +136,25 @@ TEST_F(RestoreOpTest, RestoreSimple) {
         });
     inputs.push_back({nullptr, &input_7});
     // Input #8 is a 1-d uint8 tensor
-    Tensor input_8 = MakeInput<uint8>(TensorShape({11}),
-                                      [](int x) -> uint8 { return x + 1; });
+    Tensor input_8 = MakeInput<uint8_t>(TensorShape({11}),
+                                        [](int x) -> uint8_t { return x + 1; });
     inputs.push_back({nullptr, &input_8});
     // Input #9 is a 1-d int8 tensor
-    Tensor input_9 =
-        MakeInput<int8>(TensorShape({7}), [](int x) -> int8 { return x - 7; });
+    Tensor input_9 = MakeInput<int8_t>(TensorShape({7}),
+                                       [](int x) -> int8_t { return x - 7; });
     inputs.push_back({nullptr, &input_9});
     // Input #10 is a 1-d int16 tensor
-    Tensor input_10 = MakeInput<int16>(TensorShape({7}),
-                                       [](int x) -> int16 { return x - 8; });
+    Tensor input_10 = MakeInput<int16_t>(
+        TensorShape({7}), [](int x) -> int16_t { return x - 8; });
     inputs.push_back({nullptr, &input_10});
     // Input #11 is a 1-d int64 tensor
-    Tensor input_11 = MakeInput<int64_t>(TensorShape({9}),
-                                         [](int x) -> int64 { return x - 9; });
+    Tensor input_11 = MakeInput<int64_t>(
+        TensorShape({9}), [](int x) -> int64_t { return x - 9; });
     inputs.push_back({nullptr, &input_11});
     // Input #12 is a 1-d string tensor
     Tensor input_12 = MakeInput<tstring>(
-        TensorShape({2}), [](int x) -> string { return x ? "yes" : "no"; });
+        TensorShape({2}),
+        [](int x) -> std::string { return x ? "yes" : "no"; });
     inputs.push_back({nullptr, &input_12});
     // Input #13 is a 1-d complex64 tensor
     Tensor input_13 = MakeInput<complex64>(
@@ -212,7 +213,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({10});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 10; ++i) {
-      EXPECT_EQ(i + 1, output->flat<int32>()(i));
+      EXPECT_EQ(i + 1, output->flat<int32_t>()(i));
     }
   }
   // The 2-d float tensor
@@ -273,7 +274,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({11});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 11; ++i) {
-      EXPECT_EQ(i + 1, output->flat<uint8>()(i));
+      EXPECT_EQ(i + 1, output->flat<uint8_t>()(i));
     }
   }
   // The 1-d int8 tensor
@@ -285,7 +286,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({7});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 7; ++i) {
-      EXPECT_EQ(i - 7, output->flat<int8>()(i));
+      EXPECT_EQ(i - 7, output->flat<int8_t>()(i));
     }
   }
   // The 1-d int16 tensor
@@ -297,7 +298,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({7});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 7; ++i) {
-      EXPECT_EQ(i - 8, output->flat<int16>()(i));
+      EXPECT_EQ(i - 8, output->flat<int16_t>()(i));
     }
   }
   // The 1-d int64 tensor
@@ -373,8 +374,8 @@ class RestoreSliceOpTest : public OpsTestBase {
 };
 
 TEST_F(RestoreSliceOpTest, RestoreInt) {
-  const string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
-  const string tensor_name = "tensor_int";
+  const std::string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
+  const std::string tensor_name = "tensor_int";
 
   // We first need to write a tensor using the save_op
   {
@@ -412,7 +413,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
     // Input #2 is a 4x16 integer tensor.
     Tensor input_2(DT_INT32, TensorShape({4, 16}));
     for (int64_t i = 0; i < input_2.NumElements(); ++i) {
-      input_2.flat<int32>()(i) = i + 1;
+      input_2.flat<int32_t>()(i) = i + 1;
     }
     inputs.push_back({nullptr, &input_2});
 
@@ -433,7 +434,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
 
   // Now we restore
   MakeRestoreSliceOp(DT_INT32);
-  string shape_and_slice = "4 16 0,2:-";
+  std::string shape_and_slice = "4 16 0,2:-";
   // Add a file name
   AddInput<tstring>(TensorShape({}),
                     [&filename](int x) -> tstring { return filename; });
@@ -452,7 +453,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
   TensorShape expected({2, 16});
   EXPECT_TRUE(output->shape().IsSameSize(expected));
   for (int64_t i = 0; i < expected.num_elements(); ++i) {
-    EXPECT_EQ(i + 1, output->flat<int32>()(i));
+    EXPECT_EQ(i + 1, output->flat<int32_t>()(i));
   }
 }
 
diff --git a/tensorflow/core/kernels/restore_v2_op_test.cc b/tensorflow/core/kernels/restore_v2_op_test.cc
index c102cc42e2063f..0a66a0f31d4366 100644
--- a/tensorflow/core/kernels/restore_v2_op_test.cc
+++ b/tensorflow/core/kernels/restore_v2_op_test.cc
@@ -61,9 +61,9 @@ class RestoreV2OpTest : public OpsTestBase {
   }
 
   void RunTest(absl::string_view save_op_to_use) {
-    const string filename =
+    const std::string filename =
         io::JoinPath(testing::TmpDir(), "tensor_simple-", save_op_to_use);
-    const std::vector<string> tensor_names = {
+    const std::vector<std::string> tensor_names = {
         "tensor_bool",  "tensor_int",    "tensor_float",     "tensor_double",
         "tensor_qint8", "tensor_qint32", "tensor_uint8",     "tensor_int8",
         "tensor_int16", "tensor_int64",  "tensor_complex64", "tensor_half"};
@@ -114,12 +114,12 @@ class RestoreV2OpTest : public OpsTestBase {
       // Input #1 is the tensor names
       Tensor input_1 = MakeInput<tstring>(
           TensorShape({static_cast<int>(tensor_names.size())}),
-          [&tensor_names](int x) -> string { return tensor_names[x]; });
+          [&tensor_names](int x) -> std::string { return tensor_names[x]; });
       inputs.push_back({nullptr, &input_1});
 
       Tensor shape_and_slices = MakeInput<tstring>(
           TensorShape({static_cast<int>(tensor_names.size())}),
-          [](int x) -> string { return "" /* saves in full */; });
+          [](int x) -> std::string { return "" /* saves in full */; });
       if (save_op_to_use != "Save") {
         inputs.push_back({nullptr, &shape_and_slices});
       }
@@ -129,8 +129,8 @@ class RestoreV2OpTest : public OpsTestBase {
                                        [](int x) -> bool { return x != 0; });
       inputs.push_back({nullptr, &input_2});
       // Input #3 is a 1-d integer tensor
-      Tensor input_3 = MakeInput<int32>(TensorShape({10}),
-                                        [](int x) -> int32 { return x + 1; });
+      Tensor input_3 = MakeInput<int32_t>(
+          TensorShape({10}), [](int x) -> int32_t { return x + 1; });
       inputs.push_back({nullptr, &input_3});
       // Input #4 is a 2-d float tensor
       Tensor input_4 = MakeInput<float>(
@@ -154,20 +154,20 @@ class RestoreV2OpTest : public OpsTestBase {
           });
       inputs.push_back({nullptr, &input_7});
       // Input #8 is a 1-d uint8 tensor
-      Tensor input_8 = MakeInput<uint8>(TensorShape({11}),
-                                        [](int x) -> uint8 { return x + 1; });
+      Tensor input_8 = MakeInput<uint8_t>(
+          TensorShape({11}), [](int x) -> uint8_t { return x + 1; });
       inputs.push_back({nullptr, &input_8});
       // Input #9 is a 1-d int8 tensor
-      Tensor input_9 = MakeInput<int8>(TensorShape({7}),
-                                       [](int x) -> int8 { return x - 7; });
+      Tensor input_9 = MakeInput<int8_t>(TensorShape({7}),
+                                         [](int x) -> int8_t { return x - 7; });
       inputs.push_back({nullptr, &input_9});
       // Input #10 is a 1-d int16 tensor
-      Tensor input_10 = MakeInput<int16>(TensorShape({7}),
-                                         [](int x) -> int16 { return x - 8; });
+      Tensor input_10 = MakeInput<int16_t>(
+          TensorShape({7}), [](int x) -> int16_t { return x - 8; });
       inputs.push_back({nullptr, &input_10});
       // Input #11 is a 1-d int64 tensor
       Tensor input_11 = MakeInput<int64_t>(
-          TensorShape({9}), [](int x) -> int64 { return x - 9; });
+          TensorShape({9}), [](int x) -> int64_t { return x - 9; });
       inputs.push_back({nullptr, &input_11});
       // Input #12 is a 1-d complex64 tensor
       Tensor input_13 = MakeInput<complex64>(
@@ -222,7 +222,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({10});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 10; ++i) {
-        EXPECT_EQ(i + 1, output->flat<int32>()(i));
+        EXPECT_EQ(i + 1, output->flat<int32_t>()(i));
       }
     }
     // The 2-d float tensor
@@ -283,7 +283,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({11});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 11; ++i) {
-        EXPECT_EQ(i + 1, output->flat<uint8>()(i));
+        EXPECT_EQ(i + 1, output->flat<uint8_t>()(i));
       }
     }
     // The 1-d int8 tensor
@@ -295,7 +295,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({7});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 7; ++i) {
-        EXPECT_EQ(i - 7, output->flat<int8>()(i));
+        EXPECT_EQ(i - 7, output->flat<int8_t>()(i));
       }
     }
     // The 1-d int16 tensor
@@ -307,7 +307,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({7});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 7; ++i) {
-        EXPECT_EQ(i - 8, output->flat<int16>()(i));
+        EXPECT_EQ(i - 8, output->flat<int16_t>()(i));
       }
     }
     // The 1-d int64 tensor
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 09606abc6c61e6..632a5136db8280 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -115,17 +115,17 @@ class ReverseOpTest : public OpsTestBase {
   }
 };
 
-TEST_F(ReverseOpTest, Reverse_0_uint8) { Reverse_0<uint8>(); }
+TEST_F(ReverseOpTest, Reverse_0_uint8) { Reverse_0<uint8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_int8) { Reverse_0<int8>(); }
+TEST_F(ReverseOpTest, Reverse_0_int8) { Reverse_0<int8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_uint16) { Reverse_0<uint16>(); }
+TEST_F(ReverseOpTest, Reverse_0_uint16) { Reverse_0<uint16_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_int16) { Reverse_0<int16>(); }
+TEST_F(ReverseOpTest, Reverse_0_int16) { Reverse_0<int16_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_0_float) { Reverse_0<float>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_int32) { Reverse_0<int32>(); }
+TEST_F(ReverseOpTest, Reverse_0_int32) { Reverse_0<int32_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_0_int64) { Reverse_0<int64_t>(); }
 
@@ -135,17 +135,17 @@ TEST_F(ReverseOpTest, Reverse_0_complex64) { Reverse_0<complex64>(); }
 
 TEST_F(ReverseOpTest, Reverse_0_complex128) { Reverse_0<complex128>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_uint8) { Reverse_234<uint8>(); }
+TEST_F(ReverseOpTest, Reverse_234_uint8) { Reverse_234<uint8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_int8) { Reverse_234<int8>(); }
+TEST_F(ReverseOpTest, Reverse_234_int8) { Reverse_234<int8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_uint16) { Reverse_234<uint16>(); }
+TEST_F(ReverseOpTest, Reverse_234_uint16) { Reverse_234<uint16_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_int16) { Reverse_234<int16>(); }
+TEST_F(ReverseOpTest, Reverse_234_int16) { Reverse_234<int16_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_234_float) { Reverse_234<float>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_int32) { Reverse_234<int32>(); }
+TEST_F(ReverseOpTest, Reverse_234_int32) { Reverse_234<int32_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_234_int64) { Reverse_234<int64_t>(); }
 
@@ -155,17 +155,17 @@ TEST_F(ReverseOpTest, Reverse_234_complex64) { Reverse_234<complex64>(); }
 
 TEST_F(ReverseOpTest, Reverse_234_complex128) { Reverse_234<complex128>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_uint8) { Reverse_1234<uint8>(); }
+TEST_F(ReverseOpTest, Reverse_1234_uint8) { Reverse_1234<uint8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_int8) { Reverse_1234<int8>(); }
+TEST_F(ReverseOpTest, Reverse_1234_int8) { Reverse_1234<int8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_uint16) { Reverse_1234<uint16>(); }
+TEST_F(ReverseOpTest, Reverse_1234_uint16) { Reverse_1234<uint16_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_int16) { Reverse_1234<int16>(); }
+TEST_F(ReverseOpTest, Reverse_1234_int16) { Reverse_1234<int16_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_1234_float) { Reverse_1234<float>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_int32) { Reverse_1234<int32>(); }
+TEST_F(ReverseOpTest, Reverse_1234_int32) { Reverse_1234<int32_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_1234_int64) { Reverse_1234<int64_t>(); }
 
@@ -190,7 +190,7 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   Tensor data(DataTypeToEnum<T>::value, shape);
   data.flat<T>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = reverse_axis;
+  axes.flat<int32_t>()(0) = reverse_axis;
   test::graph::Reverse(g, test::graph::Constant(g, data),
                        test::graph::Constant(g, axes));
   return g;
@@ -229,8 +229,8 @@ void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 1 /* intra_threads */, 1 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   1 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
@@ -257,8 +257,8 @@ void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 4 /* intra_threads */, 1 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   4 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
@@ -286,8 +286,8 @@ void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 1 /* intra_threads */, 3 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   1 /* intra_threads */, 3 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
@@ -316,8 +316,8 @@ void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 4 /* intra_threads */, 3 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   4 /* intra_threads */, 3 /* channels */);
 }
 BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
     ->UseRealTime()
@@ -344,8 +344,8 @@ void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 1 /* intra_threads */, 4 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   1 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
@@ -372,8 +372,8 @@ void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 4 /* intra_threads */, 4 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   4 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 139520ece5e2a0..7d33356a169ccf 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -98,8 +98,8 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
 }
 
 template <>
-void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                                   int seq_dim) {
+void CheckErrors<GPUDevice, int32_t>(OpKernelContext* context, int batch_dim,
+                                     int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,8 +164,8 @@ class ReverseSequenceOp : public OpKernel {
   }
 
  private:
-  int32 batch_dim_;
-  int32 seq_dim_;
+  int32_t batch_dim_;
+  int32_t seq_dim_;
 
   ReverseSequenceOp(const ReverseSequenceOp&) = delete;
   void operator=(const ReverseSequenceOp&) = delete;
diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h
index f25794f3a2ad39..7db47a4b8bbce3 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.h
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@@ -49,8 +49,8 @@ class ReverseGenerator {
 
  private:
   typename TTypes<T, Dims>::ConstTensor input_;
-  int32 batch_dim_;
-  int32 seq_dim_;
+  int32_t batch_dim_;
+  int32_t seq_dim_;
   typename TTypes<Tlen>::ConstVec seq_lengths_;
 };
 
diff --git a/tensorflow/core/kernels/roll_op_gpu.cu.cc b/tensorflow/core/kernels/roll_op_gpu.cu.cc
index dca487fc060003..130bdd206b67fd 100644
--- a/tensorflow/core/kernels/roll_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/roll_op_gpu.cu.cc
@@ -30,15 +30,15 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 
 template <typename T>
-__global__ void RollKernel(const int32 nthreads, const int32 num_dims,
+__global__ void RollKernel(const int32_t nthreads, const int32_t num_dims,
                            const T* __restrict__ input, T* __restrict__ output,
-                           const int32* __restrict__ dim_size,
-                           const int32* __restrict__ threshold,
-                           const int64* __restrict__ dim_range) {
+                           const int32_t* __restrict__ dim_size,
+                           const int32_t* __restrict__ threshold,
+                           const int64_t* __restrict__ dim_range) {
   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
-    int64 offset = 0;
+    int64_t offset = 0;
     for (int i = 0; i < num_dims; i++) {
-      const int64 stride = dim_range[i] / dim_size[i];
+      const int64_t stride = dim_range[i] / dim_size[i];
       const int shift = dim_size[i] - threshold[i];
       const int indx = (out_idx / stride) % dim_size[i];
       const int shifted_indx = (indx + shift) % dim_size[i];
@@ -53,21 +53,22 @@ namespace functor {
 
 template <typename T>
 struct Roll<GPUDevice, T> {
-  void operator()(const OpKernelContext* context, const int64 num_elements,
-                  const int num_dims, const gtl::ArraySlice<int32> dim_size,
+  void operator()(const OpKernelContext* context, const int64_t num_elements,
+                  const int num_dims, const absl::Span<const int32> dim_size,
                   const T* input, T* output,
-                  const gtl::ArraySlice<int32> threshold,
-                  const gtl::ArraySlice<int64_t> dim_range, const int64 isd) {
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range,
+                  const int64_t isd) {
     if (!num_elements) return;
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
-    auto dim_bytes = sizeof(int32) * dim_size.size();
+    auto dim_bytes = sizeof(int32_t) * dim_size.size();
     auto dim_buf = d.allocate(dim_bytes);
 
-    auto thres_bytes = sizeof(int32) * threshold.size();
+    auto thres_bytes = sizeof(int32_t) * threshold.size();
     auto thres_buf = d.allocate(thres_bytes);
 
-    auto range_bytes = sizeof(int64) * dim_range.size();
+    auto range_bytes = sizeof(int64_t) * dim_range.size();
     auto range_buf = d.allocate(range_bytes);
 
     d.memcpyHostToDevice(dim_buf, dim_size.data(), dim_bytes);
@@ -76,12 +77,12 @@ struct Roll<GPUDevice, T> {
 
     GpuLaunchConfig cfg = GetGpuLaunchConfig(num_elements, d);
 
-    TF_CHECK_OK(GpuLaunchKernel(RollKernel<T>, cfg.block_count,
-                                cfg.thread_per_block, 0, d.stream(),
-                                cfg.virtual_thread_count, num_dims, input,
-                                output, reinterpret_cast<const int32*>(dim_buf),
-                                reinterpret_cast<const int32*>(thres_buf),
-                                reinterpret_cast<const int64*>(range_buf)));
+    TF_CHECK_OK(
+        GpuLaunchKernel(RollKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+                        d.stream(), cfg.virtual_thread_count, num_dims, input,
+                        output, reinterpret_cast<const int32_t*>(dim_buf),
+                        reinterpret_cast<const int32_t*>(thres_buf),
+                        reinterpret_cast<const int64_t*>(range_buf)));
 
     d.deallocate(dim_buf);
     d.deallocate(thres_buf);
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 7e9054f997172d..f9dac8363f8f37 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -104,7 +104,7 @@ namespace functor {
   DECLARE(Eigen::internal::ProdReducer<T>, T);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS);
-DECLARE_FOR_ALL_REDUCERS(int32);
+DECLARE_FOR_ALL_REDUCERS(int32_t);
 DECLARE_FOR_ALL_REDUCERS(int64_t);
 #undef DECLARE_FOR_ALL_REDUCERS
 
@@ -151,7 +151,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("axis"),                                           \
       ScanOp<GPUDevice, type, Eigen::internal::SumReducer<type>, int64>)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-REGISTER_GPU_KERNELS(int32);
+REGISTER_GPU_KERNELS(int32_t);
 REGISTER_GPU_KERNELS(int64_t);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -190,7 +190,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("axis"),                                            \
       ScanOp<GPUDevice, type, Eigen::internal::ProdReducer<type>, int64>)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-REGISTER_GPU_KERNELS(int32);
+REGISTER_GPU_KERNELS(int32_t);
 REGISTER_GPU_KERNELS(int64_t);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index 61868b7853e400..e4f43d51b46075 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -97,7 +97,7 @@ __global__ void ScatterOpCustomKernel(T* __restrict__ params,
       // Ignore indices that are out of range.
       continue;
     }
-    int64 params_i = param_first_index * update_block + (i % update_block);
+    int64_t params_i = param_first_index * update_block + (i % update_block);
     body(&params[params_i], ldg(updates + updates_i));
   }
 }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 7d61e1aa2f257e..d5e3b2ad9eb0a9 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -1040,10 +1040,10 @@ absl::Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
 // and the GPU implementation is not. Tensor inputs to this function must be on
 // the GPU.
 template <typename T, typename Index, scatter_nd_op::UpdateOp Op>
-Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
-                        const Tensor& updates, const TensorShape& shape,
-                        Tensor* out, bool allocate,
-                        BadIndicesPolicy bad_indices_policy) {
+absl::Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
+                              const Tensor& updates, const TensorShape& shape,
+                              Tensor* out, bool allocate,
+                              BadIndicesPolicy bad_indices_policy) {
   AllocatorAttributes alloc_attr;
   alloc_attr.set_on_host(true);
   alloc_attr.set_gpu_compatible(true);
@@ -1053,7 +1053,7 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
   Tensor host_indices;
   TF_RETURN_IF_ERROR(c->allocate_temp(indices.dtype(), indices.shape(),
                                       &host_indices, alloc_attr));
-  se::DeviceMemoryBase indices_ptr(
+  stream_executor::DeviceAddressBase indices_ptr(
       const_cast<Tensor&>(indices).flat<Index>().data(),
       indices.flat<Index>().size() * sizeof(Index));
   TF_RETURN_IF_ERROR(stream->Memcpy(host_indices.flat<Index>().data(),
@@ -1063,7 +1063,7 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
   Tensor host_updates;
   TF_RETURN_IF_ERROR(c->allocate_temp(updates.dtype(), updates.shape(),
                                       &host_updates, alloc_attr));
-  se::DeviceMemoryBase updates_ptr(
+  stream_executor::DeviceAddressBase updates_ptr(
       const_cast<Tensor&>(updates).flat<T>().data(),
       updates.flat<T>().size() * sizeof(T));
   TF_RETURN_IF_ERROR(stream->Memcpy(host_updates.flat<T>().data(), updates_ptr,
@@ -1078,8 +1078,8 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
     fill(c->eigen_device<CPUDevice>(), host_out.flat<T>());
   } else {
     CHECK_NOTNULL(out);  // Crash OK
-    se::DeviceMemoryBase out_ptr(out->flat<T>().data(),
-                                 out->flat<T>().size() * sizeof(T));
+    stream_executor::DeviceAddressBase out_ptr(
+        out->flat<T>().data(), out->flat<T>().size() * sizeof(T));
     TF_RETURN_IF_ERROR(stream->Memcpy(host_out.flat<T>().data(), out_ptr,
                                       host_out.NumElements() * sizeof(T)));
   }
@@ -1090,13 +1090,13 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
       bad_indices_policy));
 
   // Copy 'host_out' to device.
-  se::DeviceMemoryBase out_ptr(out->flat<T>().data(),
-                               out->flat<T>().size() * sizeof(T));
+  stream_executor::DeviceAddressBase out_ptr(out->flat<T>().data(),
+                                             out->flat<T>().size() * sizeof(T));
   TF_RETURN_IF_ERROR(stream->Memcpy(&out_ptr, host_out.flat<T>().data(),
                                     host_out.NumElements() * sizeof(T)));
   // Block host, since 'host_out' cannot be destructed until the copy is done.
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index fd1d4747c40982..ae2402b2a228e1 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -98,7 +98,7 @@ template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
 __global__ void ScatterNdOpKernel(
     const Index* indices, const T* updates, T* out,
     const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
-    const Eigen::array<int64, IXDIM> batch_strides, const int64 num_indices,
+    const Eigen::array<int64_t, IXDIM> batch_strides, const int64_t num_indices,
     const Index slice_size) {
   auto update = LeftUpdate<T, op>();
 
@@ -141,7 +141,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
     const Eigen::DenseIndex batch_size = Tindices.dimension(0);
 
     // Index batch_strides[IXDIM];
-    Eigen::array<int64, IXDIM> batch_strides;
+    Eigen::array<int64_t, IXDIM> batch_strides;
     if (IXDIM > 0) {
       batch_strides[IXDIM - 1] = 1;
     }
diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
index 67602db6164561..10448882a9296d 100644
--- a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
@@ -60,15 +60,16 @@ __global__ void LowerBoundKernel(const T* __restrict__ sorted_inputs,
 namespace functor {
 template <typename T, typename OutType>
 struct UpperBoundFunctor<GPUDevice, T, OutType> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
-                        const typename TTypes<T, 1>::ConstTensor& values,
-                        int batch_size, int num_inputs, int num_values,
-                        typename TTypes<OutType, 1>::Tensor* output) {
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
+      const typename TTypes<T, 1>::ConstTensor& values, int batch_size,
+      int num_inputs, int num_values,
+      typename TTypes<OutType, 1>::Tensor* output) {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
     if (values.size() == 0) {
       // GetGpuLaunchConfig requires work_element_count > 0
-      return OkStatus();
+      return absl::OkStatus();
     }
     GpuLaunchConfig config = GetGpuLaunchConfig(values.size(), device);
 
@@ -77,21 +78,22 @@ struct UpperBoundFunctor<GPUDevice, T, OutType> {
         config.thread_per_block, 0, device.stream(), sorted_inputs.data(),
         batch_size, num_inputs, num_values, values.data(), output->data()));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
 template <typename T, typename OutType>
 struct LowerBoundFunctor<GPUDevice, T, OutType> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
-                        const typename TTypes<T, 1>::ConstTensor& values,
-                        int batch_size, int num_inputs, int num_values,
-                        typename TTypes<OutType, 1>::Tensor* output) {
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
+      const typename TTypes<T, 1>::ConstTensor& values, int batch_size,
+      int num_inputs, int num_values,
+      typename TTypes<OutType, 1>::Tensor* output) {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
     if (values.size() == 0) {
       // GetGpuLaunchConfig requires work_element_count > 0
-      return OkStatus();
+      return absl::OkStatus();
     }
     GpuLaunchConfig config = GetGpuLaunchConfig(values.size(), device);
 
@@ -100,7 +102,7 @@ struct LowerBoundFunctor<GPUDevice, T, OutType> {
         config.thread_per_block, 0, device.stream(), sorted_inputs.data(),
         batch_size, num_inputs, num_values, values.data(), output->data()));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
index f0ba0ce2c27572..dc63e6c5602956 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -165,7 +165,7 @@ __global__ void SegmentMeanNormalizeKernel(
 }
 
 template <typename SegmentId, typename Index, typename T>
-Status LaunchSegmentMeanNormalizeKernel(
+absl::Status LaunchSegmentMeanNormalizeKernel(
     const GPUDevice& d, SegmentId nsegments, Index ninner,
     const Index* __restrict__ segment_offsets,  // [nsegments + 1]
     T* __restrict__ output) {                   // [nsegments, ninner]
@@ -195,7 +195,7 @@ __global__ void SegmentSetEmptyKernel(
 }
 
 template <typename SegmentId, typename Index, typename T>
-Status LaunchSegmentSetEmptyKernel(
+absl::Status LaunchSegmentSetEmptyKernel(
     const GPUDevice& d, SegmentId nsegments, Index ninner,
     const Index* __restrict__ segment_offsets,  // [nsegments + 1]
     const T empty_value,
@@ -263,7 +263,7 @@ __global__ void SegmentOffsetsKernel(
 // value at segment_offsets[nsegments] is set to the end index of the last valid
 // ID (e.g., nsegments if all IDs are valid).
 template <typename Toffsets, typename Tsegmentids>
-Status LaunchSegmentOffsetsKernel(
+absl::Status LaunchSegmentOffsetsKernel(
     const GPUDevice& d, Toffsets size, Tsegmentids nsegments,
     const Tsegmentids* segment_ids,  // [size]
     Toffsets* segment_offsets) {     // [nsegments + 1]
@@ -397,7 +397,7 @@ __global__ void SegmentReduceVectorKernel(
 template <typename Treducevec, typename Tvec, typename Toffsets,
           typename Tindices, typename Tsegmentids, typename ReduceOp,
           typename Tinit, typename Tweights>
-Status LaunchSegmentReduceVectorKernel(
+absl::Status LaunchSegmentReduceVectorKernel(
     const GPUDevice& d, Toffsets nouter, Toffsets ninner_vec,
     Tsegmentids nsegments, ReduceOp reduce_op, Tinit initial_value,
     Tinit empty_segment_value, bool is_mean, bool is_sqrtn,
@@ -467,7 +467,7 @@ __global__ void SegmentReduceEpilogueKernel(
 // be a higher-precision type than the output type Tvec (e.g., float vs. half).
 template <typename Tvec, typename Treducevec, typename Toffsets,
           typename Tsegmentids, typename Tinit>
-Status LaunchSegmentReduceEpilogueKernel(
+absl::Status LaunchSegmentReduceEpilogueKernel(
     const GPUDevice& d, Tsegmentids nsegments, Tinit empty_segment_value,
     bool is_mean, bool is_sqrtn,
     const Treducevec* output_raw,     // [nsegments]
@@ -542,7 +542,7 @@ MakeLookupAndScaleAndCastInputsIterator(const Tvec* input_vec,
 template <typename Treducevec, typename Tvec, typename Toffsets,
           typename Tindices, typename Tsegmentids, typename ReduceOp,
           typename Tinit, typename Tweights>
-Status SegmentReduceGPUImplNoInnerDim(
+absl::Status SegmentReduceGPUImplNoInnerDim(
     OpKernelContext* ctx, Toffsets nouter, Tsegmentids nsegments,
     ReduceOp reduce_op, Tinit initial_value, Tinit empty_segment_value,
     bool is_mean, bool is_sqrtn,
@@ -568,7 +568,7 @@ Status SegmentReduceGPUImplNoInnerDim(
         TensorShape({static_cast<int64_t>(nsegments * sizeof(Treducevec))}),
         &output_raw));
     output_raw_ptr =
-        reinterpret_cast<Treducevec*>(output_raw.flat<int8>().data());
+        reinterpret_cast<Treducevec*>(output_raw.flat<int8_t>().data());
   }
   auto input_iter =
       MakeLookupAndScaleAndCastInputsIterator<Treducevec, Toffsets>(
@@ -586,13 +586,13 @@ Status SegmentReduceGPUImplNoInnerDim(
         device, nsegments, empty_segment_value, is_mean, is_sqrtn,
         output_raw_ptr, segment_offsets, output_vec));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Treducevec, typename Tvec, typename Toffsets,
           typename Tindices, typename Tsegmentids, typename ReduceOp,
           typename Tinit, typename Tweights>
-Status SegmentReduceGPUImpl(
+absl::Status SegmentReduceGPUImpl(
     OpKernelContext* ctx, Toffsets nouter, Toffsets ninner_vec,
     Tsegmentids nsegments, ReduceOp reduce_op, Tinit initial_value,
     Tinit empty_segment_value, bool is_mean, bool is_sqrtn,
@@ -648,12 +648,13 @@ struct SegmentReduceGPUVectorized {
   struct Impl {
     template <typename T, typename Toffsets, typename Tindices,
               typename Tsegmentids, typename ReduceOp, typename Tweights>
-    Status operator()(OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
-                      Tsegmentids nsegments, ReduceOp reduce_op,
-                      T initial_value, T empty_segment_value, bool is_mean,
-                      bool is_sqrtn, const T* input,
-                      const Tsegmentids* segment_ids, const Tindices* indices,
-                      const Tweights* weights, T* output) {
+    absl::Status operator()(OpKernelContext* ctx, Toffsets nouter,
+                            Toffsets ninner, Tsegmentids nsegments,
+                            ReduceOp reduce_op, T initial_value,
+                            T empty_segment_value, bool is_mean, bool is_sqrtn,
+                            const T* input, const Tsegmentids* segment_ids,
+                            const Tindices* indices, const Tweights* weights,
+                            T* output) {
       DCHECK_EQ(ninner % vec_size, 0);
       DCHECK_EQ(reinterpret_cast<std::uintptr_t>(input) % vec_size, 0);
       DCHECK_EQ(reinterpret_cast<std::uintptr_t>(output) % vec_size, 0);
@@ -682,16 +683,16 @@ struct SegmentReduceGPUVectorized {
 // Note: Treduce is to allow reducing in higher precision than T.
 template <typename Treduce, typename T, typename Toffsets, typename Tindices,
           typename Tsegmentids, typename ReduceOp, typename Tweights>
-Status SegmentReduceGPU(OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
-                        Tsegmentids nsegments, ReduceOp reduce_op,
-                        T initial_value, T empty_segment_value, bool is_mean,
-                        bool is_sqrtn,
-                        const T* input,  // [nouter or any, ninner]
-                        const Tsegmentids* segment_ids,  // [nouter]
-                        const Tindices* indices,         // [nouter] (optional)
-                        const Tweights* weights,  // [nouter or any] (optional)
-                        T* output) {              // [nsegments, ninner]
-  if (ninner == 0 || nsegments == 0) return OkStatus();
+absl::Status SegmentReduceGPU(
+    OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
+    Tsegmentids nsegments, ReduceOp reduce_op, T initial_value,
+    T empty_segment_value, bool is_mean, bool is_sqrtn,
+    const T* input,                  // [nouter or any, ninner]
+    const Tsegmentids* segment_ids,  // [nouter]
+    const Tindices* indices,         // [nouter] (optional)
+    const Tweights* weights,         // [nouter or any] (optional)
+    T* output) {                     // [nsegments, ninner]
+  if (ninner == 0 || nsegments == 0) return absl::OkStatus();
   return DispatchToVectorized<
       T, SegmentReduceGPUVectorized<Treduce>::template Impl>(
       MinAlignmentOf(input, output, ninner), ctx, nouter, ninner, nsegments,
@@ -716,7 +717,7 @@ __global__ void SegmentWeightsKernel(
 }
 
 template <typename SegmentId, typename Index, typename Tweights>
-Status LaunchSegmentWeightsKernel(
+absl::Status LaunchSegmentWeightsKernel(
     const GPUDevice& d, SegmentId nsegments,
     SparseSegmentReductionOperation operation,
     const Index* segment_offsets,  // [nsegments + 1]
@@ -945,7 +946,7 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
 };
 
 template <typename T, typename Index, typename SegmentId>
-Status SparseSegmentReductionFunctor<T, Index, SegmentId>::operator()(
+absl::Status SparseSegmentReductionFunctor<T, Index, SegmentId>::operator()(
     OpKernelContext* context, bool is_mean, bool is_sqrtn, T default_value,
     typename TTypes<T, 2>::ConstTensor input,
     typename TTypes<Index>::ConstVec indices,
@@ -1087,7 +1088,7 @@ __global__ void ScatterUniqueIndicesKernel(
 
 template <typename Toffsets, typename EdgeIndicatorIter,
           typename TindicesCompact, typename Tindices>
-Status LaunchScatterUniqueIndicesKernel(
+absl::Status LaunchScatterUniqueIndicesKernel(
     const GPUDevice& d, Toffsets nouter,
     EdgeIndicatorIter sorted_indices_edge_indicator,     // [nouter]
     const TindicesCompact* __restrict__ sorted_indices,  // [nouter]
@@ -1122,7 +1123,7 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
     const int64_t nouter64 = indices_vec.dimension(0);
     // Note: nouter and ninner are not expected to be huge, so we use int32 to
     // save memory bandwidth.
-    using Toffsets = int32;
+    using Toffsets = int32_t;
     OP_REQUIRES_ASYNC(context, nouter64 <= std::numeric_limits<Toffsets>::max(),
                       absl::InvalidArgumentError(
                           absl::StrCat("Indices vector of length ", nouter64,
@@ -1140,7 +1141,7 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
     // worth it because the vector is used multiple times).
     // Note that we can currently assume int32 is safe because the op's dense
     // output_dim0 input is always int32.
-    using TindicesCompact = int32;
+    using TindicesCompact = int32_t;
     Tensor tmp_indices_internal;
     const TindicesCompact* indices_internal_ptr;
     if constexpr (std::is_same<Tindices, TindicesCompact>::value) {
@@ -1163,9 +1164,9 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
           context, operation, nouter, ninner, nsegments, input_flat.data(),
           tmp_indices_internal, indices_internal_ptr, segment_vec,
           dense_output_shape, done);
-    } else if (sizeof(Tsegmentids) > sizeof(int32) &&
-               nsegments <= std::numeric_limits<int32>::max()) {
-      CastSegmentIdsThenImpl<Toffsets, TindicesCompact, int32>(
+    } else if (sizeof(Tsegmentids) > sizeof(int32_t) &&
+               nsegments <= std::numeric_limits<int32_t>::max()) {
+      CastSegmentIdsThenImpl<Toffsets, TindicesCompact, int32_t>(
           context, operation, nouter, ninner, nsegments, input_flat.data(),
           tmp_indices_internal, indices_internal_ptr, segment_vec,
           dense_output_shape, done);
@@ -1295,12 +1296,13 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
     ScratchSpace<Toffsets> last_idx_host(context, 1, /*on_host=*/true);
     OP_REQUIRES_OK_ASYNC(
         context,
-        stream->Memcpy(last_idx_host.mutable_data(),
-                       se::DeviceMemoryBase(const_cast<Toffsets*>(
-                                                sorted_indices_unique_ids_ptr) +
-                                                (nouter - 1),
-                                            sizeof(*last_idx_host.data())),
-                       sizeof(*last_idx_host.data())),
+        stream->Memcpy(
+            last_idx_host.mutable_data(),
+            stream_executor::DeviceAddressBase(
+                const_cast<Toffsets*>(sorted_indices_unique_ids_ptr) +
+                    (nouter - 1),
+                sizeof(*last_idx_host.data())),
+            sizeof(*last_idx_host.data())),
         done);
 
     auto async_finish_computation =
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index bdeb782dc47e49..4bc9c22b33bb00 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -31,11 +31,11 @@ typedef Eigen::GpuDevice GPUDevice;
 // GPU kernel.
 template <int NUM_BLOCK_DIMS>
 struct S2BParameters {
-  int32 space_tensor_batch;
-  int32 batch_tensor_shape[NUM_BLOCK_DIMS + 2];
-  int32 space_tensor_spatial_shape[NUM_BLOCK_DIMS];
-  int32 pad_start[NUM_BLOCK_DIMS];
-  int32 block_shape[NUM_BLOCK_DIMS];
+  int32_t space_tensor_batch;
+  int32_t batch_tensor_shape[NUM_BLOCK_DIMS + 2];
+  int32_t space_tensor_spatial_shape[NUM_BLOCK_DIMS];
+  int32_t pad_start[NUM_BLOCK_DIMS];
+  int32_t block_shape[NUM_BLOCK_DIMS];
 };
 
 // GPU kernel for space-to-batch (if B2S = false) and batch-to-space conversion
@@ -44,13 +44,13 @@ struct S2BParameters {
 // To simplify template implementation given lack of constexpr if, both the
 // input and output pointers are non-const.
 template <typename T, int NUM_BLOCK_DIMS, bool B2S>
-__global__ void S2B(const int32 nthreads, T* __restrict__ space_tensor_ptr,
+__global__ void S2B(const int32_t nthreads, T* __restrict__ space_tensor_ptr,
                     S2BParameters<NUM_BLOCK_DIMS> args,
                     T* __restrict__ batch_tensor_ptr) {
   GPU_1D_KERNEL_LOOP(batch_tensor_idx, nthreads) {
-    int32 remaining_batch_tensor_idx = batch_tensor_idx;
+    int32_t remaining_batch_tensor_idx = batch_tensor_idx;
 
-    int32 batch_tensor_pos[NUM_BLOCK_DIMS + 2];
+    int32_t batch_tensor_pos[NUM_BLOCK_DIMS + 2];
 
     for (int dim = NUM_BLOCK_DIMS + 1; dim >= 1; --dim) {
       batch_tensor_pos[dim] =
@@ -59,17 +59,17 @@ __global__ void S2B(const int32 nthreads, T* __restrict__ space_tensor_ptr,
     }
     batch_tensor_pos[0] = remaining_batch_tensor_idx;
 
-    int32 remaining_block_idx = batch_tensor_pos[0] / args.space_tensor_batch;
-    int32 space_tensor_idx = batch_tensor_pos[NUM_BLOCK_DIMS + 1];
-    int32 space_tensor_stride = args.batch_tensor_shape[NUM_BLOCK_DIMS + 1];
-    const int32 space_tensor_batch_pos =
+    int32_t remaining_block_idx = batch_tensor_pos[0] / args.space_tensor_batch;
+    int32_t space_tensor_idx = batch_tensor_pos[NUM_BLOCK_DIMS + 1];
+    int32_t space_tensor_stride = args.batch_tensor_shape[NUM_BLOCK_DIMS + 1];
+    const int32_t space_tensor_batch_pos =
         batch_tensor_pos[0] % args.space_tensor_batch;
     for (int block_dim = NUM_BLOCK_DIMS - 1; block_dim >= 0; --block_dim) {
-      int32 offset = remaining_block_idx;
+      int32_t offset = remaining_block_idx;
       if (block_dim > 0) {
         offset %= args.block_shape[block_dim];
       }
-      int32 space_tensor_pos =
+      int32_t space_tensor_pos =
           batch_tensor_pos[block_dim + 1] * args.block_shape[block_dim] +
           offset - args.pad_start[block_dim];
       if (space_tensor_pos < 0 ||
@@ -102,45 +102,45 @@ template <typename T, int NUM_BLOCK_DIMS, bool B2S>
 struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
   using SpaceT = typename std::conditional<B2S, T, const T>::type;
   using BatchT = typename std::conditional<B2S, const T, T>::type;
-  Status operator()(
+  absl::Status operator()(
       const GPUDevice& d,
       typename TTypes<SpaceT, NUM_BLOCK_DIMS + 2>::Tensor space_tensor,
-      const int64 block_shape[NUM_BLOCK_DIMS],
-      const int64 paddings[NUM_BLOCK_DIMS * 2],
+      const int64_t block_shape[NUM_BLOCK_DIMS],
+      const int64_t paddings[NUM_BLOCK_DIMS * 2],
       typename TTypes<BatchT, NUM_BLOCK_DIMS + 2>::Tensor batch_tensor) {
     // Kernel execution fails if number of elements is zero.
     if (batch_tensor.size() == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     S2BParameters<NUM_BLOCK_DIMS> args;
     args.space_tensor_batch = space_tensor.dimension(0);
     for (int block_dim = 0; block_dim < NUM_BLOCK_DIMS; ++block_dim) {
-      if (block_shape[block_dim] > std::numeric_limits<int32>::max()) {
+      if (block_shape[block_dim] > std::numeric_limits<int32_t>::max()) {
         return errors::InvalidArgument("block_shape value exceeds 2^32-1");
       }
       args.block_shape[block_dim] = block_shape[block_dim];
       if (space_tensor.dimension(block_dim + 1) >
-          std::numeric_limits<int32>::max()) {
+          std::numeric_limits<int32_t>::max()) {
         return errors::InvalidArgument("space_tensor dimension exceeds 2^32-1");
       }
       args.space_tensor_spatial_shape[block_dim] =
           space_tensor.dimension(block_dim + 1);
-      if (paddings[block_dim * 2] > std::numeric_limits<int32>::max()) {
+      if (paddings[block_dim * 2] > std::numeric_limits<int32_t>::max()) {
         return errors::InvalidArgument("paddings/crops value exceeds 2^32-1");
       }
       args.pad_start[block_dim] = paddings[block_dim * 2];
     }
-    int64 total_count = 1;
+    int64_t total_count = 1;
     for (int dim = 0; dim < NUM_BLOCK_DIMS + 2; ++dim) {
       args.batch_tensor_shape[dim] = batch_tensor.dimension(dim);
       total_count *= args.batch_tensor_shape[dim];
     }
-    if (total_count > std::numeric_limits<int32>::max()) {
+    if (total_count > std::numeric_limits<int32_t>::max()) {
       return errors::InvalidArgument(
           "number of batch_tensor elements exceeds 2^32-1");
     }
     GpuLaunchConfig config =
-        GetGpuLaunchConfig(static_cast<int32>(total_count), d);
+        GetGpuLaunchConfig(static_cast<int32_t>(total_count), d);
     return GpuLaunchKernel(S2B<T, NUM_BLOCK_DIMS, B2S>, config.block_count,
                            config.thread_per_block, 0, d.stream(),
                            config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index 8bb9474ca9b524..97acca5442890d 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -29,7 +29,7 @@ typedef Eigen::GpuDevice GPUDevice;
 // Space2Depth kernel for FORMAT_NHWC.
 // See 'spacetodepth_op.h' for a more detailed description.
 template <typename dtype>
-__global__ void S2D_NHWC(const int32 nthreads,
+__global__ void S2D_NHWC(const int32_t nthreads,
                          const dtype* __restrict__ input_ptr,
                          const int block_size, const int batch_size,
                          const int input_height, const int input_width,
@@ -61,7 +61,7 @@ __global__ void S2D_NHWC(const int32 nthreads,
 // Space2Depth kernel for FORMAT_NCHW.
 // See 'spacetodepth_op.h' for a more detailed description.
 template <typename dtype>
-__global__ void S2D_NCHW(const int32 nthreads,
+__global__ void S2D_NCHW(const int32_t nthreads,
                          const dtype* __restrict__ input_ptr,
                          const int block_size, const int output_width,
                          const int input_depth_by_output_height,
@@ -99,7 +99,7 @@ __global__ void S2D_NCHW(const int32 nthreads,
 // Space2Depth kernel for FORMAT_NCHW using a loop over block area.
 // See 'spacetodepth_op.h' for functional specification.
 template <typename dtype, int block_size>
-__global__ void S2D_NCHW_LOOP(const int32 nthreads,
+__global__ void S2D_NCHW_LOOP(const int32_t nthreads,
                               const dtype* __restrict__ input,
                               const int output_width, const int input_width,
                               const int input_depth_by_output_area,
diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index c454241c1574c2..ef440aa870dfe3 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -93,19 +93,19 @@ class CSRSparseMatrixAddFunctor {
 
     Tensor c_batch_ptr_t(cpu_allocator(), DT_INT32,
                          TensorShape({batch_size + 1}));
-    auto c_batch_ptr = c_batch_ptr_t.vec<int32>();
+    auto c_batch_ptr = c_batch_ptr_t.vec<int32_t>();
     c_batch_ptr(0) = 0;
 
     Tensor c_row_ptr_t;
     TF_RETURN_IF_ERROR(ctx_->allocate_temp(
         DT_INT32, TensorShape({batch_size * (rows + 1)}), &c_row_ptr_t));
-    auto c_row_ptr = c_row_ptr_t.vec<int32>();
+    auto c_row_ptr = c_row_ptr_t.vec<int32_t>();
 
     // Set the output row pointers to zero, in case we hit any empty
     // combinations of rows in a and b.
-    functor::SetZeroFunctor<Device, int32> set_zero;
+    functor::SetZeroFunctor<Device, int32_t> set_zero;
     const Device& d = ctx_->eigen_device<Device>();
-    set_zero(d, c_row_ptr_t.flat<int32>());
+    set_zero(d, c_row_ptr_t.flat<int32_t>());
 
     size_t maxWorkspaceSize = 0;
     for (int i = 0; i < batch_size; ++i) {
@@ -125,7 +125,7 @@ class CSRSparseMatrixAddFunctor {
     Tensor temp;
     TF_RETURN_IF_ERROR(ctx_->allocate_temp(
         DT_INT8, TensorShape({static_cast<int64_t>(maxWorkspaceSize)}), &temp));
-    void* workspace = temp.flat<int8>().data();
+    void* workspace = temp.flat<int8_t>().data();
 
     for (int i = 0; i < batch_size; ++i) {
       // Calculate output sizes for all minibatch entries.
@@ -138,8 +138,8 @@ class CSRSparseMatrixAddFunctor {
                                   a.values_vec<T>(i), a_dense_shape};
       ConstCSRComponent<T> b_comp{b.row_pointers_vec(i), b.col_indices_vec(i),
                                   b.values_vec<T>(i), b_dense_shape};
-      TTypes<int32>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
-                                              rows + 1);
+      TTypes<int32_t>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
+                                                rows + 1);
       int c_nnz_i;
       TF_RETURN_IF_ERROR(csr_geam.GetOutputStructure(
           a_comp, b_comp, c_row_ptr_i, &c_nnz_i, workspace));
@@ -281,17 +281,18 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         beta_(beta),
         initialized_(false) {}
 
-  Status Initialize() {
+  absl::Status Initialize() {
     TF_RETURN_IF_ERROR(cuda_sparse_.Initialize());
     TF_RETURN_IF_ERROR(descrA_.Initialize());
     TF_RETURN_IF_ERROR(descrB_.Initialize());
     TF_RETURN_IF_ERROR(descrC_.Initialize());
     initialized_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
-                          const ConstCSRComponent<T>& b, size_t* bufferSize) {
+  absl::Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
+                                const ConstCSRComponent<T>& b,
+                                size_t* bufferSize) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -313,13 +314,13 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), null_T, null_int,
         null_int, bufferSize));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status GetOutputStructure(const ConstCSRComponent<T>& a,
-                            const ConstCSRComponent<T>& b,
-                            TTypes<int32>::UnalignedVec c_row_ptr,
-                            int* output_nnz, void* workspace) {
+  absl::Status GetOutputStructure(const ConstCSRComponent<T>& a,
+                                  const ConstCSRComponent<T>& b,
+                                  TTypes<int32_t>::UnalignedVec c_row_ptr,
+                                  int* output_nnz, void* workspace) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -343,11 +344,12 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
       return errors::Internal(
           "CSRAdd: CsrgeamNnz returned nnzTotalDevHostPtr < 0: ", *output_nnz);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Compute(const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b,
-                 CSRComponent<T>* c, void* workspace) {
+  absl::Status Compute(const ConstCSRComponent<T>& a,
+                       const ConstCSRComponent<T>& b, CSRComponent<T>* c,
+                       void* workspace) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -368,7 +370,7 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), c->values.data(),
         c->row_ptr.data(), c->col_ind.data(), workspace));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
index 2991f7bad9af89..311469571aaf9f 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
@@ -73,9 +73,9 @@ class CSRSparseMatrixToDenseCPUOp : public OpKernel {
     const int64_t num_rows = dense_shape((rank == 2) ? 0 : 1);
     const int64_t num_cols = dense_shape((rank == 2) ? 1 : 2);
 
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
-    auto row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto col_ind = csr_sparse_matrix->col_indices().vec<int32>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
+    auto row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
     auto values = csr_sparse_matrix->values().vec<T>();
 
     TensorShape dense_tensor_shape;
@@ -159,14 +159,14 @@ class CSRSparseMatrixToDenseGPUOp : public OpKernel {
     functor::CSRSparseMatrixToCOOSparseMatrix<Device> csr_to_coo;
     auto indices = indices_t.matrix<int64_t>();
 
-    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32>();
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
 
     Tensor coo_row_ind_t;
     OP_REQUIRES_OK(c, c->allocate_temp(DT_INT32, TensorShape({total_nnz}),
                                        &coo_row_ind_t));
-    auto coo_row_ind = coo_row_ind_t.vec<int32>();
+    auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
 
     // TODO(ebrevdo): just write a custom kernel that converts from
     // csr to dense.
@@ -176,9 +176,9 @@ class CSRSparseMatrixToDenseGPUOp : public OpKernel {
         // No copying required.  Avoid failure case below.
         continue;
       }
-      const TTypes<int32>::UnalignedConstVec csr_row_ptr_i(
+      const TTypes<int32_t>::UnalignedConstVec csr_row_ptr_i(
           &csr_row_ptr((rows + 1) * i), rows + 1);
-      const TTypes<int32>::UnalignedVec coo_row_ind_i(
+      const TTypes<int32_t>::UnalignedVec coo_row_ind_i(
           &coo_row_ind(csr_sparse_matrix->batch_offset(i)), nnz_i);
       OP_REQUIRES_OK(c, csr_to_coo(c, csr_row_ptr_i, coo_row_ind_i));
     }
@@ -237,20 +237,20 @@ REGISTER_GPU(complex128)
 namespace functor {
 template <>
 struct COOSparseMatrixToSparseTensor<GPUDevice> {
-  Status operator()(OpKernelContext* ctx,
-                    TTypes<int64_t>::ConstVec host_dense_shape,
-                    TTypes<int>::ConstVec host_batch_ptrs,
-                    TTypes<int>::Vec coo_row_ind,
-                    TTypes<int>::ConstVec coo_col_ind,
-                    TTypes<int64_t>::Matrix indices);
+  absl::Status operator()(OpKernelContext* ctx,
+                          TTypes<int64_t>::ConstVec host_dense_shape,
+                          TTypes<int>::ConstVec host_batch_ptrs,
+                          TTypes<int>::Vec coo_row_ind,
+                          TTypes<int>::ConstVec coo_col_ind,
+                          TTypes<int64_t>::Matrix indices);
 };
 extern template struct COOSparseMatrixToSparseTensor<GPUDevice>;
 
 template <>
 struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c,
-                    TTypes<const int>::UnalignedVec csr_row_ptr,
-                    TTypes<int>::UnalignedVec coo_row_ind);
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<const int>::UnalignedVec csr_row_ptr,
+                          TTypes<int>::UnalignedVec coo_row_ind);
 };
 extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;
 
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
index 403af12bb8fb52..07448230f398fb 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@@ -91,9 +91,9 @@ class CSRSparseMatrixToSparseTensorCPUOp : public OpKernel {
         c, c->allocate_output(0, TensorShape({total_nnz, rank}), &indices));
     auto indices_flat = indices->template flat<int64_t>();
 
-    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto csr_col_ind = csr_sparse_matrix->col_indices().vec<int32>();
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto csr_col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
 
     // Process the individual batches in parallel using a threadpool.
     auto shard = [&](int64_t batch_begin, int64_t batch_end) {
@@ -165,14 +165,14 @@ class CSRSparseMatrixToSparseTensorGPUOp : public OpKernel {
     functor::CSRSparseMatrixToCOOSparseMatrix<Device> csr_to_coo;
     auto indices = indices_t->matrix<int64_t>();
 
-    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32>();
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
 
     Tensor coo_row_ind_t;
     OP_REQUIRES_OK(c, c->allocate_temp(DT_INT32, TensorShape({total_nnz}),
                                        &coo_row_ind_t));
-    auto coo_row_ind = coo_row_ind_t.vec<int32>();
+    auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
 
     // TODO(ebrevdo): Convert to one or two single kernel calls,
     // where the kernels are batch-friendly.
@@ -182,9 +182,9 @@ class CSRSparseMatrixToSparseTensorGPUOp : public OpKernel {
         // No copying required.  Avoid failure case below.
         continue;
       }
-      const TTypes<int32>::UnalignedConstVec csr_row_ptr_i(
+      const TTypes<int32_t>::UnalignedConstVec csr_row_ptr_i(
           &csr_row_ptr((rows + 1) * i), rows + 1);
-      const TTypes<int32>::UnalignedVec coo_row_ind_i(
+      const TTypes<int32_t>::UnalignedVec coo_row_ind_i(
           &coo_row_ind(csr_sparse_matrix->batch_offset(i)), nnz_i);
       OP_REQUIRES_OK(c, csr_to_coo(c, csr_row_ptr_i, coo_row_ind_i));
     }
@@ -222,20 +222,20 @@ REGISTER_GPU(complex128)
 namespace functor {
 template <>
 struct COOSparseMatrixToSparseTensor<GPUDevice> {
-  Status operator()(OpKernelContext* ctx,
-                    TTypes<int64_t>::ConstVec host_dense_shape,
-                    TTypes<int>::ConstVec host_batch_ptrs,
-                    TTypes<int>::Vec coo_row_ind,
-                    TTypes<int>::ConstVec coo_col_ind,
-                    TTypes<int64_t>::Matrix indices);
+  absl::Status operator()(OpKernelContext* ctx,
+                          TTypes<int64_t>::ConstVec host_dense_shape,
+                          TTypes<int>::ConstVec host_batch_ptrs,
+                          TTypes<int>::Vec coo_row_ind,
+                          TTypes<int>::ConstVec coo_col_ind,
+                          TTypes<int64_t>::Matrix indices);
 };
 extern template struct COOSparseMatrixToSparseTensor<GPUDevice>;
 
 template <>
 struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c,
-                    TTypes<const int>::UnalignedVec csr_row_ptr,
-                    TTypes<int>::UnalignedVec coo_row_ind);
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<const int>::UnalignedVec csr_row_ptr,
+                          TTypes<int>::UnalignedVec coo_row_ind);
 };
 extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;
 
diff --git a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
index 6e635d140ad7df..eda72f21e674f9 100644
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@@ -99,15 +99,16 @@ class DenseToCSRSparseMatrixCPUOp : public OpKernel {
                        TensorShape({(num_rows + 1) * batch_size}));
 
     // Fill the row pointers with zeros.
-    functor::SetZeroFunctor<Device, int32> set_zero;
-    set_zero(ctx->eigen_device<Device>(), csr_row_ptr.flat<int32>());
+    functor::SetZeroFunctor<Device, int32_t> set_zero;
+    set_zero(ctx->eigen_device<Device>(), csr_row_ptr.flat<int32_t>());
 
     // Convert from COO to CSR format.
     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
     OP_REQUIRES_OK(
-        ctx, coo_to_csr(batch_size, num_rows, num_cols,
-                        indices.matrix<int64_t>(), batch_ptr.vec<int32>(),
-                        csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()));
+        ctx,
+        coo_to_csr(batch_size, num_rows, num_cols, indices.matrix<int64_t>(),
+                   batch_ptr.vec<int32_t>(), csr_row_ptr.vec<int32_t>(),
+                   csr_col_ind.vec<int32_t>()));
 
     CSRSparseMatrix output_csr_matrix;
     OP_REQUIRES_OK(ctx, CSRSparseMatrix::CreateCSRSparseMatrix(
@@ -173,7 +174,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
     const int64_t rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1);
     const int64_t cols = dense_tensor_shape.dim_size((rank == 2) ? 1 : 2);
 
-    ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
+    ScratchSpace<int32_t> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
 
     Tensor nnz_per_batch_device_t;
     if (rank == 2) {
@@ -184,7 +185,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                            c->allocate_temp(DT_INT32, TensorShape({batch_size}),
                                             &nnz_per_batch_device_t),
                            done);
-      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>();
+      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32_t>();
 
       functor::CalculateNNZPerBatchMatrixFromIndices<Device>
           calculate_nnz_from_indices;
@@ -193,14 +194,14 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
           c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device),
           done);
 
-      stream_executor::DeviceMemoryBase nnz_per_batch_device_ptr(
+      stream_executor::DeviceAddressBase nnz_per_batch_device_ptr(
           static_cast<void*>(nnz_per_batch_device.data()));
 
       OP_REQUIRES_OK_ASYNC(
           c,
           stream->Memcpy(nnz_per_batch_host.mutable_data() /*host_dst*/,
                          nnz_per_batch_device_ptr /*gpu_src*/,
-                         batch_size * sizeof(int32) /*size*/),
+                         batch_size * sizeof(int32_t) /*size*/),
           done);
     }
 
@@ -215,7 +216,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
       // tensor by the time we get here; we can unreference it.
       nnz_per_batch_device_ref.Unref();
 
-      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>();
+      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32_t>();
 
       {
         // Ensure that within the callback, the proper GPU settings are
@@ -226,7 +227,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         // Extract out the values.
         Tensor temp_values_t;
         OP_REQUIRES_OK_ASYNC(c,
-                             (functor::DoGatherNd<Device, T, int64>(
+                             (functor::DoGatherNd<Device, T, int64_t>(
                                  c, params_t, indices_t, &temp_values_t)),
                              done);
         const Tensor& values_t = const_cast<const Tensor&>(temp_values_t);
@@ -248,7 +249,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
 
         Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
                            TensorShape({batch_size + 1}));
-        auto batch_ptr = batch_ptr_t.vec<int32>();
+        auto batch_ptr = batch_ptr_t.vec<int32_t>();
         auto indices = indices_t.matrix<int64_t>();
 
         batch_ptr(0) = 0;
@@ -285,9 +286,9 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                              &csr_row_ptr_t),
             done);
 
-        auto coo_row_ind = coo_row_ind_t.vec<int32>();
-        auto coo_col_ind = coo_col_ind_t.vec<int32>();
-        auto csr_row_ptr = csr_row_ptr_t.vec<int32>();
+        auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
+        auto coo_col_ind = coo_col_ind_t.vec<int32_t>();
+        auto csr_row_ptr = csr_row_ptr_t.vec<int32_t>();
 
         // Convert SparseTensor rep to coo row ind, coo col ind.
         if (total_nnz > 0) {
@@ -301,8 +302,8 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         // a bug if you have empty coo rows.
         // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle
         // zero-element input coo rows.
-        functor::SetZeroFunctor<Device, int32> set_zero;
-        set_zero(d, csr_row_ptr_t.flat<int32>());
+        functor::SetZeroFunctor<Device, int32_t> set_zero;
+        set_zero(d, csr_row_ptr_t.flat<int32_t>());
 
         functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr;
         for (int i = 0; i < batch_size; ++i) {
@@ -312,9 +313,9 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
             // handled by the SetZero above.
           } else {
             // Convert coo to csr.
-            auto coo_row_ind_i =
-                TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i);
-            auto csr_row_ptr_i = TTypes<int32>::UnalignedVec(
+            auto coo_row_ind_i = TTypes<int32_t>::UnalignedVec(
+                &coo_row_ind(batch_ptr(i)), nnz_i);
+            auto csr_row_ptr_i = TTypes<int32_t>::UnalignedVec(
                 &csr_row_ptr((rows + 1) * i), rows + 1);
             OP_REQUIRES_OK_ASYNC(
                 c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i),
@@ -367,9 +368,9 @@ REGISTER_GPU(GPU, complex128)
 namespace functor {
 
 template <>
-Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
+absl::Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
-    TTypes<int32>::Vec nnz_per_batch);
+    TTypes<int32_t>::Vec nnz_per_batch);
 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>;
 
 template <>
@@ -383,9 +384,9 @@ extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>;
 
 template <>
 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c, const int rows, const int cols,
-                    TTypes<int>::UnalignedVec coo_row_ind,
-                    TTypes<int>::UnalignedVec csr_row_ptr) {
+  absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
+                          TTypes<int>::UnalignedVec coo_row_ind,
+                          TTypes<int>::UnalignedVec csr_row_ptr) {
     GpuSparse cuda_sparse(c);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     return cuda_sparse.Coo2csr(coo_row_ind.data(),
diff --git a/tensorflow/core/kernels/sparse/kernels.cc b/tensorflow/core/kernels/sparse/kernels.cc
index ca7009f942112f..dd84b556e002ab 100644
--- a/tensorflow/core/kernels/sparse/kernels.cc
+++ b/tensorflow/core/kernels/sparse/kernels.cc
@@ -31,8 +31,8 @@ namespace functor {
 
 absl::Status SparseTensorToCSRSparseMatrixCPUFunctor::operator()(
     int64_t batch_size, int num_rows, int num_cols,
-    TTypes<int64_t>::ConstMatrix indices, TTypes<int32>::Vec batch_ptr,
-    TTypes<int32>::Vec csr_row_ptr, TTypes<int32>::Vec csr_col_ind) {
+    TTypes<int64_t>::ConstMatrix indices, TTypes<int32_t>::Vec batch_ptr,
+    TTypes<int32_t>::Vec csr_row_ptr, TTypes<int32_t>::Vec csr_col_ind) {
   // Validate inputs.
   if (batch_ptr.size() != batch_size + 1) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/sparse/kernels.h b/tensorflow/core/kernels/sparse/kernels.h
index aff14ca07910fa..14441de5d3cad3 100644
--- a/tensorflow/core/kernels/sparse/kernels.h
+++ b/tensorflow/core/kernels/sparse/kernels.h
@@ -42,7 +42,7 @@ template <typename Device>
 struct CalculateNNZPerBatchMatrixFromIndices {
   absl::Status operator()(OpKernelContext* c,
                           TTypes<int64_t>::ConstMatrix indices,
-                          TTypes<int32>::Vec nnz_per_batch);
+                          TTypes<int32_t>::Vec nnz_per_batch);
 };
 
 // Split a subset of a SparseTensors' indices into two vectors:
@@ -63,8 +63,8 @@ template <typename Device>
 struct SparseTensorToCOOSparseMatrix {
   void operator()(const Device& d, TTypes<int64_t>::ConstVec host_dense_shape,
                   TTypes<int64_t>::ConstMatrix indices,
-                  TTypes<int32>::Vec coo_row_ind,
-                  TTypes<int32>::Vec coo_col_ind);
+                  TTypes<int32_t>::Vec coo_row_ind,
+                  TTypes<int32_t>::Vec coo_col_ind);
 };
 
 // Write coo batch, row, and column vectors to output matrix indices:
@@ -89,9 +89,9 @@ template <typename Device>
 struct COOSparseMatrixToSparseTensor {
   absl::Status operator()(OpKernelContext* c,
                           TTypes<int64_t>::ConstVec host_dense_shape,
-                          TTypes<int32>::ConstVec host_batch_ptrs,
-                          TTypes<int32>::Vec coo_row_ind,
-                          TTypes<int32>::ConstVec coo_col_ind,
+                          TTypes<int32_t>::ConstVec host_batch_ptrs,
+                          TTypes<int32_t>::Vec coo_row_ind,
+                          TTypes<int32_t>::ConstVec coo_col_ind,
                           TTypes<int64_t>::Matrix indices);
 };
 
@@ -105,8 +105,8 @@ struct COOSparseMatrixToSparseTensor {
 template <typename Device>
 struct COOSparseMatrixToCSRSparseMatrix {
   absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
-                          TTypes<int32>::UnalignedVec coo_row_ind,
-                          TTypes<int32>::UnalignedVec csr_row_ptr);
+                          TTypes<int32_t>::UnalignedVec coo_row_ind,
+                          TTypes<int32_t>::UnalignedVec csr_row_ptr);
 };
 
 // Convert a matrix of (batched) coo row and column indices to CSR SparseMatrix
@@ -126,9 +126,9 @@ struct COOSparseMatrixToCSRSparseMatrix {
 struct SparseTensorToCSRSparseMatrixCPUFunctor {
   absl::Status operator()(int64_t batch_size, int num_rows, int num_cols,
                           TTypes<int64_t>::ConstMatrix indices,
-                          TTypes<int32>::Vec batch_ptr,
-                          TTypes<int32>::Vec csr_row_ptr,
-                          TTypes<int32>::Vec csr_col_ind);
+                          TTypes<int32_t>::Vec batch_ptr,
+                          TTypes<int32_t>::Vec csr_row_ptr,
+                          TTypes<int32_t>::Vec csr_col_ind);
 };
 
 // Convert a vector of csr row pointers to coo row indices.
@@ -141,8 +141,8 @@ struct SparseTensorToCSRSparseMatrixCPUFunctor {
 template <typename Device>
 struct CSRSparseMatrixToCOOSparseMatrix {
   absl::Status operator()(OpKernelContext* c,
-                          TTypes<int32>::UnalignedConstVec csr_row_ptr,
-                          TTypes<int32>::UnalignedVec coo_row_ind);
+                          TTypes<int32_t>::UnalignedConstVec csr_row_ptr,
+                          TTypes<int32_t>::UnalignedVec coo_row_ind);
 };
 
 // Calculates C = matmul(A, B) or C = matmul(A, B)^T, where A is in CSR format
@@ -176,10 +176,10 @@ struct CSRStructureModifyingFunctor {
                                         const ConstCSRComponent<T>& b,
                                         size_t* bufferSize) = 0;
 
-  virtual absl::Status GetOutputStructure(const ConstCSRComponent<T>& a,
-                                          const ConstCSRComponent<T>& b,
-                                          TTypes<int32>::UnalignedVec c_row_ptr,
-                                          int* output_nnz, void* workspace) = 0;
+  virtual absl::Status GetOutputStructure(
+      const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b,
+      TTypes<int32_t>::UnalignedVec c_row_ptr, int* output_nnz,
+      void* workspace) = 0;
 
   virtual absl::Status Compute(const ConstCSRComponent<T>& a,
                                const ConstCSRComponent<T>& b,
diff --git a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
index 3427538ff98ba4..3c1c79a5f02d6b 100644
--- a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
@@ -37,22 +37,22 @@ namespace functor {
 
 namespace {
 struct StridedDataReader {
-  StridedDataReader(const int64* begin, int stride)
+  StridedDataReader(const int64_t* begin, int stride)
       : begin_(begin), stride_(stride) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
     return static_cast<int>(ldg(begin_ + idx * stride_));
   }
 
-  const int64* begin_;
+  const int64_t* begin_;
   const int stride_;
 };
 }  // namespace
 
 template <>
-Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
+absl::Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
-    TTypes<int32>::Vec nnz_per_batch) {
+    TTypes<int32_t>::Vec nnz_per_batch) {
   const auto& cu_stream = GetGpuStream(c);
 
   const int total_nnz = indices.dimension(0);
@@ -96,9 +96,9 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
   TF_RETURN_IF_ERROR(c->allocate_temp(
       DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
       &temp_storage));
-  DCHECK_NE(temp_storage.flat<int8>().data(), nullptr);
+  DCHECK_NE(temp_storage.flat<int8_t>().data(), nullptr);
   auto second_success = gpuprim::DeviceHistogram::HistogramEven(
-      /*d_temp_storage*/ temp_storage.flat<int8>().data(),
+      /*d_temp_storage*/ temp_storage.flat<int8_t>().data(),
       /*temp_storage_bytes&*/ temp_storage_bytes,
       /*d_samples*/ indices_first_column,
       /*d_histogram*/ nnz_per_batch.data(),
@@ -116,13 +116,13 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
         temp_storage_bytes, ", status: ", GpuGetErrorString(second_success));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(ebrevdo): Write a custom batch-friendly impl of this to update
 // the SparseTensor indices directly.
 template <>
-Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
+absl::Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<const int>::UnalignedVec csr_row_ptr,
     TTypes<int>::UnalignedVec coo_row_ind) {
   GpuSparse gpu_sparse(c);
@@ -133,7 +133,7 @@ Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
 }
 
 template <int stride>
-__global__ void SparseTensorToCOOMatrixKernel(const int64* indices,
+__global__ void SparseTensorToCOOMatrixKernel(const int64_t* indices,
                                               int* coo_rows_out,
                                               int* coo_cols_out, int size) {
   const int offset = (stride == 3) ? 1 : 0;
@@ -168,7 +168,8 @@ void SparseTensorToCOOSparseMatrix<GPUDevice>::operator()(
 
 __global__ void COOMatrixToSparseTensorKernel2D(const int* coo_rows,
                                                 const int* coo_cols,
-                                                int64* indices_out, int size) {
+                                                int64_t* indices_out,
+                                                int size) {
   GPU_1D_KERNEL_LOOP(i, size) {
     indices_out[i * 2] = static_cast<int64_t>(ldg(coo_rows + i));
     indices_out[i * 2 + 1] = static_cast<int64_t>(ldg(coo_cols + i));
@@ -191,7 +192,7 @@ __device__ inline int BinarySearchRange(int* range, int n, int x) {
 }
 
 __global__ void COOMatrixToSparseTensorKernel3D(
-    const int* coo_rows, const int* coo_cols, int64* indices_out,
+    const int* coo_rows, const int* coo_cols, int64_t* indices_out,
     GpuDeviceArrayStruct<int> batch_ptr_s, const int batch_size,
     const int size) {
   // Step 1: access the batch ptrs and copy to shared memory.
@@ -214,7 +215,7 @@ __global__ void COOMatrixToSparseTensorKernel3D(
 }
 
 template <>
-Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
+absl::Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstVec host_dense_shape,
     TTypes<int>::ConstVec host_batch_ptr, TTypes<int>::Vec coo_row_ind,
     TTypes<int>::ConstVec coo_col_ind, TTypes<int64_t>::Matrix indices) {
@@ -234,7 +235,7 @@ Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
                                 config.block_count, config.thread_per_block, 0,
                                 d.stream(), coo_row_ind.data(),
                                 coo_col_ind.data(), indices.data(), size));
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     const int batch_size = host_dense_shape(0);
     GpuDeviceArrayOnHost<int> batch_ptr_copy(c, host_batch_ptr.size());
@@ -251,7 +252,7 @@ Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
                         config.thread_per_block, shared_memory_size, d.stream(),
                         coo_row_ind.data(), coo_col_ind.data(), indices.data(),
                         batch_ptr_copy.data(), batch_size, size));
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -281,10 +282,10 @@ __global__ void CSRSparseMatrixBatchMulVecKernel3D(
 }
 
 template <typename T>
-Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
-                                      const CSRSparseMatrix& a,
-                                      typename TTypes<T>::ConstFlat b,
-                                      CSRSparseMatrix* c) {
+absl::Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
+                                            const CSRSparseMatrix& a,
+                                            typename TTypes<T>::ConstFlat b,
+                                            CSRSparseMatrix* c) {
   DCHECK_EQ(a.dims(), 3);
   const int total_nnz = a.total_nnz();
   Tensor c_values_t;
@@ -321,7 +322,7 @@ Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
       config.thread_per_block, shared_memory_size, d.stream(), a_values.data(),
       b.data(), c_values.data(), batch_ptr_copy.data(), batch_size, total_nnz));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define DEFINE_SPARSE_MUL_VEC_GPU(T)                                        \
@@ -416,12 +417,12 @@ __global__ void CSRSparseMatrixSoftmaxKernel3D(
 }
 
 template <typename T>
-Status CSRSparseMatrixSoftmaxGPUImpl(OpKernelContext* ctx,
-                                     const CSRSparseMatrix& logits,
-                                     typename TTypes<T>::Vec softmax_values) {
+absl::Status CSRSparseMatrixSoftmaxGPUImpl(
+    OpKernelContext* ctx, const CSRSparseMatrix& logits,
+    typename TTypes<T>::Vec softmax_values) {
   auto host_dense_shape = logits.dense_shape().vec<int64_t>();
-  auto host_batch_ptr = logits.batch_pointers().vec<int32>();
-  auto row_ptr = logits.row_pointers().vec<int32>();
+  auto host_batch_ptr = logits.batch_pointers().vec<int32_t>();
+  auto row_ptr = logits.row_pointers().vec<int32_t>();
   auto logits_values = logits.values().vec<T>();
 
   const int ndims = host_dense_shape.size();
@@ -459,7 +460,7 @@ Status CSRSparseMatrixSoftmaxGPUImpl(OpKernelContext* ctx,
                                 logits_values.data(), softmax_values.data()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define DEFINE_SOFTMAX_GPU(T)                                             \
@@ -604,18 +605,19 @@ __global__ void CSRSparseMatrixSoftmaxGradKernel3D(
 }
 
 template <typename T>
-Status CSRSparseMatrixSoftmaxGradGPUImpl(
+absl::Status CSRSparseMatrixSoftmaxGradGPUImpl(
     OpKernelContext* ctx, const CSRSparseMatrix& softmax,
     const CSRSparseMatrix& grad_softmax,
     typename TTypes<T>::Vec gradient_values) {
   auto host_dense_shape = softmax.dense_shape().vec<int64_t>();
-  auto softmax_host_batch_ptr = softmax.batch_pointers().vec<int32>();
-  auto softmax_row_ptr = softmax.row_pointers().vec<int32>();
-  auto softmax_col_ind = softmax.col_indices().vec<int32>();
+  auto softmax_host_batch_ptr = softmax.batch_pointers().vec<int32_t>();
+  auto softmax_row_ptr = softmax.row_pointers().vec<int32_t>();
+  auto softmax_col_ind = softmax.col_indices().vec<int32_t>();
   auto softmax_values = softmax.values().vec<T>();
-  auto grad_softmax_host_batch_ptr = grad_softmax.batch_pointers().vec<int32>();
-  auto grad_softmax_row_ptr = grad_softmax.row_pointers().vec<int32>();
-  auto grad_softmax_col_ind = grad_softmax.col_indices().vec<int32>();
+  auto grad_softmax_host_batch_ptr =
+      grad_softmax.batch_pointers().vec<int32_t>();
+  auto grad_softmax_row_ptr = grad_softmax.row_pointers().vec<int32_t>();
+  auto grad_softmax_col_ind = grad_softmax.col_indices().vec<int32_t>();
   auto grad_softmax_values = grad_softmax.values().vec<T>();
 
   const int ndims = host_dense_shape.size();
@@ -666,7 +668,7 @@ Status CSRSparseMatrixSoftmaxGradGPUImpl(
         grad_softmax_values.data(), gradient_values.data()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define DEFINE_SOFTMAX_GRAD_GPU(T)                                          \
diff --git a/tensorflow/core/kernels/sparse/kernels_test.cc b/tensorflow/core/kernels/sparse/kernels_test.cc
index 018b8b77a81e34..dc13ef62256357 100644
--- a/tensorflow/core/kernels/sparse/kernels_test.cc
+++ b/tensorflow/core/kernels/sparse/kernels_test.cc
@@ -38,19 +38,19 @@ TEST(SparseTensorToCSRSparseMatrix, SingleBatchConversion) {
       test::AsTensor<int64_t>({0, 0, 2, 3, 2, 4, 3, 0}, TensorShape({4, 2}));
   Tensor batch_ptr(DT_INT32, {2});
   Tensor csr_col_ind(DT_INT32, {4});
-  auto csr_row_ptr = test::AsTensor<int32>({0, 0, 0, 0, 0});
+  auto csr_row_ptr = test::AsTensor<int32_t>({0, 0, 0, 0, 0});
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   TF_EXPECT_OK(coo_to_csr(/*batch_size=*/1, /*num_rows=*/4, /*num_cols=*/5,
                           indices.template matrix<int64_t>(),
-                          batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
-                          csr_col_ind.vec<int32>()));
-
-  test::ExpectTensorEqual<int32>(batch_ptr, test::AsTensor<int32>({0, 4}));
-  test::ExpectTensorEqual<int32>(csr_row_ptr,
-                                 test::AsTensor<int32>({0, 1, 1, 3, 4}));
-  test::ExpectTensorEqual<int32>(csr_col_ind,
-                                 test::AsTensor<int32>({0, 3, 4, 0}));
+                          batch_ptr.vec<int32_t>(), csr_row_ptr.vec<int32_t>(),
+                          csr_col_ind.vec<int32_t>()));
+
+  test::ExpectTensorEqual<int32_t>(batch_ptr, test::AsTensor<int32_t>({0, 4}));
+  test::ExpectTensorEqual<int32_t>(csr_row_ptr,
+                                   test::AsTensor<int32_t>({0, 1, 1, 3, 4}));
+  test::ExpectTensorEqual<int32_t>(csr_col_ind,
+                                   test::AsTensor<int32_t>({0, 3, 4, 0}));
 }
 
 TEST(SparseTensorToCSRSparseMatrix, BatchConversion) {
@@ -63,21 +63,22 @@ TEST(SparseTensorToCSRSparseMatrix, BatchConversion) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   TF_EXPECT_OK(coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
                           indices.template matrix<int64_t>(),
-                          batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
-                          csr_col_ind.vec<int32>()));
-
-  test::ExpectTensorEqual<int32>(batch_ptr,
-                                 test::AsTensor<int32>({0, 2, 2, 3}));
-  test::ExpectTensorEqual<int32>(csr_row_ptr,
-                                 test::AsTensor<int32>({0, 1, 1, 2,  //
-                                                        0, 0, 0, 0,  //
-                                                        0, 1, 1, 1}));
-  test::ExpectTensorEqual<int32>(csr_col_ind, test::AsTensor<int32>({0, 3, 1}));
+                          batch_ptr.vec<int32_t>(), csr_row_ptr.vec<int32_t>(),
+                          csr_col_ind.vec<int32_t>()));
+
+  test::ExpectTensorEqual<int32_t>(batch_ptr,
+                                   test::AsTensor<int32_t>({0, 2, 2, 3}));
+  test::ExpectTensorEqual<int32_t>(csr_row_ptr,
+                                   test::AsTensor<int32_t>({0, 1, 1, 2,  //
+                                                            0, 0, 0, 0,  //
+                                                            0, 1, 1, 1}));
+  test::ExpectTensorEqual<int32_t>(csr_col_ind,
+                                   test::AsTensor<int32_t>({0, 3, 1}));
 }
 
 TEST(SparseTensorToCSRSparseMatrix, InvalidBatchThrowsIllegalArgument) {
@@ -90,13 +91,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidBatchThrowsIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
                              ::testing::ContainsRegex(
                                  "Batch index .* is outside of valid range")));
@@ -111,13 +112,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidRowThrowsIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(
           tsl::error::Code::INVALID_ARGUMENT,
           ::testing::ContainsRegex("Row index .* is outside of valid range")));
@@ -132,13 +133,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidColThrowsIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
                              ::testing::ContainsRegex(
                                  "Column index .* is outside of valid range")));
@@ -154,13 +155,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidRankIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
                              ::testing::ContainsRegex(
                                  "Indices must have either 2 or 3 columns.")));
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.h b/tensorflow/core/kernels/sparse/mat_mul_op.h
index 3e55cfbc38f201..5c9bfd8a805a54 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.h
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.h
@@ -276,7 +276,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
   Eigen::Ref<const SparseMatrix> GetSparseMatrixRef(
       const CSRSparseMatrix& csr_matrix, const int batch_index,
       const int64_t row_begin, const int64_t num_shard_rows,
-      std::vector<int32>* row_ptrs) {
+      std::vector<int32_t>* row_ptrs) {
     // Compute the row pointers of the sparse sub-matrix.
     row_ptrs->resize(num_shard_rows + 1);
     const int64_t row_offset =
@@ -325,7 +325,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
 
                 // Define an Eigen::SparseMatrix over the row range:
                 // [row_begin, row_end) of the CSR SparseMatrix A.
-                std::vector<int32> row_ptrs;
+                std::vector<int32_t> row_ptrs;
                 auto sparse_matrix = GetSparseMatrixRef(
                     lhs, batch_idx, row_begin, num_shard_rows, &row_ptrs);
 
@@ -396,7 +396,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
 
                 // Define a new sparse sub-matrix from the row range
                 // [row_begin, row_end) of the sparse matrix A.
-                std::vector<int32> row_ptrs;
+                std::vector<int32_t> row_ptrs;
                 auto sparse_matrix = GetSparseMatrixRef(
                     lhs, batch_idx, row_begin, num_shard_rows, &row_ptrs);
 
@@ -773,9 +773,9 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
   explicit CSRSparseMatrixMatMul(const bool transpose_output)
       : transpose_output_(transpose_output) {}
 
-  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
-                 typename TTypes<T>::UnalignedConstMatrix b,
-                 typename TTypes<T>::UnalignedMatrix c) {
+  absl::Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                       typename TTypes<T>::UnalignedConstMatrix b,
+                       typename TTypes<T>::UnalignedMatrix c) {
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     {
@@ -859,11 +859,11 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       Tensor buffer;
       TF_RETURN_IF_ERROR(ctx->allocate_temp(
           DT_INT8, TensorShape({static_cast<int64_t>(bufferSize)}), &buffer));
-      DCHECK(buffer.flat<int8>().data() != nullptr);
+      DCHECK(buffer.flat<int8_t>().data() != nullptr);
 
       TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
                                           &beta, matC, algo,
-                                          buffer.flat<int8>().data()));
+                                          buffer.flat<int8_t>().data()));
 
       TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matB));
       TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matC));
@@ -940,7 +940,7 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
 #endif  // GOOGLE_CUDA && CUDA_VERSION >= 10020
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -954,8 +954,8 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
       : transA_(TransposeAndConjugateToGpuSparseOp(transpose_a, conjugate_a,
                                                    &status_)) {}
 
-  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
-                 const T* x, T* y) {
+  absl::Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                       const T* x, T* y) {
     TF_RETURN_IF_ERROR(status_);
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
@@ -1001,11 +1001,11 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
 #endif
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
-  Status status_;
+  absl::Status status_;
   const gpusparseOperation_t transA_;
 };
 
diff --git a/tensorflow/core/kernels/sparse/mul_op.cc b/tensorflow/core/kernels/sparse/mul_op.cc
index 37ce9a6feb51bf..1a68bcc34e9143 100644
--- a/tensorflow/core/kernels/sparse/mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mul_op.cc
@@ -125,8 +125,8 @@ class CSRSparseMatrixMulScalar<GPUDevice, T> {
  public:
   explicit CSRSparseMatrixMulScalar() {}
 
-  Status Compute(OpKernelContext* ctx, const CSRSparseMatrix& a,
-                 typename TTypes<T>::ConstScalar b, CSRSparseMatrix* c) {
+  absl::Status Compute(OpKernelContext* ctx, const CSRSparseMatrix& a,
+                       typename TTypes<T>::ConstScalar b, CSRSparseMatrix* c) {
     const int total_nnz = a.total_nnz();
     Tensor c_values_t;
     TF_RETURN_IF_ERROR(ctx->allocate_temp(
@@ -146,7 +146,7 @@ class CSRSparseMatrixMulScalar<GPUDevice, T> {
     functor::BinaryFunctor<GPUDevice, functor::mul<T>, 1>().Right(
         d, c_values, a_values, b, error_ptr);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse/nnz_op.cc b/tensorflow/core/kernels/sparse/nnz_op.cc
index 2006abfe4459b1..ad8095b1cdc925 100644
--- a/tensorflow/core/kernels/sparse/nnz_op.cc
+++ b/tensorflow/core/kernels/sparse/nnz_op.cc
@@ -53,7 +53,7 @@ class CSRNNZOp : public OpKernel {
           c, nnz_shape.AddDimWithStatus(csr_sparse_matrix->batch_size()));
     }
     OP_REQUIRES_OK(c, c->allocate_output(0, nnz_shape, &nnz_t));
-    auto nnz = nnz_t->flat<int32>();
+    auto nnz = nnz_t->flat<int32_t>();
     for (int i = 0; i < csr_sparse_matrix->batch_size(); ++i) {
       nnz(i) = csr_sparse_matrix->nnz(i);
     }
diff --git a/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc b/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
index 51f867277c6a55..afe0a1322ba866 100644
--- a/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
@@ -93,7 +93,7 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
 
     // Allocate batch pointers.
     Tensor batch_ptr(cpu_allocator(), DT_INT32, TensorShape({batch_size + 1}));
-    auto batch_ptr_vec = batch_ptr.vec<int32>();
+    auto batch_ptr_vec = batch_ptr.vec<int32_t>();
     batch_ptr_vec(0) = 0;
 
     // Temporary vector of Eigen SparseMatrices to store the Sparse Cholesky
@@ -130,7 +130,7 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
                                    Eigen::NaturalOrdering<int>>
                   solver;
               auto permutation_indices_flat =
-                  input_permutation_indices.flat<int32>().data();
+                  input_permutation_indices.flat<int32_t>().data();
 
               // Invert the fill-in reducing ordering and apply it to the input
               // sparse matrix.
@@ -183,8 +183,8 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
     Tensor output_col_ind(cpu_allocator(), DT_INT32, TensorShape({total_nnz}));
     Tensor output_values(cpu_allocator(), DataTypeToEnum<T>::value,
                          TensorShape({total_nnz}));
-    auto output_row_ptr_ptr = output_row_ptr.flat<int32>().data();
-    auto output_col_ind_ptr = output_col_ind.flat<int32>().data();
+    auto output_row_ptr_ptr = output_row_ptr.flat<int32_t>().data();
+    auto output_col_ind_ptr = output_col_ind.flat<int32_t>().data();
     auto output_values_ptr = output_values.flat<T>().data();
 
     // Copy the output matrices from each batch into the CSRSparseMatrix
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index c961ec282b4ed0..be11f9d81065a6 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -188,7 +188,7 @@ class CSRSparseMatMulCPUOp : public OpKernel {
 
     // Set batch pointers.
     Tensor batch_ptr(cpu_allocator(), DT_INT32, TensorShape({batch_size + 1}));
-    auto batch_ptr_vec = batch_ptr.vec<int32>();
+    auto batch_ptr_vec = batch_ptr.vec<int32_t>();
     batch_ptr_vec(0) = 0;
 
     // Store intermediate matrix products for each batch.
@@ -248,8 +248,8 @@ class CSRSparseMatMulCPUOp : public OpKernel {
     Tensor output_col_ind(cpu_allocator(), DT_INT32, TensorShape({total_nnz}));
     Tensor output_values(cpu_allocator(), DataTypeToEnum<T>::value,
                          TensorShape({total_nnz}));
-    auto output_row_ptr_ptr = output_row_ptr.flat<int32>().data();
-    auto output_col_ind_ptr = output_col_ind.flat<int32>().data();
+    auto output_row_ptr_ptr = output_row_ptr.flat<int32_t>().data();
+    auto output_col_ind_ptr = output_col_ind.flat<int32_t>().data();
     auto output_values_ptr = output_values.flat<T>().data();
 
     // Copy the output matrices from each batch into the CSRSparseMatrix
@@ -411,14 +411,14 @@ class CSRSparseMatMulGPUOp : public OpKernel {
 
     Tensor c_batch_ptr_t(cpu_allocator(), DT_INT32,
                          TensorShape({batch_size + 1}));
-    auto c_batch_ptr = c_batch_ptr_t.vec<int32>();
+    auto c_batch_ptr = c_batch_ptr_t.vec<int32_t>();
     c_batch_ptr(0) = 0;
 
     Tensor c_row_ptr_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                             DT_INT32, TensorShape({batch_size * (rows + 1)}),
                             &c_row_ptr_t));
-    auto c_row_ptr = c_row_ptr_t.vec<int32>();
+    auto c_row_ptr = c_row_ptr_t.vec<int32_t>();
 
     // Possibly transpose a.
     const CSRSparseMatrix* a_input_matrix;
@@ -506,7 +506,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                      DT_INT8, TensorShape({static_cast<int64_t>(bufferSize1)}),
                      &buffer1_t));
       }
-      void* buffer1 = buffer1_t.flat<int8>().data();
+      void* buffer1 = buffer1_t.flat<int8_t>().data();
 
       // Do workEstimation using buffer1.
       // buffer1 implicitly captured in gemmDesc for use in the compute call.
@@ -525,7 +525,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                      DT_INT8, TensorShape({static_cast<int64_t>(bufferSize2)}),
                      &buffer2_t));
       }
-      void* buffer2 = buffer2_t.flat<int8>().data();
+      void* buffer2 = buffer2_t.flat<int8_t>().data();
 
       // Compute the gemm.
       // Note that buffer1 is implicitly consumed here and buffer2 is implicitly
@@ -552,7 +552,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
       // Copy product to final c_row_ptr and intermediate column and values
       // tensors.
       void* row_ptr = &c_row_ptr(i * (rows + 1));
-      void* col_ptr = colidx_tmp.flat<int32>().data();
+      void* col_ptr = colidx_tmp.flat<int32_t>().data();
       void* val_ptr = values_tmp.flat<T>().data();
       cusparseStatus_t cusp_status =
           cusparseCsrSetPointers(matC.get(), row_ptr, col_ptr, val_ptr);
@@ -643,8 +643,8 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                                   b_input_matrix->values_vec<T>(b_batch),
                                   b_input_dense_shape};
 
-      TTypes<int32>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
-                                              rows + 1);
+      TTypes<int32_t>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
+                                                rows + 1);
 
       int c_nnz_i;
       OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix.h b/tensorflow/core/kernels/sparse/sparse_matrix.h
index 8e5ff45f57d30a..d3db1f29871d80 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix.h
+++ b/tensorflow/core/kernels/sparse/sparse_matrix.h
@@ -217,40 +217,40 @@ class CSRSparseMatrix {
     return dense_shape_;
   }
 
-  inline TTypes<int32>::UnalignedVec row_pointers_vec(int batch) {
+  inline TTypes<int32_t>::UnalignedVec row_pointers_vec(int batch) {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int64_t rows = dense_shape().vec<int64_t>()((dims() == 2) ? 0 : 1);
     const int offset = batch * (rows + 1);
-    return TTypes<int32>::UnalignedVec(row_pointers_vec_->data() + offset,
-                                       rows + 1);
+    return TTypes<int32_t>::UnalignedVec(row_pointers_vec_->data() + offset,
+                                         rows + 1);
   }
 
-  inline TTypes<int32>::UnalignedConstVec row_pointers_vec(int batch) const {
+  inline TTypes<int32_t>::UnalignedConstVec row_pointers_vec(int batch) const {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int64_t rows = dense_shape().vec<int64_t>()((dims() == 2) ? 0 : 1);
     const int offset = batch * (rows + 1);
-    return TTypes<int32>::UnalignedConstVec(row_pointers_vec_->data() + offset,
-                                            rows + 1);
+    return TTypes<int32_t>::UnalignedConstVec(
+        row_pointers_vec_->data() + offset, rows + 1);
   }
 
-  inline TTypes<int32>::UnalignedVec col_indices_vec(int batch) {
+  inline TTypes<int32_t>::UnalignedVec col_indices_vec(int batch) {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int offset = (*batch_pointers_vec_)(batch);
     const int nnz_in_batch = nnz(batch);
-    return TTypes<int32>::UnalignedVec(col_indices_vec_->data() + offset,
-                                       nnz_in_batch);
+    return TTypes<int32_t>::UnalignedVec(col_indices_vec_->data() + offset,
+                                         nnz_in_batch);
   }
 
-  inline TTypes<int32>::UnalignedConstVec col_indices_vec(int batch) const {
+  inline TTypes<int32_t>::UnalignedConstVec col_indices_vec(int batch) const {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int offset = (*batch_pointers_vec_)(batch);
     const int nnz_in_batch = nnz(batch);
-    return TTypes<int32>::UnalignedConstVec(col_indices_vec_->data() + offset,
-                                            nnz_in_batch);
+    return TTypes<int32_t>::UnalignedConstVec(col_indices_vec_->data() + offset,
+                                              nnz_in_batch);
   }
 
   template <typename T>
@@ -411,9 +411,11 @@ class CSRSparseMatrix {
   void SetupVecs() {
     if (!metadata_.validated) return;
     batch_pointers_vec_.reset(
-        new TTypes<int32>::Vec(batch_pointers_.vec<int32>()));
-    row_pointers_vec_.reset(new TTypes<int32>::Vec(row_pointers_.vec<int32>()));
-    col_indices_vec_.reset(new TTypes<int32>::Vec(col_indices_.vec<int32>()));
+        new TTypes<int32_t>::Vec(batch_pointers_.vec<int32_t>()));
+    row_pointers_vec_.reset(
+        new TTypes<int32_t>::Vec(row_pointers_.vec<int32_t>()));
+    col_indices_vec_.reset(
+        new TTypes<int32_t>::Vec(col_indices_.vec<int32_t>()));
   }
 
   void ClearVecs() {
@@ -537,9 +539,9 @@ class CSRSparseMatrix {
   Tensor row_pointers_;
   Tensor col_indices_;
   Tensor values_;
-  std::unique_ptr<TTypes<int32>::Vec> batch_pointers_vec_;
-  std::unique_ptr<TTypes<int32>::Vec> row_pointers_vec_;
-  std::unique_ptr<TTypes<int32>::Vec> col_indices_vec_;
+  std::unique_ptr<TTypes<int32_t>::Vec> batch_pointers_vec_;
+  std::unique_ptr<TTypes<int32_t>::Vec> row_pointers_vec_;
+  std::unique_ptr<TTypes<int32_t>::Vec> col_indices_vec_;
 };
 
 // Call BinaryFunctor<Device, T>()(ctx, a, b, c)
@@ -616,16 +618,16 @@ absl::Status CSRSparseMatrixUnaryHelper(OpKernelContext* ctx,
 
 template <typename T>
 struct ConstCSRComponent {
-  TTypes<int32>::UnalignedConstVec row_ptr;
-  TTypes<int32>::UnalignedConstVec col_ind;
+  TTypes<int32_t>::UnalignedConstVec row_ptr;
+  TTypes<int32_t>::UnalignedConstVec col_ind;
   typename TTypes<T>::UnalignedConstVec values;
   TTypes<int64_t>::ConstVec dense_shape_host;
 };
 
 template <typename T>
 struct CSRComponent {
-  TTypes<int32>::UnalignedVec row_ptr;
-  TTypes<int32>::UnalignedVec col_ind;
+  TTypes<int32_t>::UnalignedVec row_ptr;
+  TTypes<int32_t>::UnalignedVec col_ind;
   typename TTypes<T>::UnalignedVec values;
   TTypes<int64_t>::Vec dense_shape_host;
 };
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
index 5c1a0f007ed656..d25a86056b574b 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
@@ -57,7 +57,7 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
     OP_REQUIRES(c, index_t.dims() == 0,
                 errors::InvalidArgument("index should be a scalar, but saw: ",
                                         index_t.DebugString()));
-    int32_t index = index_t.scalar<int32>()();
+    int32_t index = index_t.scalar<int32_t>()();
     OP_REQUIRES(c, index >= 0 && index < csr_sparse_matrix->batch_size(),
                 errors::InvalidArgument("index (", index, ") not in [0, ",
                                         csr_sparse_matrix->batch_size(), ")"));
@@ -67,7 +67,7 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
       c->set_output(1, csr_sparse_matrix->col_indices());
       c->set_output(2, csr_sparse_matrix->values());
     } else {
-      auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+      auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
       auto dense_shape = csr_sparse_matrix->dense_shape().vec<int64_t>();
       int64_t rows = dense_shape(1);
       int nnz = batch_ptrs(index + 1) - batch_ptrs(index);
@@ -78,23 +78,23 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
           c, c->allocate_output(0, TensorShape({rows + 1}), &row_ptrs_t));
       OP_REQUIRES_OK(c, c->allocate_output(1, TensorShape({nnz}), &col_inds_t));
       OP_REQUIRES_OK(c, c->allocate_output(2, TensorShape({nnz}), &values_t));
-      auto row_ptrs = row_ptrs_t->vec<int32>();
-      auto col_inds = col_inds_t->vec<int32>();
+      auto row_ptrs = row_ptrs_t->vec<int32_t>();
+      auto col_inds = col_inds_t->vec<int32_t>();
       auto values = values_t->vec<T>();
 
-      functor::Slice<Device, int32, 1> slice_int;
+      functor::Slice<Device, int32_t, 1> slice_int;
       functor::Slice<Device, T, 1> slice_t;
       typedef Eigen::DSizes<Eigen::DenseIndex, 1> EVec;
       const Device& d = c->eigen_device<Device>();
       slice_int(d,
                 /*output*/ row_ptrs,
-                /*input*/ csr_sparse_matrix->row_pointers().vec<int32>(),
+                /*input*/ csr_sparse_matrix->row_pointers().vec<int32_t>(),
                 /*slice_indices*/
                 EVec{static_cast<Eigen::DenseIndex>(index * (rows + 1))},
                 /*slice_sizes*/ EVec{static_cast<Eigen::DenseIndex>(rows + 1)});
       slice_int(d,
                 /*output*/ col_inds,
-                /*input*/ csr_sparse_matrix->col_indices().vec<int32>(),
+                /*input*/ csr_sparse_matrix->col_indices().vec<int32_t>(),
                 /*slice_indices*/ EVec{batch_ptrs(index)},
                 /*slice_sizes*/ EVec{nnz});
       slice_t(d,
@@ -137,7 +137,7 @@ namespace functor {
       const Eigen::DSizes<Eigen::DenseIndex, 1>& sizes);        \
   extern template struct Slice<GPUDevice, T, 1>;
 
-DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(int32_t);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(complex64);
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index e93e2b0a018845..7d7bba8601da64 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -74,7 +74,7 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     const int64_t num_cols = dense_shape_vec((rank == 2) ? 1 : 2);
     const int64_t total_nnz = values.NumElements();
 
-    static constexpr int64_t kInt32Max = std::numeric_limits<int32>::max();
+    static constexpr int64_t kInt32Max = std::numeric_limits<int32_t>::max();
     OP_REQUIRES(
         ctx, batch_size < kInt32Max,
         errors::InvalidArgument("dense_shape batch_size must be < Int32Max,"
@@ -106,16 +106,16 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     Tensor csr_row_ptr(cpu_allocator(), DT_INT32, csr_row_ind_shape);
 
     // Fill the row pointers with zeros.
-    functor::SetZeroFunctor<CPUDevice, int32> set_zero;
-    set_zero(ctx->eigen_device<CPUDevice>(), csr_row_ptr.flat<int32>());
+    functor::SetZeroFunctor<CPUDevice, int32_t> set_zero;
+    set_zero(ctx->eigen_device<CPUDevice>(), csr_row_ptr.flat<int32_t>());
 
     // Convert from COO to CSR format.
     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
     OP_REQUIRES_OK(
         ctx,
         coo_to_csr(batch_size, num_rows, num_cols,
-                   indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                   csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()));
+                   indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                   csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()));
 
     // Create the CSRSparseMatrix object from its component Tensors and prepare
     // the Variant output Tensor.
@@ -166,7 +166,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
     const int64_t rows = dense_shape((rank == 2) ? 0 : 1);
     const int64_t cols = dense_shape((rank == 2) ? 1 : 2);
 
-    static constexpr int64_t kInt32Max = std::numeric_limits<int32>::max();
+    static constexpr int64_t kInt32Max = std::numeric_limits<int32_t>::max();
     OP_REQUIRES_ASYNC(
         c, batch_size < kInt32Max,
         errors::InvalidArgument("dense_shape batch_size must be < Int32Max,"
@@ -187,7 +187,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                                 (rows + 1) * batch_size),
         done);
 
-    ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
+    ScratchSpace<int32_t> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
 
     Tensor nnz_per_batch_device_t;
     if (rank == 2) {
@@ -198,7 +198,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                            c->allocate_temp(DT_INT32, TensorShape({batch_size}),
                                             &nnz_per_batch_device_t),
                            done);
-      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>();
+      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32_t>();
 
       functor::CalculateNNZPerBatchMatrixFromIndices<Device>
           calculate_nnz_from_indices;
@@ -207,14 +207,14 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
           c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device),
           done);
 
-      stream_executor::DeviceMemoryBase nnz_per_batch_device_ptr(
+      stream_executor::DeviceAddressBase nnz_per_batch_device_ptr(
           static_cast<void*>(nnz_per_batch_device.data()));
 
       OP_REQUIRES_OK_ASYNC(
           c,
           stream->Memcpy(nnz_per_batch_host.mutable_data() /*host_dst*/,
                          nnz_per_batch_device_ptr /*gpu_src*/,
-                         batch_size * sizeof(int32) /*size*/),
+                         batch_size * sizeof(int32_t) /*size*/),
           done);
     }
 
@@ -227,7 +227,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
       // tensor by the time we get here; we can unreference it.
       nnz_per_batch_device_ref.Unref();
 
-      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>();
+      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32_t>();
 
       // Ensure that within the callback, the proper GPU settings are
       // configured.
@@ -237,7 +237,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
                            TensorShape({batch_size + 1}));
 
-        auto batch_ptr = batch_ptr_t.vec<int32>();
+        auto batch_ptr = batch_ptr_t.vec<int32_t>();
         auto indices = indices_t.matrix<int64_t>();
 
         batch_ptr(0) = 0;
@@ -274,9 +274,9 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                              &csr_row_ptr_t),
             done);
 
-        auto coo_row_ind = coo_row_ind_t.vec<int32>();
-        auto coo_col_ind = coo_col_ind_t.vec<int32>();
-        auto csr_row_ptr = csr_row_ptr_t.vec<int32>();
+        auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
+        auto coo_col_ind = coo_col_ind_t.vec<int32_t>();
+        auto csr_row_ptr = csr_row_ptr_t.vec<int32_t>();
 
         // Convert SparseTensor rep to coo row ind, coo col ind.
         if (total_nnz > 0) {
@@ -290,8 +290,8 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         // a bug if you have empty coo rows.
         // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle
         // zero-element input coo rows.
-        functor::SetZeroFunctor<Device, int32> set_zero;
-        set_zero(d, csr_row_ptr_t.flat<int32>());
+        functor::SetZeroFunctor<Device, int32_t> set_zero;
+        set_zero(d, csr_row_ptr_t.flat<int32_t>());
 
         functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr;
         for (int i = 0; i < batch_size; ++i) {
@@ -301,9 +301,9 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
             // handled by the SetZero above.
           } else {
             // Convert coo to csr.
-            auto coo_row_ind_i =
-                TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i);
-            auto csr_row_ptr_i = TTypes<int32>::UnalignedVec(
+            auto coo_row_ind_i = TTypes<int32_t>::UnalignedVec(
+                &coo_row_ind(batch_ptr(i)), nnz_i);
+            auto csr_row_ptr_i = TTypes<int32_t>::UnalignedVec(
                 &csr_row_ptr((rows + 1) * i), rows + 1);
             OP_REQUIRES_OK_ASYNC(
                 c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i),
@@ -345,9 +345,9 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
 namespace functor {
 
 template <>
-Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
+absl::Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
-    TTypes<int32>::Vec nnz_per_batch);
+    TTypes<int32_t>::Vec nnz_per_batch);
 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>;
 
 template <>
@@ -361,9 +361,9 @@ extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>;
 
 template <>
 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c, const int rows, const int cols,
-                    TTypes<int>::UnalignedVec coo_row_ind,
-                    TTypes<int>::UnalignedVec csr_row_ptr) {
+  absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
+                          TTypes<int>::UnalignedVec coo_row_ind,
+                          TTypes<int>::UnalignedVec csr_row_ptr) {
     GpuSparse cuda_sparse(c);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     return cuda_sparse.Coo2csr(coo_row_ind.data(),
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index 74e0b85f393e40..234b00e5749593 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -182,9 +182,9 @@ absl::Status CSRSparseMatrixTranspose<Device, T>::operator()(
 
   // Set the output row pointers to zero, in case we hit any empty
   // input batches.
-  functor::SetZeroFunctor<Device, int32> set_zero;
+  functor::SetZeroFunctor<Device, int32_t> set_zero;
   const Device& d = ctx->eigen_device<Device>();
-  set_zero(d, output_row_ptr_t.flat<int32>());
+  set_zero(d, output_row_ptr_t.flat<int32_t>());
 
   functor::CSRSparseMatrixTransposeComponent<Device, T> transpose_component;
   for (int i = 0; i < batch_size; ++i) {
@@ -255,8 +255,8 @@ struct CSRSparseMatrixTransposeComponent<CPUDevice, T> {
 
 template <typename T>
 struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
-  Status operator()(OpKernelContext* ctx, const ConstCSRComponent<T>& x,
-                    CSRComponent<T>* y) {
+  absl::Status operator()(OpKernelContext* ctx, const ConstCSRComponent<T>& x,
+                          CSRComponent<T>* y) {
     TF_RETURN_IF_ERROR(ValidateTransposeInputs(x, *y));
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
@@ -277,7 +277,7 @@ struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
         x.col_ind.data() /*csrColInd*/, y->values.data() /*cscVal*/,
         y->col_ind.data() /*cscRowInd*/, y->row_ptr.data() /*cscColPtr*/,
         copyValues);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/sparse/zeros_op.h b/tensorflow/core/kernels/sparse/zeros_op.h
index 2a86089e04e62e..8f6c09fdb0fa68 100644
--- a/tensorflow/core/kernels/sparse/zeros_op.h
+++ b/tensorflow/core/kernels/sparse/zeros_op.h
@@ -54,7 +54,7 @@ struct CSRSparseMatrixZeros {
 
     Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
                        TensorShape({batch_size + 1}));
-    batch_ptr_t.vec<int32>().setZero();  // On host.
+    batch_ptr_t.vec<int32_t>().setZero();  // On host.
 
     Allocator* allocator = c->device()->GetAllocator(AllocatorAttributes());
     // An all-zeros CSR matrix is composed of an empty set of column
@@ -66,10 +66,10 @@ struct CSRSparseMatrixZeros {
     Tensor coo_col_ind_t(allocator, DT_INT32, TensorShape({0}));
     Tensor csr_values_t(allocator, dtype, TensorShape({0}));
     const Device& d = c->eigen_device<Device>();
-    functor::SetZeroFunctor<Device, int32> set_zero;
+    functor::SetZeroFunctor<Device, int32_t> set_zero;
     TF_RETURN_IF_ERROR(c->allocate_temp(
         DT_INT32, TensorShape({batch_size * (rows + 1)}), &csr_row_ptr_t));
-    set_zero(d, csr_row_ptr_t.flat<int32>());
+    set_zero(d, csr_row_ptr_t.flat<int32_t>());
 
     TF_RETURN_IF_ERROR(CSRSparseMatrix::CreateCSRSparseMatrix(
         dtype, dense_shape_t, batch_ptr_t, csr_row_ptr_t, coo_col_ind_t,
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index bd2956c734a1b7..7bd5f5be719565 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -836,24 +836,24 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 REGISTER_KERNEL_BUILDER(
     Name("TensorArrayGather")
         .Device(DEVICE_GPU)
-        .TypeConstraint<int32>("dtype")
+        .TypeConstraint<int32_t>("dtype")
         .HostMemory("indices")
         .HostMemory("handle"),
-    TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
+    TensorArrayPackOrGatherOp<CPUDevice, int32_t, false /* LEGACY_PACK */>);
 REGISTER_KERNEL_BUILDER(
     Name("TensorArrayGatherV2")
         .Device(DEVICE_GPU)
-        .TypeConstraint<int32>("dtype")
+        .TypeConstraint<int32_t>("dtype")
         .HostMemory("indices")
         .HostMemory("handle"),
-    TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
+    TensorArrayPackOrGatherOp<CPUDevice, int32_t, false /* LEGACY_PACK */>);
 REGISTER_KERNEL_BUILDER(
     Name("TensorArrayGatherV3")
         .Device(DEVICE_GPU)
-        .TypeConstraint<int32>("dtype")
+        .TypeConstraint<int32_t>("dtype")
         .HostMemory("indices")
         .HostMemory("handle"),
-    TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
+    TensorArrayPackOrGatherOp<CPUDevice, int32_t, false /* LEGACY_PACK */>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -1050,22 +1050,22 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("dtype")
+                            .TypeConstraint<int32_t>("dtype")
                             .HostMemory("lengths")
                             .HostMemory("handle"),
-                        TensorArrayConcatOp<CPUDevice, int32>);
+                        TensorArrayConcatOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("dtype")
+                            .TypeConstraint<int32_t>("dtype")
                             .HostMemory("lengths")
                             .HostMemory("handle"),
-                        TensorArrayConcatOp<CPUDevice, int32>);
+                        TensorArrayConcatOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("dtype")
+                            .TypeConstraint<int32_t>("dtype")
                             .HostMemory("lengths")
                             .HostMemory("handle"),
-                        TensorArrayConcatOp<CPUDevice, int32>);
+                        TensorArrayConcatOp<CPUDevice, int32_t>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index b1207aeea7f674..b23b7a1b4d4e81 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -83,10 +83,10 @@ class UniqueOp : public OpKernel {
     // TODO(dga):  Make unique polymorphic for returning int32 and int64
     // vectors to support large tensors.
     OP_REQUIRES(context,
-                input.NumElements() <= std::numeric_limits<int32>::max(),
+                input.NumElements() <= std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument(
                     "unique does not support input tensors larger than ",
-                    std::numeric_limits<int32>::max(), " elements"));
+                    std::numeric_limits<int32_t>::max(), " elements"));
 
     int64_t axis = 0;
     std::vector<int64_t> new_sizes{1, input.NumElements(), 1};
@@ -115,7 +115,7 @@ class UniqueOp : public OpKernel {
                         "axis tensor should be int32 or int64, but got ",
                         DataTypeString(axis_tensor.dtype())));
         if (axis_tensor.dtype() == DT_INT32) {
-          axis = internal::SubtleMustCopy(axis_tensor.scalar<int32>()());
+          axis = internal::SubtleMustCopy(axis_tensor.scalar<int32_t>()());
         } else {
           axis = internal::SubtleMustCopy(axis_tensor.scalar<int64_t>()());
         }
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index e21c0bfad6ae52..b870921666bd83 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -84,7 +84,7 @@ void BM_Unique_INT32(::testing::benchmark::State& state) {
                   "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
       .Run(state);
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * dim *
-                          sizeof(int32));
+                          sizeof(int32_t));
 }
 
 void BM_Unique_INT32_Repeat(::testing::benchmark::State& state) {
@@ -108,7 +108,7 @@ void BM_Unique_INT32_Repeat(::testing::benchmark::State& state) {
                   "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
       .Run(state);
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * dim * 200 *
-                          sizeof(int32));
+                          sizeof(int32_t));
 }
 
 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@@ -118,7 +118,7 @@ TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
   tensor_proto.mutable_tensor_shape()->set_unknown_rank(false);
   for (int i = 0; i < dim; ++i) {
     const int len = std::rand() % max_str_len + 1;
-    string rand_str;
+    std::string rand_str;
     rand_str.resize(len);
     for (int j = 0; j < len; ++j) {
       rand_str[j] = static_cast<char>(j % 256);
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 25eb23de84e177..a43beaecc040be 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -29,8 +29,8 @@ namespace {
 
 // Makes a unique name for a temporary variable inside a while loop body,
 // because loop can be executed in multiple iterations in parallel.
-string TemporaryVariableName(const string& var_name,
-                             const FrameAndIter& control_frame) {
+std::string TemporaryVariableName(const std::string& var_name,
+                                  const FrameAndIter& control_frame) {
   if (control_frame.frame_id != kIllegalFrameId &&
       control_frame.iter_id != kIllegalIterId) {
     return strings::StrCat(var_name, "/frame:", control_frame.frame_id,
@@ -53,7 +53,7 @@ class LegacyVar : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat(DataTypeString(tensor_.dtype()), "/",
                         tensor_.shape().DebugString());
   }
@@ -130,14 +130,14 @@ class TemporaryVariableOp : public OpKernel {
   struct TmpVar : public ResourceBase {
     mutex mu;
     Tensor val;
-    string name;
-    string DebugString() const override { return name; }
+    std::string name;
+    std::string DebugString() const override { return name; }
     ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
   };
 
   TensorShape shape_;
   DataType dtype_;
-  string var_name_;
+  std::string var_name_;
 };
 
 class DestroyTemporaryVariableOp : public OpKernel {
@@ -171,7 +171,7 @@ class DestroyTemporaryVariableOp : public OpKernel {
   }
 
  private:
-  string var_name_;
+  std::string var_name_;
 };
 
 class IsVariableInitializedOp : public OpKernel {
diff --git a/tensorflow/core/kernels/variable_ops_test.cc b/tensorflow/core/kernels/variable_ops_test.cc
index 0a814aab1db9fe..6ed93a0e643f2f 100644
--- a/tensorflow/core/kernels/variable_ops_test.cc
+++ b/tensorflow/core/kernels/variable_ops_test.cc
@@ -31,7 +31,7 @@ namespace {
 void ManyManyVariablesHelper(int threads, int variables,
                              ::testing::benchmark::State& state) {
   Graph g(OpRegistry::Global());
-  std::vector<string> targets;
+  std::vector<std::string> targets;
   for (int i = 0; i < variables; ++i) {
     Node* v;
     TF_CHECK_OK(
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 1421e24cbb0fdd..42c89f61ff3f48 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -259,8 +259,8 @@ class WhereGPUOp : public AsyncOpKernel {
     const Tensor& input = context->input(0);
     const int input_dims = input.dims();
 
-    if (input.NumElements() < std::numeric_limits<int32>::max()) {
-      ComputeAsyncType<int32>(input, input_dims, context, done);
+    if (input.NumElements() < std::numeric_limits<int32_t>::max()) {
+      ComputeAsyncType<int32_t>(input, input_dims, context, done);
     } else {
       ComputeAsyncType<int64_t>(input, input_dims, context, done);
     }
@@ -282,7 +282,7 @@ class WhereGPUOp : public AsyncOpKernel {
 
     // Push kernel to stream to get number of true elements.
     const GPUDevice& d = context->eigen_device<GPUDevice>();
-    Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
+    absl::Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
         context, d, input.flat<T>(), num_true_t);
     OP_REQUIRES_OK_ASYNC(context, s, done);
 
@@ -374,9 +374,9 @@ TF_CALL_WHERE_GPU_TYPES(REGISTER_GPU_WHERE_OP);
 
 REGISTER_KERNEL_BUILDER(Name("Where")
                             .Device(DEVICE_DEFAULT)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("input")
                             .HostMemory("index"),
-                        WhereCPUOp<int32>);
+                        WhereCPUOp<int32_t>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/while_op_test.cc b/tensorflow/core/kernels/while_op_test.cc
index b7f5af047b8186..36c68d2b14e508 100644
--- a/tensorflow/core/kernels/while_op_test.cc
+++ b/tensorflow/core/kernels/while_op_test.cc
@@ -123,7 +123,8 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
     return malloc(size);
 #else
-    return tensorflow::port::AlignedMalloc(size, EIGEN_MAX_ALIGN_BYTES);
+    return tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(EIGEN_MAX_ALIGN_BYTES));
 #endif
   };
   se_.host_memory_deallocate = [](const SP_Device* const device, void* mem) {
@@ -136,7 +137,8 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
     mem->opaque = malloc(size);
 #else
-    mem->opaque = tensorflow::port::AlignedMalloc(size, EIGEN_MAX_ALIGN_BYTES);
+    mem->opaque = tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(EIGEN_MAX_ALIGN_BYTES));
 #endif
     mem->size = size;
   };
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index c5c8e548a7592f..884c7725f43d6b 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -35,7 +35,7 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-static absl::Status ReadEntireFile(Env* env, const string& filename,
+static absl::Status ReadEntireFile(Env* env, const std::string& filename,
                                    T* contents) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
@@ -47,7 +47,7 @@ static absl::Status ReadEntireFile(Env* env, const string& filename,
 
 class WholeFileReader : public ReaderBase {
  public:
-  WholeFileReader(Env* env, const string& node_name)
+  WholeFileReader(Env* env, const std::string& node_name)
       : ReaderBase(absl::StrCat("WholeFileReader '", node_name, "'")),
         env_(env) {}
 
@@ -136,8 +136,8 @@ class WriteFileOp : public OpKernel {
                 errors::InvalidArgument(
                     "Contents tensor must be scalar, but had shape: ",
                     contents_input->shape().DebugString()));
-    const string& filename = filename_input->scalar<tstring>()();
-    const string dir(io::Dirname(filename));
+    const std::string& filename = filename_input->scalar<tstring>()();
+    const std::string dir(io::Dirname(filename));
     if (!context->env()->FileExists(dir).ok()) {
       OP_REQUIRES_OK(context, context->env()->RecursivelyCreateDir(dir));
     }
diff --git a/tensorflow/core/lib/core/arena.h b/tensorflow/core/lib/core/arena.h
index 14d80422496bd2..d5f6c765e51dd0 100644
--- a/tensorflow/core/lib/core/arena.h
+++ b/tensorflow/core/lib/core/arena.h
@@ -62,7 +62,7 @@ class Arena {
 
  protected:
   bool SatisfyAlignment(const size_t alignment);
-  void MakeNewBlock(const uint32 alignment);
+  void MakeNewBlock(const uint32_t alignment);
   void* GetMemoryFallback(const size_t size, const int align);
   void* GetMemory(const size_t size, const int align) {
     assert(remaining_ <= block_size_);                  // an invariant
@@ -88,7 +88,7 @@ class Arena {
   // The returned AllocatedBlock* is valid until the next call to AllocNewBlock
   // or Reset (i.e. anything that might affect overflow_blocks_).
   AllocatedBlock* AllocNewBlock(const size_t block_size,
-                                const uint32 alignment);
+                                const uint32_t alignment);
 
   const size_t block_size_;
   char* freestart_;  // beginning of the free space in most recent block
diff --git a/tensorflow/core/lib/core/coding_test.cc b/tensorflow/core/lib/core/coding_test.cc
index 9efe3d8ec10f2c..4769cddaca0906 100644
--- a/tensorflow/core/lib/core/coding_test.cc
+++ b/tensorflow/core/lib/core/coding_test.cc
@@ -22,46 +22,46 @@ namespace tensorflow {
 namespace core {
 
 TEST(Coding, Fixed16) {
-  static const uint16 N = 50000;
+  static const uint16_t N = 50000;
 
-  string s;
-  for (uint16 v = 0; v < N; v++) {
-    char buf[sizeof(uint16)];
+  std::string s;
+  for (uint16_t v = 0; v < N; v++) {
+    char buf[sizeof(uint16_t)];
     EncodeFixed16(buf, v);
     s.append(buf, sizeof(buf));
   }
 
   const char* p = s.data();
-  for (uint16 v = 0; v < N; v++) {
-    uint16 actual = DecodeFixed16(p);
+  for (uint16_t v = 0; v < N; v++) {
+    uint16_t actual = DecodeFixed16(p);
     ASSERT_EQ(v, actual);
-    p += sizeof(uint16);
+    p += sizeof(uint16_t);
   }
 }
 
 TEST(Coding, Fixed32) {
-  static const uint32 N = 100000;
+  static const uint32_t N = 100000;
 
-  string s;
-  for (uint32 v = 0; v < N; v++) {
-    char buf[sizeof(uint32)];
+  std::string s;
+  for (uint32_t v = 0; v < N; v++) {
+    char buf[sizeof(uint32_t)];
     EncodeFixed32(buf, v);
     s.append(buf, sizeof(buf));
   }
 
   const char* p = s.data();
-  for (uint32 v = 0; v < N; v++) {
-    uint32 actual = DecodeFixed32(p);
+  for (uint32_t v = 0; v < N; v++) {
+    uint32_t actual = DecodeFixed32(p);
     ASSERT_EQ(v, actual);
-    p += sizeof(uint32);
+    p += sizeof(uint32_t);
   }
 }
 
 TEST(Coding, Fixed64) {
-  string s;
+  std::string s;
   for (int power = 0; power <= 63; power++) {
-    uint64 v = static_cast<uint64>(1) << power;
-    char buf[sizeof(uint64)];
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    char buf[sizeof(uint64_t)];
     EncodeFixed64(buf, v - 1);
     s.append(buf, sizeof(buf));
     EncodeFixed64(buf, v + 0);
@@ -72,19 +72,19 @@ TEST(Coding, Fixed64) {
 
   const char* p = s.data();
   for (int power = 0; power <= 63; power++) {
-    uint64 v = static_cast<uint64>(1) << power;
-    uint64 actual;
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    uint64_t actual;
     actual = DecodeFixed64(p);
     ASSERT_EQ(v - 1, actual);
-    p += sizeof(uint64);
+    p += sizeof(uint64_t);
 
     actual = DecodeFixed64(p);
     ASSERT_EQ(v + 0, actual);
-    p += sizeof(uint64);
+    p += sizeof(uint64_t);
 
     actual = DecodeFixed64(p);
     ASSERT_EQ(v + 1, actual);
-    p += sizeof(uint64);
+    p += sizeof(uint64_t);
   }
 }
 
@@ -113,17 +113,17 @@ TEST(Coding, EncodingOutput) {
 }
 
 TEST(Coding, Varint32) {
-  string s;
-  for (uint32 i = 0; i < (32 * 32); i++) {
-    uint32 v = (i / 32) << (i % 32);
+  std::string s;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t v = (i / 32) << (i % 32);
     PutVarint32(&s, v);
   }
 
   const char* p = s.data();
   const char* limit = p + s.size();
-  for (uint32 i = 0; i < (32 * 32); i++) {
-    uint32 expected = (i / 32) << (i % 32);
-    uint32 actual;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t expected = (i / 32) << (i % 32);
+    uint32_t actual;
     p = GetVarint32Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
     ASSERT_EQ(expected, actual);
@@ -133,21 +133,21 @@ TEST(Coding, Varint32) {
 
 TEST(Coding, Varint64) {
   // Construct the list of values to check
-  std::vector<uint64> values;
+  std::vector<uint64_t> values;
   // Some special values
   values.push_back(0);
   values.push_back(100);
-  values.push_back(~static_cast<uint64>(0));
-  values.push_back(~static_cast<uint64>(0) - 1);
-  for (uint32 k = 0; k < 64; k++) {
+  values.push_back(~static_cast<uint64_t>(0));
+  values.push_back(~static_cast<uint64_t>(0) - 1);
+  for (uint32_t k = 0; k < 64; k++) {
     // Test values near powers of two
-    const uint64 power = 1ull << k;
+    const uint64_t power = 1ull << k;
     values.push_back(power);
     values.push_back(power - 1);
     values.push_back(power + 1);
   }
 
-  string s;
+  std::string s;
   for (size_t i = 0; i < values.size(); i++) {
     PutVarint64(&s, values[i]);
   }
@@ -156,7 +156,7 @@ TEST(Coding, Varint64) {
   const char* limit = p + s.size();
   for (size_t i = 0; i < values.size(); i++) {
     ASSERT_TRUE(p < limit);
-    uint64 actual;
+    uint64_t actual;
     p = GetVarint64Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
     ASSERT_EQ(values[i], actual);
@@ -165,17 +165,17 @@ TEST(Coding, Varint64) {
 }
 
 TEST(Coding, Varint32Overflow) {
-  uint32 result;
-  string input("\x81\x82\x83\x84\x85\x11");
+  uint32_t result;
+  std::string input("\x81\x82\x83\x84\x85\x11");
   ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(),
                              &result) == nullptr);
 }
 
 TEST(Coding, Varint32Truncation) {
-  uint32 large_value = (1u << 31) + 100;
-  string s;
+  uint32_t large_value = (1u << 31) + 100;
+  std::string s;
   PutVarint32(&s, large_value);
-  uint32 result;
+  uint32_t result;
   for (size_t len = 0; len < s.size() - 1; len++) {
     ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
@@ -185,17 +185,17 @@ TEST(Coding, Varint32Truncation) {
 }
 
 TEST(Coding, Varint64Overflow) {
-  uint64 result;
-  string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+  uint64_t result;
+  std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
   ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(),
                              &result) == nullptr);
 }
 
 TEST(Coding, Varint64Truncation) {
-  uint64 large_value = (1ull << 63) + 100ull;
-  string s;
+  uint64_t large_value = (1ull << 63) + 100ull;
+  std::string s;
   PutVarint64(&s, large_value);
-  uint64 result;
+  uint64_t result;
   for (size_t len = 0; len < s.size() - 1; len++) {
     ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index bb729bf99272cc..85c7ef3d268372 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -58,7 +58,7 @@ class FewerArgsForCompiler {
  public:
   FewerArgsForCompiler(int datasize, const UncompressFlags& flags,
                        int64_t* nwarn,
-                       std::function<uint8*(int, int, int)> allocate_output)
+                       std::function<uint8_t*(int, int, int)> allocate_output)
       : datasize_(datasize),
         flags_(flags),
         pnwarn_(nwarn),
@@ -72,7 +72,7 @@ class FewerArgsForCompiler {
   const int datasize_;
   const UncompressFlags flags_;
   int64_t* const pnwarn_;
-  std::function<uint8*(int, int, int)> allocate_output_;
+  std::function<uint8_t*(int, int, int)> allocate_output_;
   int height_read_;  // number of scanline lines successfully read
   int height_;
   int stride_;
@@ -95,7 +95,7 @@ bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width,
 void no_print(j_common_ptr cinfo) {}
 #endif
 
-uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
+uint8_t* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   // unpack the argball
   const int datasize = argball->datasize_;
   const auto& flags = argball->flags_;
@@ -252,8 +252,8 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
                                         target_output_height, components);
   }
 #else
-  uint8* dstdata = argball->allocate_output_(target_output_width,
-                                             target_output_height, components);
+  uint8_t* dstdata = argball->allocate_output_(
+      target_output_width, target_output_height, components);
 #endif
   if (dstdata == nullptr) {
     jpeg_destroy_decompress(&cinfo);
@@ -509,12 +509,12 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
 //  associated libraries aren't good enough to guarantee that 7
 //  parameters won't get clobbered by the longjmp.  So we help
 //  it out a little.
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int64_t* nwarn,
-                  std::function<uint8*(int, int, int)> allocate_output) {
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int64_t* nwarn,
+                    std::function<uint8_t*(int, int, int)> allocate_output) {
   FewerArgsForCompiler argball(datasize, flags, nwarn,
                                std::move(allocate_output));
-  uint8* const dstdata = UncompressLow(srcdata, &argball);
+  uint8_t* const dstdata = UncompressLow(srcdata, &argball);
 
   const float fraction_read =
       argball.height_ == 0
@@ -530,7 +530,7 @@ uint8* Uncompress(const void* srcdata, int datasize,
   // set the unread pixels to black
   if (argball.height_read_ != argball.height_) {
     const int first_bad_line = argball.height_read_;
-    uint8* start = dstdata + first_bad_line * argball.stride_;
+    uint8_t* start = dstdata + first_bad_line * argball.stride_;
     const int nbytes = (argball.height_ - first_bad_line) * argball.stride_;
     memset(static_cast<void*>(start), 0, nbytes);
   }
@@ -538,17 +538,17 @@ uint8* Uncompress(const void* srcdata, int datasize,
   return dstdata;
 }
 
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int* pwidth, int* pheight,
-                  int* pcomponents, int64_t* nwarn) {
-  uint8* buffer = nullptr;
-  uint8* result =
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int* pwidth, int* pheight,
+                    int* pcomponents, int64_t* nwarn) {
+  uint8_t* buffer = nullptr;
+  uint8_t* result =
       Uncompress(srcdata, datasize, flags, nwarn,
                  [=, &buffer](int width, int height, int components) {
                    if (pwidth != nullptr) *pwidth = width;
                    if (pheight != nullptr) *pheight = height;
                    if (pcomponents != nullptr) *pcomponents = components;
-                   buffer = new uint8[height * width * components];
+                   buffer = new uint8_t[height * width * components];
                    return buffer;
                  });
   if (!result) delete[] buffer;
@@ -599,7 +599,7 @@ bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
 // Compression
 
 namespace {
-bool CompressInternal(const uint8* srcdata, int width, int height,
+bool CompressInternal(const uint8_t* srcdata, int width, int height,
                       const CompressFlags& flags, tstring* output) {
   if (output == nullptr) {
     LOG(ERROR) << "Output buffer is null: ";
@@ -711,7 +711,7 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
   if (!flags.xmp_metadata.empty()) {
     // XMP metadata is embedded in the APP1 tag of JPEG and requires this
     // namespace header string (null-terminated)
-    const string name_space = "http://ns.adobe.com/xap/1.0/";
+    const std::string name_space = "http://ns.adobe.com/xap/1.0/";
     const int name_space_length = name_space.size();
     const int metadata_length = flags.xmp_metadata.size();
     const int packet_length = metadata_length + name_space_length + 1;
@@ -736,8 +736,8 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
       new JSAMPLE[width * cinfo.input_components]);
   while (cinfo.next_scanline < cinfo.image_height) {
     JSAMPROW row_pointer[1];  // pointer to JSAMPLE row[s]
-    const uint8* r = &srcdata[cinfo.next_scanline * in_stride];
-    uint8* p = static_cast<uint8*>(row_temp.get());
+    const uint8_t* r = &srcdata[cinfo.next_scanline * in_stride];
+    uint8_t* p = static_cast<uint8_t*>(row_temp.get());
     switch (flags.format) {
       case FORMAT_RGBA: {
         for (int i = 0; i < width; ++i, p += 3, r += 4) {
@@ -777,14 +777,14 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
 
 bool Compress(const void* srcdata, int width, int height,
               const CompressFlags& flags, tstring* output) {
-  return CompressInternal(static_cast<const uint8*>(srcdata), width, height,
+  return CompressInternal(static_cast<const uint8_t*>(srcdata), width, height,
                           flags, output);
 }
 
 tstring Compress(const void* srcdata, int width, int height,
                  const CompressFlags& flags) {
   tstring temp;
-  CompressInternal(static_cast<const uint8*>(srcdata), width, height, flags,
+  CompressInternal(static_cast<const uint8_t*>(srcdata), width, height, flags,
                    &temp);
   // If CompressInternal fails, temp will be empty.
   return temp;
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.h b/tensorflow/core/lib/jpeg/jpeg_mem.h
index 859c4702fd09fa..569abb6b79bf74 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.h
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.h
@@ -87,19 +87,19 @@ struct UncompressFlags {
 // The function returns a pointer to the raw uncompressed data or NULL if
 // there was an error. The caller of the function is responsible for
 // freeing the memory (using delete []).
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int* width, int* height,
-                  int* components,  // Output only: useful with autodetect
-                  int64_t* nwarn);
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int* width, int* height,
+                    int* components,  // Output only: useful with autodetect
+                    int64_t* nwarn);
 
 // Version of Uncompress that allocates memory via a callback.  The callback
 // arguments are (width, height, components).  If the size is known ahead of
 // time this function can return an existing buffer; passing a callback allows
 // the buffer to be shaped based on the JPEG header.  The caller is responsible
 // for freeing the memory *even along error paths*.
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int64_t* nwarn,
-                  std::function<uint8*(int, int, int)> allocate_output);
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int64_t* nwarn,
+                    std::function<uint8_t*(int, int, int)> allocate_output);
 
 // Read jpeg header and get image information.  Returns true on success.
 // The width, height, and components points may be null.
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index 0248a453d5586f..a8c5401bf52e01 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -41,12 +41,12 @@ namespace {
 
 const char kTestData[] = "tensorflow/core/lib/jpeg/testdata/";
 
-int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
+int ComputeSumAbsoluteDifference(const uint8_t* a, const uint8_t* b, int width,
                                  int height, int a_stride, int b_stride) {
   int totalerr = 0;
   for (int i = 0; i < height; i++) {
-    const uint8* const pa = a + i * a_stride;
-    const uint8* const pb = b + i * b_stride;
+    const uint8_t* const pa = a + i * a_stride;
+    const uint8_t* const pb = b + i * b_stride;
     for (int j = 0; j < 3 * width; j++) {
       totalerr += abs(static_cast<int>(pa[j]) - static_cast<int>(pb[j]));
     }
@@ -55,20 +55,21 @@ int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
 }
 
 // Reads the contents of the file into output
-void ReadFileToStringOrDie(Env* env, const string& filename, string* output) {
+void ReadFileToStringOrDie(Env* env, const std::string& filename,
+                           std::string* output) {
   TF_CHECK_OK(ReadFileToString(env, filename, output));
 }
 
-void TestJPEG(Env* env, const string& jpegfile) {
+void TestJPEG(Env* env, const std::string& jpegfile) {
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  const uint8* const temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const uint8_t* const temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   // Try partial decoding (half of the data)
   int w, h, c;
-  std::unique_ptr<uint8[]> imgdata;
+  std::unique_ptr<uint8_t[]> imgdata;
 
   UncompressFlags flags;
   flags.components = 3;
@@ -91,7 +92,7 @@ void TestJPEG(Env* env, const string& jpegfile) {
 
 TEST(JpegMemTest, Jpeg) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Name of a valid jpeg file on the disk
   TestJPEG(env, data_path + "jpeg_merge_test1.jpg");
@@ -100,16 +101,16 @@ TEST(JpegMemTest, Jpeg) {
   TestJPEG(env, data_path + "jpeg_merge_test1_cmyk.jpg");
 }
 
-void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
+void TestCropAndDecodeJpeg(Env* env, const std::string& jpegfile,
                            const UncompressFlags& default_flags) {
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   // Decode the whole image.
-  std::unique_ptr<uint8[]> imgdata1;
+  std::unique_ptr<uint8_t[]> imgdata1;
   int w1, h1, c1;
   {
     UncompressFlags flags = default_flags;
@@ -119,13 +120,13 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
       // If stride is not zero, the default allocator would fail because it
       // allocate w*h*c bytes, but the actual required bytes should be stride*h.
       // Therefore, we provide a specialized allocator here.
-      uint8* buffer = nullptr;
+      uint8_t* buffer = nullptr;
       imgdata1.reset(Uncompress(temp, fsize, flags, nullptr,
                                 [&](int width, int height, int components) {
                                   w1 = width;
                                   h1 = height;
                                   c1 = components;
-                                  buffer = new uint8[flags.stride * height];
+                                  buffer = new uint8_t[flags.stride * height];
                                   return buffer;
                                 }));
     }
@@ -134,7 +135,7 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
 
   auto check_crop_and_decode_func = [&](int crop_x, int crop_y, int crop_width,
                                         int crop_height) {
-    std::unique_ptr<uint8[]> imgdata2;
+    std::unique_ptr<uint8_t[]> imgdata2;
     int w, h, c;
     UncompressFlags flags = default_flags;
     flags.crop = true;
@@ -145,13 +146,13 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
     if (flags.stride == 0) {
       imgdata2.reset(Uncompress(temp, fsize, flags, &w, &h, &c, nullptr));
     } else {
-      uint8* buffer = nullptr;
+      uint8_t* buffer = nullptr;
       imgdata2.reset(Uncompress(temp, fsize, flags, nullptr,
                                 [&](int width, int height, int components) {
                                   w = width;
                                   h = height;
                                   c = components;
-                                  buffer = new uint8[flags.stride * height];
+                                  buffer = new uint8_t[flags.stride * height];
                                   return buffer;
                                 }));
     }
@@ -164,8 +165,8 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
     const int stride1 = (flags.stride != 0) ? flags.stride : w1 * c;
     const int stride2 = (flags.stride != 0) ? flags.stride : w * c;
     for (int i = 0; i < crop_height; i++) {
-      const uint8* p1 = &imgdata1[(i + crop_y) * stride1 + crop_x * c];
-      const uint8* p2 = &imgdata2[i * stride2];
+      const uint8_t* p1 = &imgdata1[(i + crop_y) * stride1 + crop_x * c];
+      const uint8_t* p2 = &imgdata2[i * stride2];
 
       for (int j = 0; j < c * w; j++) {
         ASSERT_EQ(p1[j], p2[j])
@@ -185,7 +186,7 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
 
 TEST(JpegMemTest, CropAndDecodeJpeg) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
 
   // Test basic flags for jpeg and cmyk jpeg.
@@ -195,7 +196,7 @@ TEST(JpegMemTest, CropAndDecodeJpeg) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithRatio) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
   for (int ratio : {1, 2, 4, 8}) {
     flags.ratio = ratio;
@@ -205,7 +206,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithRatio) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithComponents) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
   for (const int components : {0, 1, 3}) {
     flags.components = components;
@@ -215,7 +216,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithComponents) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithUpScaling) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
   flags.fancy_upscaling = true;
   TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
@@ -223,13 +224,13 @@ TEST(JpegMemTest, CropAndDecodeJpegWithUpScaling) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
@@ -244,9 +245,9 @@ TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
   TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
 }
 
-void CheckInvalidCropWindowFailed(const uint8* const temp, int fsize, int x,
+void CheckInvalidCropWindowFailed(const uint8_t* const temp, int fsize, int x,
                                   int y, int w, int h) {
-  std::unique_ptr<uint8[]> imgdata;
+  std::unique_ptr<uint8_t[]> imgdata;
   int ww, hh, cc;
   UncompressFlags flags;
   flags.components = 3;
@@ -261,13 +262,13 @@ void CheckInvalidCropWindowFailed(const uint8* const temp, int fsize, int x,
 
 TEST(JpegMemTest, CropAndDecodeJpegWithInvalidCropWindow) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
@@ -293,27 +294,27 @@ TEST(JpegMemTest, Jpeg2) {
   const int in_w = 256;
   const int in_h = 256;
   const int stride1 = 3 * in_w;
-  const std::unique_ptr<uint8[]> refdata1(new uint8[stride1 * in_h]);
+  const std::unique_ptr<uint8_t[]> refdata1(new uint8_t[stride1 * in_h]);
   for (int i = 0; i < in_h; i++) {
     for (int j = 0; j < in_w; j++) {
       const int offset = i * stride1 + 3 * j;
       refdata1[offset + 0] = i;
       refdata1[offset + 1] = j;
-      refdata1[offset + 2] = static_cast<uint8>((i + j) >> 1);
+      refdata1[offset + 2] = static_cast<uint8_t>((i + j) >> 1);
     }
   }
 
   // duplicate with weird input stride
   const int stride2 = 3 * 357;
-  const std::unique_ptr<uint8[]> refdata2(new uint8[stride2 * in_h]);
+  const std::unique_ptr<uint8_t[]> refdata2(new uint8_t[stride2 * in_h]);
   for (int i = 0; i < in_h; i++) {
     memcpy(&refdata2[i * stride2], &refdata1[i * stride1], 3 * in_w);
   }
 
   // Test compression
-  string cpdata1, cpdata2;
+  std::string cpdata1, cpdata2;
   {
-    const string kXMP = "XMP_TEST_123";
+    const std::string kXMP = "XMP_TEST_123";
 
     // Compress it to JPEG
     CompressFlags flags;
@@ -327,7 +328,7 @@ TEST(JpegMemTest, Jpeg2) {
     CHECK_EQ(cpdata1, cpdata2);
 
     // Verify valid XMP.
-    CHECK_NE(string::npos, cpdata1.find(kXMP));
+    CHECK_NE(std::string::npos, cpdata1.find(kXMP));
 
     // Test the other API, where a storage string is supplied
     tstring cptest;
@@ -340,7 +341,7 @@ TEST(JpegMemTest, Jpeg2) {
   }
 
   // Uncompress twice: once with 3 components and once with autodetect.
-  std::unique_ptr<uint8[]> imgdata1;
+  std::unique_ptr<uint8_t[]> imgdata1;
   for (const int components : {0, 3}) {
     // Uncompress it
     UncompressFlags flags;
@@ -366,7 +367,7 @@ TEST(JpegMemTest, Jpeg2) {
   {
     UncompressFlags flags;
     flags.stride = 3 * 411;
-    const std::unique_ptr<uint8[]> imgdata2(new uint8[flags.stride * in_h]);
+    const std::unique_ptr<uint8_t[]> imgdata2(new uint8_t[flags.stride * in_h]);
     CHECK(imgdata2.get() == Uncompress(cpdata2.c_str(), cpdata2.length(), flags,
                                        nullptr /* nwarn */,
                                        [=, &imgdata2](int w, int h, int c) {
@@ -404,7 +405,7 @@ TEST(JpegMemTest, Jpeg2) {
 
 // Takes JPEG data and reads its headers to determine whether or not the JPEG
 // was chroma downsampled.
-bool IsChromaDownsampled(const string& jpegdata) {
+bool IsChromaDownsampled(const std::string& jpegdata) {
   // Initialize libjpeg structures to have a memory source
   // Modify the usual jpeg error manager to catch fatal errors.
   struct jpeg_decompress_struct cinfo;
@@ -447,8 +448,8 @@ bool IsChromaDownsampled(const string& jpegdata) {
 
 TEST(JpegMemTest, ChromaDownsampling) {
   // Read the data from a test jpeg file into memory
-  const string jpegfile = string(kTestData) + "jpeg_merge_test1.jpg";
-  string jpeg;
+  const std::string jpegfile = std::string(kTestData) + "jpeg_merge_test1.jpg";
+  std::string jpeg;
   ReadFileToStringOrDie(Env::Default(), jpegfile, &jpeg);
 
   // Verify that compressing the JPEG with chroma downsampling works.
@@ -458,7 +459,7 @@ TEST(JpegMemTest, ChromaDownsampling) {
   unflags.components = 3;
   int w, h, c;
   int64_t num_warnings;
-  std::unique_ptr<uint8[]> uncompressed(Uncompress(
+  std::unique_ptr<uint8_t[]> uncompressed(Uncompress(
       jpeg.c_str(), jpeg.size(), unflags, &w, &h, &c, &num_warnings));
   CHECK(uncompressed != nullptr);
   CHECK_EQ(num_warnings, 0);
@@ -476,10 +477,10 @@ TEST(JpegMemTest, ChromaDownsampling) {
   }
 }
 
-void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
-                 int expected_height, const string& reference_RGB_file,
+void TestBadJPEG(Env* env, const std::string& bad_jpeg_file, int expected_width,
+                 int expected_height, const std::string& reference_RGB_file,
                  const bool try_recover_truncated_jpeg) {
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, bad_jpeg_file, &jpeg);
 
   UncompressFlags flags;
@@ -487,7 +488,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
   flags.try_recover_truncated_jpeg = try_recover_truncated_jpeg;
 
   int width, height, components;
-  std::unique_ptr<uint8[]> imgdata;
+  std::unique_ptr<uint8_t[]> imgdata;
   imgdata.reset(Uncompress(jpeg.c_str(), jpeg.size(), flags, &width, &height,
                            &components, nullptr));
   if (expected_width > 0) {  // we expect the file to decode into 'something'
@@ -496,7 +497,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
     CHECK_EQ(components, 3);
     CHECK(imgdata.get());
     if (!reference_RGB_file.empty()) {
-      string ref;
+      std::string ref;
       ReadFileToStringOrDie(env, reference_RGB_file, &ref);
       CHECK(!memcmp(ref.data(), imgdata.get(), ref.size()));
     }
@@ -507,7 +508,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
 
 TEST(JpegMemTest, BadJpeg) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Test corrupt file
   TestBadJPEG(env, data_path + "bad_huffman.jpg", 1024, 768, "", false);
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index b16584be2f3da5..82bff12556d89c 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -53,8 +53,8 @@ namespace {
                                        (del)))
 
 // Convert from 8 bit components to 16. This works in-place.
-static void Convert8to16(const uint8* p8, int num_comps, int p8_row_bytes,
-                         int width, int height_in, uint16* p16,
+static void Convert8to16(const uint8_t* p8, int num_comps, int p8_row_bytes,
+                         int width, int height_in, uint16_t* p16,
                          int p16_row_bytes) {
   // Force height*row_bytes computations to use 64 bits. Height*width is
   // enforced to < 29 bits in decode_png_op.cc, but height*row_bytes is
@@ -64,17 +64,18 @@ static void Convert8to16(const uint8* p8, int num_comps, int p8_row_bytes,
 
   // Adjust pointers to copy backwards
   width *= num_comps;
-  CPTR_INC(uint8, p8, (height - 1) * p8_row_bytes + (width - 1) * sizeof(*p8));
-  PTR_INC(uint16, p16,
+  CPTR_INC(uint8_t, p8,
+           (height - 1) * p8_row_bytes + (width - 1) * sizeof(*p8));
+  PTR_INC(uint16_t, p16,
           (height - 1) * p16_row_bytes + (width - 1) * sizeof(*p16));
   int bump8 = width * sizeof(*p8) - p8_row_bytes;
   int bump16 = width * sizeof(*p16) - p16_row_bytes;
   for (; height-- != 0;
-       CPTR_INC(uint8, p8, bump8), PTR_INC(uint16, p16, bump16)) {
+       CPTR_INC(uint8_t, p8, bump8), PTR_INC(uint16_t, p16, bump16)) {
     for (int w = width; w-- != 0; --p8, --p16) {
-      uint32 pix = *p8;
+      uint32_t pix = *p8;
       pix |= pix << 8;
-      *p16 = static_cast<uint16>(pix);
+      *p16 = static_cast<uint16_t>(pix);
     }
   }
 }
@@ -229,7 +230,7 @@ bool CommonInitDecode(absl::string_view png_string, int desired_channels,
     CommonFreeDecode(context);
     return false;
   }
-  context->data = absl::bit_cast<const uint8*>(png_string.data());
+  context->data = absl::bit_cast<const uint8_t*>(png_string.data());
   context->data_left = png_string.size();
   png_set_read_fn(context->png_ptr, context, StringReader);
   png_read_info(context->png_ptr, context->info_ptr);
@@ -342,9 +343,9 @@ bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context) {
 
   // Synthesize 16 bits from 8 if requested.
   if (context->need_to_synthesize_16)
-    Convert8to16(absl::bit_cast<uint8*>(data), context->channels, row_bytes,
-                 context->width, context->height, absl::bit_cast<uint16*>(data),
-                 row_bytes);
+    Convert8to16(absl::bit_cast<uint8_t*>(data), context->channels, row_bytes,
+                 context->width, context->height,
+                 absl::bit_cast<uint16_t*>(data), row_bytes);
   return ok;
 }
 
diff --git a/tensorflow/core/lib/png/png_io.h b/tensorflow/core/lib/png/png_io.h
index a7fff84c1961ef..71d14546613328 100644
--- a/tensorflow/core/lib/png/png_io.h
+++ b/tensorflow/core/lib/png/png_io.h
@@ -45,7 +45,7 @@ namespace png {
 
 // Handy container for decoding information and struct pointers
 struct DecodeContext {
-  const uint8* data;
+  const uint8_t* data;
   int data_left;
   png_structp png_ptr;
   png_infop info_ptr;
diff --git a/tensorflow/core/lib/strings/base64_test.cc b/tensorflow/core/lib/strings/base64_test.cc
index df4a4bcf59353c..ce68ee2b4dea72 100644
--- a/tensorflow/core/lib/strings/base64_test.cc
+++ b/tensorflow/core/lib/strings/base64_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(Base64, EncodeDecode) {
-  const string original = "a simple test message!";
+  const std::string original = "a simple test message!";
   tstring encoded;
   TF_EXPECT_OK(Base64Encode(original, &encoded));
   EXPECT_EQ("YSBzaW1wbGUgdGVzdCBtZXNzYWdlIQ", encoded);
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index 31b08152f963e2..8e69dbe7fc6809 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -134,7 +134,7 @@ static const char kFFCharacter = '\000';  // Combined with kEscape2
 static const char kEscape1_Separator[2] = {kEscape1, kSeparator};
 
 // Append to "*dest" the "len" bytes starting from "*src".
-inline static void AppendBytes(string* dest, const char* src, size_t len) {
+inline static void AppendBytes(std::string* dest, const char* src, size_t len) {
   dest->append(src, len);
 }
 
@@ -164,7 +164,8 @@ const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start,
 
 // Helper routine to encode "s" and append to "*dest", escaping special
 // characters.
-inline static void EncodeStringFragment(string* dest, absl::string_view s) {
+inline static void EncodeStringFragment(std::string* dest,
+                                        absl::string_view s) {
   const char* p = s.data();
   const char* limit = p + s.size();
   const char* copy_start = p;
@@ -191,12 +192,12 @@ inline static void EncodeStringFragment(string* dest, absl::string_view s) {
   }
 }
 
-void OrderedCode::WriteString(string* dest, absl::string_view s) {
+void OrderedCode::WriteString(std::string* dest, absl::string_view s) {
   EncodeStringFragment(dest, s);
   AppendBytes(dest, kEscape1_Separator, 2);
 }
 
-void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
+void OrderedCode::WriteNumIncreasing(std::string* dest, uint64_t val) {
   // Values are encoded with a single byte length prefix, followed
   // by the actual value in big-endian format with leading 0 bytes
   // dropped.
@@ -216,7 +217,8 @@ void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
 // If parse succeeds, return true, consume encoding from
 // "*src", and if result != NULL append the decoded string to "*result".
 // Otherwise, return false and leave both undefined.
-inline static bool ReadStringInternal(absl::string_view* src, string* result) {
+inline static bool ReadStringInternal(absl::string_view* src,
+                                      std::string* result) {
   const char* start = src->data();
   const char* string_limit = src->data() + src->size();
 
@@ -271,11 +273,11 @@ inline static bool ReadStringInternal(absl::string_view* src, string* result) {
   return false;
 }
 
-bool OrderedCode::ReadString(absl::string_view* src, string* result) {
+bool OrderedCode::ReadString(absl::string_view* src, std::string* result) {
   return ReadStringInternal(src, result);
 }
 
-bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64* result) {
+bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64_t* result) {
   if (src->empty()) {
     return false;  // Not enough bytes
   }
@@ -294,7 +296,7 @@ bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64* result) {
   }
 
   if (result) {
-    uint64 tmp = 0;
+    uint64_t tmp = 0;
     for (size_t i = 0; i < len; i++) {
       tmp <<= 8;
       tmp |= static_cast<unsigned char>((*src)[1 + i]);
@@ -305,7 +307,7 @@ bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64* result) {
   return true;
 }
 
-void OrderedCode::TEST_Corrupt(string* str, int k) {
+void OrderedCode::TEST_Corrupt(std::string* str, int k) {
   int seen_seps = 0;
   for (size_t i = 0; i + 1 < str->size(); i++) {
     if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) {
@@ -389,7 +391,7 @@ static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = {
 
 // This array maps encoding lengths to the header bits that overlap with
 // the payload and need fixing when reading.
-static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
+static const uint64_t kLengthToMask[1 + kMaxSigned64Length] = {
     0ULL,
     0x80ULL,
     0xc000ULL,
@@ -408,7 +410,7 @@ static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
 // bit position (the highest bit position in a positive int64 is 63).
 // For a negative number n, we count the bits in ~n.
 // That is, length = kBitsToLength[tsl::Log2Floor64(n < 0 ? ~n : n) + 1].
-static const int8 kBitsToLength[1 + 63] = {
+static const int8_t kBitsToLength[1 + 63] = {
     1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
     4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
     7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
@@ -418,23 +420,23 @@ static inline int SignedEncodingLength(int64_t n) {
   return kBitsToLength[tsl::Log2Floor64(n < 0 ? ~n : n) + 1];
 }
 
-static void StoreBigEndian64(char* dst, uint64 v) {
+static void StoreBigEndian64(char* dst, uint64_t v) {
   for (int i = 0; i < 8; i++) {
     dst[i] = (v >> (56 - 8 * i)) & 0xff;
   }
 }
 
-static uint64 LoadBigEndian64(const char* src) {
-  uint64 result = 0;
+static uint64_t LoadBigEndian64(const char* src) {
+  uint64_t result = 0;
   for (int i = 0; i < 8; i++) {
     unsigned char c = static_cast<unsigned char>(src[i]);
-    result |= static_cast<uint64>(c) << (56 - 8 * i);
+    result |= static_cast<uint64_t>(c) << (56 - 8 * i);
   }
   return result;
 }
 
-void OrderedCode::WriteSignedNumIncreasing(string* dest, int64_t val) {
-  const uint64 x = val < 0 ? ~val : val;
+void OrderedCode::WriteSignedNumIncreasing(std::string* dest, int64_t val) {
+  const uint64_t x = val < 0 ? ~val : val;
   if (x < 64) {  // fast path for encoding length == 1
     *dest += kLengthToHeaderBits[1][0] ^ val;
     return;
@@ -458,12 +460,12 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64_t val) {
 bool OrderedCode::ReadSignedNumIncreasing(absl::string_view* src,
                                           int64_t* result) {
   if (src->empty()) return false;
-  const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
+  const uint64_t xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
   const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff);
 
   // now calculate and test length, and set x to raw (unmasked) result
   int len;
-  uint64 x;
+  uint64_t x;
   if (first_byte != 0xff) {
     len = 7 - tsl::Log2Floor64(first_byte ^ 0xff);
     if (src->size() < static_cast<size_t>(len)) return false;
diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h
index e7485bd57f7e15..2d2811ac5af30f 100644
--- a/tensorflow/core/lib/strings/ordered_code.h
+++ b/tensorflow/core/lib/strings/ordered_code.h
@@ -54,9 +54,9 @@ class OrderedCode {
   // Encoding routines: each one of the following routines append
   // one item to "*dest" in an encoding where larger values are
   // ordered lexicographically after smaller values.
-  static void WriteString(string* dest, absl::string_view str);
-  static void WriteNumIncreasing(string* dest, uint64 num);
-  static void WriteSignedNumIncreasing(string* dest, int64_t num);
+  static void WriteString(std::string* dest, absl::string_view str);
+  static void WriteNumIncreasing(std::string* dest, uint64_t num);
+  static void WriteSignedNumIncreasing(std::string* dest, int64_t num);
 
   // -------------------------------------------------------------------
   // Decoding routines: these extract an item earlier encoded using
@@ -66,13 +66,13 @@ class OrderedCode {
   // result.  In case of string result, the decoded string is appended to
   // "*result".  Returns true if the next item was read successfully, false
   // otherwise.
-  static bool ReadString(absl::string_view* src, string* result);
-  static bool ReadNumIncreasing(absl::string_view* src, uint64* result);
+  static bool ReadString(absl::string_view* src, std::string* result);
+  static bool ReadNumIncreasing(absl::string_view* src, uint64_t* result);
   static bool ReadSignedNumIncreasing(absl::string_view* src, int64_t* result);
 
   // Helper for testing: corrupt "*str" by changing the kth item separator
   // in the string.
-  static void TEST_Corrupt(string* str, int k);
+  static void TEST_Corrupt(std::string* str, int k);
 
   // Helper for testing.
   // SkipToNextSpecialByte is an internal routine defined in the .cc file
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index 4717007fc27fc2..9ef3a8dafd2138 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -32,8 +32,8 @@ namespace tensorflow {
 namespace strings {
 namespace {
 
-string RandomString(random::SimplePhilox* rnd, size_t len) {
-  string x;
+std::string RandomString(random::SimplePhilox* rnd, size_t len) {
+  std::string x;
   for (size_t i = 0; i < len; i++) {
     x += rnd->Uniform(256);
   }
@@ -45,33 +45,34 @@ string RandomString(random::SimplePhilox* rnd, size_t len) {
 
 // Read/WriteIncreasing are defined for string, uint64, int64 below.
 template <typename T>
-void OCWriteIncreasing(string* dest, const T& val);
+void OCWriteIncreasing(std::string* dest, const T& val);
 template <typename T>
 bool OCReadIncreasing(absl::string_view* src, T* result);
 
 // Read/WriteIncreasing<string>
 template <>
-void OCWriteIncreasing<string>(string* dest, const string& val) {
+void OCWriteIncreasing<std::string>(std::string* dest, const std::string& val) {
   OrderedCode::WriteString(dest, val);
 }
 template <>
-bool OCReadIncreasing<string>(absl::string_view* src, string* result) {
+bool OCReadIncreasing<std::string>(absl::string_view* src,
+                                   std::string* result) {
   return OrderedCode::ReadString(src, result);
 }
 
 // Read/WriteIncreasing<uint64>
 template <>
-void OCWriteIncreasing<uint64>(string* dest, const uint64& val) {
+void OCWriteIncreasing<uint64_t>(std::string* dest, const uint64_t& val) {
   OrderedCode::WriteNumIncreasing(dest, val);
 }
 template <>
-bool OCReadIncreasing<uint64>(absl::string_view* src, uint64* result) {
+bool OCReadIncreasing<uint64_t>(absl::string_view* src, uint64_t* result) {
   return OrderedCode::ReadNumIncreasing(src, result);
 }
 
 // Read/WriteIncreasing<int64_t>
 template <>
-void OCWriteIncreasing<int64_t>(string* dest, const int64_t& val) {
+void OCWriteIncreasing<int64_t>(std::string* dest, const int64_t& val) {
   OrderedCode::WriteSignedNumIncreasing(dest, val);
 }
 template <>
@@ -80,14 +81,14 @@ bool OCReadIncreasing<int64_t>(absl::string_view* src, int64_t* result) {
 }
 
 template <typename T>
-string OCWrite(T val) {
-  string result;
+std::string OCWrite(T val) {
+  std::string result;
   OCWriteIncreasing<T>(&result, val);
   return result;
 }
 
 template <typename T>
-void OCWriteToString(string* result, T val) {
+void OCWriteToString(std::string* result, T val) {
   OCWriteIncreasing<T>(result, val);
 }
 
@@ -100,7 +101,7 @@ bool OCRead(absl::string_view* s, T* val) {
 // Numbers
 
 template <typename T>
-T TestRead(const string& a) {
+T TestRead(const std::string& a) {
   // gracefully reject any proper prefix of an encoding
   for (int i = 0; i < a.size() - 1; ++i) {
     absl::string_view s(a.data(), i);
@@ -124,9 +125,9 @@ void TestWriteRead(T expected) {
 // output.
 template <typename T, typename U>
 void TestWriteAppends(T first, U second) {
-  string encoded;
+  std::string encoded;
   OCWriteToString<T>(&encoded, first);
-  string encoded_first_only = encoded;
+  std::string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
   EXPECT_TRUE(absl::StartsWith(encoded, encoded_first_only));
@@ -149,7 +150,7 @@ void TestNumbers(T multiplier) {
   random::SimplePhilox rnd(&philox);
   for (int bits = 1; bits <= std::numeric_limits<T>().digits; ++bits) {
     // test random non-negative numbers with given number of significant bits
-    const uint64 mask = (~0ULL) >> (64 - bits);
+    const uint64_t mask = (~0ULL) >> (64 - bits);
     for (int i = 0; i < 1000; i++) {
       T x = rnd.Rand64() & mask;
       TestWriteRead(multiplier * x);
@@ -160,16 +161,18 @@ void TestNumbers(T multiplier) {
 }
 
 // Return true iff 'a' is "before" 'b'
-bool CompareStrings(const string& a, const string& b) { return (a < b); }
+bool CompareStrings(const std::string& a, const std::string& b) {
+  return (a < b);
+}
 
 template <typename T>
 void TestNumberOrdering() {
   // first the negative numbers (if T is signed, otherwise no-op)
-  string laststr = OCWrite<T>(std::numeric_limits<T>().min());
+  std::string laststr = OCWrite<T>(std::numeric_limits<T>().min());
   for (T num = std::numeric_limits<T>().min() / 2; num != 0; num /= 2) {
-    string strminus1 = OCWrite<T>(num - 1);
-    string str = OCWrite<T>(num);
-    string strplus1 = OCWrite<T>(num + 1);
+    std::string strminus1 = OCWrite<T>(num - 1);
+    std::string str = OCWrite<T>(num);
+    std::string strplus1 = OCWrite<T>(num + 1);
 
     CHECK(CompareStrings(strminus1, str));
     CHECK(CompareStrings(str, strplus1));
@@ -185,9 +188,9 @@ void TestNumberOrdering() {
   T num = 1;
   while (num < std::numeric_limits<T>().max() / 2) {
     num *= 2;
-    string strminus1 = OCWrite<T>(num - 1);
-    string str = OCWrite<T>(num);
-    string strplus1 = OCWrite<T>(num + 1);
+    std::string strminus1 = OCWrite<T>(num - 1);
+    std::string str = OCWrite<T>(num);
+    std::string strplus1 = OCWrite<T>(num + 1);
 
     CHECK(CompareStrings(strminus1, str));
     CHECK(CompareStrings(str, strplus1));
@@ -199,7 +202,7 @@ void TestNumberOrdering() {
 }
 
 // Helper routine for testing TEST_SkipToNextSpecialByte
-size_t FindSpecial(const string& x) {
+size_t FindSpecial(const std::string& x) {
   const char* p = x.data();
   const char* limit = p + x.size();
   const char* result = OrderedCode::TEST_SkipToNextSpecialByte(p, limit);
@@ -209,15 +212,15 @@ size_t FindSpecial(const string& x) {
 // Helper function template to create strings from string literals (excluding
 // the terminal zero byte of the underlying character array).
 template <size_t N>
-string ByteSequence(const char (&arr)[N]) {
-  return string(arr, N - 1);
+std::string ByteSequence(const char (&arr)[N]) {
+  return std::string(arr, N - 1);
 }
 
 TEST(OrderedCode, SkipToNextSpecialByte) {
   for (size_t len = 0; len < 256; len++) {
     random::PhiloxRandom philox(301, 17);
     random::SimplePhilox rnd(&philox);
-    string x;
+    std::string x;
     while (x.size() < len) {
       char c = 1 + rnd.Uniform(254);
       ASSERT_NE(c, 0);
@@ -228,7 +231,7 @@ TEST(OrderedCode, SkipToNextSpecialByte) {
     for (size_t special_pos = 0; special_pos < len; special_pos++) {
       for (size_t special_test = 0; special_test < 2; special_test++) {
         const char special_byte = (special_test == 0) ? 0 : 255;
-        string y = x;
+        std::string y = x;
         y[special_pos] = special_byte;
         EXPECT_EQ(FindSpecial(y), special_pos);
         if (special_pos < 16) {
@@ -283,9 +286,9 @@ TEST(OrderedCode, ExhaustiveFindSpecial) {
   EXPECT_EQ(count, 256 * 256 * 256 * 2);
 }
 
-TEST(Uint64, EncodeDecode) { TestNumbers<uint64>(1); }
+TEST(Uint64, EncodeDecode) { TestNumbers<uint64_t>(1); }
 
-TEST(Uint64, Ordering) { TestNumberOrdering<uint64>(); }
+TEST(Uint64, Ordering) { TestNumberOrdering<uint64_t>(); }
 
 TEST(Int64, EncodeDecode) {
   TestNumbers<int64_t>(1);
@@ -295,15 +298,15 @@ TEST(Int64, EncodeDecode) {
 TEST(Int64, Ordering) { TestNumberOrdering<int64_t>(); }
 
 // Returns the bitwise complement of s.
-inline string StrNot(const string& s) {
-  string result;
-  for (string::const_iterator it = s.begin(); it != s.end(); ++it)
+inline std::string StrNot(const std::string& s) {
+  std::string result;
+  for (std::string::const_iterator it = s.begin(); it != s.end(); ++it)
     result.push_back(~*it);
   return result;
 }
 
 template <typename T>
-void TestInvalidEncoding(const string& s) {
+void TestInvalidEncoding(const std::string& s) {
   absl::string_view p(s);
   EXPECT_FALSE(OCRead<T>(&p, nullptr));
   EXPECT_EQ(s, p);
@@ -311,11 +314,11 @@ void TestInvalidEncoding(const string& s) {
 
 TEST(OrderedCodeInvalidEncodingsTest, Overflow) {
   // 1U << 64, increasing and decreasing
-  const string k2xx64U = "\x09\x01" + string(8, 0);
-  TestInvalidEncoding<uint64>(k2xx64U);
+  const std::string k2xx64U = "\x09\x01" + std::string(8, 0);
+  TestInvalidEncoding<uint64_t>(k2xx64U);
 
   // 1 << 63 and ~(1 << 63), increasing and decreasing
-  const string k2xx63 = "\xff\xc0\x80" + string(7, 0);
+  const std::string k2xx63 = "\xff\xc0\x80" + std::string(7, 0);
   TestInvalidEncoding<int64_t>(k2xx63);
   TestInvalidEncoding<int64_t>(StrNot(k2xx63));
 }
@@ -332,11 +335,11 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
   for (int n = 2; n <= 9; ++n) {
     // The zero in non_minimal[1] is "redundant".
-    string non_minimal =
-        string(1, n - 1) + string(1, 0) + RandomString(&rnd, n - 2);
+    std::string non_minimal =
+        std::string(1, n - 1) + std::string(1, 0) + RandomString(&rnd, n - 2);
     EXPECT_EQ(n, non_minimal.length());
 
-    EXPECT_NE(OCWrite<uint64>(0), non_minimal);
+    EXPECT_NE(OCWrite<uint64_t>(0), non_minimal);
 #ifndef NDEBUG
     absl::string_view s(non_minimal);
     EXPECT_DEATH(OrderedCode::ReadNumIncreasing(&s, nullptr),
@@ -348,11 +351,12 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
   for (int n = 2; n <= 10; ++n) {
     // Header with 1 sign bit and n-1 size bits.
-    string header = string(n / 8, 0xff) + string(1, 0xff << (8 - (n % 8)));
+    std::string header =
+        std::string(n / 8, 0xff) + std::string(1, 0xff << (8 - (n % 8)));
     // There are more than 7 zero bits between header bits and "payload".
-    string non_minimal = header +
-                         string(1, rnd.Uniform(256) & ~*header.rbegin()) +
-                         RandomString(&rnd, n - header.length() - 1);
+    std::string non_minimal =
+        header + std::string(1, rnd.Uniform(256) & ~*header.rbegin()) +
+        RandomString(&rnd, n - header.length() - 1);
     EXPECT_EQ(n, non_minimal.length());
 
     EXPECT_NE(OCWrite<int64_t>(0), non_minimal);
@@ -369,7 +373,7 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
 // Returns random number with specified number of bits,
 // i.e., in the range [2^(bits-1),2^bits).
-uint64 NextBits(random::SimplePhilox* rnd, int bits) {
+uint64_t NextBits(random::SimplePhilox* rnd, int bits) {
   return (bits != 0)
              ? (rnd->Rand64() % (1LL << (bits - 1))) + (1LL << (bits - 1))
              : 0;
@@ -385,7 +389,7 @@ void BM_WriteNum(::testing::benchmark::State& state, T multiplier) {
   for (int i = 0; i < kValues; i++) {
     values[i] = NextBits(&rnd, state.max_iterations % 64) * multiplier;
   }
-  string result;
+  std::string result;
   int index = 0;
   for (auto i : state) {
     result.clear();
@@ -400,12 +404,12 @@ void BM_ReadNum(::testing::benchmark::State& state, T multiplier) {
   random::SimplePhilox rnd(&philox);
   // Use enough distinct values to confuse the branch predictor
   constexpr int kValues = 64;
-  string values[kValues];
+  std::string values[kValues];
   for (int i = 0; i < kValues; i++) {
     T val = NextBits(&rnd, i % 64) * multiplier;
     values[i] = OCWrite<T>(val);
   }
-  uint32 index = 0;
+  uint32_t index = 0;
   for (auto i : state) {
     T val;
     absl::string_view s = values[index++ % kValues];
@@ -423,7 +427,7 @@ void BM_ReadNum(::testing::benchmark::State& state, T multiplier) {
   }                                                         \
   BENCHMARK(BM_Read##name)
 
-BENCHMARK_NUM(NumIncreasing, uint64, 1);
+BENCHMARK_NUM(NumIncreasing, uint64_t, 1);
 BENCHMARK_NUM(SignedNum, int64_t, 1);
 BENCHMARK_NUM(SignedNumNegative, int64_t, -1);
 
@@ -437,30 +441,30 @@ TEST(String, EncodeDecode) {
   random::SimplePhilox rnd(&philox);
 
   for (int len = 0; len < 256; len++) {
-    const string a = RandomString(&rnd, len);
+    const std::string a = RandomString(&rnd, len);
     TestWriteRead(a);
     for (int len2 = 0; len2 < 64; len2++) {
-      const string b = RandomString(&rnd, len2);
+      const std::string b = RandomString(&rnd, len2);
 
       TestWriteAppends(a, b);
 
-      string out;
-      OCWriteToString<string>(&out, a);
-      OCWriteToString<string>(&out, b);
+      std::string out;
+      OCWriteToString<std::string>(&out, a);
+      OCWriteToString<std::string>(&out, b);
 
-      string a2, b2, dummy;
+      std::string a2, b2, dummy;
       absl::string_view s = out;
       absl::string_view s2 = out;
-      CHECK(OCRead<string>(&s, &a2));
-      CHECK(OCRead<string>(&s2, nullptr));
+      CHECK(OCRead<std::string>(&s, &a2));
+      CHECK(OCRead<std::string>(&s2, nullptr));
       CHECK_EQ(s, s2);
 
-      CHECK(OCRead<string>(&s, &b2));
-      CHECK(OCRead<string>(&s2, nullptr));
+      CHECK(OCRead<std::string>(&s, &b2));
+      CHECK(OCRead<std::string>(&s2, nullptr));
       CHECK_EQ(s, s2);
 
-      CHECK(!OCRead<string>(&s, &dummy));
-      CHECK(!OCRead<string>(&s2, nullptr));
+      CHECK(!OCRead<std::string>(&s, &dummy));
+      CHECK(!OCRead<std::string>(&s2, nullptr));
       CHECK_EQ(a, a2);
       CHECK_EQ(b, b2);
       CHECK(s.empty());
@@ -472,8 +476,8 @@ TEST(String, EncodeDecode) {
 // 'str' is a string literal that may contain '\0'.
 #define STATIC_STR(str) StringPiece((str), sizeof(str) - 1)
 
-string EncodeStringIncreasing(absl::string_view value) {
-  string encoded;
+std::string EncodeStringIncreasing(absl::string_view value) {
+  std::string encoded;
   OrderedCode::WriteString(&encoded, value);
   return encoded;
 }
@@ -515,19 +519,20 @@ TEST(String, Increasing) {
 }
 
 TEST(EncodingIsExpected, String) {
-  std::vector<std::pair<string, string>> data = {
-      {"", string("\x00\x01", 2)},
-      {"foo", string("foo\x00\x01", 5)},
-      {"hello", string("hello\x00\x01", 7)},
-      {string("\x00\x01\xff", 3), string("\x00\xff\x01\xff\x00\x00\x01", 7)},
+  std::vector<std::pair<std::string, std::string>> data = {
+      {"", std::string("\x00\x01", 2)},
+      {"foo", std::string("foo\x00\x01", 5)},
+      {"hello", std::string("hello\x00\x01", 7)},
+      {std::string("\x00\x01\xff", 3),
+       std::string("\x00\xff\x01\xff\x00\x00\x01", 7)},
   };
   for (const auto& t : data) {
-    string result;
+    std::string result;
     OrderedCode::WriteString(&result, t.first);
     EXPECT_EQ(t.second, result);
 
     absl::string_view in = result;
-    string decoded;
+    std::string decoded;
     EXPECT_TRUE(OrderedCode::ReadString(&in, &decoded));
     EXPECT_EQ(t.first, decoded);
     EXPECT_EQ("", in);
@@ -535,7 +540,7 @@ TEST(EncodingIsExpected, String) {
 }
 
 TEST(EncodingIsExpected, Unsigned) {
-  std::vector<std::pair<uint64, string>> data = {
+  std::vector<std::pair<uint64_t, std::string>> data = {
       {0x0ull, ByteSequence("\000")},
       {0x1ull, ByteSequence("\001\001")},
       {0x2ull, ByteSequence("\001\002")},
@@ -753,13 +758,13 @@ TEST(EncodingIsExpected, Unsigned) {
        ByteSequence("\010\200\000\000\000\000\000\000\001")},
   };
   for (const auto& t : data) {
-    uint64 num = t.first;
-    string result;
+    uint64_t num = t.first;
+    std::string result;
     OrderedCode::WriteNumIncreasing(&result, num);
     EXPECT_EQ(t.second, result) << std::hex << num;
 
     absl::string_view in = result;
-    uint64 decoded;
+    uint64_t decoded;
     EXPECT_TRUE(OrderedCode::ReadNumIncreasing(&in, &decoded));
     EXPECT_EQ(num, decoded);
     EXPECT_EQ("", in);
@@ -767,7 +772,7 @@ TEST(EncodingIsExpected, Unsigned) {
 }
 
 TEST(EncodingIsExpected, Signed) {
-  std::vector<std::pair<int64_t, string>> data = {
+  std::vector<std::pair<int64_t, std::string>> data = {
       {0ll, ByteSequence("\200")},
       {1ll, ByteSequence("\201")},
       {2ll, ByteSequence("\202")},
@@ -1201,7 +1206,7 @@ TEST(EncodingIsExpected, Signed) {
   };
   for (const auto& t : data) {
     int64_t num = t.first;
-    string result;
+    std::string result;
     OrderedCode::WriteSignedNumIncreasing(&result, num);
     EXPECT_EQ(t.second, result) << std::hex << num;
 
@@ -1216,15 +1221,15 @@ TEST(EncodingIsExpected, Signed) {
 void BM_WriteString(::testing::benchmark::State& state, int len) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  string x;
+  std::string x;
   for (int i = 0; i < len; i++) {
     x += rnd.Uniform(256);
   }
-  string y;
+  std::string y;
 
   for (auto s : state) {
     y.clear();
-    OCWriteToString<string>(&y, x);
+    OCWriteToString<std::string>(&y, x);
   }
   state.SetBytesProcessed(state.iterations() * len);
 }
@@ -1232,18 +1237,18 @@ void BM_WriteString(::testing::benchmark::State& state, int len) {
 void BM_ReadString(::testing::benchmark::State& state, int len) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  string x;
+  std::string x;
   for (int i = 0; i < len; i++) {
     x += rnd.Uniform(256);
   }
-  string data;
-  OCWriteToString<string>(&data, x);
-  string result;
+  std::string data;
+  OCWriteToString<std::string>(&data, x);
+  std::string result;
 
   for (auto i : state) {
     result.clear();
     absl::string_view s = data;
-    OCRead<string>(&s, &result);
+    OCRead<std::string>(&s, &result);
   }
   state.SetBytesProcessed(state.iterations() * len);
 }
diff --git a/tensorflow/core/lib/strings/proto_serialization_test.cc b/tensorflow/core/lib/strings/proto_serialization_test.cc
index 052a6dff016d25..fa2e2364d9f216 100644
--- a/tensorflow/core/lib/strings/proto_serialization_test.cc
+++ b/tensorflow/core/lib/strings/proto_serialization_test.cc
@@ -47,7 +47,7 @@ static void BM_ProtoSerializationToString(::testing::benchmark::State& state) {
   GraphDef graph_def = MakeGraphDef(num_nodes);
 
   for (auto i : state) {
-    string serialized;
+    std::string serialized;
     testing::DoNotOptimize(
         SerializeToStringDeterministic(graph_def, &serialized));
   }
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 41db93ae910a18..5edbfd28bdccbf 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -73,7 +73,7 @@ constexpr char kRiffType[] = "WAVE";
 constexpr char kFormatChunkId[] = "fmt ";
 constexpr char kDataChunkId[] = "data";
 
-inline int16 FloatToInt16Sample(float data) {
+inline int16_t FloatToInt16Sample(float data) {
   constexpr float kMultiplier = 1.0f * (1 << 15);
   return std::min<float>(std::max<float>(roundf(data * kMultiplier),
                                          std::numeric_limits<int16_t>::min()),
@@ -212,7 +212,7 @@ absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   for (size_t i = 0; i < num_samples; ++i) {
     int16_t sample = FloatToInt16Sample(audio[i]);
     core::EncodeFixed16(&data[i * kBytesPerSample],
-                        static_cast<uint16>(sample));
+                        static_cast<uint16_t>(sample));
   }
   return absl::OkStatus();
 }
@@ -230,13 +230,14 @@ template Status EncodeAudioAsS16LEWav<tstring>(const float* audio,
 
 absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
                                           std::vector<float>* float_values,
-                                          uint32* sample_count,
-                                          uint16* channel_count,
-                                          uint32* sample_rate) {
+                                          uint32_t* sample_count,
+                                          uint16_t* channel_count,
+                                          uint32_t* sample_rate) {
   int offset = 0;
   TF_RETURN_IF_ERROR(ExpectText(wav_string, kRiffChunkId, &offset));
-  uint32 total_file_size;
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &total_file_size, &offset));
+  uint32_t total_file_size;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint32_t>(wav_string, &total_file_size, &offset));
   TF_RETURN_IF_ERROR(ExpectText(wav_string, kRiffType, &offset));
   std::string found_text;
   TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &found_text, &offset));
@@ -252,57 +253,61 @@ absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
         found_text != "link" && found_text != "axml") {
       return errors::InvalidArgument("Unexpected field ", found_text);
     }
-    uint32 size_of_chunk;
-    TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &size_of_chunk, &offset));
+    uint32_t size_of_chunk;
+    TF_RETURN_IF_ERROR(
+        ReadValue<uint32_t>(wav_string, &size_of_chunk, &offset));
     TF_RETURN_IF_ERROR(
         IncrementOffset(offset, size_of_chunk, wav_string.size(), &offset));
     TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &found_text, &offset));
   }
-  uint32 format_chunk_size;
+  uint32_t format_chunk_size;
   TF_RETURN_IF_ERROR(
-      ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
+      ReadValue<uint32_t>(wav_string, &format_chunk_size, &offset));
   if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
     return errors::InvalidArgument(
         "Bad format chunk size for WAV: Expected 16 or 18, but got",
         format_chunk_size);
   }
-  uint16 audio_format;
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));
+  uint16_t audio_format;
+  TF_RETURN_IF_ERROR(ReadValue<uint16_t>(wav_string, &audio_format, &offset));
   if (audio_format != 1) {
     return errors::InvalidArgument(
         "Bad audio format for WAV: Expected 1 (PCM), but got", audio_format);
   }
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, channel_count, &offset));
+  TF_RETURN_IF_ERROR(ReadValue<uint16_t>(wav_string, channel_count, &offset));
   if (*channel_count < 1) {
     return errors::InvalidArgument(
         "Bad number of channels for WAV: Expected at least 1, but got ",
         *channel_count);
   }
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, sample_rate, &offset));
-  uint32 bytes_per_second;
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &bytes_per_second, &offset));
-  uint16 bytes_per_sample;
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &bytes_per_sample, &offset));
+  TF_RETURN_IF_ERROR(ReadValue<uint32_t>(wav_string, sample_rate, &offset));
+  uint32_t bytes_per_second;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint32_t>(wav_string, &bytes_per_second, &offset));
+  uint16_t bytes_per_sample;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint16_t>(wav_string, &bytes_per_sample, &offset));
   // Confusingly, bits per sample is defined as holding the number of bits for
   // one channel, unlike the definition of sample used elsewhere in the WAV
   // spec. For example, bytes per sample is the memory needed for all channels
   // for one point in time.
-  uint16 bits_per_sample;
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &bits_per_sample, &offset));
+  uint16_t bits_per_sample;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint16_t>(wav_string, &bits_per_sample, &offset));
   if (bits_per_sample != 16) {
     return errors::InvalidArgument(
         "Can only read 16-bit WAV files, but received ", bits_per_sample);
   }
-  const uint32 expected_bytes_per_sample =
+  const uint32_t expected_bytes_per_sample =
       ((bits_per_sample * *channel_count) + 7) / 8;
   if (bytes_per_sample != expected_bytes_per_sample) {
     return errors::InvalidArgument(
         "Bad bytes per sample in WAV header: Expected ",
         expected_bytes_per_sample, " but got ", bytes_per_sample);
   }
-  const uint64 expected_bytes_per_second =
-      static_cast<uint64>(bytes_per_sample) * *sample_rate;
-  if (static_cast<uint64>(bytes_per_second) != expected_bytes_per_second) {
+  const uint64_t expected_bytes_per_second =
+      static_cast<uint64_t>(bytes_per_sample) * *sample_rate;
+  if (static_cast<uint64_t>(bytes_per_second) != expected_bytes_per_second) {
     return errors::InvalidArgument(
         "Bad bytes per second in WAV header: Expected ",
         expected_bytes_per_second, " but got ", bytes_per_second,
@@ -318,12 +323,12 @@ absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
   while (offset < wav_string.size()) {
     std::string chunk_id;
     TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &chunk_id, &offset));
-    uint32 chunk_size;
-    TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &chunk_size, &offset));
-    if (chunk_size > std::numeric_limits<int32>::max()) {
+    uint32_t chunk_size;
+    TF_RETURN_IF_ERROR(ReadValue<uint32_t>(wav_string, &chunk_size, &offset));
+    if (chunk_size > std::numeric_limits<int32_t>::max()) {
       return errors::InvalidArgument(
           "WAV data chunk '", chunk_id, "' is too large: ", chunk_size,
-          " bytes, but the limit is ", std::numeric_limits<int32>::max());
+          " bytes, but the limit is ", std::numeric_limits<int32_t>::max());
     }
     if (chunk_id == kDataChunkId) {
       if (was_data_found) {
@@ -331,18 +336,18 @@ absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
       }
       was_data_found = true;
       *sample_count = chunk_size / bytes_per_sample;
-      const uint32 data_count = *sample_count * *channel_count;
+      const uint32_t data_count = *sample_count * *channel_count;
       int unused_new_offset = 0;
       // Validate that the data exists before allocating space for it
       // (prevent easy OOM errors).
-      TF_RETURN_IF_ERROR(IncrementOffset(offset, sizeof(int16) * data_count,
+      TF_RETURN_IF_ERROR(IncrementOffset(offset, sizeof(int16_t) * data_count,
                                          wav_string.size(),
                                          &unused_new_offset));
       float_values->resize(data_count);
       for (int i = 0; i < data_count; ++i) {
         int16_t single_channel_value = 0;
         TF_RETURN_IF_ERROR(
-            ReadValue<int16>(wav_string, &single_channel_value, &offset));
+            ReadValue<int16_t>(wav_string, &single_channel_value, &offset));
         (*float_values)[i] = Int16SampleToFloat(single_channel_value);
       }
     } else {
diff --git a/tensorflow/core/lib/wav/wav_io.h b/tensorflow/core/lib/wav/wav_io.h
index 99a3df5038e68b..4ffe789dd282d7 100644
--- a/tensorflow/core/lib/wav/wav_io.h
+++ b/tensorflow/core/lib/wav/wav_io.h
@@ -65,9 +65,9 @@ extern template Status EncodeAudioAsS16LEWav<tstring>(const float* audio,
 // The results are output as floats within the range -1 to 1,
 absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
                                           std::vector<float>* float_values,
-                                          uint32* sample_count,
-                                          uint16* channel_count,
-                                          uint32* sample_rate);
+                                          uint32_t* sample_count,
+                                          uint16_t* channel_count,
+                                          uint32_t* sample_rate);
 
 // Everything below here is only exposed publicly for testing purposes.
 
@@ -88,8 +88,8 @@ absl::Status ReadValue(const std::string& data, T* value, int* offset) {
     memcpy(value, data.data() + *offset, sizeof(T));
   } else {
     *value = 0;
-    const uint8* data_buf =
-        reinterpret_cast<const uint8*>(data.data() + *offset);
+    const uint8_t* data_buf =
+        reinterpret_cast<const uint8_t*>(data.data() + *offset);
     int shift = 0;
     for (int i = 0; i < sizeof(T); ++i, shift += 8) {
       *value = *value | (data_buf[i] << shift);
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index dfc75257cc85f5..68d0c0fa35fbe7 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -31,10 +31,10 @@ namespace wav {
 
 // These are defined in wav_io.cc, and the signatures are here so we don't have
 // to expose them in the public header.
-absl::Status ExpectText(const string& data, const string& expected_text,
-                        int* offset);
-absl::Status ReadString(const string& data, int expected_length, string* value,
-                        int* offset);
+absl::Status ExpectText(const std::string& data,
+                        const std::string& expected_text, int* offset);
+absl::Status ReadString(const std::string& data, int expected_length,
+                        std::string* value, int* offset);
 
 TEST(WavIO, BadArguments) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
@@ -76,7 +76,7 @@ TEST(WavIO, BadArguments) {
 
 TEST(WavIO, BasicEven) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 3, &result));
   EXPECT_EQ(56, result.size());
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 22050, 1, 6, &result));
@@ -87,19 +87,19 @@ TEST(WavIO, BasicEven) {
 
 TEST(WavIO, BasicOdd) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 22050, 1, 5, &result));
   EXPECT_EQ(54, result.size());
 }
 
 TEST(WavIO, EncodeThenDecode) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
-  string wav_data;
+  std::string wav_data;
   TF_ASSERT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 3, &wav_data));
   std::vector<float> decoded_audio;
-  uint32 decoded_sample_count;
-  uint16 decoded_channel_count;
-  uint32 decoded_sample_rate;
+  uint32_t decoded_sample_count;
+  uint16_t decoded_channel_count;
+  uint32_t decoded_sample_rate;
   TF_ASSERT_OK(DecodeLin16WaveAsFloatVector(
       wav_data, &decoded_audio, &decoded_sample_count, &decoded_channel_count,
       &decoded_sample_rate));
@@ -112,59 +112,129 @@ TEST(WavIO, EncodeThenDecode) {
 }
 
 TEST(WavIO, BasicMono) {
-  std::vector<uint8> wav_data = {
-      'R', 'I', 'F', 'F',  // ChunkID
-      44, 0, 0, 0,         // ChunkSize: 36 + SubChunk2Size
-      'W', 'A', 'V', 'E',  // Format
-      'f', 'm', 't', ' ',  // Subchunk1ID
-      16, 0, 0, 0,         // Subchunk1Size
-      1, 0,                // AudioFormat: 1=PCM
-      1, 0,                // NumChannels
-      0x44, 0xac, 0, 0,    // SampleRate: 44100
-      0x88, 0x58, 0x1, 0,  // BytesPerSecond: SampleRate * NumChannels *
-                           //                 BitsPerSample/8
-      2, 0,                // BytesPerSample: NumChannels * BitsPerSample/8
-      16, 0,               // BitsPerSample
-      'd', 'a', 't', 'a',  // Subchunk2ID
-      8, 0, 0, 0,          // Subchunk2Size: NumSamples * NumChannels *
-                           //                BitsPerSample/8
-      0, 0,                // Sample 1: 0
-      0xff, 0x7f,          // Sample 2: 32767 (saturated)
-      0, 0,                // Sample 3: 0
-      0x00, 0x80,          // Sample 4: -32768 (saturated)
+  std::vector<uint8_t> wav_data = {
+      'R',
+      'I',
+      'F',
+      'F',  // ChunkID
+      44,
+      0,
+      0,
+      0,  // ChunkSize: 36 + SubChunk2Size
+      'W',
+      'A',
+      'V',
+      'E',  // Format
+      'f',
+      'm',
+      't',
+      ' ',  // Subchunk1ID
+      16,
+      0,
+      0,
+      0,  // Subchunk1Size
+      1,
+      0,  // AudioFormat: 1=PCM
+      1,
+      0,  // NumChannels
+      0x44,
+      0xac,
+      0,
+      0,  // SampleRate: 44100
+      0x88,
+      0x58,
+      0x1,
+      0,  // BytesPerSecond: SampleRate * NumChannels *
+          //                 BitsPerSample/8
+      2,
+      0,  // BytesPerSample: NumChannels * BitsPerSample/8
+      16,
+      0,  // BitsPerSample
+      'd',
+      'a',
+      't',
+      'a',  // Subchunk2ID
+      8,
+      0,
+      0,
+      0,  // Subchunk2Size: NumSamples * NumChannels *
+          //                BitsPerSample/8
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
   };
-  string expected(wav_data.begin(), wav_data.end());
+  std::string expected(wav_data.begin(), wav_data.end());
   float audio[] = {0.0f, 1.0f, 0.0f, -1.0f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 1, 4, &result));
   EXPECT_EQ(expected, result);
 }
 
 TEST(WavIO, BasicStereo) {
-  std::vector<uint8> wav_data = {
-      'R', 'I', 'F', 'F',  // ChunkID
-      44, 0, 0, 0,         // ChunkSize: 36 + SubChunk2Size
-      'W', 'A', 'V', 'E',  // Format
-      'f', 'm', 't', ' ',  // Subchunk1ID
-      16, 0, 0, 0,         // Subchunk1Size
-      1, 0,                // AudioFormat: 1=PCM
-      2, 0,                // NumChannels
-      0x44, 0xac, 0, 0,    // SampleRate: 44100
-      0x10, 0xb1, 0x2, 0,  // BytesPerSecond: SampleRate * NumChannels *
-                           //                 BitsPerSample/8
-      4, 0,                // BytesPerSample: NumChannels * BitsPerSample/8
-      16, 0,               // BitsPerSample
-      'd', 'a', 't', 'a',  // Subchunk2ID
-      8, 0, 0, 0,          // Subchunk2Size: NumSamples * NumChannels *
-                           //                BitsPerSample/8
-      0, 0,                // Sample 1: 0
-      0xff, 0x7f,          // Sample 2: 32767 (saturated)
-      0, 0,                // Sample 3: 0
-      0x00, 0x80,          // Sample 4: -32768 (saturated)
+  std::vector<uint8_t> wav_data = {
+      'R',
+      'I',
+      'F',
+      'F',  // ChunkID
+      44,
+      0,
+      0,
+      0,  // ChunkSize: 36 + SubChunk2Size
+      'W',
+      'A',
+      'V',
+      'E',  // Format
+      'f',
+      'm',
+      't',
+      ' ',  // Subchunk1ID
+      16,
+      0,
+      0,
+      0,  // Subchunk1Size
+      1,
+      0,  // AudioFormat: 1=PCM
+      2,
+      0,  // NumChannels
+      0x44,
+      0xac,
+      0,
+      0,  // SampleRate: 44100
+      0x10,
+      0xb1,
+      0x2,
+      0,  // BytesPerSecond: SampleRate * NumChannels *
+          //                 BitsPerSample/8
+      4,
+      0,  // BytesPerSample: NumChannels * BitsPerSample/8
+      16,
+      0,  // BitsPerSample
+      'd',
+      'a',
+      't',
+      'a',  // Subchunk2ID
+      8,
+      0,
+      0,
+      0,  // Subchunk2Size: NumSamples * NumChannels *
+          //                BitsPerSample/8
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
   };
-  string expected(wav_data.begin(), wav_data.end());
+  std::string expected(wav_data.begin(), wav_data.end());
   float audio[] = {0.0f, 1.0f, 0.0f, -1.0f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 2, &result));
   EXPECT_EQ(expected, result);
 }
@@ -175,38 +245,83 @@ TEST(WavIO, BasicStereo) {
 // large WAV files are not common, and are unsupported by many readers.
 // See b/72655902.
 TEST(WavIO, ChunkSizeOverflow) {
-  std::vector<uint8> wav_data = {
-      'R', 'I', 'F', 'F',      // ChunkID
-      60, 0, 0, 0,             // ChunkSize: 36 + SubChunk2Size
-      'W', 'A', 'V', 'E',      // Format
-      'f', 'm', 't', ' ',      // Subchunk1ID
-      16, 0, 0, 0,             // Subchunk1Size
-      1, 0,                    // AudioFormat: 1=PCM
-      1, 0,                    // NumChannels
-      0x44, 0xac, 0, 0,        // SampleRate: 44100
-      0x88, 0x58, 0x1, 0,      // BytesPerSecond: SampleRate * NumChannels *
-                               //                 BitsPerSample/8
-      2, 0,                    // BytesPerSample: NumChannels * BitsPerSample/8
-      16, 0,                   // BitsPerSample
-      'd', 'a', 't', 'a',      // Subchunk2ID
-      8, 0, 0, 0,              // Subchunk2Size: NumSamples * NumChannels *
-                               //                BitsPerSample/8
-      0, 0,                    // Sample 1: 0
-      0xff, 0x7f,              // Sample 2: 32767 (saturated)
-      0, 0,                    // Sample 3: 0
-      0x00, 0x80,              // Sample 4: -32768 (saturated)
-      'f', 'o', 'o', 'o',      // Subchunk2ID
-      0xff, 0xff, 0xff, 0xf8,  // Chunk size that could cause an infinite loop.
-      0, 0,                    // Sample 1: 0
-      0xff, 0x7f,              // Sample 2: 32767 (saturated)
-      0, 0,                    // Sample 3: 0
-      0x00, 0x80,              // Sample 4: -32768 (saturated)
+  std::vector<uint8_t> wav_data = {
+      'R',
+      'I',
+      'F',
+      'F',  // ChunkID
+      60,
+      0,
+      0,
+      0,  // ChunkSize: 36 + SubChunk2Size
+      'W',
+      'A',
+      'V',
+      'E',  // Format
+      'f',
+      'm',
+      't',
+      ' ',  // Subchunk1ID
+      16,
+      0,
+      0,
+      0,  // Subchunk1Size
+      1,
+      0,  // AudioFormat: 1=PCM
+      1,
+      0,  // NumChannels
+      0x44,
+      0xac,
+      0,
+      0,  // SampleRate: 44100
+      0x88,
+      0x58,
+      0x1,
+      0,  // BytesPerSecond: SampleRate * NumChannels *
+          //                 BitsPerSample/8
+      2,
+      0,  // BytesPerSample: NumChannels * BitsPerSample/8
+      16,
+      0,  // BitsPerSample
+      'd',
+      'a',
+      't',
+      'a',  // Subchunk2ID
+      8,
+      0,
+      0,
+      0,  // Subchunk2Size: NumSamples * NumChannels *
+          //                BitsPerSample/8
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
+      'f',
+      'o',
+      'o',
+      'o',  // Subchunk2ID
+      0xff,
+      0xff,
+      0xff,
+      0xf8,  // Chunk size that could cause an infinite loop.
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
   };
-  string wav_data_string(wav_data.begin(), wav_data.end());
+  std::string wav_data_string(wav_data.begin(), wav_data.end());
   std::vector<float> decoded_audio;
-  uint32 decoded_sample_count;
-  uint16 decoded_channel_count;
-  uint32 decoded_sample_rate;
+  uint32_t decoded_sample_count;
+  uint16_t decoded_channel_count;
+  uint32_t decoded_sample_rate;
   absl::Status decode_status = DecodeLin16WaveAsFloatVector(
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
@@ -244,10 +359,10 @@ TEST(WavIO, IncrementOffset) {
 }
 
 TEST(WavIO, ExpectText) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       'E', 'x', 'p', 'e', 'c', 't', 'e', 'd',
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   TF_EXPECT_OK(ExpectText(test_string, "Expected", &offset));
@@ -267,13 +382,13 @@ TEST(WavIO, ExpectText) {
 }
 
 TEST(WavIO, ReadString) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       'E', 'x', 'p', 'e', 'c', 't', 'e', 'd',
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  string read_value;
+  std::string read_value;
   TF_EXPECT_OK(ReadString(test_string, 2, &read_value, &offset));
   EXPECT_EQ("Ex", read_value);
   EXPECT_EQ(2, offset);
@@ -287,8 +402,8 @@ TEST(WavIO, ReadString) {
 }
 
 TEST(WavIO, ReadValueInt8) {
-  std::vector<uint8> test_data = {0x00, 0x05, 0xff, 0x80};
-  string test_string(test_data.begin(), test_data.end());
+  std::vector<uint8_t> test_data = {0x00, 0x05, 0xff, 0x80};
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   int8_t read_value;
@@ -313,11 +428,11 @@ TEST(WavIO, ReadValueInt8) {
 }
 
 TEST(WavIO, ReadValueUInt8) {
-  std::vector<uint8> test_data = {0x00, 0x05, 0xff, 0x80};
-  string test_string(test_data.begin(), test_data.end());
+  std::vector<uint8_t> test_data = {0x00, 0x05, 0xff, 0x80};
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  uint8 read_value;
+  uint8_t read_value;
   TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
   EXPECT_EQ(0, read_value);
   EXPECT_EQ(1, offset);
@@ -339,14 +454,14 @@ TEST(WavIO, ReadValueUInt8) {
 }
 
 TEST(WavIO, ReadValueInt16) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00,  // 0
       0xff, 0x00,  // 255
       0x00, 0x01,  // 256
       0xff, 0xff,  // -1
       0x00, 0x80,  // -32768
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   int16_t read_value;
@@ -375,17 +490,17 @@ TEST(WavIO, ReadValueInt16) {
 }
 
 TEST(WavIO, ReadValueUInt16) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00,  // 0
       0xff, 0x00,  // 255
       0x00, 0x01,  // 256
       0xff, 0xff,  // 65535
       0x00, 0x80,  // 32768
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  uint16 read_value;
+  uint16_t read_value;
   TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
   EXPECT_EQ(0, read_value);
   EXPECT_EQ(2, offset);
@@ -411,14 +526,14 @@ TEST(WavIO, ReadValueUInt16) {
 }
 
 TEST(WavIO, ReadValueInt32) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00, 0x00, 0x00,  // 0
       0xff, 0x00, 0x00, 0x00,  // 255
       0x00, 0xff, 0x00, 0x00,  // 65280
       0x00, 0x00, 0xff, 0x00,  // 16,711,680
       0xff, 0xff, 0xff, 0xff,  // -1
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   int32_t read_value;
@@ -447,17 +562,17 @@ TEST(WavIO, ReadValueInt32) {
 }
 
 TEST(WavIO, ReadValueUInt32) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00, 0x00, 0x00,  // 0
       0xff, 0x00, 0x00, 0x00,  // 255
       0x00, 0xff, 0x00, 0x00,  // 65280
       0x00, 0x00, 0xff, 0x00,  // 16,711,680
       0xff, 0xff, 0xff, 0xff,  // 4,294,967,295
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  uint32 read_value;
+  uint32_t read_value;
   TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
   EXPECT_EQ(0, read_value);
   EXPECT_EQ(4, offset);
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 9fdae56fb81d87..ec1ee113fcff2d 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -61,7 +61,6 @@ tf_cuda_cc_test(
         "multi_gpu",
         "no_oss",
         "notap",
-        "cuda-only", # flaky on CI as of 2022-05-30
     ],
     deps = [
         "//tensorflow/core:test",
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index fee9ede90965fe..feb6a553313283 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -18,7 +18,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags() + [
         "gpu_cupti",
         "nomac",
-        "cuda-only", # flaky on CI
     ],
     deps = [
         "//tensorflow/cc:cc_ops",
@@ -43,11 +42,11 @@ tf_cuda_cc_test(
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
-        "@local_xla//xla/backends/profiler/gpu:cuda_test",
-        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
         "@local_xla//xla/backends/profiler/gpu:device_tracer",
         "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-    ] + if_cuda_is_configured([
+        ] + if_cuda_is_configured([
+        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
+        "@local_xla//xla/backends/profiler/gpu:cuda_test",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cupti_headers",
     ]),
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 97ae6af69c56ae..964b014a3aa3f7 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2434  // Updated: 2025/12/7
+#define TF_GRAPH_DEF_VERSION 2451  // Updated: 2025/12/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util.cc b/tensorflow/core/runtime_fallback/kernel/attr_util.cc
index 82bb7ce1b89b57..3c319e09e0e137 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util.cc
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util.cc
@@ -72,7 +72,7 @@ absl::Status ParseValue(absl::string_view input, bool* value) {
   return absl::OkStatus();
 }
 
-absl::Status ParseValue(absl::string_view input, int32* value) {
+absl::Status ParseValue(absl::string_view input, int32_t* value) {
   bool parse_result = absl::SimpleAtoi(input, value);
   if (!parse_result) {
     return errors::InvalidArgument("Could not parse int32 from ", input);
@@ -90,7 +90,7 @@ absl::Status ParseValue(absl::string_view input, std::string* value) {
   return absl::OkStatus();
 }
 
-absl::Status ParseValue(absl::string_view input, std::vector<int32>* value) {
+absl::Status ParseValue(absl::string_view input, std::vector<int32_t>* value) {
   std::vector<std::string> parts = str_util::Split(input, ",");
   value->reserve(parts.size());
   for (const auto& value_str : parts) {
@@ -123,7 +123,7 @@ absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
   } else if (type == "i32") {
     int32_t val;
     s = ParseValue(value, &val);
-    opattrs->Set<int32>(name, val);
+    opattrs->Set<int32_t>(name, val);
   } else if (type == "string" || type == "padding") {
     std::string val;
     s = ParseValue(value, &val);
@@ -133,9 +133,9 @@ absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
     s = ParseValue(value, &val);
     opattrs->Set<tfrt::OpAttrType>(name, tfd::ConvertFromTfDataType(val));
   } else if (type == "list(i32)") {
-    std::vector<int32> val;
+    std::vector<int32_t> val;
     s = ParseValue(value, &val);
-    opattrs->SetArray<int32>(name, val);
+    opattrs->SetArray<int32_t>(name, val);
   }
   return s;
 }
diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util.h b/tensorflow/core/runtime_fallback/kernel/attr_util.h
index 4abbb4f8b31c58..41f0e657d6b1af 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util.h
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util.h
@@ -38,10 +38,10 @@ typedef llvm::StringMap<std::string> AttrMap;
 
 // Parse value from the given string input.
 absl::Status ParseValue(absl::string_view input, bool* value);
-absl::Status ParseValue(absl::string_view input, int32* value);
+absl::Status ParseValue(absl::string_view input, int32_t* value);
 absl::Status ParseValue(absl::string_view input, DataType* value);
 absl::Status ParseValue(absl::string_view input, std::string* value);
-absl::Status ParseValue(absl::string_view input, std::vector<int32>* value);
+absl::Status ParseValue(absl::string_view input, std::vector<int32_t>* value);
 absl::Status ParseValue(absl::string_view input, Padding* value);
 
 absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc b/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc
index 79d80b13ff501a..e6975350c55da4 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc
@@ -47,9 +47,9 @@ TEST(AttrUtilTest, TestGetIntAttr) {
   TF_ASSERT_OK(AddOpAttr("bar", "i32$0", &opattrs));
   TF_ASSERT_OK(AddOpAttr("baz", "i32$123", &opattrs));
 
-  ASSERT_EQ(opattrs.GetAsserting<int32>("foo"), -2);
-  ASSERT_EQ(opattrs.GetAsserting<int32>("bar"), 0);
-  ASSERT_EQ(opattrs.GetAsserting<int32>("baz"), 123);
+  ASSERT_EQ(opattrs.GetAsserting<int32_t>("foo"), -2);
+  ASSERT_EQ(opattrs.GetAsserting<int32_t>("bar"), 0);
+  ASSERT_EQ(opattrs.GetAsserting<int32_t>("baz"), 123);
 
   absl::Status s = AddOpAttr("invalid", "i32$4.5", &opattrs);
   ASSERT_FALSE(s.ok());
@@ -71,17 +71,17 @@ TEST(AttrUtilTest, TestGetIntListAttr) {
   TF_ASSERT_OK(AddOpAttr("baz", "list(i32)$1,2,3", &opattrs));
 
   // std::vector<int32> v1, v2, v3;
-  ArrayRef<int32> v1, v2, v3;
-  std::vector<int32> expected_v1;
-  std::vector<int32> expected_v2 = {1};
-  std::vector<int32> expected_v3 = {1, 2, 3};
-  ArrayRef<int32> expected_v1_ref(expected_v1);
-  ArrayRef<int32> expected_v2_ref(expected_v2);
-  ArrayRef<int32> expected_v3_ref(expected_v3);
-
-  ASSERT_TRUE(opattrs.GetArray<int32>("foo", &v1));
-  ASSERT_TRUE(opattrs.GetArray<int32>("bar", &v2));
-  ASSERT_TRUE(opattrs.GetArray<int32>("baz", &v3));
+  ArrayRef<int32_t> v1, v2, v3;
+  std::vector<int32_t> expected_v1;
+  std::vector<int32_t> expected_v2 = {1};
+  std::vector<int32_t> expected_v3 = {1, 2, 3};
+  ArrayRef<int32_t> expected_v1_ref(expected_v1);
+  ArrayRef<int32_t> expected_v2_ref(expected_v2);
+  ArrayRef<int32_t> expected_v3_ref(expected_v3);
+
+  ASSERT_TRUE(opattrs.GetArray<int32_t>("foo", &v1));
+  ASSERT_TRUE(opattrs.GetArray<int32_t>("bar", &v2));
+  ASSERT_TRUE(opattrs.GetArray<int32_t>("baz", &v3));
   ASSERT_EQ(v1, expected_v1_ref);
   ASSERT_EQ(v2, expected_v2_ref);
   ASSERT_EQ(v3, expected_v3_ref);
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index b496e1924107d9..2bab64c6a02ac6 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -427,8 +427,9 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
       [&]() { return GetTracingMetadata(args, exec_ctx, kernel_runner); });
 
   if (fallback_request_state.log_device_placement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat("Executing op ", frame.op_name().GetValue().str(),
-                              " in device ", frame.device().GetValue().str());
+    std::string msg =
+        absl::StrCat("Executing op ", frame.op_name().GetValue().str(),
+                     " in device ", frame.device().GetValue().str());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
@@ -865,10 +866,10 @@ llvm::Expected<bool> Predicate(
 
       CASE(float);
       CASE(double);
-      CASE(uint8);
-      CASE(int8);
-      CASE(int16);
-      CASE(int32);
+      CASE(uint8_t);
+      CASE(int8_t);
+      CASE(int16_t);
+      CASE(int32_t);
       CASE(int64_t);
       CASE(bool);
 #undef CASE
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
index da93625c5111c2..b93902e576ddd6 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
@@ -77,7 +77,7 @@ static void TFDConstantTensor(tfrt::Argument<int32_t> value,
   // it causes a missing typeinfo error when using -fno-rtti. Investigate
   // if we can make it work with no-rtti.
   Tensor out(DT_INT32, TensorShape({}));
-  out.flat<int32>()(0) = value.get();
+  out.flat<int32_t>()(0) = value.get();
   tensor.Emplace(out);
 }
 
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
index 41e7cfae0637e7..c26dae601b69fe 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
@@ -81,9 +81,9 @@ absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
 
 template <>
 absl::Status TFRTOpKernelConstruction::GetAttr(
-    absl::string_view attr_name, std::vector<int32>* value) const {
-  llvm::ArrayRef<int32> arrayref;
-  bool success = attributes_.GetArray<int32>(
+    absl::string_view attr_name, std::vector<int32_t>* value) const {
+  llvm::ArrayRef<int32_t> arrayref;
+  bool success = attributes_.GetArray<int32_t>(
       llvm::StringRef(attr_name.data(), attr_name.size()), &arrayref);
   if (!success) {
     return MissingAttributeError(attr_name);
@@ -239,7 +239,7 @@ TFRTOpMetaBuilder& TFRTOpMetaBuilder::Attr(absl::string_view attr_spec) {
   return *this;
 }
 
-const string& TFRTOpMetaBuilder::op_name() const { return op_name_; }
+const std::string& TFRTOpMetaBuilder::op_name() const { return op_name_; }
 
 TFRTOpMeta TFRTOpMetaBuilder::BuildMeta() const {
   return TFRTOpMeta(output_types_);
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
index e370fde54e23db..e06a0f13f3ec2b 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
@@ -100,8 +100,8 @@ absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
                                                Padding* value) const;
 
 template <>
-absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
-                                               std::vector<int32>* value) const;
+absl::Status TFRTOpKernelConstruction::GetAttr(
+    absl::string_view attr_name, std::vector<int32_t>* value) const;
 
 absl::Status MissingAttributeError(absl::string_view attr_name);
 
@@ -207,11 +207,11 @@ class TFRTOpMetaBuilder {
   TFRTOpMetaBuilder& Input(absl::string_view input_spec);
   TFRTOpMetaBuilder& Attr(absl::string_view attr_spec);
 
-  const string& op_name() const;
+  const std::string& op_name() const;
   TFRTOpMeta BuildMeta() const;
 
  private:
-  string op_name_;
+  std::string op_name_;
   std::vector<DataType> output_types_;
 };
 
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
index 5c99d39745c519..3b96ce59d9335d 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
@@ -56,7 +56,7 @@ TEST(TFRTOpKernelTest, TestGetBoolAttr) {
 
 TEST(TFRTOpKernelTest, TestGetIntAttr) {
   tfrt::OpAttrs attrs;
-  attrs.Set<int32>("foo", -2);
+  attrs.Set<int32_t>("foo", -2);
   tfrt::OpAttrsRef attrsref(attrs);
 
   TFRTOpKernelConstruction ctx(attrsref);
@@ -68,18 +68,18 @@ TEST(TFRTOpKernelTest, TestGetIntAttr) {
 
 TEST(TFRTOpKernelTest, TestGetIntListAttr) {
   tfrt::OpAttrs attrs;
-  attrs.SetArray<int32>("foo", {});
-  attrs.SetArray<int32>("bar", {1});
-  attrs.SetArray<int32>("baz", {1, 2, 3});
+  attrs.SetArray<int32_t>("foo", {});
+  attrs.SetArray<int32_t>("bar", {1});
+  attrs.SetArray<int32_t>("baz", {1, 2, 3});
   attrs.SetString("bar", "test");
   tfrt::OpAttrsRef attrsref(attrs);
 
   TFRTOpKernelConstruction ctx(attrsref);
 
-  std::vector<int32> v1, v2, v3;
-  std::vector<int32> expected_v1;
-  std::vector<int32> expected_v2 = {1};
-  std::vector<int32> expected_v3 = {1, 2, 3};
+  std::vector<int32_t> v1, v2, v3;
+  std::vector<int32_t> expected_v1;
+  std::vector<int32_t> expected_v2 = {1};
+  std::vector<int32_t> expected_v3 = {1, 2, 3};
   TF_ASSERT_OK(ctx.GetAttr("foo", &v1));
   ASSERT_EQ(v1, expected_v1);
   TF_ASSERT_OK(ctx.GetAttr("bar", &v2));
@@ -217,7 +217,7 @@ TEST(TFRTOpKernelTest, TestAllocateTemp) {
   ASSERT_EQ(out.AllocatedBytes(), 0);
   TF_EXPECT_OK(ctx.allocate_temp(DT_INT32, {}, &out));
   ASSERT_GT(out.AllocatedBytes(), 0);
-  out.scalar<int32>()() = 123;
+  out.scalar<int32_t>()() = 123;
   ASSERT_EQ(out.dtype(), DT_INT32);
   ASSERT_EQ(out.shape().dims(), 0);
 }
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
index 04149fd2b397b4..a35e77ae99776f 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
@@ -50,8 +50,9 @@ constexpr char kBatchesToAverageOverAttr[] = "_batches_to_average_over";
 
 }  // namespace
 
-int32 BatchFunctionFallbackKernelBase::
-    NumBatchThreadsFromEnvironmentWithDefault(int default_num_batch_threads) {
+int32_t
+BatchFunctionFallbackKernelBase::NumBatchThreadsFromEnvironmentWithDefault(
+    int default_num_batch_threads) {
   int32_t num;
   const char* val = std::getenv("TF_NUM_BATCH_THREADS");
 
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
index f053704fd50dcb..3b26516602d4d2 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
@@ -67,7 +67,7 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   void SetAdaptiveBatchSchedulerOptions(OpKernelConstruction* c,
                                         int32_t num_batch_threads);
 
-  static int32 NumBatchThreadsFromEnvironmentWithDefault(
+  static int32_t NumBatchThreadsFromEnvironmentWithDefault(
       int default_num_batch_threads);
   static thread::ThreadPool* GetOrCreateBatchThreadsPool();
   static constexpr int64_t kBatchThreadPoolSize = 128;
@@ -80,10 +80,10 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   int32_t batch_timeout_micros_;
   int32_t max_enqueued_batches_;
   std::vector<int32_t> allowed_batch_sizes_;
-  int32 low_priority_max_batch_size_;
-  int32 low_priority_batch_timeout_micros_;
-  int32 low_priority_max_enqueued_batches_;
-  std::vector<int32> low_priority_allowed_batch_sizes_;
+  int32_t low_priority_max_batch_size_;
+  int32_t low_priority_batch_timeout_micros_;
+  int32_t low_priority_max_enqueued_batches_;
+  std::vector<int32_t> low_priority_allowed_batch_sizes_;
   std::string mixed_priority_policy_;
   bool enable_large_batch_splitting_;
   bool has_attribute_enable_large_batch_splitting_;
@@ -100,10 +100,10 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   static constexpr int64_t kMaxInflightBatches = 64;
   bool enable_adaptive_batch_threads_ = false;
   struct AdaptiveBatchSchedulerOptions {
-    int32 min_in_flight_batches_limit = kMinInflightBatches;
-    int32 initial_in_flight_batches_limit = kInitialInflightBatches;
-    int32 max_in_flight_batches_limit = kMaxInflightBatches;
-    int32 batches_to_average_over = kBatchesToAverageOver;
+    int32_t min_in_flight_batches_limit = kMinInflightBatches;
+    int32_t initial_in_flight_batches_limit = kInitialInflightBatches;
+    int32_t max_in_flight_batches_limit = kMaxInflightBatches;
+    int32_t batches_to_average_over = kBatchesToAverageOver;
   };
   std::optional<AdaptiveBatchSchedulerOptions>
       adaptive_batch_scheduler_options_ = std::nullopt;
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index 100290da8bff1e..016ccf6b1bf55c 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -193,7 +193,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
     return absl::OkStatus();
   }
 
-  string DebugString() const final { return "FallbackBatchResource"; }
+  std::string DebugString() const final { return "FallbackBatchResource"; }
 
   const tsl::RCReference<const tfrt::Function>& batch_function() const {
     return bef_func_;
@@ -407,6 +407,7 @@ REGISTER_KERNEL_BUILDER(
 
 // Identical to BatchFunction except it has 2 extra TFRT attributes and it does
 // not have `f` attribute. Users will not invoke this op directly.
+// LINT.IfChange
 REGISTER_OP("_BatchFunctionFallback")
     .Input("in_tensors: Tin")
     .Input("captured_tensors: Tcaptured")
@@ -467,6 +468,7 @@ REGISTER_OP("_BatchFunctionFallback")
     .Attr("opaque_function_handle: int")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// LINT.ThenChange(//tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc)
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc b/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc
index 4454a04cc1ab34..758a9074637aa2 100644
--- a/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc
+++ b/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc
@@ -39,8 +39,8 @@ class ScalarAdd : public OpKernelT {
     const Tensor& input1 = ctx->input(1);
 
     Tensor output(input0);
-    output.scalar<int32>()() =
-        input0.scalar<int32>()() + input1.scalar<int32>()();
+    output.scalar<int32_t>()() =
+        input0.scalar<int32_t>()() + input1.scalar<int32_t>()();
 
     ctx->set_output(0, output);
   }
@@ -54,7 +54,7 @@ REGISTER_OP("ScalarAdd") SCALAR_ADD_PROPERTIES;
 
 // When calling ScalarAdd from TF, use the standard OpKernel* types.
 REGISTER_KERNEL_BUILDER(
-    Name("ScalarAdd").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
+    Name("ScalarAdd").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
     ScalarAdd<OpKernel, OpKernelConstruction, OpKernelContext>)
 #endif
 
diff --git a/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc b/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
index 6d45437dae4625..5193167366ac35 100644
--- a/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
+++ b/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 static void TFDConstantTensor5D(tfrt::Argument<int32_t> value,
                                 tfrt::Result<Tensor> tensor) {
   Tensor out(DT_INT32, TensorShape({1, 1, 1, 1, 1}));
-  out.flat<int32>()(0) = value.get();
+  out.flat<int32_t>()(0) = value.get();
   tensor.Emplace(out);
 }
 
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 7b0981742dd5b4..8af924c1b40dfe 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/lib/db:sqlite",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -55,6 +56,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@local_xla//xla/tsl/protobuf:histogram_proto_cc",
     ],
@@ -74,6 +76,7 @@ tf_cc_test(
         "//tensorflow/core/lib/db:sqlite",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/protobuf:histogram_proto_cc",
     ],
 )
@@ -128,6 +131,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/png:png_io",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/summary/loader.cc b/tensorflow/core/summary/loader.cc
index 1443cffc4c6e6a..08e4ea469b106b 100644
--- a/tensorflow/core/summary/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #include <iostream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/core/summary/schema.cc b/tensorflow/core/summary/schema.cc
index 2cd421afc59bff..3ba5db4037e419 100644
--- a/tensorflow/core/summary/schema.cc
+++ b/tensorflow/core/summary/schema.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/summary/schema.h"
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/summary/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
index a5e3695e420103..449f851c74669f 100644
--- a/tensorflow/core/summary/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <cstdlib>
 #include <functional>
 #include <limits>
+#include <string>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index 849fc9a6954c7e..2cc0a6b36a1863 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <deque>
 #include <limits>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -28,6 +29,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/summary/summary_db_writer_test.cc b/tensorflow/core/summary/summary_db_writer_test.cc
index 8c25da1823f057..b65349e935aa15 100644
--- a/tensorflow/core/summary/summary_db_writer_test.cc
+++ b/tensorflow/core/summary/summary_db_writer_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 #include <memory>
+#include <string>
 #include <utility>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/summary/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
index dfb1bba4aecbe5..a77641f7e912e5 100644
--- a/tensorflow/core/summary/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <atomic>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/summary/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
index 94ca029774f40d..c0ef770435f05c 100644
--- a/tensorflow/core/summary/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 #include "tensorflow/core/summary/summary_file_writer.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/summary/vacuum.cc b/tensorflow/core/summary/vacuum.cc
index 29c459cca89f13..7db3633b4c21c0 100644
--- a/tensorflow/core/summary/vacuum.cc
+++ b/tensorflow/core/summary/vacuum.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <iostream>
+#include <string>
 
 #include "absl/log/log.h"
 #include "tensorflow/core/lib/db/sqlite.h"
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 5658c7db2ca6bb..571caba934dfe7 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -106,6 +106,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/client:local_client",
+        "@local_xla//xla/pjrt:host_memory_allocator",
         "@local_xla//xla/pjrt:local_device_state",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:tf_pjrt_client",
diff --git a/tensorflow/core/tfrt/common/async_value_tensor.h b/tensorflow/core/tfrt/common/async_value_tensor.h
index 06e99f8f7bcc48..83d0efcb5cc63a 100644
--- a/tensorflow/core/tfrt/common/async_value_tensor.h
+++ b/tensorflow/core/tfrt/common/async_value_tensor.h
@@ -64,7 +64,7 @@ class AsyncValueAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
 
   bool AllocatesOpaqueHandle() const override { return true; }
-  string Name() override { return "async-value"; }
+  std::string Name() override { return "async-value"; }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index 9a6ec5bba211e5..e20e2ca0790586 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -99,6 +99,6 @@ PjRtGpuClientCreationInfo* PjRtState::GetPjRtGpuClientCreationInfo() {
   return pjrt_gpu_client_creation_info_.get();
 }
 
-string PjRtState::DebugString() const { return "PjRtState"; }
+std::string PjRtState::DebugString() const { return "PjRtState"; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 0c6f78cfd82ba8..e0e9f8657bb8a8 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/client/local_client.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
@@ -44,7 +45,7 @@ using PjRtClientsMap = std::map<DeviceType, std::unique_ptr<xla::PjRtClient>>;
 struct PjRtGpuClientCreationInfo {
   std::set<int> allowed_devices;
   std::unique_ptr<se::MultiDeviceAdapter> allocator;
-  std::unique_ptr<tsl::Allocator> host_memory_allocator;
+  std::unique_ptr<xla::HostMemoryAllocator> host_memory_allocator;
   std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
   xla::LocalClient* local_client;
 };
@@ -62,7 +63,7 @@ class PjRtState : public ResourceBase {
   // Moves PJRT client to `unused_`. The PJRT client moved to `unused_` will not
   // be returned by `GetPjRtClient`.
   absl::Status MovePjRtClientToUnused(const DeviceType& device_type);
-  string DebugString() const override;
+  std::string DebugString() const override;
 
   // Saves information needed to create a PJRT client (to enable creating a
   // client with remote devices).
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index c7f12aed50daa3..c500f862e1e706 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -51,8 +51,9 @@ namespace tfrt_stub {
 
 namespace {
 
-string DeviceName(absl::string_view name_prefix, absl::string_view device_type,
-                  int32_t task_id, size_t device_id) {
+std::string DeviceName(absl::string_view name_prefix,
+                       absl::string_view device_type, int32_t task_id,
+                       size_t device_id) {
   return strings::StrCat(absl::StripSuffix(name_prefix, "0"), task_id,
                          "/device:", device_type, ":", device_id);
 }
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index 4533b3c2e102de..7d0d005b0474dc 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -120,7 +120,6 @@ cc_library(
         ":ifrt_persistent_compilation_cache",
         ":ifrt_restore_tensor_registry",
         ":ifrt_serving_core_selector",
-        ":ifrt_tensor_utils",
         ":sharding_utils",
         ":tf_host_callback",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index d8c2064e5f6f81..bc376e94d09962 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -97,7 +97,6 @@ limitations under the License.
 #include "tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
-#include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 #include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
 #include "tsl/platform/tstring.h"
@@ -504,7 +503,7 @@ IfrtServingExecutable::CreateExecutableSynchronously(
       compile_metadata.use_shardy_partitioner());
   xla_compile_options.parameter_is_tupled_arguments = false;
   // Use portable execution for single device + core selection.
-  if (UsePortableExecution(compile_metadata)) {
+  if (UsePortableExecution()) {
     xla_compile_options.compile_portable_executable = true;
   } else {
     TF_ASSIGN_OR_RETURN(
@@ -555,9 +554,8 @@ IfrtServingExecutable::CreateExecutableSynchronously(
   return executable_bundle;
 }
 
-tsl::Future<IfrtServingExecutable::SharedCachedExecutableBundle>
+absl::StatusOr<tsl::Future<IfrtServingExecutable::SharedCachedExecutableBundle>>
 IfrtServingExecutable::LookUpOrCreateExecutable(
-    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
     absl::Span<const DtypeAndShape> dtypes_and_shapes,
     absl::Span<const int> variable_arg_indices) {
   std::vector<tensorflow::TensorShape> input_shapes;
@@ -597,7 +595,18 @@ IfrtServingExecutable::LookUpOrCreateExecutable(
     // compilation.
     module_copy = mlir::OwningOpRef<mlir::ModuleOp>(module_->clone());
   }
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata =
+      original_compile_metadata_;
+
+  // b/469105465: Add test coverage for core selection in execution.
+  if (UsePortableExecution()) {
+    // Clear device_assignment because portable execution doesn't allow device
+    // assignment.
+    compile_metadata.clear_device_assignment();
+  }
 
+  TF_RETURN_IF_ERROR(
+      UpdateCompileMetadata(compile_metadata, dtypes_and_shapes));
   LOG(INFO) << "Cache missed. Building executable";
   absl::StatusOr<SharedCachedExecutableBundle> executable_bundle =
       CreateExecutableSynchronously(std::move(module_copy), compile_metadata,
@@ -613,11 +622,11 @@ void IfrtServingExecutable::Freeze() {
   module_ = nullptr;
 }
 
-bool IfrtServingExecutable::UsePortableExecution(
-    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
+bool IfrtServingExecutable::UsePortableExecution() {
   // TODO(b/335247101) Add a check that the core selector must be non-null if
   // it is a single-device program after core selection in Ifrt is stable.
-  return IsSingleDevice(compile_metadata) && ifrt_serving_core_selector_;
+  return IsSingleDevice(original_compile_metadata_) &&
+         ifrt_serving_core_selector_;
 }
 
 absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
@@ -657,20 +666,12 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
                       BuildDtypeAndShape(inputs, variable_arg_indices,
                                          ifrt_restore_tensor_registry_));
 
-  tensorflow::tpu::TPUCompileMetadataProto compile_metadata =
-      original_compile_metadata_;
-  TF_RETURN_IF_ERROR(
-      UpdateCompileMetadata(compile_metadata, dtypes_and_shapes));
-
   // `device_reservation` should be alive before the end of the execution.
   tsl::DeviceReservation device_reservation(kNoCoreSelectedIndex, nullptr);
   xla::ifrt::DeviceListRef device_list;
-  if (UsePortableExecution(compile_metadata)) {
+  if (UsePortableExecution()) {
     device_reservation =
         ifrt_serving_core_selector_->ReserveDevice(program_id_);
-    // Clear device_assignment because portable execution doesn't allow device
-    // assignment.
-    compile_metadata.clear_device_assignment();
     TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
                         ifrt_client_->LookupDevice(xla::ifrt::DeviceId(
                             device_reservation.device_index())));
@@ -679,10 +680,10 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
     device_list = assigned_device_list_;
   }
   TF_ASSIGN_OR_RETURN(
-      SharedCachedExecutableBundle executable_bundle,
-      LookUpOrCreateExecutable(compile_metadata, dtypes_and_shapes,
-                               variable_arg_indices)
-          .Await());
+      tsl::Future<SharedCachedExecutableBundle> executable_bundle_future,
+      LookUpOrCreateExecutable(dtypes_and_shapes, variable_arg_indices));
+  TF_ASSIGN_OR_RETURN(SharedCachedExecutableBundle executable_bundle,
+                      executable_bundle_future.Await());
 
   if (executable_bundle->compile_metadata.args().size() !=
       dtypes_and_shapes.size()) {
@@ -694,7 +695,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   {
     tsl::profiler::TraceMe traceme("AsyncRestoreVariables");
     absl::ReaderMutexLock lock(mutex_);
-    if (!is_frozen_) {
+    if (!is_frozen_ && !tf_to_hlo_compiler_->IsXlaCompilationDisabled()) {
       // Asynchronously load the restored variable tensors to Ifrt array.
       TF_RETURN_IF_ERROR(AsyncLoadIfrtArray(inputs, variable_arg_indices,
                                             *executable_bundle, device_list));
@@ -775,7 +776,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   VLOG(2) << "Start Execution";
 
   std::optional<xla::ifrt::DeviceListRef> execution_device_list;
-  if (UsePortableExecution(compile_metadata)) {
+  if (UsePortableExecution()) {
     execution_device_list = device_list;
   }
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index 8e29544fd01d78..ac772ae89d89be 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -230,10 +230,9 @@ class IfrtServingExecutable {
       const CachedExecutableBundle& executable_bundle,
       const xla::ifrt::DeviceListRef& devices);
 
-  tsl::Future<SharedCachedExecutableBundle> LookUpOrCreateExecutable(
-      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
-      absl::Span<const DtypeAndShape> dtypes_and_shapes,
-      absl::Span<const int> variable_arg_indices);
+  absl::StatusOr<tsl::Future<SharedCachedExecutableBundle>>
+  LookUpOrCreateExecutable(absl::Span<const DtypeAndShape> dtypes_and_shapes,
+                           absl::Span<const int> variable_arg_indices);
   absl::StatusOr<IfrtServingExecutable::SharedCachedExecutableBundle>
   CreateExecutableSynchronously(
       mlir::OwningOpRef<mlir::ModuleOp> module_copy,
@@ -248,8 +247,7 @@ class IfrtServingExecutable {
   std::vector<xla::ifrt::Shape> GetArgShape(
       int arg_index, const CachedExecutableBundle& entry);
 
-  bool UsePortableExecution(
-      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata);
+  bool UsePortableExecution();
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index 17243e2e3b0bc6..b260fc6a492833 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -244,7 +244,7 @@ class MlrtBatchResource : public tensorflow::serving::BatchResourceBase {
     return absl::OkStatus();
   }
 
-  string DebugString() const final { return "MlrtBatchResource"; }
+  std::string DebugString() const final { return "MlrtBatchResource"; }
 
   mlrt::bc::Function batch_function() const { return batch_function_; }
 
@@ -461,6 +461,7 @@ REGISTER_KERNEL_BUILDER(
     Name(kMlrtBatchFunctionName).Device(DEVICE_GPU),
     tfrt_stub::BatchFunctionFallbackKernel<MlrtBatchResource>);
 
+// LINT.IfChange
 // Identical to BatchFunction except it has 2 extra TFRT attributes and it does
 // not have `f` attribute. Users will not invoke this op directly.
 REGISTER_OP(kMlrtBatchFunctionName)
@@ -475,6 +476,43 @@ REGISTER_OP(kMlrtBatchFunctionName)
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .Attr("batching_queue: string = ''")
+    // A separate set of batch options for the low priority requests, which is
+    // used for priority queue batching.
+    .Attr("low_priority_max_batch_size: int = 0")
+    .Attr("low_priority_batch_timeout_micros: int = 0")
+    .Attr("low_priority_allowed_batch_sizes: list(int) = []")
+    .Attr("low_priority_max_enqueued_batches: int = 0")
+    // Policy that determines the mixed priority batching behavior when low
+    // priority batch parameters are present.
+    //
+    // low_priority_padding_with_next_allowed_batch_size: If high priority
+    // batches time out without reaching the max batch size, low priority inputs
+    // pad the high priority batches up to the next allowed batch size. A low
+    // priority only batch gets schedule only when the low priority input times
+    // out or reaches the max batch size while there is no high priority input
+    // waiting to be processed.
+    // low_priority_padding_with_max_batch_size: Same as above but pad up to the
+    // max batch size.
+    // priority_isolation: High priority and low priority inputs never share the
+    // same batch, i.e., no low priority input padding high priority batches.
+    // Low priority inputs get scheduled only as part of low priority only
+    // batches as described above.
+    // priority_merge: High and low priority inputs are queued separately but
+    // when a batch needs to be scheduled, the two queues are treated as one
+    // merged flat list of inputs with high priority inputs at the front of the
+    // list of tasks to use for the next batch. If all inputs are of the same
+    // priority, the behavior is the same as disabling prioritization.
+    .Attr(
+        "mixed_priority_policy: "
+        "{'low_priority_padding_with_max_batch_size', "
+        "'low_priority_padding_with_next_allowed_batch_size', "
+        "'priority_isolation', 'priority_merge'} = "
+        "'low_priority_padding_with_max_batch_size'")
+    // See the description of the batch_padding_policy attribute of
+    // BatchFunction in core/ops/batch_ops.cc.
+    .Attr(
+        "batch_padding_policy: "
+        "{'PAD_UP', 'BATCH_DOWN', 'MINIMIZE_TPU_COST_PER_REQUEST'} = 'PAD_UP'")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
@@ -485,6 +523,8 @@ REGISTER_OP(kMlrtBatchFunctionName)
     .Attr("opaque_function_handle: int")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// LINT.ThenChange(//tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc)
+
 }  // namespace
 
 // TODO(rohitju, chky): This additional Register is not ideal but unavoidable
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 0fc8f06b2b5e53..9956e74011d7ed 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -173,7 +173,7 @@ class TfrtSession : public tensorflow::Session {
   }
 
   absl::Status Create(GraphDef&& graph) override {
-    absl::MutexLock lock(&session_state_lock_);
+    absl::MutexLock lock(session_state_lock_);
     return CreateLocked(std::move(graph));
   }
 
@@ -279,7 +279,7 @@ class TfrtSession : public tensorflow::Session {
   }
 
   absl::Status Extend(GraphDef&& graph) override {
-    absl::MutexLock lock(&session_state_lock_);
+    absl::MutexLock lock(session_state_lock_);
     return ExtendLocked(std::move(graph));
   }
 
@@ -299,7 +299,7 @@ class TfrtSession : public tensorflow::Session {
       std::vector<Tensor>* outputs,
       const thread::ThreadPoolOptions& thread_pool_options) {
     {
-      absl::MutexLock lock(&session_state_lock_);
+      absl::MutexLock lock(session_state_lock_);
       if (session_state_ == SessionState::kInitialized) {
         return errors::Unavailable("Session not created yet.");
       }
@@ -401,7 +401,7 @@ class TfrtSession : public tensorflow::Session {
   // NOTE: This API is still experimental and may change.
   absl::Status MakeCallable(const CallableOptions& callable_options,
                             CallableHandle* out_handle) override {
-    absl::MutexLock lock(&callables_lock_);
+    absl::MutexLock lock(callables_lock_);
     *out_handle = next_callable_handle_++;
     assert(callables_.find(*out_handle) == callables_.end());
     callables_[*out_handle] = {callable_options};
@@ -436,7 +436,7 @@ class TfrtSession : public tensorflow::Session {
       const thread::ThreadPoolOptions& thread_pool_options) override {
     Callable callable;
     {
-      absl::MutexLock lock(&callables_lock_);
+      absl::MutexLock lock(callables_lock_);
       auto it = callables_.find(handle);
       if (it == callables_.end())
         return errors::InvalidArgument("No such callable handle: ", handle);
@@ -466,7 +466,7 @@ class TfrtSession : public tensorflow::Session {
   /// session.
   /// NOTE: This API is still experimental and may change.
   absl::Status ReleaseCallable(CallableHandle handle) override {
-    absl::MutexLock lock(&callables_lock_);
+    absl::MutexLock lock(callables_lock_);
     auto it = callables_.find(handle);
     if (it == callables_.end())
       return errors::InvalidArgument("No such callable handle: ", handle);
@@ -475,7 +475,7 @@ class TfrtSession : public tensorflow::Session {
   }
 
   absl::Status Close() override {
-    absl::MutexLock lock(&session_state_lock_);
+    absl::MutexLock lock(session_state_lock_);
     session_state_ = SessionState::kClosed;
     return absl::OkStatus();
   }
@@ -721,7 +721,7 @@ class TfrtSessionFactory::ThreadPoolManager {
           "TFRT session does not yet support session local thread pool");
     }
 
-    absl::MutexLock lock(&mutex_);
+    absl::MutexLock lock(mutex_);
 
     auto it = named_thread_pools_.find(name);
     // The thread pool with the given name already exists.
@@ -842,7 +842,7 @@ absl::Status TfrtSessionFactory::NewSession(const SessionOptions& options,
 
   *out_session = nullptr;
 
-  absl::MutexLock lock(&mutex_);
+  absl::MutexLock lock(mutex_);
   std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
@@ -873,13 +873,13 @@ static TfrtSessionFactory* session_factory = nullptr;
 
 tfrt_stub::Runtime* TfrtSessionFactory::GetRuntime() {
   DCHECK(session_factory != nullptr);
-  absl::MutexLock lock(&session_factory->mutex_);
+  absl::MutexLock lock(session_factory->mutex_);
   return session_factory->runtime_;
 }
 
 absl::Status InitializeTfrtSession(const TfrtSessionOptions& options) {
   DCHECK(session_factory != nullptr);
-  absl::MutexLock lock(&session_factory->mutex_);
+  absl::MutexLock lock(session_factory->mutex_);
   DCHECK(!session_factory->IsInitialized());
   return UpdateTfrtSessionOptionsLocked(options);
 }
diff --git a/tensorflow/core/tfrt/utils/graph_partition.cc b/tensorflow/core/tfrt/utils/graph_partition.cc
index 08f5dce6d5734d..ddf50ab8c7ef4d 100644
--- a/tensorflow/core/tfrt/utils/graph_partition.cc
+++ b/tensorflow/core/tfrt/utils/graph_partition.cc
@@ -436,7 +436,7 @@ absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
   auto new_graph = std::make_unique<Graph>(graph->flib_def());
   FunctionDefLibrary flib = graph->flib_def().ToProto();
 
-  std::unordered_map<string, std::unique_ptr<Graph>> partitions;
+  std::unordered_map<std::string, std::unique_ptr<Graph>> partitions;
   TF_RETURN_IF_ERROR(
       PartitionFunctionGraph(device_set, std::move(graph), &partitions));
 
@@ -447,7 +447,7 @@ absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
   std::map<std::string, OutputNodeInfo> device_to_output_info_map;
 
   for (auto& partition : partitions) {
-    const string& device = partition.first;
+    const std::string& device = partition.first;
     VLOG(1) << "Process the partitioin on device: " << device;
 
     Graph* subgraph = partition.second.get();
diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
index 7e255bab054550..dfc4077e8f10a1 100644
--- a/tensorflow/core/tpu/kernels/image_resize_ops.cc
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -57,7 +57,7 @@ class TpuCustomResizeOp : public XlaOpKernel {
     return output_shape;
   }
 
-  string OpaqueField() const {
+  std::string OpaqueField() const {
     return absl::StrCat("\"", align_corners_, half_pixel_centers_, "\"");
   }
 
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
index d59c6c4b6d4683..2d13813db101cf 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -188,7 +188,9 @@ struct LinearizedBuffersWrapper {
   ~LinearizedBuffersWrapper() = default;
 
   // These functions are tensorflow::Variant requirements.
-  string TypeName() const { return "(anonymous)::LinearizedBuffersWrapper"; }
+  std::string TypeName() const {
+    return "(anonymous)::LinearizedBuffersWrapper";
+  }
   void Encode(tensorflow::VariantTensorData* data) const {
     LOG(ERROR) << "Encode() is not implemented for LinearizedBuffersWrapper "
                   "objects.";
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
index 182f5bf29ca32b..2fa5972f29af46 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
@@ -44,10 +44,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
+std::vector<int> ConvertBinarySplitsToBucketSplits(int64_t split,
                                                    int max_division_level) {
   std::vector<int> bucket_splits;
-  uint32 current_index = 0;
+  uint32_t current_index = 0;
   while (split > 0) {
     if (split % 2 == 1) {
       int split_level = absl::bit_width(current_index + 1) - 1;
@@ -62,9 +62,9 @@ std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
   return bucket_splits;
 }
 
-int64 ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
-                                        int max_division_level) {
-  int64 binary_splits = 0;
+int64_t ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
+                                          int max_division_level) {
+  int64_t binary_splits = 0;
   for (auto& bucket_split : bucket_splits) {
     int split_level = max_division_level - 1;
     while (bucket_split > 0 && bucket_split % 2 == 0) {
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
index 72419504760aa6..cd958fc5d2218d 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -33,11 +33,11 @@ namespace tensorflow {
 // Pad value used for SparseCore mini batching logic.
 const int32_t kXlaPadValue = std::numeric_limits<int32_t>::max();
 
-std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
+std::vector<int> ConvertBinarySplitsToBucketSplits(int64_t split,
                                                    int max_division_level);
 
-int64 ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
-                                        int max_division_level);
+int64_t ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
+                                          int max_division_level);
 
 absl::Status ValidateInputCombiner(const std::string& combiner);
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc
index 9af20e1f2a540d..6a241cdb3a3795 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc
@@ -25,11 +25,11 @@ namespace {
 TEST(ConvertSplitsAndBackTest, Split0) {
   const int max_division_level = 6;
 
-  int64 original_split = 0;
+  int64_t original_split = 0;
   std::vector<int> actual_buckets =
       ConvertBinarySplitsToBucketSplits(original_split, max_division_level);
   std::vector<int> expected_buckets = {};
-  int64 re_split =
+  int64_t re_split =
       ConvertBucketSplitsToBinarySplits(expected_buckets, max_division_level);
   ASSERT_EQ(re_split, original_split);
 }
@@ -37,11 +37,11 @@ TEST(ConvertSplitsAndBackTest, Split0) {
 TEST(ConvertSplitsAndBackTest, Split2) {
   const int max_division_level = 6;
 
-  int64 original_split = 2;
+  int64_t original_split = 2;
   std::vector<int> actual_buckets =
       ConvertBinarySplitsToBucketSplits(original_split, max_division_level);
   std::vector<int> expected_buckets = {16};
-  int64 re_split =
+  int64_t re_split =
       ConvertBucketSplitsToBinarySplits(expected_buckets, max_division_level);
   ASSERT_EQ(re_split, original_split);
 }
@@ -49,11 +49,11 @@ TEST(ConvertSplitsAndBackTest, Split2) {
 TEST(ConvertSplitsAndBackTest, Split3) {
   const int max_division_level = 6;
 
-  int64 original_split = 3;
+  int64_t original_split = 3;
   std::vector<int> actual_buckets =
       ConvertBinarySplitsToBucketSplits(original_split, max_division_level);
   std::vector<int> expected_buckets = {16, 32};
-  int64 re_split =
+  int64_t re_split =
       ConvertBucketSplitsToBinarySplits(expected_buckets, max_division_level);
   ASSERT_EQ(re_split, original_split);
 }
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 0815f742b4e9e5..ddd47e0d53c701 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -118,9 +118,9 @@ absl::Status ValidateInputs(const Tensor& indices_or_row_splits,
 }
 
 absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
-                                        const int32 total_id_count,
-                                        const int32 sample_count,
-                                        int32* row_ids_before_padding,
+                                        const int32_t total_id_count,
+                                        const int32_t sample_count,
+                                        int32_t* row_ids_before_padding,
                                         std::vector<int> shape_strides) {
   // The only difference between dense tensor, sparse tensor and ragged tensor
   // is the row ids output.
@@ -129,7 +129,7 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
     // Row ids are just the index ids.
     // Note: this path is also taken when the input is a ragged/sparse tensor
     // with 0 elements. In that case, the row_ids will just be empty as well.
-    for (int32 i = 0; i < total_id_count; ++i) {
+    for (int32_t i = 0; i < total_id_count; ++i) {
       *(row_ids_before_padding + i) = i;
     }
   } else if (indices_or_row_splits.dims() == 2 &&
@@ -140,12 +140,12 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
     // For 2D sparse tensor, as we always combine on the last dimension.
     // The row ids are just the sample ids which is the first dim of the
     // indices.
-    auto indices_matrix = indices_or_row_splits.matrix<int32>();
+    auto indices_matrix = indices_or_row_splits.matrix<int32_t>();
     // TODO(b/432045101): remove this once the bug is fixed.
     if (indices_matrix.dimension(1) == 2) {
-      int32 previous_row_id = -1;
-      for (int32 i = 0; i < total_id_count; ++i) {
-        int32 current_row_id = indices_matrix(i, 0);
+      int32_t previous_row_id = -1;
+      for (int32_t i = 0; i < total_id_count; ++i) {
+        int32_t current_row_id = indices_matrix(i, 0);
         if (current_row_id < previous_row_id) {
           return absl::InvalidArgumentError(
               "Invalid indices_or_row_splits input, indices of SparseTensor "
@@ -173,7 +173,7 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
             "Invalid shape_strides input, expected non-empty shape_strides for "
             "SparseTensor with rank > 2.");
       }
-      int32 previous_row_id = -1;
+      int32_t previous_row_id = -1;
       int32_t rank = indices_matrix.dimension(1) - 1;
       for (int32_t i = 0; i < total_id_count; ++i) {
         int32_t current_row_id = 0;
@@ -205,10 +205,10 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
   } else if (indices_or_row_splits.dims() == 1 &&
              indices_or_row_splits.NumElements() > 0) {
     // Ragged tensor to COO format.
-    const int32* indices_or_row_splits_ptr =
-        indices_or_row_splits.flat<int32>().data();
-    int32 current_row_id = -1;
-    for (int32 i = 0; i < total_id_count; ++i) {
+    const int32_t* indices_or_row_splits_ptr =
+        indices_or_row_splits.flat<int32_t>().data();
+    int32_t current_row_id = -1;
+    for (int32_t i = 0; i < total_id_count; ++i) {
       while (i == *(indices_or_row_splits_ptr + 1 + current_row_id)) {
         current_row_id += 1;
       }
@@ -308,7 +308,7 @@ absl::Status SortDedupAndCountStatsOfCooTensor(
     uint32_t previous_id_array_index = 0;
     for (int32_t index = 0; index < total_id_count; ++index) {
       uint64_t item = per_feature_col_ids_index_list[index];
-      int32 col_id = item >> 32;
+      int32_t col_id = item >> 32;
       uint32_t id_array_index = item & 0xffffffff;
       int32_t row_id = *(row_ids_ptr + id_array_index);
       // If the row ids and col ids are both same as the previous one,
@@ -362,9 +362,9 @@ class ConvertToCooTensorOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ValidateInputs(*indices_or_row_splits, *values,
                                        *weights, sample_count_));
 
-    const int32 total_id_count = values->NumElements();
+    const int32_t total_id_count = values->NumElements();
 
-    auto row_ids_before_dedup = std::make_unique<int32[]>(total_id_count);
+    auto row_ids_before_dedup = std::make_unique<int32_t[]>(total_id_count);
 
     OP_REQUIRES_OK(ctx, ComputeRowIdsBeforePadding(
                             *indices_or_row_splits, total_id_count,
@@ -382,14 +382,14 @@ class ConvertToCooTensorOp : public OpKernel {
     auto combiner_scale_transform_fn =
         GetCombinerScaleTransformFunction(combiner_);
 
-    const int32* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
-    const int32* values_ptr = values->flat<int32>().data();
+    const int32_t* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
+    const int32_t* values_ptr = values->flat<int32_t>().data();
     const float* weights_ptr = weights->flat<float>().data();
 
     // Dedup the ids within one sample by just checking the adjacent ids. This
     // will NOT result in a full deduplication.
-    std::vector<int32> row_ids;
-    std::vector<int32> col_ids;
+    std::vector<int32_t> row_ids;
+    std::vector<int32_t> col_ids;
     std::vector<float> gains;
     row_ids.reserve(total_id_count);
     col_ids.reserve(total_id_count);
@@ -400,8 +400,8 @@ class ConvertToCooTensorOp : public OpKernel {
       const float gain = *weights_ptr;
       const float rescaled_gain = combiner_scale_contribution_fn(gain);
       for (int token_id = 0; token_id < total_id_count; ++token_id) {
-        const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-        const int32 col_id = *(values_ptr + token_id);
+        const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+        const int32_t col_id = *(values_ptr + token_id);
         if (gains_rescale.has_value()) {
           // Compute the gain rescale before doing the dedup.
           (*gains_rescale)[row_id] += rescaled_gain;
@@ -417,8 +417,8 @@ class ConvertToCooTensorOp : public OpKernel {
       }
     } else {
       for (int token_id = 0; token_id < total_id_count; ++token_id) {
-        const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-        const int32 col_id = *(values_ptr + token_id);
+        const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+        const int32_t col_id = *(values_ptr + token_id);
         const float gain = *(weights_ptr + token_id);
         if (gains_rescale.has_value()) {
           // Compute the gain rescale before doing the dedup.
@@ -435,7 +435,7 @@ class ConvertToCooTensorOp : public OpKernel {
       }
     }
 
-    const int32 output_id_count = row_ids.size();
+    const int32_t output_id_count = row_ids.size();
 
     Tensor* gains_tensor;
     OP_REQUIRES_OK(ctx,
@@ -450,8 +450,8 @@ class ConvertToCooTensorOp : public OpKernel {
         ctx, ctx->allocate_output("col_ids", TensorShape({output_id_count}),
                                   &col_ids_tensor));
 
-    int32* row_ids_tensor_ptr = row_ids_tensor->flat<int32>().data();
-    int32* col_ids_tensor_ptr = col_ids_tensor->flat<int32>().data();
+    int32_t* row_ids_tensor_ptr = row_ids_tensor->flat<int32_t>().data();
+    int32_t* col_ids_tensor_ptr = col_ids_tensor->flat<int32_t>().data();
     float* gains_tensor_ptr = gains_tensor->flat<float>().data();
 
     if (gains_rescale.has_value()) {
@@ -535,11 +535,11 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
                           feature_width_, &max_ids_per_partition,
                           &max_unique_ids_per_partition));
 
-  const int32* row_ids_tensor_ptr = row_ids->flat<int32>().data();
-  const int32* col_ids_tensor_ptr = col_ids->flat<int32>().data();
+  const int32_t* row_ids_tensor_ptr = row_ids->flat<int32_t>().data();
+  const int32_t* col_ids_tensor_ptr = col_ids->flat<int32_t>().data();
   const float* gains_tensor_ptr = gains->flat<float>().data();
-  const int64* splits_tensor_ptr = splits->flat<int64>().data();
-  const int32* id_counts_tensor_ptr = id_counts->flat<int32>().data();
+  const int64_t* splits_tensor_ptr = splits->flat<int64_t>().data();
+  const int32_t* id_counts_tensor_ptr = id_counts->flat<int32_t>().data();
 
   const int32_t total_id_count = row_ids->NumElements();
 
@@ -556,9 +556,9 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int max_division_level = GetMinibatchMaxDivisionLevel();
 
-  const int32 kMaxDivisions = 1 << max_division_level;
+  const int32_t kMaxDivisions = 1 << max_division_level;
 
-  int64 binary_splits = 0;
+  int64_t binary_splits = 0;
   for (int i = 0; i < splits->NumElements(); ++i) {
     binary_splits |= *(splits_tensor_ptr + i);
   }
@@ -566,7 +566,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   std::vector<int> bucket_splits =
       ConvertBinarySplitsToBucketSplits(binary_splits, max_division_level);
 
-  const int32 num_minibatch_per_sc = bucket_splits.size() + 1;
+  const int32_t num_minibatch_per_sc = bucket_splits.size() + 1;
   sparse_core_ops_stats_handler_->Record(StatsType::NUM_MINIBATCHES_PER_SC,
                                          num_minibatch_per_sc, device_name_,
                                          table_name_);
@@ -588,7 +588,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   bucket_splits.insert(bucket_splits.begin(), 0);
   bucket_splits.push_back(kMaxDivisions);
 
-  const int32 max_ids_per_chip = max_ids_per_chip_per_sample_ * sample_count_;
+  const int32_t max_ids_per_chip = max_ids_per_chip_per_sample_ * sample_count_;
 
   OP_REQUIRES(
       ctx, max_ids_per_chip % xla_pad_size == 0,
@@ -596,8 +596,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           "The max_ids_per_chip is set to be ", max_ids_per_chip,
           " which is not divisible by the xla_pad_size ", xla_pad_size, " .")));
 
-  const int32 padded_row_pointers_size_per_sc =
-      xla::RoundUpTo<int32>(num_physical_replica, xla_pad_size);
+  const int32_t padded_row_pointers_size_per_sc =
+      xla::RoundUpTo<int32_t>(num_physical_replica, xla_pad_size);
 
   Tensor* row_pointers_tensor;
   OP_REQUIRES_OK(ctx,
@@ -619,11 +619,12 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->allocate_output("sorted_gains", TensorShape({max_ids_per_chip}),
                                 &sorted_gains_tensor));
-  int32* row_pointers_tensor_ptr = row_pointers_tensor->flat<int32>().data();
-  int32* sorted_sample_ids_tensor_ptr =
-      sorted_sample_ids_tensor->flat<int32>().data();
-  int32* sorted_token_ids_tensor_ptr =
-      sorted_token_ids_tensor->flat<int32>().data();
+  int32_t* row_pointers_tensor_ptr =
+      row_pointers_tensor->flat<int32_t>().data();
+  int32_t* sorted_sample_ids_tensor_ptr =
+      sorted_sample_ids_tensor->flat<int32_t>().data();
+  int32_t* sorted_token_ids_tensor_ptr =
+      sorted_token_ids_tensor->flat<int32_t>().data();
   float* sorted_gains_tensor_ptr = sorted_gains_tensor->flat<float>().data();
 
   // This packed id count is used to track how many ids we have packed into
@@ -631,8 +632,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   // dropped.
   int32_t packed_id_count = 0;
 
-  int32 global_index = 0;
-  int32 row_pointers_index = 0;
+  int32_t global_index = 0;
+  int32_t row_pointers_index = 0;
   for (int sc_id = 0; sc_id < num_sc_per_chip_; ++sc_id) {
     for (int i = 1; i < bucket_splits.size(); ++i) {
       for (int replica_id = 0; replica_id < num_physical_replica;
@@ -686,8 +687,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         }
 
         *(row_pointers_tensor_ptr + row_pointers_index) = global_index;
-        int32 num_ids_to_pad_per_replica =
-            xla::RoundUpTo<int32>(global_index, xla_pad_size) - global_index;
+        int32_t num_ids_to_pad_per_replica =
+            xla::RoundUpTo<int32_t>(global_index, xla_pad_size) - global_index;
         std::fill_n(sorted_token_ids_tensor_ptr + global_index,
                     num_ids_to_pad_per_replica, kXlaPadValue);
         std::fill_n(sorted_sample_ids_tensor_ptr + global_index,
@@ -698,8 +699,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         ++row_pointers_index;
       }
       // Pad the row_pointers to be memory aligned.
-      int32 num_row_pointers_to_pad =
-          xla::RoundUpTo<int32>(row_pointers_index, xla_pad_size) -
+      int32_t num_row_pointers_to_pad =
+          xla::RoundUpTo<int32_t>(row_pointers_index, xla_pad_size) -
           row_pointers_index;
       std::fill_n(row_pointers_tensor_ptr + row_pointers_index,
                   num_row_pointers_to_pad, global_index);
@@ -718,7 +719,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
                  << " . This could potentially impact the model quality.";
   }
 
-  int32 row_pointers_unpadded_size =
+  int32_t row_pointers_unpadded_size =
       total_num_minibatch * padded_row_pointers_size_per_sc;
 
   Tensor* num_minibatches_per_physical_sparse_core_tensor;
@@ -736,11 +737,11 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->allocate_output("ids_unpadded_size", TensorShape({}),
                                            &ids_unpadded_size_tensor));
 
-  num_minibatches_per_physical_sparse_core_tensor->flat<int32>()(0) =
+  num_minibatches_per_physical_sparse_core_tensor->flat<int32_t>()(0) =
       num_minibatch_per_sc;
-  row_pointers_unpadded_size_tensor->flat<int32>()(0) =
+  row_pointers_unpadded_size_tensor->flat<int32_t>()(0) =
       row_pointers_unpadded_size;
-  ids_unpadded_size_tensor->flat<int32>()(0) = ids_unpadded_size;
+  ids_unpadded_size_tensor->flat<int32_t>()(0) = ids_unpadded_size;
 }
 
 #ifdef LIBTPU_ON_GCE
@@ -778,7 +779,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->input("program_key", &program_key_t));
   tstring program_key = program_key_t->vec<tstring>()(0);
 
-  int32 per_sc_sample_count = sample_count_ / num_sc_per_chip_;
+  int32_t per_sc_sample_count = sample_count_ / num_sc_per_chip_;
 
   int64_t max_ids_per_partition = -1;
   int64_t max_unique_ids_per_partition = -1;
@@ -802,10 +803,10 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   const Tensor* gains;
   OP_REQUIRES_OK(ctx, ctx->input("gains", &gains));
 
-  const int32 total_id_count = row_ids->NumElements();
+  const int32_t total_id_count = row_ids->NumElements();
 
-  const int32* row_ids_ptr = row_ids->flat<int32>().data();
-  const int32* col_ids_ptr = col_ids->flat<int32>().data();
+  const int32_t* row_ids_ptr = row_ids->flat<int32_t>().data();
+  const int32_t* col_ids_ptr = col_ids->flat<int32_t>().data();
   const float* gains_ptr = gains->flat<float>().data();
 
 #ifndef NDEBUG
@@ -829,7 +830,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int max_division_level = GetMinibatchMaxDivisionLevel();
 
-  const int32 kMaxDivisions = 1 << max_division_level;
+  const int32_t kMaxDivisions = 1 << max_division_level;
 
   // The id counts tensor is the running sum of the number of ids for all
   // buckets for all the replicas on each SparseCore.
@@ -842,7 +843,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           TensorShape(
               {kMaxDivisions * num_sc_per_chip_ * num_physical_replica + 1}),
           &id_counts_tensor));
-  int32* id_counts_tensor_ptr = id_counts_tensor->flat<int32>().data();
+  int32_t* id_counts_tensor_ptr = id_counts_tensor->flat<int32_t>().data();
   *id_counts_tensor_ptr = 0;
 
   const int32_t division_size =
@@ -855,8 +856,8 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   //                0001011 -> 0001 01 1
   //      which mean split at level 0 section 0, level 1 section 0 and level
   //      2 section 0. the split points are [128, 256, 512].
-  int64 pre_merge_splits = 0;
-  int64 after_merge_splits = 0;
+  int64_t pre_merge_splits = 0;
+  int64_t after_merge_splits = 0;
   // Vector of uint64_t storing the col ids in the upper 32 bit and the index
   // to the original id array in the lower 32 bit.
   std::vector<std::vector<uint64_t>> col_ids_index_list(
@@ -926,7 +927,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
     int32_t previous_row_id = -1;
     uint32_t previous_id_array_index = 0;
     for (uint64_t item : col_ids_index_list[sc_id]) {
-      int32 col_id = item >> 32;
+      int32_t col_id = item >> 32;
       uint32_t id_array_index = item & 0xffffffff;
       int32_t row_id = *(row_ids_ptr + id_array_index);
       // If the row ids and col ids are both same as the previous one,
@@ -1027,9 +1028,9 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           if (level > 0 && (pre_merge_splits &
                             (1LL << (pre_start_bit_pos + (section >> 1)))) == 0)
             continue;
-          int32 id_count = id_counter[(section + 1) * section_size] -
-                           id_counter[section * section_size];
-          int32 unique_id_count =
+          int32_t id_count = id_counter[(section + 1) * section_size] -
+                             id_counter[section * section_size];
+          int32_t unique_id_count =
               unique_id_counter[(section + 1) * section_size] -
               unique_id_counter[section * section_size];
           // If the number of ids or unique ids exceeds the limit, We need to
@@ -1155,17 +1156,17 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   Tensor* splits_tensor;
   OP_REQUIRES_OK(
       ctx, ctx->allocate_output("splits", TensorShape({}), &splits_tensor));
-  splits_tensor->flat<int64>()(0) = after_merge_splits;
+  splits_tensor->flat<int64_t>()(0) = after_merge_splits;
 
   Tensor* max_ids_tensor;
   OP_REQUIRES_OK(
       ctx, ctx->allocate_output("max_ids", TensorShape({}), &max_ids_tensor));
-  max_ids_tensor->flat<int32>()(0) = this_max_ids;
+  max_ids_tensor->flat<int32_t>()(0) = this_max_ids;
 
   Tensor* max_uniques_tensor;
   OP_REQUIRES_OK(ctx, ctx->allocate_output("max_uniques", TensorShape({}),
                                            &max_uniques_tensor));
-  max_uniques_tensor->flat<int32>()(0) = this_max_uniques;
+  max_uniques_tensor->flat<int32_t>()(0) = this_max_uniques;
 }
 
 #ifdef LIBTPU_ON_GCE
@@ -1197,12 +1198,12 @@ void StoreMinibatchStatisticsInFdoOp::Compute(OpKernelContext* ctx) {
 
   const Tensor* max_ids_t;
   OP_REQUIRES_OK(ctx, ctx->input("max_ids", &max_ids_t));
-  int64_t max_ids = max_ids_t->scalar<int64>()();
+  int64_t max_ids = max_ids_t->scalar<int64_t>()();
   const Tensor* max_uniques_t;
   OP_REQUIRES_OK(ctx, ctx->input("max_uniques", &max_uniques_t));
-  int64_t max_uniques = max_uniques_t->scalar<int64>()();
+  int64_t max_uniques = max_uniques_t->scalar<int64_t>()();
 
-  int32 per_sc_sample_count = sample_count_ / num_sc_per_chip_;
+  int32_t per_sc_sample_count = sample_count_ / num_sc_per_chip_;
 
   int64_t max_ids_per_partition = -1;
   int64_t max_unique_ids_per_partition = -1;
@@ -1264,10 +1265,10 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ValidateInputs(*indices_or_row_splits, *values, *weights,
                                      sample_count_));
 
-  const int32 total_id_count = values->NumElements();
+  const int32_t total_id_count = values->NumElements();
 
-  auto row_ids_before_dedup = std::unique_ptr<int32[]>(
-      new std::remove_extent_t<int32[]>[total_id_count]);
+  auto row_ids_before_dedup = std::unique_ptr<int32_t[]>(
+      new std::remove_extent_t<int32_t[]>[total_id_count]);
 
   OP_REQUIRES_OK(ctx, ComputeRowIdsBeforePadding(*indices_or_row_splits,
                                                  total_id_count, sample_count_,
@@ -1285,14 +1286,14 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
   auto combiner_scale_transform_fn =
       GetCombinerScaleTransformFunction(combiner_);
 
-  const int32* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
-  const int32* values_ptr = values->flat<int32>().data();
+  const int32_t* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
+  const int32_t* values_ptr = values->flat<int32_t>().data();
   const float* weights_ptr = weights->flat<float>().data();
 
   // Dedup the ids within one sample by just checking the adjacent ids. This
   // will NOT result in a full deduplication.
-  std::vector<int32> row_ids;
-  std::vector<int32> col_ids;
+  std::vector<int32_t> row_ids;
+  std::vector<int32_t> col_ids;
   std::vector<float> gains;
   row_ids.reserve(total_id_count);
   col_ids.reserve(total_id_count);
@@ -1306,8 +1307,8 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
     const float gain = *weights_ptr;
     const float rescaled_gain = combiner_scale_contribution_fn(gain);
     for (int token_id = 0; token_id < total_id_count; ++token_id) {
-      const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-      const int32 col_id = *(values_ptr + token_id);
+      const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+      const int32_t col_id = *(values_ptr + token_id);
       if (gains_rescale.has_value()) {
         // Compute the gain rescale before doing the dedup.
         (*gains_rescale)[row_id] += rescaled_gain;
@@ -1324,8 +1325,8 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
     }
   } else {
     for (int token_id = 0; token_id < total_id_count; ++token_id) {
-      const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-      const int32 col_id = *(values_ptr + token_id);
+      const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+      const int32_t col_id = *(values_ptr + token_id);
       const float gain = *(weights_ptr + token_id);
       if (gains_rescale.has_value()) {
         // Compute the gain rescale before doing the dedup.
@@ -1371,8 +1372,8 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
         ctx, col_ids_output_list.allocate(
                  i, TensorShape({per_sc_token_count[i]}), &col_ids_tensor));
 
-    int32* row_ids_tensor_ptr = row_ids_tensor->flat<int32>().data();
-    int32* col_ids_tensor_ptr = col_ids_tensor->flat<int32>().data();
+    int32_t* row_ids_tensor_ptr = row_ids_tensor->flat<int32_t>().data();
+    int32_t* col_ids_tensor_ptr = col_ids_tensor->flat<int32_t>().data();
     float* gains_tensor_ptr = gains_tensor->flat<float>().data();
 
     WriteToOutputTensor(
@@ -1384,10 +1385,10 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
 }
 
 void ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor(
-    int32* row_ids, int32* col_ids, float* gains, int32* row_ids_tensor_ptr,
-    int32* col_ids_tensor_ptr, float* gains_tensor_ptr, int32_t begin_index,
-    int32_t end_index, int32_t sc_id,
-    std::optional<std::vector<float>> gains_rescale) {
+    int32_t* row_ids, int32_t* col_ids, float* gains,
+    int32_t* row_ids_tensor_ptr, int32_t* col_ids_tensor_ptr,
+    float* gains_tensor_ptr, int32_t begin_index, int32_t end_index,
+    int32_t sc_id, std::optional<std::vector<float>> gains_rescale) {
   tsl::profiler::TraceMe traceme(
       "ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor");
   if (gains_rescale.has_value()) {
@@ -1407,12 +1408,13 @@ void ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor(
     }
   } else {
     std::transform(row_ids + begin_index, row_ids + end_index,
-                   row_ids_tensor_ptr, [this, &sc_id](int32 row_id) -> int32 {
+                   row_ids_tensor_ptr,
+                   [this, &sc_id](int32_t row_id) -> int32_t {
                      return row_id % per_sc_sample_count_ + per_sc_row_offset_ +
                             per_sc_stacked_table_sample_count_ * sc_id;
                    });
     std::transform(col_ids + begin_index, col_ids + end_index,
-                   col_ids_tensor_ptr, [this](int32 col_id) -> int32 {
+                   col_ids_tensor_ptr, [this](int32_t col_id) -> int32_t {
                      return ((col_id + col_shift_) & num_sc_shards_bit_mod_) +
                             (col_id & num_sc_shards_bit_mod_inv_) + col_offset_;
                    });
@@ -1804,7 +1806,7 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
         }
 
         *(row_pointers_tensor_ptr + row_pointers_index) = global_index;
-        int32 num_ids_to_pad_per_replica =
+        int32_t num_ids_to_pad_per_replica =
             xla::RoundUpTo<int32_t>(global_index, xla_pad_size) - global_index;
 
         std::fill_n(sorted_token_ids_tensor_ptr + global_index,
@@ -1818,8 +1820,8 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
         ++row_pointers_index;
       }
       // Pad the row_pointers to be memory aligned.
-      int32 num_row_pointers_to_pad =
-          xla::RoundUpTo<int32>(row_pointers_index, xla_pad_size) -
+      int32_t num_row_pointers_to_pad =
+          xla::RoundUpTo<int32_t>(row_pointers_index, xla_pad_size) -
           row_pointers_index;
       std::fill_n(row_pointers_tensor_ptr + row_pointers_index,
                   num_row_pointers_to_pad, global_index);
@@ -1838,7 +1840,7 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
                  << " . This could potentially impact the model quality.";
   }
 
-  int32 row_pointers_unpadded_size =
+  int32_t row_pointers_unpadded_size =
       total_num_minibatch * padded_row_pointers_size_per_sc;
 
   Tensor* num_minibatches_per_sc_tensor;
@@ -1855,10 +1857,10 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->allocate_output("ids_unpadded_size", TensorShape({}),
                                            &ids_unpadded_size_tensor));
 
-  num_minibatches_per_sc_tensor->flat<int32>()(0) = num_minibatch_per_sc;
-  row_pointers_unpadded_size_tensor->flat<int32>()(0) =
+  num_minibatches_per_sc_tensor->flat<int32_t>()(0) = num_minibatch_per_sc;
+  row_pointers_unpadded_size_tensor->flat<int32_t>()(0) =
       row_pointers_unpadded_size;
-  ids_unpadded_size_tensor->flat<int32>()(0) = ids_unpadded_size;
+  ids_unpadded_size_tensor->flat<int32_t>()(0) = ids_unpadded_size;
 }
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
index 05bc79e416de8a..706622ae1dfbe4 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -34,15 +34,15 @@ namespace tensorflow {
 // Struct to describe an embedding lookup input data.
 struct EmbeddingLookupInput {
   // Which replica it belongs.
-  int32 replica_id;
+  int32_t replica_id;
   // Token id.
-  int32 token_id;
+  int32_t token_id;
   // Sample id.
-  int32 sample_id;
+  int32_t sample_id;
   // Gain.
   float gain;
 
-  EmbeddingLookupInput(int32 replica_id, int32 token_id, int32 sample_id,
+  EmbeddingLookupInput(int32_t replica_id, int32_t token_id, int32_t sample_id,
                        float gain)
       : replica_id(replica_id),
         token_id(token_id),
@@ -56,9 +56,9 @@ absl::Status ValidateInputs(const Tensor& indices_or_row_splits,
 
 // Compute the row id list before padding.
 absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
-                                        int32 total_id_count,
-                                        int32 sample_count,
-                                        int32* row_ids_before_padding,
+                                        int32_t total_id_count,
+                                        int32_t sample_count,
+                                        int32_t* row_ids_before_padding,
                                         std::vector<int> shape_strides = {});
 
 class GetMinibatchesInCsrWithPhysicalReplicaOp : public OpKernel {
@@ -101,7 +101,7 @@ class GetMinibatchSplitsWithPhysicalReplicaOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  protected:
-  virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
+  virtual void CalculateHeadroom(int32_t this_max_ids, int32_t this_max_uniques,
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
                                  int64_t max_unique_ids_per_partition,
@@ -138,7 +138,7 @@ class StoreMinibatchStatisticsInFdoOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  protected:
-  virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
+  virtual void CalculateHeadroom(int32_t this_max_ids, int32_t this_max_uniques,
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
                                  int64_t max_unique_ids_per_partition) {}
@@ -165,10 +165,11 @@ class ConvertToListOfSparseCoreCooTensorsOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  void WriteToOutputTensor(int32* row_ids, int32* col_ids, float* gains,
-                           int32* row_ids_tensor_ptr, int32* col_ids_tensor_ptr,
-                           float* gains_tensor_ptr, int32_t begin_index,
-                           int32_t end_index, int32_t sc_id,
+  void WriteToOutputTensor(int32_t* row_ids, int32_t* col_ids, float* gains,
+                           int32_t* row_ids_tensor_ptr,
+                           int32_t* col_ids_tensor_ptr, float* gains_tensor_ptr,
+                           int32_t begin_index, int32_t end_index,
+                           int32_t sc_id,
                            std::optional<std::vector<float>> gains_rescale);
   int sample_count_;
   int num_sc_per_chip_;
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index 50e86ba0198602..f3576628d048bc 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -143,7 +143,7 @@ class XlaSparseDenseMatmulOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* builder = ctx->builder();
 
-    const int32 num_physical_replica =
+    const int32_t num_physical_replica =
         stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoreCountFn(
             /*mesh_state=*/nullptr,
             /*tpu_core_type=*/TpuCoreTypeEnum::kEmbeddingV2);
@@ -662,7 +662,7 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
                 errors::InvalidArgument(
                     "activations input has non static or non-rank 2 shape: ",
                     activation_shape.ToString()));
-    int64 num_samples_per_chip = activation_shape.dimensions(0);
+    int64_t num_samples_per_chip = activation_shape.dimensions(0);
     OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_device_ == 0,
                 errors::InvalidArgument(
                     "num_samples_per_chip ", num_samples_per_chip,
diff --git a/tensorflow/core/tpu/kernels/topk_ops.cc b/tensorflow/core/tpu/kernels/topk_ops.cc
index 16334632946c25..22d18e39220146 100644
--- a/tensorflow/core/tpu/kernels/topk_ops.cc
+++ b/tensorflow/core/tpu/kernels/topk_ops.cc
@@ -51,21 +51,21 @@ xla::XlaOp CreateKthOrderStatisticComputation(xla::XlaBuilder* builder,
   const int64_t width = input_shape.dim_size(1);
 
   xla::XlaOp input_sm32 = xla::BitcastConvertType(input, xla::S32);
-  xla::XlaOp zero_r0 = xla::ConstantR0<int32>(builder, 0);
+  xla::XlaOp zero_r0 = xla::ConstantR0<int32_t>(builder, 0);
   xla::XlaOp zero_r1 = xla::Broadcast(zero_r0, {height});
   xla::XlaOp zero_r2 = xla::Broadcast(zero_r0, {height, width});
 
-  xla::XlaOp max_r0 = xla::ConstantR0<int32>(builder, 0x7FFFFFFF);
+  xla::XlaOp max_r0 = xla::ConstantR0<int32_t>(builder, 0x7FFFFFFF);
   xla::XlaOp max_r1 = xla::Broadcast(max_r0, {height});
 
   // Start at positive zero, so that pivot is always less than top.
-  xla::XlaOp negative_zero_r0 = xla::ConstantR0<int32>(builder, 0x80000000);
+  xla::XlaOp negative_zero_r0 = xla::ConstantR0<int32_t>(builder, 0x80000000);
   xla::XlaOp negative_zero_r1 = xla::Broadcast(negative_zero_r0, {height});
   xla::XlaOp top_r1 = zero_r1;
 
-  for (uint32 mask = 1U << 31; mask; mask >>= 1) {
+  for (uint32_t mask = 1U << 31; mask; mask >>= 1) {
     xla::XlaOp broadcast_mask_r1 =
-        xla::Broadcast(xla::ConstantR0<int32>(builder, mask), {height});
+        xla::Broadcast(xla::ConstantR0<int32_t>(builder, mask), {height});
 
     // The first iteration of the loop determines if the kth element
     // is positive or negative. If the kth element is negative, we
@@ -111,14 +111,14 @@ class KthOrderStatistic : public XlaOpKernel {
         ctx, input_shape.dims() == 2,
         InvalidArgument("input must be rank-2: ", input_shape.DebugString()));
 
-    xla::XlaOp k = xla::ConstantR0<int32>(builder, k_);
+    xla::XlaOp k = xla::ConstantR0<int32_t>(builder, k_);
     xla::XlaOp kth_order_statistics =
         CreateKthOrderStatisticComputation(builder, input_shape, input, k);
     ctx->SetOutput(0, kth_order_statistics);
   }
 
  private:
-  int32 k_;
+  int32_t k_;
 };
 
 REGISTER_XLA_OP(Name("KthOrderStatistic"), KthOrderStatistic);
@@ -269,21 +269,21 @@ xla::XlaOp CreateMakeUnique(xla::XlaBuilder* builder, const xla::XlaOp input,
   // count_mask is used to mask away the low order bits to ensure
   // that every element is distinct.
   uint32_t next_power_of_two = absl::bit_ceil<uint64_t>(width);
-  uint32 count_mask = ~(next_power_of_two - 1);
+  uint32_t count_mask = ~(next_power_of_two - 1);
   xla::XlaOp count_mask_r0 = xla::ConstantR0(builder, count_mask);
   xla::XlaOp count_mask_r2 = xla::Broadcast(count_mask_r0, {height, width});
 
   // smallest_normal is the bit representation of the smallest
   // positive normal floating point number. The sign is zero,
   // exponent is one, and the fraction is zero.
-  uint32 smallest_normal = 1U << 23;
+  uint32_t smallest_normal = 1U << 23;
   xla::XlaOp smallest_normal_r0 = xla::ConstantR0(builder, smallest_normal);
   xla::XlaOp smallest_normal_r2 =
       xla::Broadcast(smallest_normal_r0, {height, width});
 
   // Used to mask away the sign bit when computing the absolute
   // value.
-  uint32 low_bit_mask = ~(1U << 31);
+  uint32_t low_bit_mask = ~(1U << 31);
   xla::XlaOp low_bit_mask_r0 = xla::ConstantR0(builder, low_bit_mask);
   xla::XlaOp low_bit_mask_r2 = xla::Broadcast(low_bit_mask_r0, {height, width});
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
index e8666ec63e171a..06fde06bdcac84 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
@@ -37,11 +37,11 @@ class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
  public:
   using StubType = tpu::grpc::TpuCompilationCacheService::Stub;
 
-  TpuCompilationCacheRpcLookup(const string& server_address,
+  TpuCompilationCacheRpcLookup(const std::string& server_address,
                                int64_t max_cache_size);
   ~TpuCompilationCacheRpcLookup() override = default;
 
-  absl::Status Lookup(const string& proto_key,
+  absl::Status Lookup(const std::string& proto_key,
                       std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
                       tpu::CompilationCacheFetchTarget fetch_target) override;
 
@@ -49,11 +49,11 @@ class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
                       std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
                       tpu::CompilationCacheFetchTarget fetch_target) override;
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
  private:
   // Helper method to make the RPC request to the central cache.
-  absl::Status RemoteLookupLocked(const string& local_proto_key,
+  absl::Status RemoteLookupLocked(const std::string& local_proto_key,
                                   const tpu::GetTpuProgramRequest& request,
                                   std::shared_ptr<CacheEntry>* cache_entry)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index a456a473c1a836..4f7af33e8c1c35 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -167,7 +167,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
   });
 
   absl::Status compile_status = ComputeInternal(ctx);
-  string status_payload;
+  std::string status_payload;
   // Construct payload if compile_status is not ok and there's no payload for
   // compilation yet.
   if (!compile_status.ok() &&
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 66f7b02e6bc04d..56e2130495750c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -178,7 +178,7 @@ class TpuCompileOpKernelCommon {
   std::string mlir_module_;
   // Fingerprint of the MLIR Module created once on construction to avoid paying
   // the cost on each invocation.
-  uint64 mlir_module_fingerprint_ = 0;
+  uint64_t mlir_module_fingerprint_ = 0;
 
   // Number of different programs to compile. This maps to number of cores in
   // each replica.
@@ -198,7 +198,7 @@ class TpuCompileOpKernelCommon {
 
   absl::Status RegisterXLAFingerprints(
       const std::vector<TensorShape>& arg_shapes,
-      TpuProgramGroupInterface* tpu_program_group, uint64 fingerprint);
+      TpuProgramGroupInterface* tpu_program_group, uint64_t fingerprint);
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h b/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
index 73b0a492b3551c..a6bf93239dc3d4 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
@@ -61,7 +61,7 @@ class TpuEmbeddingEngineStateInterface : public ResourceBase {
     return new TpuEmbeddingEngineStateInterface(state);
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return "TpuEmbeddingEngineStateInterface";
   }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
index e06c02c99b6cbb..46981718facdb4 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
@@ -31,7 +31,8 @@ absl::Status ValidateCombiners(absl::Span<const std::string> combiners);
 // Validates the `mode_override` input of the TPUEnqueue* ops, and, if correct,
 // sets the `mode` to pass on to the TPU Embedding manager.
 absl::Status GetValidatedModeOverride(
-    const string& mode_override, tpu::TPUEmbeddingConfiguration::Mode* mode);
+    const std::string& mode_override,
+    tpu::TPUEmbeddingConfiguration::Mode* mode);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENQUEUE_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.h b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
index 1d9e5cd57697ec..45c5fb52e1d9c9 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
@@ -84,18 +84,19 @@ GroupedEdges GroupTensorsForOutputPacking(Graph* graph,
                                           GraphShapeInfo* shape_info);
 
 absl::Status CreateConcatAndSplitNodesForInputTensor(
-    Graph* graph, const string& cluster_name, EdgeShapes* tpu_input_shapes,
+    Graph* graph, const std::string& cluster_name, EdgeShapes* tpu_input_shapes,
     const absl::flat_hash_map<std::string, std::vector<const Edge*>>&
         grouped_input_edges,
     int32_t minimum_input_tensors_packing, bool xla_spmd_input_sharded,
     const XlaShardingInfoMap& xla_sharding_info,
     const TpuReplicatedInputInfoMap& tpu_replicated_input_info);
 absl::Status CreateConcatAndSplitNodesForOutputTensor(
-    Graph* graph, const string& cluster_name, EdgeShapes* tpu_output_shapes,
-    GraphShapeInfo* tpu_inferred_info, GroupedEdges shape_to_output,
-    int32_t minimum_output_tensors_packing);
+    Graph* graph, const std::string& cluster_name,
+    EdgeShapes* tpu_output_shapes, GraphShapeInfo* tpu_inferred_info,
+    GroupedEdges shape_to_output, int32_t minimum_output_tensors_packing);
 
-absl::Status InsertReshapeNodePairs(Graph* graph, const string& cluster_name,
+absl::Status InsertReshapeNodePairs(Graph* graph,
+                                    const std::string& cluster_name,
                                     EdgeShapes* tpu_input_shapes,
                                     int num_cores_per_replica);
 
@@ -172,7 +173,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   };
 
   // This method is thread-safe.
-  absl::Status GetTpuCoreOrdinal(OpKernelContext* ctx, uint64 input_hash,
+  absl::Status GetTpuCoreOrdinal(OpKernelContext* ctx, uint64_t input_hash,
                                  int64_t* ordinal_selector_req_id,
                                  int32_t* core_ordinal);
 
@@ -196,11 +197,10 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   // device_ordinal: The index of the TPU core that is scheduled to run
   //   the computation. In the case of XLA SPMD, it is the "primary" core, which
   //   is the smallest index of all the cores.
-  absl::Status InitializeShardedVarOnTPU(OpKernelContext* ctx,
-                                         const core::RefCountPtr<Var>& var,
-                                         std::vector<NodeDef>& ndefs,
-                                         int split_dim,
-                                         const std::vector<string>& tpu_devices)
+  absl::Status InitializeShardedVarOnTPU(
+      OpKernelContext* ctx, const core::RefCountPtr<Var>& var,
+      std::vector<NodeDef>& ndefs, int split_dim,
+      const std::vector<std::string>& tpu_devices)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Check if any of the immediate successors of node has attribute
@@ -250,7 +250,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   absl::Status PlacementHelper(
       const DeviceSet& device_set,
       const GraphOptimizationPassOptions& optimization_options,
-      const string& function_name);
+      const std::string& function_name);
   // Partitions `graph`, populates `subgraphs` with the partitions, and runs
   // the post-partitioning graph optimization passes.
   absl::Status PartitionHelper(
@@ -263,15 +263,15 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   // If `out_flib_def` is not null, it will be set to a copy of `flib_def_` and
   // used for instantiation.
   absl::Status InstantiatePartition(
-      const Graph& graph, const string& function_name,
-      const string& target_device, FHandle* handle,
+      const Graph& graph, const std::string& function_name,
+      const std::string& target_device, FHandle* handle,
       std::unique_ptr<FunctionLibraryDefinition>* out_flib_def)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Adds and instantiates functions for each subgraph in `subgraphs` after
   // rewriting nodes' `device_ordinal` attributes to match `replica_id` when
   // num_cores_per_replica == 1.
   absl::Status InstantiateFunctionsFromSubgraphs(
-      const DeviceSet& device_set, int replica_id, uint64 cache_hash,
+      const DeviceSet& device_set, int replica_id, uint64_t cache_hash,
       int num_cores_per_replica,
       std::unordered_map<std::string, std::unique_ptr<Graph>> subgraphs)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -344,7 +344,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   const std::string local_device_name_;
   // Maps from cache key to their corresponding functions, which are
   // represented as (device, handle) pairs.
-  gtl::FlatMap<uint64, std::vector<DeviceAndFHandle>> partition_cache_
+  gtl::FlatMap<uint64_t, std::vector<DeviceAndFHandle>> partition_cache_
       ABSL_GUARDED_BY(mu_);
 
   // A set contains seen ordinals. Used by variable initialization on TPU.
@@ -362,7 +362,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   FunctionLibraryRuntime* library_runtime_;
 
   // Used to uniquify function names in `flib_def_`.
-  uint32 suffix_ = 0;
+  uint32_t suffix_ = 0;
 
   // Minimum number of run steps (batches) necessary to trigger xla autotuner.
   int autotuner_thresh_ = 0;
@@ -371,7 +371,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   std::shared_ptr<tpu::TPUOrdinalSelector> ordinal_selector_;
 
   // Maps input hash to TF fingerprint.
-  absl::flat_hash_map<uint64, uint64> inputs_to_fingerprint_;
+  absl::flat_hash_map<uint64_t, uint64_t> inputs_to_fingerprint_;
 
   // List of TPU devices
   std::vector<Device*> tpu_devices_;
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 6e84dde261bb24..1d50e75bb804b3 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -75,7 +75,7 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
                mesh_state_, tpu_core_type);
   }
 
-  string DebugString() const override { return "TpuMeshStateInterface"; }
+  std::string DebugString() const override { return "TpuMeshStateInterface"; }
 
  private:
   XLA_TpuMeshState* mesh_state_;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 8d1d4861b6fcca..6da81d1ffefabe 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -73,8 +73,8 @@ std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
 }
 }  // namespace
 
-uint64 CreateFingerprintWithNameAndShapes(
-    uint64 name, const std::vector<tensorflow::TensorShape>& shapes) {
+uint64_t CreateFingerprintWithNameAndShapes(
+    uint64_t name, const std::vector<tensorflow::TensorShape>& shapes) {
   std::string shape_prefix = CreateShapePrefix(shapes);
   VLOG(2) << "CreateFingerprintWithNameAndShapes, name: " << name
           << ", shape_prefix: " << shape_prefix;
@@ -85,7 +85,7 @@ uint64 CreateFingerprintWithNameAndShapes(
 // Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
 // data to compute the fingerprint.
 std::string GuaranteedConstFingerprint(
-    const string& fingerprint_in_metadata,
+    const std::string& fingerprint_in_metadata,
     const OpInputList& guaranteed_constants) {
   if (fingerprint_in_metadata.empty()) {
     uint64_t fingerprint = 0;
@@ -104,8 +104,8 @@ std::string GuaranteedConstFingerprint(
 // The `guaranteed_constants` must be passed as reference due to the lazy
 // evaluation of `guaranteed_const_fingerprint()` callback.
 TpuCompilationCacheKey CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
+    absl::string_view function_name, uint64_t function_library_fingerprint,
+    uint64_t mlir_module_fingerprint, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state, uint64_t session_id,
@@ -151,7 +151,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     // reference based on the assumption that these variables lifetime is
     // managed through the `TPUCompileOpKernelImpl` that outlives the
     // lifetime of the compilation cache lookups.
-    string fingerprint;
+    std::string fingerprint;
     key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
                                         fingerprint]() mutable {
       if (fingerprint.empty()) {
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
index d0ca805fec4757..df68fdaaff39e5 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -28,13 +28,13 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 // Creates a fingerprint given the name and the vector of shapes.
-uint64 CreateFingerprintWithNameAndShapes(
-    uint64 name, const std::vector<tensorflow::TensorShape>& shapes);
+uint64_t CreateFingerprintWithNameAndShapes(
+    uint64_t name, const std::vector<tensorflow::TensorShape>& shapes);
 
 // Creates a unique compilation cache `key`.
 TpuCompilationCacheKey CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
+    absl::string_view function_name, uint64_t function_library_fingerprint,
+    uint64_t mlir_module_fingerprint, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state, uint64_t session_id = 0,
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
index 9ea689b317f551..3bf1bfac3fe0bb 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
@@ -39,7 +39,7 @@ class TPUOrdinalSelector : TPUOrdinalSelectorInterface {
     stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_DestroyFn(
         ordinal_selector_);
   }
-  int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) override {
+  int64_t GetOrdinal(std::optional<uint64_t> key, int64_t* req_id) override {
     int64_t ordinal;
     stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_GetOrdinalFn(
         ordinal_selector_, key, req_id, &ordinal);
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
index 040959d592a1bd..21ce7b393d6195 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
@@ -26,7 +26,7 @@ namespace tpu {
 class TPUOrdinalSelectorInterface {
  public:
   virtual ~TPUOrdinalSelectorInterface() = default;
-  virtual int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) = 0;
+  virtual int64_t GetOrdinal(std::optional<uint64_t> key, int64_t* req_id) = 0;
   virtual void DequeueFromCoreSelector(int32_t device_ordinal,
                                        int64_t req_id) = 0;
 };
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index 1457ceac9b790b..73acdd65ef166c 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -123,7 +123,7 @@ TpuPodState::~TpuPodState() {
   VLOG(1) << "Shutting down Compilation Cache Service done.";
 }
 
-string TpuPodState::DebugString() const {
+std::string TpuPodState::DebugString() const {
   return "Wrapper for distributed TPU state";
 }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.h b/tensorflow/core/tpu/kernels/tpu_pod_state.h
index b24a512d341cbe..99e2cff3e1f948 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.h
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.h
@@ -38,7 +38,7 @@ class TpuPodState : public ResourceBase {
 
   ~TpuPodState() override;
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
  private:
   std::unique_ptr<TpuCompilationCacheService> cache_service_;
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
index f50652f8b5e81c..73214c817eaf04 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
@@ -99,7 +99,7 @@ absl::Status GetComputationCacheEntry(
 }
 
 // Builds an InputBuffers object that describes the inputs to the computation.
-absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
+absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceAddress>> BuildInputBuffers(
     OpKernelContext* context, const std::vector<VariableInfo>& variables,
     const xla::Shape& input_host_shape, xla::Backend* backend,
     int device_ordinal, se::Stream* stream) {
@@ -150,10 +150,11 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
         validate_shape(variables[i].index(), *variables[i].var()->tensor()));
   }
 
-  se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
+  stream_executor::DeviceAddressAllocator* const allocator =
+      backend->memory_allocator();
   xla::TransferManager* const transfer_manager = backend->transfer_manager();
 
-  xla::ShapeTree<xla::MaybeOwningDeviceMemory> input_buffers(
+  xla::ShapeTree<xla::MaybeOwningDeviceAddress> input_buffers(
       transfer_manager->HostShapeToDeviceShape(input_host_shape));
 
   // Allocates a buffer for the root tuple.
@@ -165,15 +166,17 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
   auto set_input_buffers_helper = [&](int arg_index, xla::ShapedBuffer* buffers,
                                       bool owning = false) {
     buffers->buffers().ForEachMutableElement(
-        [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        [&](const xla::ShapeIndex& index,
+            stream_executor::DeviceAddressBase* buffer) {
           xla::ShapeIndex in_index = {arg_index};
           for (int64_t j : index) {
             in_index.push_back(j);
           }
           if (owning) {
             *input_buffers.mutable_element(in_index) =
-                se::OwningDeviceMemory(*buffer, device_ordinal, allocator);
-            *buffer = se::DeviceMemoryBase();
+                stream_executor::ScopedDeviceAddress<uint8_t>(
+                    *buffer, device_ordinal, allocator);
+            *buffer = stream_executor::DeviceAddressBase();
           } else {
             *input_buffers.mutable_element(in_index) = *buffer;
           }
@@ -268,7 +271,8 @@ absl::Status UpdateOutputVariables(
   TF_RET_CHECK(result_buffers.on_host_shape().IsTuple());
   TF_RET_CHECK(!xla::ShapeUtil::IsNestedTuple(result_buffers.on_host_shape()));
 
-  se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
+  stream_executor::DeviceAddressAllocator* const allocator =
+      backend->memory_allocator();
 
   auto output_buffers = result_buffers.release();
   const xla::Shape& output_host_shape = output_buffers.on_host_shape();
@@ -285,7 +289,8 @@ absl::Status UpdateOutputVariables(
       xla::ScopedShapedBuffer shaped_buffer(host_shape, device_shape, allocator,
                                             device_ordinal);
       shaped_buffer.buffers().ForEachMutableElement(
-          [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+          [&](const xla::ShapeIndex& index,
+              stream_executor::DeviceAddressBase* buffer) {
             xla::ShapeIndex out_index = {i};
             for (int64_t j : index) {
               out_index.push_back(j);
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
index c731cc10ec70ce..ab44f7788fbf50 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
@@ -42,7 +42,7 @@ absl::Status GetComputationCacheEntry(
     std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
     tpu::CompilationCacheFetchTarget fetch_target);
 
-absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
+absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceAddress>> BuildInputBuffers(
     OpKernelContext* context, const std::vector<VariableInfo>& variables,
     const xla::Shape& input_host_shape, xla::Backend* backend,
     int device_ordinal, se::Stream* stream);
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 14223164d1e1b2..25e57e71da8dbf 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -48,7 +48,7 @@ absl::StatusOr<TpuCompilationCacheKey> ParseCompilationCacheKey(
   TpuCompilationCacheKey parsed_key(splits.at(0));
   parsed_key.has_guaranteed_const = true;
   parsed_key.session_handle = splits.at(1);
-  const string fingerprint = splits.at(2);
+  const std::string fingerprint = splits.at(2);
   parsed_key.guaranteed_const_fingerprint = [fingerprint] {
     return fingerprint;
   };
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index 703dc3e7589134..1610d807411cdb 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -51,7 +51,7 @@ limitations under the License.
 namespace tensorflow {
 
 TpuTransferAsyncOpKernelBase::TpuTransferAsyncOpKernelBase(
-    OpKernelConstruction* ctx, const string& transfer_type,
+    OpKernelConstruction* ctx, const std::string& transfer_type,
     int number_of_threads, std::unique_ptr<TpuTransferOpInterface> transfer_op)
     : AsyncOpKernel(ctx),
       transfer_type_(transfer_type),
@@ -113,7 +113,7 @@ absl::Status TpuTransferAsyncOpKernelBase::RunTransferWithOrdinal(
 }
 
 TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(
-    OpKernelConstruction* ctx, const string& transfer_type,
+    OpKernelConstruction* ctx, const std::string& transfer_type,
     int number_of_threads, std::unique_ptr<TpuTransferOpInterface> transfer_op)
     : TpuTransferAsyncOpKernelBase(ctx, transfer_type, number_of_threads,
                                    std::move(transfer_op)) {
@@ -132,7 +132,7 @@ absl::Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
 }
 
 TpuTransferAsyncDynamicOrdinalOpKernel::TpuTransferAsyncDynamicOrdinalOpKernel(
-    OpKernelConstruction* ctx, const string& transfer_type,
+    OpKernelConstruction* ctx, const std::string& transfer_type,
     int number_of_threads, std::unique_ptr<TpuTransferOpInterface> transfer_op)
     : TpuTransferAsyncOpKernelBase(ctx, transfer_type, number_of_threads,
                                    std::move(transfer_op)) {}
@@ -140,7 +140,7 @@ TpuTransferAsyncDynamicOrdinalOpKernel::TpuTransferAsyncDynamicOrdinalOpKernel(
 absl::Status TpuTransferAsyncDynamicOrdinalOpKernel::RunTransfer(
     OpKernelContext* ctx) {
   const Tensor& device_ordinal_tensor = ctx->input(0);
-  const int device_ordinal = device_ordinal_tensor.scalar<int32>()();
+  const int device_ordinal = device_ordinal_tensor.scalar<int32_t>()();
   XlaDevice* xla_device =
       dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
   if (((xla_device == nullptr) || (xla_device->device_type() == DEVICE_CPU)) &&
diff --git a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
index 4985cea9558993..ed7ff78c77da57 100644
--- a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
@@ -73,14 +73,14 @@ REGISTER_OP("GetMinibatchesInCsrWithPhysicalReplica")
       for (int i = 0; i < c->num_inputs(); ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &rank));
       }
-      int32 max_minibatches_per_sc;
+      int32_t max_minibatches_per_sc;
       TF_RETURN_IF_ERROR(
           c->GetAttr("max_minibatches_per_sc", &max_minibatches_per_sc));
-      int32 num_replica;
+      int32_t num_replica;
       TF_RETURN_IF_ERROR(c->GetAttr("num_replica", &num_replica));
-      int32 sample_count;
+      int32_t sample_count;
       TF_RETURN_IF_ERROR(c->GetAttr("sample_count", &sample_count));
-      int32 max_ids_per_chip_per_sample;
+      int32_t max_ids_per_chip_per_sample;
       TF_RETURN_IF_ERROR(c->GetAttr("max_ids_per_chip_per_sample",
                                     &max_ids_per_chip_per_sample));
 
@@ -88,7 +88,7 @@ REGISTER_OP("GetMinibatchesInCsrWithPhysicalReplica")
       // will be run as part of the graph generation which might not have the
       // tpu system available.
       const int xla_pad_size = 8;
-      int32 num_sc_per_chip;
+      int32_t num_sc_per_chip;
       TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
 
       const int num_physical_replica = num_replica * num_sc_per_chip;
@@ -253,22 +253,22 @@ REGISTER_OP("ConvertToSparseCoreCsrWrappedCooTensor")
     .Attr("table_name: string")
     .Attr("allow_id_dropping: bool")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      int32 max_minibatches_per_sc;
+      int32_t max_minibatches_per_sc;
       TF_RETURN_IF_ERROR(
           c->GetAttr("max_minibatches_per_sc", &max_minibatches_per_sc));
-      int32 num_replica;
+      int32_t num_replica;
       TF_RETURN_IF_ERROR(c->GetAttr("num_replica", &num_replica));
-      int32 sample_count_per_sc;
+      int32_t sample_count_per_sc;
       TF_RETURN_IF_ERROR(
           c->GetAttr("sample_count_per_sc", &sample_count_per_sc));
-      int32 max_ids_per_chip_per_sample;
+      int32_t max_ids_per_chip_per_sample;
       TF_RETURN_IF_ERROR(c->GetAttr("max_ids_per_chip_per_sample",
                                     &max_ids_per_chip_per_sample));
       // We can't get this number programmatically since the shape inference
       // will be run as part of the graph generation which might not have the
       // tpu system available.
       const int xla_pad_size = 8;
-      int32 num_sc_per_chip;
+      int32_t num_sc_per_chip;
       TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
 
       const int num_physical_replica = num_replica * num_sc_per_chip;
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 587d6341527a20..95044439f5b894 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -256,7 +256,7 @@ absl::Status UseGradientAccumulation(const OptimizationParameters& params,
     }
     case GradientAccumulationSupport::kNotSupported: {
       if (raw_gradient_accumulation_status) {
-        return errors::InvalidArgument(strings::Printf(
+        return errors::InvalidArgument(absl::StrFormat(
             "Optimization algorithm %s does not support gradient accumulation "
             "but parameters specify it.",
             GetOptimizationAlgorithmName(params.parameters_case()).c_str()));
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 865683dcb430cf..251cde239bcf6c 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -115,16 +115,16 @@ absl::Status FixTupleTableAsync(se::Stream* stream,
         if (!element_shape.IsTuple()) {
           return absl::OkStatus();
         }
-        std::vector<se::DeviceMemoryBase> elements;
+        std::vector<stream_executor::DeviceAddressBase> elements;
         xla::ShapeIndex element_index = index;
         element_index.push_back(0);
         for (int i = 0; i < element_shape.tuple_shapes().size(); ++i) {
           // Gather all children of the tuple element.
           element_index.back() = i;
-          elements.push_back(mem->Buffer(element_index).AsDeviceMemoryBase());
+          elements.push_back(mem->Buffer(element_index).AsDeviceAddress());
         }
-        se::DeviceMemoryBase tuple_table_addr =
-            mem->Buffer(index).AsDeviceMemoryBase();
+        stream_executor::DeviceAddressBase tuple_table_addr =
+            mem->Buffer(index).AsDeviceAddress();
         return transfer_manager->WriteSingleTupleIndexTable(
             stream, elements, element_shape, &tuple_table_addr);
       });
@@ -160,7 +160,7 @@ bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
 // Metadata contains the sizes of shape without padding, eventually
 // representing the size of valid data.
 absl::Status UpdateDynamicInputs(
-    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    se::Stream* stream, stream_executor::DeviceAddressAllocator* allocator,
     std::vector<xla::ExecutionInput>* runtime_inputs,
     const std::vector<xla::Shape>& compile_time_shapes) {
   TF_RET_CHECK(runtime_inputs->size() == compile_time_shapes.size());
@@ -193,14 +193,15 @@ absl::Status UpdateDynamicInputs(
           TF_RET_CHECK(
               DynamicShapeIsCompatible(runtime_shape, compile_time_shape));
 
-          xla::MaybeOwningDeviceMemory* mutable_input_mem =
+          xla::MaybeOwningDeviceAddress* mutable_input_mem =
               runtime_input.MutableBuffer(index);
           auto padded_data = std::make_shared<std::vector<int8_t>>(
               ShapeSizeCompact(compile_time_shape), -1);
           auto raw_input_runtime = std::make_shared<std::vector<uint32_t>>(
               ShapeSizeCompact(runtime_shape) / sizeof(uint32_t));
           TF_RETURN_IF_ERROR(stream->MemcpyD2H(
-              se::DeviceMemory<int8_t>(mutable_input_mem->AsDeviceMemoryBase()),
+              stream_executor::DeviceAddress<int8_t>(
+                  mutable_input_mem->AsDeviceAddress()),
               absl::MakeSpan(absl::bit_cast<int8_t*>(raw_input_runtime->data()),
                              ShapeSizeCompactRaw(runtime_shape))));
           TF_RETURN_IF_ERROR(stream->DoHostCallbackWithStatus(
@@ -239,7 +240,7 @@ absl::Status UpdateDynamicInputs(
               allocator->Allocate(stream->parent()->device_ordinal(),
                                   ShapeSizeCompact(compile_time_shape)));
           auto typed_new_input_memory =
-              se::DeviceMemory<int8_t>(new_input.cref());
+              stream_executor::DeviceAddress<int8_t>(new_input.cref());
           TF_RETURN_IF_ERROR(
               stream->MemcpyH2D<int8_t>(*padded_data, &typed_new_input_memory));
 
@@ -249,7 +250,7 @@ absl::Status UpdateDynamicInputs(
           // Modify the memory location in the input shape tree to point to the
           // new input.
           *mutable_input_mem =
-              xla::MaybeOwningDeviceMemory(std::move(new_input));
+              xla::MaybeOwningDeviceAddress(std::move(new_input));
           element_modified = true;
           return absl::OkStatus();
         }));
@@ -474,7 +475,7 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
   VLOG(1) << "TPUExecute: Updating TPUEmbedding memory addresses on "
           << device_ordinal;
 
-  SE_DeviceMemoryBase* device_memory_addrs = nullptr;
+  SE_DeviceAddressBase* device_memory_addrs = nullptr;
   size_t device_memory_addrs_count;
   auto device_memory_cleanup =
       absl::MakeCleanup([device_memory_addrs, device_ordinal]() {
@@ -499,9 +500,9 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
   VLOG(1) << "TPUExecute: Adding " << device_memory_addrs_count
           << " TPUEmbedding memory addresses to HLO parameters.";
   for (int i = 0; i < device_memory_addrs_count; ++i) {
-    xla::ShapeTree<xla::MaybeOwningDeviceMemory> tree(
+    xla::ShapeTree<xla::MaybeOwningDeviceAddress> tree(
         xla::ShapeUtil::MakeOpaqueShape());
-    const SE_DeviceMemoryBase& addr = device_memory_addrs[i];
+    const SE_DeviceAddressBase& addr = device_memory_addrs[i];
     VLOG(2) << absl::StrFormat("Device memory addr[%i] = {%p, %llu, %llu}", i,
                                addr.opaque, addr.size, addr.payload);
     *tree.mutable_element({}) = ApiConverter::FromC(addr);
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
index c7a943533ba8ec..e58e58aec7f9c0 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
@@ -133,8 +133,8 @@ struct EliminateForPassthroughIterArgs
   static ForRegionOp RebuildOp(const llvm::BitVector &indices, ForRegionOp op,
                                IRRewriter &rewriter) {
     rewriter.setInsertionPoint(op);
-    auto new_op = rewriter.create<ForRegionOp>(
-        op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
+    auto new_op = ForRegionOp::create(
+        rewriter, op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
         op.getCtl().getType(), op.getStart(), op.getLimit(), op.getDelta(),
         FilterByIndex(op.getInit(), indices), op.getCtls(),
         op.getBodyAttrsAttr(), op.getRegionAttrsAttr());
@@ -163,8 +163,8 @@ struct EliminateWhileLikePassthroughIterArgs
                                      WhileLikeRegionOp op,
                                      IRRewriter &rewriter) {
     rewriter.setInsertionPoint(op);
-    auto new_op = rewriter.create<WhileLikeRegionOp>(
-        op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
+    auto new_op = WhileLikeRegionOp::create(
+        rewriter, op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
         op.getCtl().getType(), FilterByIndex(op.getInit(), indices),
         op.getCtls(), op.getParallelIterationsAttr(), op.getCondAttrsAttr(),
         op.getBodyAttrsAttr(), op.getCondRegionAttrsAttr(),
diff --git a/tensorflow/core/transforms/func_to_graph/func_to_graph.cc b/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
index 1c2941bd8da120..5be91dbb286d92 100644
--- a/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
+++ b/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
@@ -102,7 +102,7 @@ absl::Status FuncToGraph(GraphFuncOp func) {
   }
 
   OpBuilder builder(func);
-  auto graph = builder.create<GraphOp>(func.getLoc(), version);
+  auto graph = GraphOp::create(builder, func.getLoc(), version);
 
   // Remove the terminator.
   func.SingleBlock::getBody()->getTerminator()->erase();
diff --git a/tensorflow/core/transforms/functional_to_region/impl.cc b/tensorflow/core/transforms/functional_to_region/impl.cc
index f5bdd163ed1007..aaf67332ae2d48 100644
--- a/tensorflow/core/transforms/functional_to_region/impl.cc
+++ b/tensorflow/core/transforms/functional_to_region/impl.cc
@@ -322,8 +322,8 @@ LogicalResult ConvertIfLikeOp<IfLikeOp, IfLikeRegionOp>::matchAndRewrite(
   // Create the region-based op, passing in the required attributes.
   ValueRange args, ctls;
   std::tie(args, ctls) = this->SplitControl(op.getArgs());
-  auto region_op = rewriter.create<IfLikeRegionOp>(
-      op.getLoc(), op.getResultTypes(), op.getCond(), ctls,
+  auto region_op = IfLikeRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), op.getCond(), ctls,
       op.getThenBranch().getAttrs(), op.getElseBranch().getAttrs(),
       PreserveAttributes(then_func, /*drop_args=*/true),
       PreserveAttributes(else_func, /*drop_args=*/true));
@@ -390,8 +390,8 @@ LogicalResult ConvertCaseLikeOp<CaseLikeOp, CaseLikeRegionOp>::matchAndRewrite(
   // Create the region-based op, passing in the required attributes.
   ValueRange args, ctls;
   std::tie(args, ctls) = this->SplitControl(op.getArgs());
-  auto region_op = rewriter.create<CaseLikeRegionOp>(
-      op.getLoc(), op.getResultTypes(), op.getBranchIndex(), ctls,
+  auto region_op = CaseLikeRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), op.getBranchIndex(), ctls,
       rewriter.getArrayAttr(branch_attrs), region_attrs,
       op.getBranches().size());
   util::ForwardNonIntrinsicAttributes(op, region_op);
@@ -440,8 +440,8 @@ ConvertWhileLikeOp<WhileLikeOp, WhileLikeRegionOp>::matchAndRewrite(
   // TODO(jeffniu): Change this to call the infer return types builder.
   ValueRange init, ctls;
   std::tie(init, ctls) = this->SplitControl(op.getArgs());
-  auto region_op = rewriter.create<WhileLikeRegionOp>(
-      op.getLoc(), op.getResultTypes(), init, ctls,
+  auto region_op = WhileLikeRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), init, ctls,
       op.getParallelIterationsAttr(), op.getCond().getAttrs(),
       op.getBody().getAttrs(), PreserveAttributes(cond_func),
       PreserveAttributes(body_func));
@@ -482,8 +482,8 @@ LogicalResult ConvertForOp::matchAndRewrite(tfg::ForOp op,
   // `ForRegion` does. We will need to insert casts.
   ValueRange init, ctls;
   std::tie(init, ctls) = SplitControl(op.getArgs());
-  auto region_op = rewriter.create<ForRegionOp>(
-      op.getLoc(), op.getResultTypes(), op.getStart(), op.getLimit(),
+  auto region_op = ForRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), op.getStart(), op.getLimit(),
       op.getDelta(), init, ctls, op.getBody().getAttrs(),
       PreserveAttributes(body_func));
   util::ForwardNonIntrinsicAttributes(op, region_op);
diff --git a/tensorflow/core/transforms/graph_to_func/graph_to_func.cc b/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
index d3769db8bcdf00..ae9e8d48c6a17a 100644
--- a/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
+++ b/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
@@ -66,8 +66,8 @@ absl::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
 
   FunctionType func_type = builder.getFunctionType(arg_types, ret_types);
   auto loc = graph.getLoc();
-  auto func_op = builder.create<GraphFuncOp>(loc, func_name, func_type,
-                                             /*generic=*/false);
+  auto func_op = GraphFuncOp::create(builder, loc, func_name, func_type,
+                                     /*generic=*/false);
   func_op->setAttr("tfg.lifted_graph_version", graph.getVersion());
   func_op.getRegion().takeBody(graph.getRegion());
 
@@ -75,7 +75,7 @@ absl::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
   // fetches, the fetch value will be replaced with feed argument.
   OpBuilder body_builder =
       OpBuilder::atBlockEnd(func_op.SingleBlock::getBody());
-  body_builder.create<ReturnOp>(loc, fetches, control_rets);
+  ReturnOp::create(body_builder, loc, fetches, control_rets);
 
   StringAttr tfg_name = dialect->getTfgNameAttrIdentifier();
   StringAttr lifted_value_name = builder.getStringAttr("tfg.lifted_value_attr");
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index 65c37b8b468825..9fce62a74a1173 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/transforms/region_to_functional/impl.h"
 
 #include <cassert>
-#include <cctype>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -233,8 +232,8 @@ struct ConvertIfLikeRegionOpToExplicitCapture
 
   IfLikeRegionOp RebuildWith(IfLikeRegionOp op, ValueRange added,
                              PatternRewriter &rewriter) const override {
-    return rewriter.create<IfLikeRegionOp>(
-        op.getLoc(), op.getResultTypes(), op.getCond(), op.getCtls(),
+    return IfLikeRegionOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getCond(), op.getCtls(),
         op.getThenAttrsAttr(), op.getElseAttrsAttr(),
         op.getThenRegionAttrsAttr(), op.getElseRegionAttrsAttr());
   }
@@ -247,9 +246,9 @@ struct ConvertCaseLikeRegionOpToExplicitCapture
 
   CaseLikeRegionOp RebuildWith(CaseLikeRegionOp op, ValueRange added,
                                PatternRewriter &rewriter) const override {
-    return rewriter.create<CaseLikeRegionOp>(
-        op.getLoc(), op.getResultTypes(), op.getBranchIndex(), op.getCtls(),
-        op.getBranchAttrsAttr(), op.getRegionAttrsAttr(),
+    return CaseLikeRegionOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getBranchIndex(),
+        op.getCtls(), op.getBranchAttrsAttr(), op.getRegionAttrsAttr(),
         op.getBranches().size());
   }
 };
@@ -295,9 +294,9 @@ struct ConvertWhileLikeRegionOpToExplicitCapture
     util::LoopRegionResultAdded(op.getBodyRegion(), added.size());
 
     rewriter.setInsertionPoint(op);
-    return rewriter.create<WhileLikeRegionOp>(
-        op.getLoc(), results, op.getCtl().getType(), operands, op.getCtls(),
-        op.getParallelIterationsAttr(), op.getCondAttrsAttr(),
+    return WhileLikeRegionOp::create(
+        rewriter, op.getLoc(), results, op.getCtl().getType(), operands,
+        op.getCtls(), op.getParallelIterationsAttr(), op.getCondAttrsAttr(),
         op.getBodyAttrsAttr(), op.getCondRegionAttrsAttr(),
         op.getBodyRegionAttrsAttr());
   }
@@ -324,8 +323,8 @@ struct ConvertForRegionOpToExplicitCapture
     util::LoopRegionResultAdded(op.getBodyRegion(), added.size());
 
     rewriter.setInsertionPoint(op);
-    return rewriter.create<ForRegionOp>(
-        op.getLoc(), results, op.getCtl().getType(), op.getStart(),
+    return ForRegionOp::create(
+        rewriter, op.getLoc(), results, op.getCtl().getType(), op.getStart(),
         op.getLimit(), op.getDelta(), operands, op.getCtls(),
         op.getBodyAttrsAttr(), op.getRegionAttrsAttr());
   }
@@ -870,8 +869,8 @@ LogicalResult ConvertIfLikeOp<IfLikeRegionOp, IfLikeOp>::matchAndRewrite(
 
   rewriter.setInsertionPoint(op);
   auto func_op =
-      rewriter.create<IfLikeOp>(op.getLoc(), op.getResultTypes(), op.getCond(),
-                                operands, branches[0], branches[1]);
+      IfLikeOp::create(rewriter, op.getLoc(), op.getResultTypes(), op.getCond(),
+                       operands, branches[0], branches[1]);
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
@@ -924,9 +923,9 @@ LogicalResult ConvertCaseLikeOp<CaseLikeRegionOp, CaseLikeOp>::matchAndRewrite(
   llvm::append_range(operands, op.getCtls());
 
   rewriter.setInsertionPoint(op);
-  auto func_op = rewriter.create<CaseLikeOp>(op.getLoc(), op.getResultTypes(),
-                                             op.getBranchIndex(), operands,
-                                             rewriter.getArrayAttr(branches));
+  auto func_op = CaseLikeOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                    op.getBranchIndex(), operands,
+                                    rewriter.getArrayAttr(branches));
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
@@ -1000,9 +999,9 @@ ConvertWhileLikeOp<WhileLikeRegionOp, WhileLikeOp>::matchAndRewrite(
   llvm::append_range(operands, op.getCtls());
 
   rewriter.setInsertionPoint(op);
-  auto func_op = rewriter.create<WhileLikeOp>(op.getLoc(), op.getResultTypes(),
-                                              operands, cond_ref, body_ref,
-                                              op.getParallelIterationsAttr());
+  auto func_op =
+      WhileLikeOp::create(rewriter, op.getLoc(), op.getResultTypes(), operands,
+                          cond_ref, body_ref, op.getParallelIterationsAttr());
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
@@ -1038,9 +1037,9 @@ LogicalResult ConvertForOp::matchAndRewrite(ForRegionOp op,
   llvm::append_range(operands, op.getCtls());
 
   rewriter.setInsertionPoint(op);
-  auto func_op = rewriter.create<tfg::ForOp>(
-      op.getLoc(), op.getResultTypes(), op.getStart(), op.getLimit(),
-      op.getDelta(), operands, body_ref[0]);
+  auto func_op = tfg::ForOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                    op.getStart(), op.getLimit(), op.getDelta(),
+                                    operands, body_ref[0]);
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 3acd07c02fadf8..05ae29c1619d87 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -298,6 +298,7 @@ filegroup(
         "mkl_heuristics.h",
         "mkl_util.h",
         "onednn_env_vars.h",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@local_xla//xla/tsl/util:onednn_util_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
@@ -952,38 +953,39 @@ tf_proto_library(
     name = "test_log_proto",
     srcs = ["test_log.proto"],
     make_default_target_header_only = True,
-    protodeps = ["@local_xla//xla/tsl/protobuf:test_log_proto"],
     # Not to be used outside of tensorflow/core.
     visibility = ["//tensorflow/core:__pkg__"],
     exports = ["@local_xla//xla/tsl/protobuf:test_log_proto"],
+    deps = ["@local_xla//xla/tsl/protobuf:test_log_proto"],
 )
 
 tf_proto_library(
     name = "protos_test",
     srcs = ["example_proto_fast_parsing_test.proto"],
-    protodeps = ["//tensorflow/core:protos_all"],
     visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core:protos_all"],
 )
 
 tf_proto_library(
     name = "event_proto",
     srcs = ["event.proto"],
     make_default_target_header_only = True,
-    protodeps = [
-        "//tensorflow/core/framework:summary_proto",
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core/framework:resource_handle_proto",
+        "//tensorflow/core/framework:summary_proto",
         "//tensorflow/core/framework:tensor_proto",
         "//tensorflow/core/framework:tensor_shape_proto",
         "//tensorflow/core/framework:types_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "saved_tensor_slice_proto",
     srcs = ["saved_tensor_slice.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core/framework:resource_handle_proto",
         "//tensorflow/core/framework:tensor_proto",
         "//tensorflow/core/framework:tensor_shape_proto",
@@ -991,7 +993,6 @@ tf_proto_library(
         "//tensorflow/core/framework:types_proto",
         "//tensorflow/core/framework:versions_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
@@ -1004,16 +1005,16 @@ tf_proto_library(
 tf_proto_library(
     name = "protos_all",
     make_default_target_header_only = True,
-    protodeps = [
-        ":event_proto",
-        ":saved_tensor_slice_proto",
-        ":memmapped_file_system_proto",
-        "//tensorflow/core/util/quantization:uniform_quant_ops_attr_proto",
-    ],
     tags = [
         "alt_dep=//third_party/tensorflow/core:protos_all",
     ],
     visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":event_proto",
+        ":memmapped_file_system_proto",
+        ":saved_tensor_slice_proto",
+        "//tensorflow/core/util/quantization:uniform_quant_ops_attr_proto",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index 3868d4971b8035..a5de2c3ba00baa 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -193,7 +193,6 @@ tf_cuda_only_cc_test(
     size = "small",
     srcs = ["autotune_serialize_test.cc"],
     features = ["-layering_check"],
-    tags = ["cuda-only"],
     deps = [
         ":autotune_serialize",
         ":conv_autotune_maps",
diff --git a/tensorflow/core/util/autotune_maps/conv_autotune_maps.h b/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
index 7c00348adfe1ba..ebf542b2afbd75 100644
--- a/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
+++ b/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
@@ -39,7 +39,7 @@ namespace tensorflow {
 
 // A dummy type to group forward convolution autotune results together.
 struct ConvAutotuneGroup {
-  static string name() { return "Conv"; }
+  static std::string name() { return "Conv"; }
 };
 
 using ConvAutotuneMap = AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
@@ -47,7 +47,7 @@ using ConvAutotuneMap = AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
 
 // A dummy type to group fused convolution autotune results together.
 struct ConvFusedAutotuneGroup {
-  static string name() { return "FusedConv"; }
+  static std::string name() { return "FusedConv"; }
 };
 
 using FusedConvAutotuneMap =
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.cc b/tensorflow/core/util/autotune_maps/conv_parameters.cc
index 3ef5626eeb8d61..be47f880c299be 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.cc
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.cc
@@ -31,11 +31,11 @@ namespace tensorflow {
 namespace {
 using ::tsl::protobuf::util::MessageDifferencer;
 
-uint64 ComputeHash(int device_id, const ConvParametersProto& proto) {
+uint64_t ComputeHash(int device_id, const ConvParametersProto& proto) {
   return Hash64Combine(device_id, tsl::DeterministicProtoHash64(proto));
 }
 
-uint64 ComputeHash(int device_id, const MatmulParametersProto& proto) {
+uint64_t ComputeHash(int device_id, const MatmulParametersProto& proto) {
   return Hash64Combine(device_id, tsl::DeterministicProtoHash64(proto));
 }
 }  // namespace
@@ -99,7 +99,7 @@ bool ConvParameters::operator==(const ConvParameters& other) const {
          MessageDifferencer::Equals(this->proto_, other.proto_);
 }
 
-string ConvParameters::ToString() const { return proto_.DebugString(); }
+std::string ConvParameters::ToString() const { return proto_.DebugString(); }
 
 MatmulParameters::MatmulParameters(
     se::StreamExecutor* stream_exec, DataType ab_dtype, DataType c_dtype,
@@ -137,7 +137,7 @@ bool MatmulParameters::operator==(const MatmulParameters& other) const {
          MessageDifferencer::Equals(this->proto_, other.proto_);
 }
 
-string MatmulParameters::ToString() const { return proto_.DebugString(); }
+std::string MatmulParameters::ToString() const { return proto_.DebugString(); }
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.h b/tensorflow/core/util/autotune_maps/conv_parameters.h
index b213dba9298dd3..12da493f4a59f4 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.h
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.h
@@ -90,16 +90,16 @@ class ConvParameters {
   bool operator!=(const ConvParameters& other) const {
     return !(*this == other);
   }
-  uint64 hash() const { return hash_code_; }
+  uint64_t hash() const { return hash_code_; }
 
-  string ToString() const;
+  std::string ToString() const;
 
   const ConvParametersProto& proto() const { return proto_; }
 
  private:
   int device_id_;
   ConvParametersProto proto_;
-  uint64 hash_code_;
+  uint64_t hash_code_;
 };
 
 class MatmulParameters {
@@ -127,16 +127,16 @@ class MatmulParameters {
   bool operator!=(const MatmulParameters& other) const {
     return !(*this == other);
   }
-  uint64 hash() const { return hash_code_; }
+  uint64_t hash() const { return hash_code_; }
 
-  string ToString() const;
+  std::string ToString() const;
 
   const MatmulParametersProto& proto() const { return proto_; }
 
  private:
   int device_id_;
   MatmulParametersProto proto_;
-  uint64 hash_code_;
+  uint64_t hash_code_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index d15ec3034a93c9..fd1762482b4340 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "oneapi/dnnl/dnnl.hpp"
 #include "oneapi/dnnl/dnnl_threadpool.hpp"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -1963,7 +1964,7 @@ class LRUCache {
   size_t capacity_;
 
   // The cache, a map from string key to a LRU entry.
-  std::unordered_map<string, Entry> cache_;
+  absl::flat_hash_map<string, Entry> cache_;
 
   // The LRU list of entries.
   // The front of the list contains the key of the most recently accessed
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index 7d43e34b35ce50..a3a5c5a72c2f01 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -42,7 +42,7 @@ using tensorflow::protobuf::io::StringOutputStream;
 // Converts an uint64 to an int64 without loss of information.
 // Unsigned values greater than INT64_MAX are represented as
 // negative numbers by wrapping (same as twos-complement bit equivalence).
-inline int64_t WrapUnsignedAsSigned64(uint64 unsigned_value) {
+inline int64_t WrapUnsignedAsSigned64(uint64_t unsigned_value) {
   // For a detailed explanation of why this works to wrap unsigned ints, see
   // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
   // Both if tests should be optimized out.
@@ -59,16 +59,16 @@ inline int64_t WrapUnsignedAsSigned64(uint64 unsigned_value) {
 // Converts an uint32 to an int32 without loss of information.
 // Unsigned values greater than INT_MAX are represented as
 // negative numbers by wrapping (same as twos-complement bit equivalence).
-inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+inline int32_t WrapUnsignedAsSigned32(uint32_t unsigned_value) {
   // For a detailed explanation of why this works to wrap unsigned ints, see
   // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
   // Both if tests should be optimized out.
   if (unsigned_value <= INT_MAX) {
-    return static_cast<int32>(unsigned_value);
+    return static_cast<int32_t>(unsigned_value);
   }
   // The C++ spec allows an architecture where this test is required.
   if (unsigned_value >= INT_MIN) {
-    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+    return static_cast<int32_t>(unsigned_value - INT_MIN) + INT_MIN;
   }
   return 0;  // This should never occur.
 }
@@ -78,8 +78,8 @@ inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
 // space in the buffer.
 // The ok value will be set to false if the buffer does not contain
 // a valid varint.
-inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
-                                          uint64* value);
+inline const uint8_t* ReadVarint64FromArray(const uint8_t* buffer, bool* ok,
+                                            uint64_t* value);
 
 // Reads a single varint32 from a byte array.
 // It is the caller's responsibility to ensure that there is enough
@@ -89,10 +89,10 @@ inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
 // This is slightly less efficient than the private version in
 // coded_stream.cc but we duplicate less code by calling
 // the 64 bit version instead of copying the code.
-inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
-                                          uint32* value) {
-  uint64 tmp = 0;
-  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+inline const uint8_t* ReadVarint32FromArray(const uint8_t* buffer, bool* ok,
+                                            uint32_t* value) {
+  uint64_t tmp = 0;
+  const uint8_t* buf = ReadVarint64FromArray(buffer, ok, &tmp);
   *value = tmp & 0xffffffff;
   return buf;
 }
@@ -101,12 +101,12 @@ inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
 // The array is part of a Tensor that was allocated by the caller
 // with type TensorType, while DeclaredType is the proto field type.
 template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
-const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+const uint8_t* ReadFromArray(const uint8_t* buf, TensorType* value);
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
-    const uint8* buf, int64_t* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
+    const uint8_t* buf, int64_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int64_t>(temp);
@@ -114,19 +114,19 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
-    const uint8* buf, int32* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int32_t, WireFormatLite::TYPE_INT32>(
+    const uint8_t* buf, int32_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
-  *value = static_cast<int32>(temp);
+  *value = static_cast<int32_t>(temp);
   return buf;
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
-    const uint8* buf, int64_t* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
+    const uint8_t* buf, int64_t* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WrapUnsignedAsSigned64(temp);
@@ -134,9 +134,9 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
-    const uint8* buf, uint64* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_UINT32>(
+    const uint8_t* buf, uint64_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = temp;
@@ -144,23 +144,23 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_UINT32>(
-    const uint8* buf, uint32* value) {
+inline const uint8_t* ReadFromArray<uint32_t, WireFormatLite::TYPE_UINT32>(
+    const uint8_t* buf, uint32_t* value) {
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   return ReadVarint32FromArray(buf, &unused_ok, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT64>(
-    const uint8* buf, uint64* value) {
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_UINT64>(
+    const uint8_t* buf, uint64_t* value) {
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   return ReadVarint64FromArray(buf, &unused_ok, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
-    const uint8* buf, int64_t* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
+    const uint8_t* buf, int64_t* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -168,9 +168,9 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
-    const uint8* buf, int32* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int32_t, WireFormatLite::TYPE_SINT32>(
+    const uint8_t* buf, int32_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -178,9 +178,9 @@ inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
-    const uint8* buf, int64_t* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
+    const uint8_t* buf, int64_t* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode64(temp);
@@ -188,10 +188,10 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED32>(
-    const uint8* buf, uint64* value) {
-  uint32 temp;
-  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_FIXED32>(
+    const uint8_t* buf, uint64_t* value) {
+  uint32_t temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32_t,
                                                WireFormatLite::TYPE_FIXED32>(
       buf, &temp);
   *value = temp;
@@ -199,10 +199,10 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_FIXED32>(
-    const uint8* buf, uint32* value) {
-  uint32 temp;
-  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+inline const uint8_t* ReadFromArray<uint32_t, WireFormatLite::TYPE_FIXED32>(
+    const uint8_t* buf, uint32_t* value) {
+  uint32_t temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32_t,
                                                WireFormatLite::TYPE_FIXED32>(
       buf, &temp);
   *value = WrapUnsignedAsSigned32(temp);
@@ -210,8 +210,8 @@ inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_FIXED32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED64>(
-    const uint8* buf, uint64* value) {
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_FIXED64>(
+    const uint8_t* buf, uint64_t* value) {
   protobuf_uint64 temp;
   buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
                                                WireFormatLite::TYPE_FIXED64>(
@@ -221,10 +221,10 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
-    const uint8* buf, int64_t* value) {
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
+    const uint8_t* buf, int64_t* value) {
   int32_t temp;
-  buf = WireFormatLite::ReadPrimitiveFromArray<int32,
+  buf = WireFormatLite::ReadPrimitiveFromArray<int32_t,
                                                WireFormatLite::TYPE_SFIXED32>(
       buf, &temp);
   *value = temp;
@@ -232,16 +232,16 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
-    const uint8* buf, int32* value) {
-  return WireFormatLite::ReadPrimitiveFromArray<int32,
+inline const uint8_t* ReadFromArray<int32_t, WireFormatLite::TYPE_SFIXED32>(
+    const uint8_t* buf, int32_t* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32_t,
                                                 WireFormatLite::TYPE_SFIXED32>(
       buf, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
-    const uint8* buf, int64_t* value) {
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
+    const uint8_t* buf, int64_t* value) {
   protobuf_int64 temp;
   buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
                                                WireFormatLite::TYPE_SFIXED64>(
@@ -251,16 +251,16 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
-    const uint8* buf, float* value) {
+inline const uint8_t* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8_t* buf, float* value) {
   return WireFormatLite::ReadPrimitiveFromArray<float,
                                                 WireFormatLite::TYPE_FLOAT>(
       buf, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
-    const uint8* buf, double* value) {
+inline const uint8_t* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
+    const uint8_t* buf, double* value) {
   float temp;
   buf =
       WireFormatLite::ReadPrimitiveFromArray<float, WireFormatLite::TYPE_FLOAT>(
@@ -270,17 +270,17 @@ inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
 }
 
 template <>
-inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
-    const uint8* buf, double* value) {
+inline const uint8_t* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8_t* buf, double* value) {
   return WireFormatLite::ReadPrimitiveFromArray<double,
                                                 WireFormatLite::TYPE_DOUBLE>(
       buf, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
-    const uint8* buf, bool* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8_t* buf, bool* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = temp != 0;
@@ -288,9 +288,9 @@ inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
-    const uint8* buf, int* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8_t* buf, int* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int>(temp);
@@ -304,8 +304,8 @@ template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
 inline int ReadPackedPrimitives(const void* bufp, const size_t len,
                                 const int index, const int stride,
                                 void* datap) {
-  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
-  const uint8* bound = buf + len;
+  const uint8_t* buf = reinterpret_cast<const uint8_t*>(bufp);
+  const uint8_t* bound = buf + len;
   TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
   int count;
 
@@ -340,7 +340,7 @@ inline absl::Status ReadPrimitive(CodedInputStream* input, int index,
 inline absl::Status ReadBytes(CodedInputStream* input, int index, void* datap) {
   tstring* data = reinterpret_cast<tstring*>(datap) + index;
 
-  uint32 length;
+  uint32_t length;
   if (!input->ReadVarint32(&length)) {
     return errors::DataLoss("Failed reading bytes");
   }
@@ -370,7 +370,7 @@ inline absl::Status ReadGroupBytes(CodedInputStream* input, int field_number,
   // TYPE_GROUP is deprecated and currently no tests in
   // tensorflow/python/kernel_tests/proto:decode_proto_op_test target a
   // TYPE_GROUP tag, we use std::string as a read buffer.
-  string buf;
+  std::string buf;
   StringOutputStream string_stream(&buf);
   {
     CodedOutputStream out(&string_stream);
@@ -412,31 +412,33 @@ inline absl::Status ReadValue(CodedInputStream* input,
       return ReadPrimitive<protobuf_int64, int64_t, WireFormatLite::TYPE_INT64>(
           input, index, datap);
     case WireFormatLite::TYPE_UINT64:
-      return ReadPrimitive<protobuf_uint64, uint64,
+      return ReadPrimitive<protobuf_uint64, uint64_t,
                            WireFormatLite::TYPE_UINT64>(input, index, datap);
     case WireFormatLite::TYPE_INT32:
       switch (dtype) {
         case DataType::DT_INT64:
-          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_INT32>(
+          return ReadPrimitive<int32_t, int64_t, WireFormatLite::TYPE_INT32>(
               input, index, datap);
         case DataType::DT_INT32:
-          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+          return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_INT32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_INT32 for ",
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_FIXED64:
-      return ReadPrimitive<protobuf_uint64, uint64,
+      return ReadPrimitive<protobuf_uint64, uint64_t,
                            WireFormatLite::TYPE_FIXED64>(input, index, datap);
     case WireFormatLite::TYPE_FIXED32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          return ReadPrimitive<uint32, uint64, WireFormatLite::TYPE_FIXED32>(
-              input, index, datap);
+          return ReadPrimitive<uint32_t, uint64_t,
+                               WireFormatLite::TYPE_FIXED32>(input, index,
+                                                             datap);
         case DataType::DT_UINT32:
-          return ReadPrimitive<uint32, uint32, WireFormatLite::TYPE_FIXED32>(
-              input, index, datap);
+          return ReadPrimitive<uint32_t, uint32_t,
+                               WireFormatLite::TYPE_FIXED32>(input, index,
+                                                             datap);
         default:
           return errors::DataLoss("Failed reading TYPE_FIXED32 for ",
                                   DataTypeString(dtype));
@@ -455,25 +457,25 @@ inline absl::Status ReadValue(CodedInputStream* input,
     case WireFormatLite::TYPE_UINT32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          return ReadPrimitive<uint32, uint64, WireFormatLite::TYPE_UINT32>(
+          return ReadPrimitive<uint32_t, uint64_t, WireFormatLite::TYPE_UINT32>(
               input, index, datap);
         case DataType::DT_UINT32:
-          return ReadPrimitive<uint32, uint32, WireFormatLite::TYPE_UINT32>(
+          return ReadPrimitive<uint32_t, uint32_t, WireFormatLite::TYPE_UINT32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_UINT32 for ",
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_ENUM:
-      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+      return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_ENUM>(
           input, index, datap);
     case WireFormatLite::TYPE_SFIXED32:
       switch (dtype) {
         case DataType::DT_INT64:
-          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_SFIXED32>(
+          return ReadPrimitive<int32_t, int64_t, WireFormatLite::TYPE_SFIXED32>(
               input, index, datap);
         case DataType::DT_INT32:
-          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+          return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_SFIXED32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_SFIXED32 for ",
@@ -485,10 +487,10 @@ inline absl::Status ReadValue(CodedInputStream* input,
     case WireFormatLite::TYPE_SINT32:
       switch (dtype) {
         case DataType::DT_INT64:
-          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_SINT32>(
+          return ReadPrimitive<int32_t, int64_t, WireFormatLite::TYPE_SINT32>(
               input, index, datap);
         case DataType::DT_INT32:
-          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+          return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_SINT32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_SINT32 for ",
@@ -533,7 +535,7 @@ inline absl::Status ReadPackedFromArray(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_UINT64:
-      *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_UINT64>(
+      *index += ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_UINT64>(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_INT32:
@@ -543,7 +545,7 @@ inline absl::Status ReadPackedFromArray(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_INT32:
-          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+          *index += ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_INT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
@@ -551,18 +553,20 @@ inline absl::Status ReadPackedFromArray(
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_FIXED64:
-      *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_FIXED64>(
+      *index += ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_FIXED64>(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_FIXED32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_FIXED32>(
-              buf, buf_size, *index, stride, data);
+          *index +=
+              ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_FIXED32>(
+                  buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_UINT32:
-          *index += ReadPackedPrimitives<uint32, WireFormatLite::TYPE_FIXED32>(
-              buf, buf_size, *index, stride, data);
+          *index +=
+              ReadPackedPrimitives<uint32_t, WireFormatLite::TYPE_FIXED32>(
+                  buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
           return errors::DataLoss("Failed reading TYPE_FIXED32 for ",
@@ -580,11 +584,11 @@ inline absl::Status ReadPackedFromArray(
     case WireFormatLite::TYPE_UINT32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_UINT32>(
+          *index += ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_UINT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_UINT32:
-          *index += ReadPackedPrimitives<uint32, WireFormatLite::TYPE_UINT32>(
+          *index += ReadPackedPrimitives<uint32_t, WireFormatLite::TYPE_UINT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
@@ -592,7 +596,7 @@ inline absl::Status ReadPackedFromArray(
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_ENUM:
-      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+      *index += ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_ENUM>(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_SFIXED32:
@@ -603,8 +607,9 @@ inline absl::Status ReadPackedFromArray(
                   buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_INT32:
-          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
-              buf, buf_size, *index, stride, data);
+          *index +=
+              ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_SFIXED32>(
+                  buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
           return errors::DataLoss("Failed reading TYPE_INT32 for ",
@@ -622,7 +627,7 @@ inline absl::Status ReadPackedFromArray(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_INT32:
-          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+          *index += ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_SINT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
@@ -645,14 +650,14 @@ inline absl::Status ReadPackedFromArray(
 // Important: This routine may read as much as kMaxVarintBytes from
 // the buffer. It is the caller's responsibility to make sure that there is
 // enough space in the buffer.
-inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
-                                          uint64* value) {
-  const uint8* ptr = buffer;
-  uint32 b;
+inline const uint8_t* ReadVarint64FromArray(const uint8_t* buffer, bool* ok,
+                                            uint64_t* value) {
+  const uint8_t* ptr = buffer;
+  uint32_t b;
 
   // Splitting into 32-bit pieces gives better performance on 32-bit
   // processors.
-  uint32 part0 = 0, part1 = 0, part2 = 0;
+  uint32_t part0 = 0, part1 = 0, part2 = 0;
 
   b = *(ptr++);
   part0 = b;
@@ -702,8 +707,9 @@ inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
 
 done:
   *ok = true;
-  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
-           (static_cast<uint64>(part2) << 56);
+  *value = (static_cast<uint64_t>(part0)) |
+           (static_cast<uint64_t>(part1) << 28) |
+           (static_cast<uint64_t>(part2) << 56);
   return ptr;
 }
 
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.cc b/tensorflow/core/util/proto/descriptor_pool_registry.cc
index 5f0423f76b74c2..e8184f6b2fabfc 100644
--- a/tensorflow/core/util/proto/descriptor_pool_registry.cc
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.cc
@@ -27,19 +27,19 @@ DescriptorPoolRegistry* DescriptorPoolRegistry::Global() {
 }
 
 DescriptorPoolRegistry::DescriptorPoolFn* DescriptorPoolRegistry::Get(
-    const string& source) {
+    const std::string& source) {
   auto found = fns_.find(source);
   if (found == fns_.end()) return nullptr;
   return &found->second;
 }
 
 void DescriptorPoolRegistry::Register(
-    const string& source,
+    const std::string& source,
     const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
   auto existing = Get(source);
   CHECK_EQ(existing, nullptr)
       << "descriptor pool for source: " << source << " already registered";
-  fns_.insert(std::pair<const string&, DescriptorPoolFn>(source, pool_fn));
+  fns_.insert(std::pair<const std::string&, DescriptorPoolFn>(source, pool_fn));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.h b/tensorflow/core/util/proto/descriptor_pool_registry.h
index 59c709ea150e87..5718243c15cbab 100644
--- a/tensorflow/core/util/proto/descriptor_pool_registry.h
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -39,13 +39,13 @@ class DescriptorPoolRegistry {
   static DescriptorPoolRegistry* Global();
 
   // Returns a pointer to a descriptor pool function for the given source.
-  DescriptorPoolFn* Get(const string& source);
+  DescriptorPoolFn* Get(const std::string& source);
 
   // Registers a descriptor pool factory.
-  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+  void Register(const std::string& source, const DescriptorPoolFn& pool_fn);
 
  private:
-  std::map<string, DescriptorPoolFn> fns_;
+  std::map<std::string, DescriptorPoolFn> fns_;
 };
 
 namespace descriptor_pool_registration {
@@ -53,7 +53,7 @@ namespace descriptor_pool_registration {
 class DescriptorPoolRegistration {
  public:
   DescriptorPoolRegistration(
-      const string& source,
+      const std::string& source,
       const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
     DescriptorPoolRegistry::Global()->Register(source, pool_fn);
   }
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
index 31942145fe32fa..e485499c94d5f7 100644
--- a/tensorflow/core/util/proto/descriptors.cc
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -45,7 +45,7 @@ absl::Status CreatePoolFromSet(
 // The file must contain a serialized `FileDescriptorSet`. See
 // `GetDescriptorPool()` for more information.
 absl::Status GetDescriptorPoolFromFile(
-    tensorflow::Env* env, const string& filename,
+    tensorflow::Env* env, const std::string& filename,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   absl::Status st = env->FileExists(filename);
   if (!st.ok()) {
@@ -66,7 +66,7 @@ absl::Status GetDescriptorPoolFromFile(
 }
 
 absl::Status GetDescriptorPoolFromBinary(
-    const string& source,
+    const std::string& source,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   if (!absl::StartsWith(source, "bytes://")) {
     return errors::InvalidArgument(absl::StrCat(
@@ -76,7 +76,7 @@ absl::Status GetDescriptorPoolFromBinary(
   }
   // Parse the FileDescriptorSet.
   protobuf::FileDescriptorSet proto;
-  if (!proto.ParseFromString(string(absl::StripPrefix(source, "bytes://")))) {
+  if (!proto.ParseFromString(absl::StripPrefix(source, "bytes://"))) {
     return errors::InvalidArgument(absl::StrCat(
         "Source does not represent serialized file descriptor set proto. ",
         "This may be due to a missing dependency on the file containing ",
@@ -88,7 +88,7 @@ absl::Status GetDescriptorPoolFromBinary(
 }  // namespace
 
 absl::Status GetDescriptorPool(
-    Env* env, string const& descriptor_source,
+    Env* env, const std::string& descriptor_source,
     protobuf::DescriptorPool const** desc_pool,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   // Attempt to lookup the pool in the registry.
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
index 3402ed0504410e..7b6ce3b97b5053 100644
--- a/tensorflow/core/util/proto/descriptors.h
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -46,7 +46,7 @@ using tsl::Env;
 // Custom schemas can be supported by registering a handler with the
 // `DescriptorPoolRegistry`.
 absl::Status GetDescriptorPool(
-    Env* env, string const& descriptor_source,
+    Env* env, const std::string& descriptor_source,
     protobuf::DescriptorPool const** desc_pool,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool);
 
diff --git a/tensorflow/core/util/proto/proto_utils.cc b/tensorflow/core/util/proto/proto_utils.cc
index 0833352bf431d7..f0a103eaa2823c 100644
--- a/tensorflow/core/util/proto/proto_utils.cc
+++ b/tensorflow/core/util/proto/proto_utils.cc
@@ -79,20 +79,20 @@ absl::Status ParseTextFormatFromString(absl::string_view input,
     return absl::Status(absl::StatusCode::kInvalidArgument,
                         "output must be non NULL");
   }
-  string err;
+  std::string err;
   StringErrorCollector err_collector(&err, /*one-indexing=*/true);
   protobuf::TextFormat::Parser parser;
   parser.RecordErrorsTo(&err_collector);
-  if (!parser.ParseFromString(string(input), output)) {
+  if (!parser.ParseFromString(input, output)) {
     return absl::Status(absl::StatusCode::kInvalidArgument, err);
   }
   return absl::OkStatus();
 }
 
-StringErrorCollector::StringErrorCollector(string* error_text)
+StringErrorCollector::StringErrorCollector(std::string* error_text)
     : StringErrorCollector(error_text, false) {}
 
-StringErrorCollector::StringErrorCollector(string* error_text,
+StringErrorCollector::StringErrorCollector(std::string* error_text,
                                            bool one_indexing)
     : error_text_(error_text), index_offset_(one_indexing ? 1 : 0) {
   DCHECK(error_text_ != nullptr) << "error_text must be non NULL";
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 8a94a832fec58c..65c73e35c15f8b 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -44,11 +44,11 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
  public:
   // String error_text is unowned and must remain valid during the use of
   // StringErrorCollector.
-  explicit StringErrorCollector(string* error_text);
+  explicit StringErrorCollector(std::string* error_text);
   // If one_indexing is set to true, all line and column numbers will be
   // increased by one for cases when provided indices are 0-indexed and
   // 1-indexed error messages are desired
-  StringErrorCollector(string* error_text, bool one_indexing);
+  StringErrorCollector(std::string* error_text, bool one_indexing);
   StringErrorCollector(const StringErrorCollector&) = delete;
   StringErrorCollector& operator=(const StringErrorCollector&) = delete;
 
@@ -61,7 +61,7 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
                      absl::string_view message) override;
 
  private:
-  string* const error_text_;
+  std::string* const error_text_;
   const int index_offset_;
 };
 
diff --git a/tensorflow/core/util/proto/proto_utils_test.cc b/tensorflow/core/util/proto/proto_utils_test.cc
index 8632c2a5e29d52..460e41ad770c31 100644
--- a/tensorflow/core/util/proto/proto_utils_test.cc
+++ b/tensorflow/core/util/proto/proto_utils_test.cc
@@ -61,21 +61,21 @@ TEST(ParseTextFormatFromStringTest, DiesOnNullOutputPointer) {
 }
 
 TEST(StringErrorCollectorTest, AppendsError) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordError(1, 2, "foo");
   EXPECT_EQ("1(2): foo\n", err);
 }
 
 TEST(StringErrorCollectorTest, AppendsWarning) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordWarning(1, 2, "foo");
   EXPECT_EQ("1(2): foo\n", err);
 }
 
 TEST(StringErrorCollectorTest, AppendsMultipleError) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordError(1, 2, "foo");
   collector.RecordError(3, 4, "bar");
@@ -83,7 +83,7 @@ TEST(StringErrorCollectorTest, AppendsMultipleError) {
 }
 
 TEST(StringErrorCollectorTest, AppendsMultipleWarning) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordWarning(1, 2, "foo");
   collector.RecordWarning(3, 4, "bar");
@@ -91,7 +91,7 @@ TEST(StringErrorCollectorTest, AppendsMultipleWarning) {
 }
 
 TEST(StringErrorCollectorTest, OffsetWorks) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err, true);
   collector.RecordError(1, 2, "foo");
   collector.RecordWarning(3, 4, "bar");
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 26a06bbb6ff129..a9f1675544a2f2 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -82,7 +82,7 @@ void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
 }
 
 void StatSummarizer::PrintStepStats() const {
-  string output = GetOutputString();
+  std::string output = GetOutputString();
   std::istringstream iss(output);
   for (std::string line; std::getline(iss, line);) {
     LOG(INFO) << line;
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 3eae427f548475..62d192ab5193d2 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -89,7 +89,7 @@ class StatSummarizer {
     return stats_calculator_->GetStatsByNodeType();
   }
 
-  std::string GetStatsByMetric(const string& title,
+  std::string GetStatsByMetric(const std::string& title,
                                StatsCalculator::SortingMetric sorting_metric,
                                int num_stats) const {
     return stats_calculator_->GetStatsByMetric(title, sorting_metric,
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 4787bcf6ded5d2..63d6a3f1a9f146 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -32,7 +32,8 @@ class StreamExecutorUtil {
   template <typename T>
   static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<char*>(t.tensor_data().data()));
-    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(
+        stream_executor::DeviceAddressBase(ptr, t.TotalBytes()));
   }
 };
 
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 93c5a7e9818ae2..3984d78e1b90bc 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -33,13 +33,13 @@ constexpr int32_t kShrinkAxis = -1, kNewAxis = -2;
 // if one does foo[3:5, ..., -3], this will have 3 length tensors
 struct StridedSliceSparseSpec {
   int64_t dims;
-  int32 num_add_axis_after_ellipsis;
+  int32_t num_add_axis_after_ellipsis;
   const Tensor* begin_tensor;
   const Tensor* end_tensor;
   const Tensor& strides_tensor;
-  const int32 begin_mask, end_mask;
-  int32 ellipsis_mask;
-  const int32 new_axis_mask, shrink_axis_mask;
+  const int32_t begin_mask, end_mask;
+  int32_t ellipsis_mask;
+  const int32_t new_axis_mask, shrink_axis_mask;
 };
 
 // Dense slicing specification
@@ -49,8 +49,8 @@ struct StridedSliceSparseSpec {
 // sparse had 3 length tensors.
 struct StridedSliceDenseSpec {
   const int64_t dims;
-  int32 begin_mask;
-  int32 end_mask;
+  int32_t begin_mask;
+  int32_t end_mask;
   bool begin_valid;
   bool end_valid;
   absl::InlinedVector<int64_t, 4UL>& begin;
@@ -62,18 +62,18 @@ struct StridedSliceDenseSpec {
   // entries. If an index in this array is positive, the size of the dimension
   // is obtained from canonical end-begin. Otherwise, if it is a kNewAxis,
   // it will be 1. A shrunk dimension is skipped.
-  absl::InlinedVector<int32, 4UL> final_shape_gather_indices;
+  absl::InlinedVector<int32_t, 4UL> final_shape_gather_indices;
   // This vector has the same size as final_shape_gather_indices, but it
   // remembers the sparse index that a dimension comes from, instead of dense
   // index. A -1 in this vector means there the index is not from the sparse
   // input.
-  absl::InlinedVector<int32, 4UL> final_shape_gather_indices_sparse;
-  absl::InlinedVector<int32, 4UL> input_shape_gather_indices_sparse;
+  absl::InlinedVector<int32_t, 4UL> final_shape_gather_indices_sparse;
+  absl::InlinedVector<int32_t, 4UL> input_shape_gather_indices_sparse;
   // The dense indexed shrink mask is which processing dimensions
   // should be shrunk. For example, if foo.shape = (10,10,10,10)
   // foo[3, ..., 5] has sparse_shrink_axis_mask of 0x5 and
   // dense_shrink_axis_mask of 0x9, yielding a final shape (10,10).
-  int32 shrink_axis_mask;
+  int32_t shrink_axis_mask;
 };
 
 }  // namespace
@@ -281,7 +281,7 @@ absl::Status ValidateStridedSliceOp(
                                       *strides};
 
   if (strides_tensor.dtype() == DT_INT32) {
-    TF_RETURN_IF_ERROR(BuildDenseSpec<int32>(sparse_spec, &dense_spec));
+    TF_RETURN_IF_ERROR(BuildDenseSpec<int32_t>(sparse_spec, &dense_spec));
   } else if (strides_tensor.dtype() == DT_INT64) {
     TF_RETURN_IF_ERROR(BuildDenseSpec<int64_t>(sparse_spec, &dense_spec));
   } else if (strides_tensor.dtype() == DT_INT16) {
diff --git a/tensorflow/core/util/tensor_bundle/naming.cc b/tensorflow/core/util/tensor_bundle/naming.cc
index d59f12cd856148..fc5ab0b624754e 100644
--- a/tensorflow/core/util/tensor_bundle/naming.cc
+++ b/tensorflow/core/util/tensor_bundle/naming.cc
@@ -24,16 +24,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-string MetaFilename(absl::string_view prefix) {
-  return strings::Printf("%.*s.index", static_cast<int>(prefix.size()),
+std::string MetaFilename(absl::string_view prefix) {
+  return absl::StrFormat("%.*s.index", static_cast<int>(prefix.size()),
                          prefix.data());
 }
 
-string DataFilename(absl::string_view prefix, int32_t shard_id,
-                    int32_t num_shards) {
+std::string DataFilename(absl::string_view prefix, int32_t shard_id,
+                         int32_t num_shards) {
   DCHECK_GT(num_shards, 0);
   DCHECK_LT(shard_id, num_shards);
-  return strings::Printf("%.*s.data-%05d-of-%05d",
+  return absl::StrFormat("%.*s.data-%05d-of-%05d",
                          static_cast<int>(prefix.size()), prefix.data(),
                          shard_id, num_shards);
 }
diff --git a/tensorflow/core/util/tensor_bundle/naming.h b/tensorflow/core/util/tensor_bundle/naming.h
index c98abac755102a..3acd5dcdd9bbe8 100644
--- a/tensorflow/core/util/tensor_bundle/naming.h
+++ b/tensorflow/core/util/tensor_bundle/naming.h
@@ -40,9 +40,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-string MetaFilename(absl::string_view prefix);
-string DataFilename(absl::string_view prefix, int32_t shard_id,
-                    int32_t num_shards);
+std::string MetaFilename(absl::string_view prefix);
+std::string DataFilename(absl::string_view prefix, int32_t shard_id,
+                         int32_t num_shards);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index afa764a2e15227..1037ffd542b668 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -237,7 +237,7 @@ tstring* GetStringBackingBuffer(const Tensor& val) {
 
 absl::Status ParseEntryProto(absl::string_view key, absl::string_view value,
                              protobuf::MessageLite* out) {
-  if (!out->ParseFromArray(value.data(), value.size())) {
+  if (!out->ParseFromString(value)) {
     return errors::DataLoss("Entry for key ", key, " not parseable.");
   }
   return absl::OkStatus();
@@ -1225,7 +1225,7 @@ string BundleReader::DebugString() {
   BundleEntryProto entry;
   Seek(kHeaderEntryKey);
   for (Next(); Valid(); Next()) {
-    CHECK(entry.ParseFromArray(value().data(), value().size()));
+    CHECK(entry.ParseFromString(value()));
     if (entry.slices_size() > 0) continue;  // Slice of some partitioned var.
 
     strings::StrAppend(&shape_str, key(), " (", DataType_Name(entry.dtype()),
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index d25c6018e5beb9..592583c1acb2de 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -50,13 +50,13 @@ using ::testing::ElementsAre;
 namespace {
 
 // Prepend the current test case's working temporary directory to <prefix>
-string Prefix(const string& prefix) {
+std::string Prefix(const std::string& prefix) {
   return absl::StrCat(testing::TmpDir(), "/", prefix);
 }
 
 // Construct a data input directory by prepending the test data root
 // directory to <prefix>
-string TestdataPrefix(const string& prefix) {
+std::string TestdataPrefix(const std::string& prefix) {
   return absl::StrCat(testing::TensorFlowSrcRoot(),
                       "/core/util/tensor_bundle/testdata/", prefix);
 }
@@ -87,7 +87,7 @@ Tensor ByteSwap(Tensor t) {
 // Assert that <reader> has a tensor under <key> matching <expected_val> in
 // terms of both shape, dtype, and value
 template <typename T>
-void Expect(BundleReader* reader, const string& key,
+void Expect(BundleReader* reader, const std::string& key,
             const Tensor& expected_val) {
   // Tests for Contains().
   EXPECT_TRUE(reader->Contains(key));
@@ -104,7 +104,7 @@ void Expect(BundleReader* reader, const string& key,
 }
 
 template <class T>
-void ExpectVariant(BundleReader* reader, const string& key,
+void ExpectVariant(BundleReader* reader, const std::string& key,
                    const Tensor& expected_t) {
   // Tests for Contains().
   EXPECT_TRUE(reader->Contains(key));
@@ -137,8 +137,8 @@ void ExpectNext(BundleReader* reader, const Tensor& expected_val) {
   test::ExpectTensorEqual<T>(val, expected_val);
 }
 
-std::vector<string> AllTensorKeys(BundleReader* reader) {
-  std::vector<string> ret;
+std::vector<std::string> AllTensorKeys(BundleReader* reader) {
+  std::vector<std::string> ret;
   reader->Seek(kHeaderEntryKey);
   reader->Next();
   for (; reader->Valid(); reader->Next()) {
@@ -149,9 +149,9 @@ std::vector<string> AllTensorKeys(BundleReader* reader) {
 
 // Writes out the metadata file of a bundle again, with the endianness marker
 // bit flipped.
-absl::Status FlipEndiannessBit(const string& prefix) {
+absl::Status FlipEndiannessBit(const std::string& prefix) {
   Env* env = Env::Default();
-  const string metadata_tmp_path = Prefix("some_tmp_path");
+  const std::string metadata_tmp_path = Prefix("some_tmp_path");
   std::unique_ptr<WritableFile> metadata_file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(metadata_tmp_path, &metadata_file));
   // We create the builder lazily in case we run into an exception earlier, in
@@ -161,8 +161,8 @@ absl::Status FlipEndiannessBit(const string& prefix) {
 
   // Reads the existing metadata file, and fills the builder.
   {
-    const string filename = MetaFilename(prefix);
-    uint64 file_size;
+    const std::string filename = MetaFilename(prefix);
+    uint64_t file_size;
     TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
     std::unique_ptr<RandomAccessFile> file;
     TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
@@ -213,7 +213,7 @@ void TestBasic() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "foo_000", Constant_2x3(T(0)));
     Expect<T>(&reader, "foo_001", Constant_2x3(T(1)));
     Expect<T>(&reader, "foo_002", Constant_2x3(T(2)));
@@ -243,7 +243,7 @@ void TestBasic() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
     Expect<T>(&reader, "bar_003", Constant_2x3(T(3)));
     Expect<T>(&reader, "bar_002", Constant_2x3(T(2)));
     Expect<T>(&reader, "bar_001", Constant_2x3(T(1)));
@@ -267,8 +267,8 @@ void TestBasic() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003",
-                             "foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003",
+                                  "foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "bar_000", Constant_2x3(T(0)));
     Expect<T>(&reader, "bar_001", Constant_2x3(T(1)));
     Expect<T>(&reader, "bar_002", Constant_2x3(T(2)));
@@ -361,8 +361,8 @@ TEST(TensorBundleTest, SwapBytes) {
 
   // 64-bit types
   // Cast to uint64*/int64* to make DataTypeToEnum<T> happy
-  TestByteSwap(reinterpret_cast<const uint64*>(forward_64),
-               reinterpret_cast<const uint64*>(swapped_64), arr_len_64);
+  TestByteSwap(reinterpret_cast<const uint64_t*>(forward_64),
+               reinterpret_cast<const uint64_t*>(swapped_64), arr_len_64);
   TestByteSwap(reinterpret_cast<const int64_t*>(forward_64),
                reinterpret_cast<const int64_t*>(swapped_64), arr_len_64);
   TestByteSwap(reinterpret_cast<const double*>(forward_64),
@@ -413,7 +413,7 @@ void TestEndianness() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "foo_000", Constant_2x3<T>(T(0)));
     Expect<T>(&reader, "foo_001", Constant_2x3<T>(T(1)));
     Expect<T>(&reader, "foo_002", Constant_2x3<T>(T(2)));
@@ -444,7 +444,7 @@ void TestEndianness() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
     Expect<T>(&reader, "bar_003", Constant_2x3<T>(T(3)));
     Expect<T>(&reader, "bar_002", Constant_2x3<T>(T(2)));
     Expect<T>(&reader, "bar_001", Constant_2x3<T>(T(1)));
@@ -468,8 +468,8 @@ void TestEndianness() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003",
-                             "foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003",
+                                  "foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "bar_000", Constant_2x3<T>(T(0)));
     Expect<T>(&reader, "bar_001", Constant_2x3<T>(T(1)));
     Expect<T>(&reader, "bar_002", Constant_2x3<T>(T(2)));
@@ -519,7 +519,7 @@ void TestNonStandardShapes() {
 
 // Writes a bundle to disk with a bad "version"; checks for "expected_error".
 void VersionTest(const VersionDef& version, absl::string_view expected_error) {
-  const string path = Prefix("version_test");
+  const std::string path = Prefix("version_test");
   {
     // Prepare an empty bundle with the given version information.
     BundleHeaderProto header;
@@ -543,10 +543,10 @@ void VersionTest(const VersionDef& version, absl::string_view expected_error) {
 TEST(TensorBundleTest, Basic) {
   TestBasic<float>();
   TestBasic<double>();
-  TestBasic<int32>();
-  TestBasic<uint8>();
-  TestBasic<int16>();
-  TestBasic<int8>();
+  TestBasic<int32_t>();
+  TestBasic<uint8_t>();
+  TestBasic<int16_t>();
+  TestBasic<int8_t>();
   TestBasic<complex64>();
   TestBasic<complex128>();
   TestBasic<int64_t>();
@@ -560,10 +560,10 @@ TEST(TensorBundleTest, Basic) {
 TEST(TensorBundleTest, Endianness) {
   TestEndianness<float>();
   TestEndianness<double>();
-  TestEndianness<int32>();
-  TestEndianness<uint8>();
-  TestEndianness<int16>();
-  TestEndianness<int8>();
+  TestEndianness<int32_t>();
+  TestEndianness<uint8_t>();
+  TestEndianness<int16_t>();
+  TestEndianness<int8_t>();
   TestEndianness<complex64>();
   TestEndianness<complex128>();
   TestEndianness<int64_t>();
@@ -704,10 +704,10 @@ TEST(TensorBundleTest, EquivalentSliceTest) {
 TEST(TensorBundleTest, NonStandardShapes) {
   TestNonStandardShapes<float>();
   TestNonStandardShapes<double>();
-  TestNonStandardShapes<int32>();
-  TestNonStandardShapes<uint8>();
-  TestNonStandardShapes<int16>();
-  TestNonStandardShapes<int8>();
+  TestNonStandardShapes<int32_t>();
+  TestNonStandardShapes<uint8_t>();
+  TestNonStandardShapes<int16_t>();
+  TestNonStandardShapes<int8_t>();
   TestNonStandardShapes<complex64>();
   TestNonStandardShapes<complex128>();
   TestNonStandardShapes<int64_t>();
@@ -723,15 +723,16 @@ TEST(TensorBundleTest, StringTensorsOldFormat) {
   // varint32s to store string lengths (we now use varint64s).
   BundleReader reader(Env::Default(), TestdataPrefix("old_string_tensors/foo"));
   TF_ASSERT_OK(reader.status());
-  EXPECT_EQ(AllTensorKeys(&reader),
-            std::vector<string>({"floats", "scalar", "string_tensor", "strs"}));
+  EXPECT_EQ(
+      AllTensorKeys(&reader),
+      std::vector<std::string>({"floats", "scalar", "string_tensor", "strs"}));
 
   Expect<tstring>(&reader, "string_tensor",
                   Tensor(DT_STRING, TensorShape({1})));
   Expect<tstring>(&reader, "scalar", test::AsTensor<tstring>({"hello"}));
   Expect<tstring>(
       &reader, "strs",
-      test::AsTensor<tstring>({"hello", "", "x01", string(1 << 10, 'c')}));
+      test::AsTensor<tstring>({"hello", "", "x01", std::string(1 << 10, 'c')}));
   Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
 }
 
@@ -758,8 +759,8 @@ TEST(TensorBundleTest, StringTensors) {
                             Tensor(DT_STRING, TensorShape({1}))));  // Empty.
     TF_EXPECT_OK(writer.Add("scalar", test::AsTensor<tstring>({"hello"})));
     TF_EXPECT_OK(writer.Add(
-        "strs",
-        test::AsTensor<tstring>({"hello", "", "x01", string(1 << 25, 'c')})));
+        "strs", test::AsTensor<tstring>(
+                    {"hello", "", "x01", std::string(1 << 25, 'c')})));
 
     // Requires a 64-bit length.
     tstring* backing_string = long_string_tensor.flat<tstring>().data();
@@ -775,15 +776,15 @@ TEST(TensorBundleTest, StringTensors) {
     BundleReader reader(Env::Default(), Prefix("foo"));
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(AllTensorKeys(&reader),
-              std::vector<string>({"floats", "long_scalar", "scalar",
-                                   "string_tensor", "strs"}));
+              std::vector<std::string>({"floats", "long_scalar", "scalar",
+                                        "string_tensor", "strs"}));
 
     Expect<tstring>(&reader, "string_tensor",
                     Tensor(DT_STRING, TensorShape({1})));
     Expect<tstring>(&reader, "scalar", test::AsTensor<tstring>({"hello"}));
-    Expect<tstring>(
-        &reader, "strs",
-        test::AsTensor<tstring>({"hello", "", "x01", string(1 << 25, 'c')}));
+    Expect<tstring>(&reader, "strs",
+                    test::AsTensor<tstring>(
+                        {"hello", "", "x01", std::string(1 << 25, 'c')}));
 
     Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
 
@@ -825,10 +826,10 @@ TEST(TensorBundleTest, StringTensors) {
 class VariantObject {
  public:
   VariantObject() {}
-  VariantObject(const string& metadata, int64_t value)
+  VariantObject(const std::string& metadata, int64_t value)
       : metadata_(metadata), value_(value) {}
 
-  string TypeName() const { return "TEST VariantObject"; }
+  std::string TypeName() const { return "TEST VariantObject"; }
   void Encode(VariantTensorData* data) const {
     data->set_type_name(TypeName());
     data->set_metadata(metadata_);
@@ -846,7 +847,7 @@ class VariantObject {
   bool operator==(const VariantObject other) const {
     return metadata_ == other.metadata_ && value_ == other.value_;
   }
-  string metadata_;
+  std::string metadata_;
   int64_t value_;
 };
 
@@ -874,8 +875,8 @@ TEST(TensorBundleTest, VariantTensors) {
 TEST(TensorBundleTest, DirectoryStructure) {
   Env* env = Env::Default();
   // Writes two bundles.
-  const std::vector<string> kBundlePrefixes = {Prefix("worker0"),
-                                               Prefix("worker1")};
+  const std::vector<std::string> kBundlePrefixes = {Prefix("worker0"),
+                                                    Prefix("worker1")};
   for (int i = 0; i < 2; ++i) {
     BundleWriter writer(env, kBundlePrefixes[i]);
     TF_EXPECT_OK(
@@ -884,10 +885,10 @@ TEST(TensorBundleTest, DirectoryStructure) {
   }
 
   // Ensures we have the expected files.
-  auto CheckDirFiles = [env](const string& bundle_prefix,
-                             absl::Span<const string> expected_files) {
+  auto CheckDirFiles = [env](const std::string& bundle_prefix,
+                             absl::Span<const std::string> expected_files) {
     absl::string_view dir = io::Dirname(bundle_prefix);
-    for (const string& expected_file : expected_files) {
+    for (const std::string& expected_file : expected_files) {
       TF_EXPECT_OK(env->FileExists(io::JoinPath(dir, expected_file)));
     }
   };
@@ -901,7 +902,7 @@ TEST(TensorBundleTest, DirectoryStructure) {
                 {"worker1.index", "worker1.data-00000-of-00001"});
 
   // Trivially "merge" one bundle to some other location (i.e., a renaming).
-  const string kAnotherPrefix = Prefix("another");
+  const std::string kAnotherPrefix = Prefix("another");
   TF_ASSERT_OK(MergeBundles(env, {kBundlePrefixes[0]}, kAnotherPrefix));
   CheckDirFiles(kAnotherPrefix,
                 {"another.index", "another.data-00000-of-00001"});
@@ -910,7 +911,7 @@ TEST(TensorBundleTest, DirectoryStructure) {
   //   merged.index
   //   merged.data-00000-of-00002
   //   merged.data-00001-of-00002
-  const string kMerged = Prefix("merged");
+  const std::string kMerged = Prefix("merged");
   TF_ASSERT_OK(
       MergeBundles(env, {kAnotherPrefix, kBundlePrefixes[1]}, kMerged));
   CheckDirFiles(kMerged, {"merged.index", "merged.data-00000-of-00002",
@@ -919,8 +920,8 @@ TEST(TensorBundleTest, DirectoryStructure) {
 
 TEST(TensorBundleTest, SortForSequentialAccess) {
   Env* env = Env::Default();
-  const std::vector<string> kBundlePrefixes = {Prefix("worker0"),
-                                               Prefix("worker1")};
+  const std::vector<std::string> kBundlePrefixes = {Prefix("worker0"),
+                                                    Prefix("worker1")};
   BundleWriter writer0(env, kBundlePrefixes[0]);
   for (int i = 0; i < 3; ++i) {
     TF_EXPECT_OK(
@@ -935,7 +936,7 @@ TEST(TensorBundleTest, SortForSequentialAccess) {
   }
   TF_ASSERT_OK(writer1.Finish());
 
-  const string kMerged = Prefix("merged");
+  const std::string kMerged = Prefix("merged");
   TF_ASSERT_OK(
       MergeBundles(env, {kBundlePrefixes[0], kBundlePrefixes[1]}, kMerged));
 
@@ -945,10 +946,11 @@ TEST(TensorBundleTest, SortForSequentialAccess) {
 
   BundleReader reader(env, kMerged);
   TF_ASSERT_OK(reader.status());
-  std::vector<string> tensor_names = {"tensor-1-0", "tensor-0-1", "tensor-1-2",
-                                      "tensor-0-0", "tensor-1-1", "tensor-0-2"};
-  TF_ASSERT_OK(reader.SortForSequentialAccess<string>(
-      tensor_names, [](const string& element) { return element; }));
+  std::vector<std::string> tensor_names = {"tensor-1-0", "tensor-0-1",
+                                           "tensor-1-2", "tensor-0-0",
+                                           "tensor-1-1", "tensor-0-2"};
+  TF_ASSERT_OK(reader.SortForSequentialAccess<std::string>(
+      tensor_names, [](const std::string& element) { return element; }));
   EXPECT_THAT(tensor_names,
               ElementsAre("tensor-0-0", "tensor-0-1", "tensor-0-2",
                           "tensor-1-2", "tensor-1-1", "tensor-1-0"));
@@ -976,11 +978,11 @@ TEST(TensorBundleTest, Error) {
 TEST(TensorBundleTest, Checksum) {
   // Randomly flips a byte in [pos_lhs, end of data file), or exactly byte
   // pos_lhs if exact_pos == True.
-  auto FlipByte = [](const string& prefix, int pos_lhs,
+  auto FlipByte = [](const std::string& prefix, int pos_lhs,
                      bool exact_pos = false) {
     DCHECK_GE(pos_lhs, 0);
-    const string& datafile = DataFilename(Prefix(prefix), 0, 1);
-    string data;
+    const std::string& datafile = DataFilename(Prefix(prefix), 0, 1);
+    std::string data;
     TF_ASSERT_OK(ReadFileToString(Env::Default(), datafile, &data));
 
     int byte_pos = 0;
@@ -995,8 +997,8 @@ TEST(TensorBundleTest, Checksum) {
     TF_ASSERT_OK(WriteStringToFile(Env::Default(), datafile, data));
   };
   // The lookup should fail with a checksum-related message.
-  auto ExpectLookupFails = [](const string& prefix, const string& key,
-                              const string& expected_msg, Tensor& val) {
+  auto ExpectLookupFails = [](const std::string& prefix, const std::string& key,
+                              const std::string& expected_msg, Tensor& val) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     absl::Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(absl::IsDataLoss(status));
@@ -1048,8 +1050,8 @@ TEST(TensorBundleTest, TruncatedTensorContents) {
   TF_ASSERT_OK(writer.Finish());
 
   // Truncates the data file by one byte, so that we hit EOF.
-  const string datafile = DataFilename(Prefix("end"), 0, 1);
-  string data;
+  const std::string datafile = DataFilename(Prefix("end"), 0, 1);
+  std::string data;
   TF_ASSERT_OK(ReadFileToString(env, datafile, &data));
   ASSERT_TRUE(!data.empty());
   TF_ASSERT_OK(WriteStringToFile(
@@ -1143,7 +1145,7 @@ TEST(TensorBundleTest, LargeVariableLoadingTest) {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<float>(&reader, "foo_000", Constant_100x100<float>(0));
     Expect<float>(&reader, "foo_001", Constant_100x100<float>(1));
     Expect<float>(&reader, "foo_002", Constant_100x100<float>(2));
@@ -1220,7 +1222,8 @@ TEST(BundleCacheTest, ConcurrentGetFile) {
 class TensorBundleAlignmentTest : public ::testing::Test {
  protected:
   template <typename T>
-  void ExpectAlignment(BundleReader* reader, const string& key, int alignment) {
+  void ExpectAlignment(BundleReader* reader, const std::string& key,
+                       int alignment) {
     BundleEntryProto full_tensor_entry;
     TF_ASSERT_OK(reader->GetBundleEntryProto(key, &full_tensor_entry));
     EXPECT_EQ(0, full_tensor_entry.offset() % alignment);
@@ -1243,7 +1246,7 @@ TEST_F(TensorBundleAlignmentTest, AlignmentTest) {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<float>(&reader, "foo_000", Constant_2x3<float>(0));
     Expect<float>(&reader, "foo_001", Constant_2x3<float>(1));
     Expect<float>(&reader, "foo_002", Constant_2x3<float>(2));
@@ -1298,7 +1301,7 @@ BENCHMARK(BM_BundleAlignment)->ArgPair(4096, 1048576);
 
 static void BM_BundleWriterSmallTensor(::testing::benchmark::State& state) {
   const int64_t bytes = state.range(0);
-  Tensor t = Constant(static_cast<int8>('a'), TensorShape{bytes});
+  Tensor t = Constant(static_cast<int8_t>('a'), TensorShape{bytes});
   BundleWriter writer(Env::Default(), Prefix("foo"));
   int suffix = 0;
   for (auto s : state) {
@@ -1311,7 +1314,7 @@ BENCHMARK(BM_BundleWriterSmallTensor)->Range(1, 1 << 20);
 static void BM_BundleWriterLargeTensor(::testing::benchmark::State& state) {
   const int mb = state.range(0);
   const int64_t bytes = static_cast<int64_t>(mb) * (1 << 20);
-  Tensor t = Constant(static_cast<int8>('a'), TensorShape{bytes});
+  Tensor t = Constant(static_cast<int8_t>('a'), TensorShape{bytes});
   for (auto s : state) {
     BundleWriter writer(Env::Default(), Prefix("foo"));
     TF_CHECK_OK(writer.Add("big", t));
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 1705ba2425577c..ccdf73f79f15b1 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -225,7 +225,6 @@ tf_kernel_library(
         "dtensor_tpu_kernels.cc",
     ],
     tags = [
-        "cuda-only",
         "tpu",
     ],  # Disable building of TPU kernels on non-TPU platforms.
     deps = [
diff --git a/tensorflow/dtensor/cc/save_restore_util.cc b/tensorflow/dtensor/cc/save_restore_util.cc
index dcaf41baf5f1e6..2f8d75cca43fa9 100644
--- a/tensorflow/dtensor/cc/save_restore_util.cc
+++ b/tensorflow/dtensor/cc/save_restore_util.cc
@@ -156,13 +156,12 @@ SaveOpSpecs BuildPerDeviceSave(
         shape_and_slice_specs.push_back({});
 
         mlir::Value new_prefix =
-            builder
-                .create<mlir::TF::AddOp>(
-                    prefix.getLoc(),
-                    mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()),
-                    prefix,
-                    StringScalarConst(builder, prefix.getLoc(),
-                                      DeviceSuffix(device_id, total_devices)))
+            mlir::TF::AddOp::create(
+                builder, prefix.getLoc(),
+                mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()),
+                prefix,
+                StringScalarConst(builder, prefix.getLoc(),
+                                  DeviceSuffix(device_id, total_devices)))
                 .getZ();
         // Generate new prefix based on device_id and save op index, only when
         // we need a new save_op.
diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
index 027f53cc3fc3e2..1c612a8f28a4ff 100644
--- a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
@@ -133,8 +133,8 @@ StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout) {
   {
     // Set Tile Assignment Dimensions by handling both partially sharded and
     // fully sharded.
-    int32 product_of_sharded_dimensions = 1;
-    for (int32 dim_size : layout.num_shards()) {
+    int32_t product_of_sharded_dimensions = 1;
+    for (int32_t dim_size : layout.num_shards()) {
       product_of_sharded_dimensions *= dim_size;
       xla_sharding.add_tile_assignment_dimensions(dim_size);
     }
diff --git a/tensorflow/dtensor/mlir/cluster_function_conversion.cc b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
index 2f725e3e9a383f..51107b7adf544c 100644
--- a/tensorflow/dtensor/mlir/cluster_function_conversion.cc
+++ b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
@@ -142,8 +142,8 @@ mlir::LogicalResult ReplaceClusterWithPartitionCallOp(
   llvm::StringRef function_name = cluster_func.getFunc();
 
   builder->setInsertionPoint(cluster_func);
-  auto call_op = builder->create<mlir::TF::StatefulPartitionedCallOp>(
-      cluster_func.getLoc(), output_types, cluster_func.getOperands(),
+  auto call_op = mlir::TF::StatefulPartitionedCallOp::create(
+      *builder, cluster_func.getLoc(), output_types, cluster_func.getOperands(),
       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr, function_name, mesh_attr,
       /*config_proto=*/builder->getStringAttr(""),
       /*executor_type=*/builder->getStringAttr(""));
diff --git a/tensorflow/dtensor/mlir/collectives.cc b/tensorflow/dtensor/mlir/collectives.cc
index b82304c6fd1749..ca4f5b6e8febda 100644
--- a/tensorflow/dtensor/mlir/collectives.cc
+++ b/tensorflow/dtensor/mlir/collectives.cc
@@ -105,8 +105,8 @@ StatusOr<mlir::Value> EmitAllGather(
 
   mlir::Location loc = DT_LOC2(input.getLoc(), "DTensorAllGatherOp");
   mlir::TF::DTensorAllGatherOp all_gather =
-      builder.create<mlir::TF::DTensorAllGatherOp>(
-          loc, output_type, input,
+      mlir::TF::DTensorAllGatherOp::create(
+          builder, loc, output_type, input,
           mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
           mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
   SetSingleLayoutOnOp(all_gather, tgt_layout);
@@ -153,8 +153,8 @@ StatusOr<const mlir::Value> EmitAllScatter(
 
   mlir::Location loc = DT_LOC2(original_value.getLoc(), "DTensorAllScatterOp");
   mlir::TF::DTensorAllScatterOp all_scatter =
-      builder.create<mlir::TF::DTensorAllScatterOp>(
-          loc, output_type, original_value,
+      mlir::TF::DTensorAllScatterOp::create(
+          builder, loc, output_type, original_value,
           mlir::dtensor::LayoutAttr::get(builder.getContext(), original_layout),
           mlir::dtensor::LayoutAttr::get(builder.getContext(), desired_layout));
   SetSingleLayoutOnOp(all_scatter, desired_layout);
@@ -224,11 +224,10 @@ StatusOr<mlir::Value> EmitAllToAll(
                       LocalTypeFromGlobalType(tgt_layout, global_type));
 
   mlir::Location loc = DT_LOC2(input.getLoc(), "DTensorAllToAllOp");
-  mlir::TF::DTensorAllToAllOp all_to_all =
-      builder.create<mlir::TF::DTensorAllToAllOp>(
-          loc, output_type, input,
-          mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
-          mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
+  mlir::TF::DTensorAllToAllOp all_to_all = mlir::TF::DTensorAllToAllOp::create(
+      builder, loc, output_type, input,
+      mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
+      mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
   SetSingleLayoutOnOp(all_to_all, tgt_layout);
 
   if (newly_created_ops != nullptr) newly_created_ops->insert(all_to_all);
@@ -247,20 +246,21 @@ StatusOr<mlir::Value> EmitDenseToSparseToDense(
   // values tensor = tf.gather_nd(input, indices)
   // shape tensor = tf.shape(input)
   mlir::TF::ZerosLikeOp zeros_like =
-      builder.create<mlir::TF::ZerosLikeOp>(input.getLoc(), input);
-  mlir::TF::NotEqualOp not_equal = builder.create<mlir::TF::NotEqualOp>(
-      zeros_like.getLoc(), input, zeros_like, builder.getBoolAttr(false));
+      mlir::TF::ZerosLikeOp::create(builder, input.getLoc(), input);
+  mlir::TF::NotEqualOp not_equal =
+      mlir::TF::NotEqualOp::create(builder, zeros_like.getLoc(), input,
+                                   zeros_like, builder.getBoolAttr(false));
 
-  mlir::TF::WhereOp indices = builder.create<mlir::TF::WhereOp>(
-      not_equal.getLoc(),
+  mlir::TF::WhereOp indices = mlir::TF::WhereOp::create(
+      builder, not_equal.getLoc(),
       mlir::RankedTensorType::get(GetShapeOfValue(not_equal).value(),
                                   builder.getI64Type()),
       not_equal);
 
-  mlir::TF::GatherNdOp values = builder.create<mlir::TF::GatherNdOp>(
-      input.getLoc(), input.getType(), input, indices);
-  auto shape = builder.create<mlir::TF::ShapeOp>(input.getLoc(), input,
-                                                 builder.getBoolAttr(false));
+  mlir::TF::GatherNdOp values = mlir::TF::GatherNdOp::create(
+      builder, input.getLoc(), input.getType(), input, indices);
+  auto shape = mlir::TF::ShapeOp::create(builder, input.getLoc(), input,
+                                         builder.getBoolAttr(false));
 
   // Emit a SparseToDenseOp and replace the SparseTensor with the result of
   // this new op.
@@ -270,8 +270,8 @@ StatusOr<mlir::Value> EmitDenseToSparseToDense(
           builder, input.getLoc(),
           mlir::cast<mlir::TensorType>(input.getType()).getElementType()));
 
-  auto dense = builder.create<mlir::TF::SparseToDenseOp>(
-      input.getLoc(), input.getType(),
+  auto dense = mlir::TF::SparseToDenseOp::create(
+      builder, input.getLoc(), input.getType(),
       mlir::ValueRange({indices, shape, values, zero_scalar}));
 
   if (newly_created_ops != nullptr) {
@@ -310,8 +310,8 @@ StatusOr<mlir::Value> EmitRelayout(
   // If two layouts are the same, or the only difference is layout type, then
   // there is no need to actually relayout data.
   if (src_layout.IsEquivalentIgnoringType(tgt_layout)) {
-    mlir::TF::IdentityOp op = builder.create<mlir::TF::IdentityOp>(
-        input.getLoc(), input.getType(), input);
+    mlir::TF::IdentityOp op = mlir::TF::IdentityOp::create(
+        builder, input.getLoc(), input.getType(), input);
     if (newly_created_ops != nullptr) newly_created_ops->insert(op);
     return op.getOutput();
   }
@@ -405,7 +405,7 @@ mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
 
   auto constant_attr = builder.getI64TensorAttr(perm_arr);
   auto perm_op =
-      builder.create<mlir::TF::ConstOp>(loc, perm_type, constant_attr);
+      mlir::TF::ConstOp::create(builder, loc, perm_type, constant_attr);
 
   std::vector<int64_t> transposed_shape(shape.begin(), shape.end());
   for (int i = 0; i < shape.size(); i++) {
@@ -414,8 +414,8 @@ mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
   auto transposed_type = mlir::RankedTensorType::get(
       transposed_shape, tr_input_type.getElementType());
 
-  return builder.create<mlir::TF::TransposeOp>(loc, transposed_type, input,
-                                               perm_op);
+  return mlir::TF::TransposeOp::create(builder, loc, transposed_type, input,
+                                       perm_op);
 }
 
 StatusOr<mlir::Operation*> EmitBarrierWithConstValue(mlir::OpBuilder& builder,
@@ -470,10 +470,10 @@ StatusOr<mlir::Operation*> EmitAllReduce(
                       DeviceTypeFromMesh(output_layout.mesh()));
 
   mlir::Location loc = DT_LOC2(input->getLoc(), "DTensorAllReduceOp");
-  auto all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-      loc, input->getResultTypes()[0], input->getOpResult(0),
-      builder.create<mlir::TF::ConstOp>(DT_LOC2(loc, "group_assignment"),
-                                        group_assignment),
+  auto all_reduce = mlir::TF::DTensorAllReduceOp::create(
+      builder, loc, input->getResultTypes()[0], input->getOpResult(0),
+      mlir::TF::ConstOp::create(builder, DT_LOC2(loc, "group_assignment"),
+                                group_assignment),
       builder.getStringAttr(std::string(reduce_op)),
       builder.getStringAttr(device_type));
   SetSingleLayoutOnOp(all_reduce, output_layout);
@@ -575,7 +575,7 @@ StatusOr<mlir::Value> CreateConstSrcTargetPair(const Mesh& mesh,
   auto src_target_attr =
       mlir::DenseIntElementsAttr::get(shaped_type, src_target_pair_flat);
   mlir::Value src_target_pair_tensor =
-      builder.create<mlir::TF::ConstOp>(location, src_target_attr);
+      mlir::TF::ConstOp::create(builder, location, src_target_attr);
   return src_target_pair_tensor;
 }
 
@@ -636,13 +636,14 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   //
   // For example, if mesh dimension splits the input tensor by its height
   // dimension, then `left` actually means tensor to pad on the top side.
-  mlir::Value is_on_left_edge = builder.create<mlir::TF::EqualOp>(
-      location, CreateIntScalarConst(0, builder, location, /*use_int64=*/false),
+  mlir::Value is_on_left_edge = mlir::TF::EqualOp::create(
+      builder, location,
+      CreateIntScalarConst(0, builder, location, /*use_int64=*/false),
       scalar_mesh_coordinate, builder.getBoolAttr(true));
 
   TF_ASSIGN_OR_RETURN(const int mesh_dim_size, mesh.dim_size(mesh_dim));
-  mlir::Value is_on_right_edge = builder.create<mlir::TF::EqualOp>(
-      location,
+  mlir::Value is_on_right_edge = mlir::TF::EqualOp::create(
+      builder, location,
       CreateIntScalarConst(mesh_dim_size - 1, builder, location,
                            /*use_int64=*/false),
       scalar_mesh_coordinate, builder.getBoolAttr(true));
@@ -663,7 +664,7 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   }
 
   mlir::Value ghost_tensor_left =
-      builder.create<mlir::TF::ConstOp>(location, const_attr).getResult();
+      mlir::TF::ConstOp::create(builder, location, const_attr).getResult();
 
   // Get the right side slice of the input tensor to pad on left side.
   llvm::SmallVector<int64_t, 4> begin_left(layout.rank(), 0);
@@ -676,11 +677,13 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   size[split_dim_index] = halo_size;
 
   mlir::Value size_tensor_left = ops_util::GetR1Const(size, builder, location);
-  mlir::Value sliced_tensor_left = builder.create<mlir::TF::SliceOp>(
-      location, halo_type, tensor, begin_tensor_left, size_tensor_left);
+  mlir::Value sliced_tensor_left =
+      mlir::TF::SliceOp::create(builder, location, halo_type, tensor,
+                                begin_tensor_left, size_tensor_left);
 
-  mlir::Value halo_tensor_left = builder.create<mlir::TF::SelectV2Op>(
-      location, is_on_right_edge, ghost_tensor_left, sliced_tensor_left);
+  mlir::Value halo_tensor_left =
+      mlir::TF::SelectV2Op::create(builder, location, is_on_right_edge,
+                                   ghost_tensor_left, sliced_tensor_left);
 
   // Invoke collective permute to receive the tensor from neighboring processor.
   // Halo slices from the left neighbor are received on each processor (they
@@ -690,12 +693,12 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
       CreateConstSrcTargetPair(mesh, mesh_dim, /*shift_left=*/false, location,
                                builder));
 
-  mlir::Value left_concat_value = builder.create<mlir::TF::CollectivePermuteOp>(
-      location, sliced_tensor_left.getType(), halo_tensor_left,
+  mlir::Value left_concat_value = mlir::TF::CollectivePermuteOp::create(
+      builder, location, sliced_tensor_left.getType(), halo_tensor_left,
       src_target_pair_left);
 
   mlir::Value ghost_tensor_right =
-      builder.create<mlir::TF::ConstOp>(location, const_attr).getResult();
+      mlir::TF::ConstOp::create(builder, location, const_attr).getResult();
 
   // Else, values to pad is tensor from different processor. We use collective
   // permute to access tensor slice from another device.
@@ -704,13 +707,15 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   mlir::Value begin_tensor_right =
       ops_util::GetR1Const(begin_right, builder, location);
   mlir::Value size_tensor_right = ops_util::GetR1Const(size, builder, location);
-  mlir::Value sliced_tensor_right = builder.create<mlir::TF::SliceOp>(
-      location, halo_type, tensor, begin_tensor_right, size_tensor_right);
+  mlir::Value sliced_tensor_right =
+      mlir::TF::SliceOp::create(builder, location, halo_type, tensor,
+                                begin_tensor_right, size_tensor_right);
 
   // Find the halo tensor value to pad on the `right` side.
   // If input block is on the right edge, we use zero ghost tensor instead.
-  mlir::Value halo_tensor_right = builder.create<mlir::TF::SelectV2Op>(
-      location, is_on_left_edge, ghost_tensor_right, sliced_tensor_right);
+  mlir::Value halo_tensor_right =
+      mlir::TF::SelectV2Op::create(builder, location, is_on_left_edge,
+                                   ghost_tensor_right, sliced_tensor_right);
 
   // Invoke collective permute to receive the tensor from neighboring processor.
   // Halo slices from the right neighbor are received on each processor (they
@@ -719,10 +724,9 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
       mlir::Value src_target_pair_right,
       CreateConstSrcTargetPair(mesh, mesh_dim, /*shift_left=*/true, location,
                                builder));
-  mlir::Value right_concat_value =
-      builder.create<mlir::TF::CollectivePermuteOp>(
-          location, sliced_tensor_right.getType(), halo_tensor_right,
-          src_target_pair_right);
+  mlir::Value right_concat_value = mlir::TF::CollectivePermuteOp::create(
+      builder, location, sliced_tensor_right.getType(), halo_tensor_right,
+      src_target_pair_right);
 
   // Final halo exchanged value is concatenated value of left_concat_value,
   // tensor, and right_concat_value in the mesh_dimension.
@@ -734,8 +738,8 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
       final_shape, input_tensor_type.getElementType());
   mlir::Value concat_axis =
       CreateIntScalarConst(split_dim_index, builder, location);
-  mlir::Value final_value = builder.create<mlir::TF::ConcatV2Op>(
-      location, final_type,
+  mlir::Value final_value = mlir::TF::ConcatV2Op::create(
+      builder, location, final_type,
       llvm::SmallVector<mlir::Value, 4>{left_concat_value, tensor,
                                         right_concat_value},
       concat_axis);
diff --git a/tensorflow/dtensor/mlir/collectives.h b/tensorflow/dtensor/mlir/collectives.h
index fc0f8f0203d68a..101e944b84d813 100644
--- a/tensorflow/dtensor/mlir/collectives.h
+++ b/tensorflow/dtensor/mlir/collectives.h
@@ -84,7 +84,7 @@ StatusOr<mlir::Operation*> EmitAllReduce(
 StatusOr<mlir::Operation*> EmitBarrierWithConstValue(mlir::OpBuilder& builder,
                                                      mlir::Location loc,
                                                      const Mesh& mesh,
-                                                     int32 value);
+                                                     int32_t value);
 
 // Given input `tensor` that is sharded across spatial dimensions, conduct
 // halo exchange such that each spatially sharded input blocks exchange
diff --git a/tensorflow/dtensor/mlir/collectives_common.cc b/tensorflow/dtensor/mlir/collectives_common.cc
index fcda6c26d51988..37bdd53366af82 100644
--- a/tensorflow/dtensor/mlir/collectives_common.cc
+++ b/tensorflow/dtensor/mlir/collectives_common.cc
@@ -38,7 +38,7 @@ namespace dtensor {
 // a multi-host cluster will generate the same grouping, and therefore the same
 // XLA program fingerprint, independently. std::map guarantees the same
 // iteration order.
-using AllReducePartitions = std::map<DeviceLocation, std::vector<int32>>;
+using AllReducePartitions = std::map<DeviceLocation, std::vector<int32_t>>;
 
 // Computes AllReduce partitions using reduced mesh dimension names.
 //
@@ -60,11 +60,11 @@ StatusOr<AllReducePartitions> GetAllReducePartitionsFromReducedDims(
     const dtensor::Layout& output_layout,
     const absl::flat_hash_set<std::string>& reduced_dims) {
   AllReducePartitions partitions;
-  for (int64 device = 0; device < output_layout.num_devices(); ++device) {
+  for (int64_t device = 0; device < output_layout.num_devices(); ++device) {
     TF_ASSIGN_OR_RETURN(const DeviceLocation device_loc,
                         output_layout.mesh().device_location(device));
     DeviceLocation kept_dims;
-    for (int64 dim_idx = 0; dim_idx < device_loc.size(); ++dim_idx) {
+    for (int64_t dim_idx = 0; dim_idx < device_loc.size(); ++dim_idx) {
       if (!reduced_dims.contains(output_layout.mesh().dim_name(dim_idx))) {
         kept_dims.push_back(device_loc[dim_idx]);
       }
diff --git a/tensorflow/dtensor/mlir/collectives_common.h b/tensorflow/dtensor/mlir/collectives_common.h
index 6041eb4501de3f..fe8688ebc673af 100644
--- a/tensorflow/dtensor/mlir/collectives_common.h
+++ b/tensorflow/dtensor/mlir/collectives_common.h
@@ -29,7 +29,7 @@ namespace tensorflow {
 namespace dtensor {
 
 // Computes AllReduce partitions using reduced mesh dimension names.
-StatusOr<std::map<DeviceLocation, std::vector<int32>>>
+StatusOr<std::map<DeviceLocation, std::vector<int32_t>>>
 GetAllReducePartitionsFromReducedDims(
     const dtensor::Layout& output_layout,
     const absl::flat_hash_set<std::string>& reduced_dims);
diff --git a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
index 4c45da0110c7b0..3b0e959ee32979 100644
--- a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
+++ b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
@@ -194,8 +194,8 @@ mlir::LogicalResult CreateMergedMeshCluster(
     output_values_to_replace.emplace_back(std::get<1>(cluster_return_value));
   }
 
-  *merged_cluster = builder->create<mlir::tf_device::ClusterOp>(
-      current_cluster.getLoc(), merged_cluster_output_types);
+  *merged_cluster = mlir::tf_device::ClusterOp::create(
+      *builder, current_cluster.getLoc(), merged_cluster_output_types);
   auto mesh_attr = current_cluster->getAttrOfType<mlir::StringAttr>(kMeshAttr);
   if (!mesh_attr)
     return current_cluster.emitOpError(kMissingMeshAttributeErrorMessage);
@@ -206,8 +206,8 @@ mlir::LogicalResult CreateMergedMeshCluster(
   // `current_cluster` and `merging_cluster`.
   merged_cluster->getBody().push_back(new mlir::Block);
   builder->setInsertionPointToEnd(&merged_cluster->GetBody());
-  builder->create<mlir::tf_device::ReturnOp>(merged_cluster->getLoc(),
-                                             merged_cluster_output_values);
+  mlir::tf_device::ReturnOp::create(*builder, merged_cluster->getLoc(),
+                                    merged_cluster_output_values);
 
   // Make sure to replace usages of tf_device.cluster ops to be merged-away with
   // newly created tf_device.cluster op.
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index 9261255c304033..09b53ae4b72895 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -72,7 +72,7 @@ namespace ops_util = ::mlir::TF::collection_ops_util;
 
 // Pad the merged tensor shape to multiples of 1024B, so delinearization
 // skipping optimization in XLA can get activated.
-constexpr int32 kAllReducePadding = 1024;
+constexpr int32_t kAllReducePadding = 1024;
 
 // Returns true if `successor` depends on `predecessor`.
 // TODO(jiawenhao): Repeatedly computing dependency sets for a large cluster can
@@ -151,10 +151,10 @@ mlir::LogicalResult MergeAllReduceGroup(
   mlir::Location loc = all_reduce_group[0].getLoc();
   mlir::Type elem_type = all_reduce_group[0].getType().getElementType();
   auto zero_scalar = ops_util::CreateScalarConst(0, builder, loc);
-  auto zero_scalar_elem_type = builder.create<mlir::TF::CastOp>(
-      loc, mlir::RankedTensorType::get({}, elem_type), zero_scalar);
-  auto merged = builder.create<mlir::TF::FillOp>(
-      loc, ops_util::GetR1Const({total_num_elements}, builder, loc),
+  auto zero_scalar_elem_type = mlir::TF::CastOp::create(
+      builder, loc, mlir::RankedTensorType::get({}, elem_type), zero_scalar);
+  auto merged = mlir::TF::FillOp::create(
+      builder, loc, ops_util::GetR1Const({total_num_elements}, builder, loc),
       zero_scalar_elem_type);
 
   // Store every all-reduce's input at an offset location in the merged tensor,
@@ -175,23 +175,23 @@ mlir::LogicalResult MergeAllReduceGroup(
     }
 
     int num_elements = all_reduce_ranked_type.getNumElements();
-    auto flattened = builder.create<mlir::TF::ReshapeOp>(
-        DT_LOC2(loc, "CombinedReduceFlatten"), all_reduce.getInput(),
+    auto flattened = mlir::TF::ReshapeOp::create(
+        builder, DT_LOC2(loc, "CombinedReduceFlatten"), all_reduce.getInput(),
         ops_util::GetR1Const({num_elements}, builder, loc));
     flattened_types.push_back(flattened.getType());
     auto indices = ops_util::GetR1Const({offset_num_elements}, builder, loc);
 
     if (all_reduce.getDeviceType().contains("TPU")) {
-      updated = builder.create<mlir::TF::XlaDynamicUpdateSliceOp>(
-          DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
+      updated = mlir::TF::XlaDynamicUpdateSliceOp::create(
+          builder, DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
           /*input=*/i == 0 ? merged.getResult() : updated,
           /*update=*/flattened, indices);
     } else {
       auto end = ops_util::GetR1Const({offset_num_elements + num_elements},
                                       builder, loc);
       auto strides = ops_util::GetR1Const({1}, builder, loc);
-      updated = builder.create<mlir::TF::TensorStridedSliceUpdateOp>(
-          DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
+      updated = mlir::TF::TensorStridedSliceUpdateOp::create(
+          builder, DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
           /*input=*/i == 0 ? merged.getResult() : updated, indices, end,
           strides,
           /*value=*/flattened);
@@ -200,8 +200,8 @@ mlir::LogicalResult MergeAllReduceGroup(
   }
 
   // All-reduce the updated merged tensor.
-  auto merged_all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-      all_reduce_group[0].getLoc(), updated.getType(), updated,
+  auto merged_all_reduce = mlir::TF::DTensorAllReduceOp::create(
+      builder, all_reduce_group[0].getLoc(), updated.getType(), updated,
       all_reduce_group[0].getGroupAssignment(),
       all_reduce_group[0].getReduceOp(), all_reduce_group[0].getDeviceType());
   SetSingleLayoutOnOp(
@@ -223,13 +223,13 @@ mlir::LogicalResult MergeAllReduceGroup(
           all_reduce_ranked_type));
     }
     int num_elements = all_reduce_ranked_type.getNumElements();
-    auto slice = builder.create<mlir::TF::SliceOp>(
-        DT_LOC2(loc, "PostCombinedReduceSlice"), flattened_types[i],
+    auto slice = mlir::TF::SliceOp::create(
+        builder, DT_LOC2(loc, "PostCombinedReduceSlice"), flattened_types[i],
         /*input=*/merged_all_reduce,
         /*begin=*/ops_util::GetR1Const({offset_num_elements}, builder, loc),
         /*size=*/ops_util::GetR1Const({num_elements}, builder, loc));
-    auto replacement = builder.create<mlir::TF::ReshapeOp>(
-        DT_LOC2(loc, "PostCombinedReduceReshape"), slice.getResult(),
+    auto replacement = mlir::TF::ReshapeOp::create(
+        builder, DT_LOC2(loc, "PostCombinedReduceReshape"), slice.getResult(),
         ops_util::GetR1Const(all_reduce_shapes[i], builder, loc));
     replacements.push_back(replacement);
     offset_num_elements += num_elements;
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
index 682af5ae92b021..5721d03ce2c343 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
@@ -64,16 +64,16 @@ mlir::DenseIntElementsAttr GetScatterGroupAssignment(
   auto partitions =
       GetAllReducePartitionsFromReducedDims(original_layout, scattered_dims)
           .value();
-  const int32 num_partitions = partitions.size();
+  const int32_t num_partitions = partitions.size();
 
   // Construct a flattened list of scatter partitions.
-  std::vector<int32> partitions_flat;
+  std::vector<int32_t> partitions_flat;
   for (auto& p : partitions) {
     partitions_flat.insert(partitions_flat.end(), p.second.begin(),
                            p.second.end());
   }
 
-  int32 partition_size = partitions.begin()->second.size();
+  int32_t partition_size = partitions.begin()->second.size();
   mlir::OpBuilder builder(all_scatter);
   auto group_shaped_type = mlir::RankedTensorType::get(
       {num_partitions, partition_size},
@@ -137,14 +137,14 @@ mlir::LogicalResult ApplyOptimization(mlir::func::FuncOp function) {
         VLOG(2) << "Fuse reduce scatter with scatter_dim: " << scatter_dim;
 
         mlir::OpBuilder builder(all_reduce);
-        auto scatter_dim_const_op = builder.create<mlir::TF::ConstOp>(
-            all_reduce.getLoc(),
+        auto scatter_dim_const_op = mlir::TF::ConstOp::create(
+            builder, all_reduce.getLoc(),
             mlir::DenseIntElementsAttr::get(
                 mlir::RankedTensorType::get({}, builder.getI32Type()),
                 {scatter_dim}));
 
-        auto reduce_scatter = builder.create<mlir::TF::DTensorReduceScatterOp>(
-            all_reduce.getLoc(), all_scatter->getResultTypes(),
+        auto reduce_scatter = mlir::TF::DTensorReduceScatterOp::create(
+            builder, all_reduce.getLoc(), all_scatter->getResultTypes(),
             all_reduce.getOperand(0), all_reduce.getGroupAssignment(),
             scatter_dim_const_op, all_reduce.getReduceOp(),
             all_reduce.getDeviceType());
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
index 0a7a232290b8a7..e8a2fde042ae62 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
@@ -160,8 +160,8 @@ mlir::LogicalResult OptimizeAllReduceAndSum(mlir::Operation* op,
   mlir::OpBuilder builder(op);
   builder.setInsertionPointAfterValue(op->getResult(0));
   mlir::TF::DTensorAllReduceOp all_reduce =
-      builder.create<mlir::TF::DTensorAllReduceOp>(
-          op->getLoc(), op->getResult(0).getType(), op->getResult(0),
+      mlir::TF::DTensorAllReduceOp::create(
+          builder, op->getLoc(), op->getResult(0).getType(), op->getResult(0),
           group_assignment, builder.getStringAttr(std::string(kReduceOpAdd)),
           builder.getStringAttr(first_reduction_op.getDeviceType()));
 
@@ -394,8 +394,8 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
 
   // Create a singe reduction operation that reduces the result of the locally
   // added tensor.
-  auto new_all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-      all_reduce.getLoc(), while_output.getType(), while_output,
+  auto new_all_reduce = mlir::TF::DTensorAllReduceOp::create(
+      builder, all_reduce.getLoc(), while_output.getType(), while_output,
       cloned_group_assignment->getResult(0),
       builder.getStringAttr(std::string(kReduceOpAdd)),
       builder.getStringAttr(all_reduce.getDeviceType()));
diff --git a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
index 6cc0a14cb1eefd..457cec03a0e1ca 100644
--- a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
+++ b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
@@ -112,8 +112,8 @@ void DTensorLayoutToXlaShardingOpPass::runOnOperation() {
         // the V1 sharding attr, so set V2 sharding to "" here. It may be better
         // to set the V2 sharding attr here and then removed it when V1 is
         // removed.
-        auto sharding_op = builder.create<mlir::TF::XlaShardingOp>(
-            layout_op.getLoc(), layout_op.getOutput().getType(),
+        auto sharding_op = mlir::TF::XlaShardingOp::create(
+            builder, layout_op.getLoc(), layout_op.getOutput().getType(),
             layout_op.getInput(),
             /*sharding=*/builder.getStringAttr(""),  // Not used by tf2xla.
             /*_xlaSharding=*/sharding_attr,
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index f563dceb065671..c0f066483451fe 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -44,7 +44,7 @@ namespace {
 // the list of devices that are a part of the same reduction group.
 template <class ReduceOpType>
 mlir::LogicalResult GetAllReduceGroupSize(ReduceOpType reduce_op,
-                                          int32* group_size) {
+                                          int32_t* group_size) {
   mlir::DenseIntElementsAttr group_assignment_attr;
   if (!matchPattern(reduce_op.getGroupAssignment(),
                     m_Constant(&group_assignment_attr)))
@@ -80,7 +80,7 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   mlir::OpBuilder builder(reduce_op);
   const mlir::Location loc = reduce_op.getLoc();
 
-  int32 group_size;
+  int32_t group_size;
   if (mlir::failed(GetAllReduceGroupSize(reduce_op, &group_size)))
     return mlir::failure();
   if (group_size <= ReduceInBfloat16MaxGroupSize())
@@ -98,16 +98,16 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   const mlir::RankedTensorType& output_type =
       mlir::dyn_cast<mlir::RankedTensorType>(reduce_op.getOutput().getType());
 
-  mlir::TF::CastOp upcast = builder.create<mlir::TF::CastOp>(
-      loc,
+  mlir::TF::CastOp upcast = mlir::TF::CastOp::create(
+      builder, loc,
       mlir::RankedTensorType::get(input_type.getShape(), builder.getF32Type()),
       reduce_op.getInput());
   reduce_op->setOperand(0, upcast.getY());
   reduce_op.getOutput().setType(upcast.getY().getType());
 
   builder.setInsertionPointAfter(reduce_op);
-  mlir::TF::CastOp downcast = builder.create<mlir::TF::CastOp>(
-      loc,
+  mlir::TF::CastOp downcast = mlir::TF::CastOp::create(
+      builder, loc,
       mlir::RankedTensorType::get(output_type.getShape(),
                                   output_type.getElementType()),
       reduce_op);
diff --git a/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc b/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc
index d5e957e19050d2..1b320bcfc100ab 100644
--- a/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc
+++ b/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc
@@ -37,9 +37,9 @@ class DTensorReplaceRelayoutWithIdentityPass
       mlir::OpBuilder builder(relayout_op);
       // Inserts an IdentityOp at the position of the relayout_op with the same
       // attributes as the relayout_op.
-      auto new_identity = builder.create<mlir::TF::IdentityOp>(
-          relayout_op->getLoc(), relayout_op.getType(), relayout_op.getInput(),
-          relayout_op->getAttrs());
+      auto new_identity = mlir::TF::IdentityOp::create(
+          builder, relayout_op->getLoc(), relayout_op.getType(),
+          relayout_op.getInput(), relayout_op->getAttrs());
       relayout_op.getOutput().replaceAllUsesWith(new_identity.getOutput());
       relayout_op.erase();
     });
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.cc b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
index c728725dbaf073..fa6d2bd041189f 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.cc
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
@@ -85,8 +85,8 @@ mlir::Value GetOrCreateCompilationKey(mlir::Operation* op) {
   auto result_type =
       mlir::RankedTensorType::get({3}, builder.getType<mlir::TF::StringType>());
   auto new_compilation_key =
-      builder.create<mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
-          cluster.getLoc(), /*program=*/result_type,
+      mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp::create(
+          builder, cluster.getLoc(), /*program=*/result_type,
           llvm::ArrayRef<mlir::Value>{});
   return new_compilation_key.getProgram();
 }
@@ -107,8 +107,8 @@ StatusOr<mlir::Value> GetDeviceOrdinal(const Mesh& mesh,
   }
   // Slice out the device ordinal using the device ID as index.
   TF_ASSIGN_OR_RETURN(mlir::Value device_id, DeviceId(function));
-  mlir::TF::SliceOp device_ordinal = builder->create<mlir::TF::SliceOp>(
-      loc,
+  mlir::TF::SliceOp device_ordinal = mlir::TF::SliceOp::create(
+      *builder, loc,
       /*output=*/EffectivelyScalarR1Type(builder->getIntegerType(32)),
       /*input=*/IntConst(*builder, loc, device_id_to_ordinal),
       /*begin=*/
@@ -118,8 +118,8 @@ StatusOr<mlir::Value> GetDeviceOrdinal(const Mesh& mesh,
   mlir::Value device_ordinal_scalar =
       ReshapeSizeTypeToScalar(*builder, loc, device_ordinal);
   if (return_int64_type) {
-    device_ordinal_scalar = builder->create<mlir::TF::CastOp>(
-        loc, mlir::RankedTensorType::get({}, builder->getI64Type()),
+    device_ordinal_scalar = mlir::TF::CastOp::create(
+        *builder, loc, mlir::RankedTensorType::get({}, builder->getI64Type()),
         device_ordinal_scalar);
   }
   return device_ordinal_scalar;
@@ -138,8 +138,8 @@ StatusOr<mlir::Operation*> LowerDTensorSendToTFOp(
   absl::Span<const std::string> receiving_devices = target_mesh.local_devices();
 
   mlir::Operation* lowered_send_op;
-  lowered_send_op = builder.create<mlir::TF::_HostSendOp>(
-      send_input.getLoc(), send_input, tensor_name, sending_devices[0],
+  lowered_send_op = mlir::TF::_HostSendOp::create(
+      builder, send_input.getLoc(), send_input, tensor_name, sending_devices[0],
       /*send_device_incarnation=*/0, receiving_devices[0],
       /*client_terminated=*/false);
 
@@ -184,12 +184,13 @@ StatusOr<mlir::Operation*> LowerDTensorSendToXlaOp(
           GetDeviceOrdinal(send_input_layout.mesh(), loc, send_func, &builder));
     }
     // Create XlaSendFromHostV2 op
-    lowered_send_op = builder.create<mlir::TF::_XlaSendFromHostV2Op>(
-        loc, value_to_send, program_key, device_ordinal, dtensor_send.getKey());
+    lowered_send_op = mlir::TF::_XlaSendFromHostV2Op::create(
+        builder, loc, value_to_send, program_key, device_ordinal,
+        dtensor_send.getKey());
   } else {
     // Note that for ops running in XLA/TPU, device ordinal input is not needed.
-    lowered_send_op = builder.create<mlir::TF::XlaSendToHostOp>(
-        loc, send_input, dtensor_send.getKey());
+    lowered_send_op = mlir::TF::XlaSendToHostOp::create(
+        builder, loc, send_input, dtensor_send.getKey());
   }
 
   dtensor_send.erase();
@@ -246,16 +247,16 @@ StatusOr<mlir::Operation*> LowerDTensorRecvToXlaOp(
 
     auto program_key = GetOrCreateCompilationKey(dtensor_recv);
     builder.setInsertionPoint(dtensor_recv);
-    recv_xla_op = builder.create<mlir::TF::_XlaRecvAtHostV2Op>(
-        dtensor_recv.getLoc(), output_types,
+    recv_xla_op = mlir::TF::_XlaRecvAtHostV2Op::create(
+        builder, dtensor_recv.getLoc(), output_types,
         /*dynamic_key=*/program_key, device_ordinal, dtensor_recv.getKeyAttr());
   } else {
     TF_ASSIGN_OR_RETURN(auto local_shape_attr,
                         GetDTensorRecvLocalShapeAttr(dtensor_recv));
 
     // Create XlaRecvFromHost op.
-    recv_xla_op = builder.create<mlir::TF::XlaRecvFromHostOp>(
-        dtensor_recv.getLoc(), output_type, local_shape_attr,
+    recv_xla_op = mlir::TF::XlaRecvFromHostOp::create(
+        builder, dtensor_recv.getLoc(), output_type, local_shape_attr,
         dtensor_recv.getKeyAttr());
   }
 
@@ -299,8 +300,8 @@ StatusOr<mlir::Operation*> LowerDTensorSendFromCPUToTFOp(
 
   mlir::Operation* lowered_send_op;
   for (size_t i = 0; i < receiving_devices.size(); ++i)
-    lowered_send_op = builder.create<mlir::TF::_HostSendOp>(
-        send_input.getLoc(), dtensor_send.getInput(), tensor_name,
+    lowered_send_op = mlir::TF::_HostSendOp::create(
+        builder, send_input.getLoc(), dtensor_send.getInput(), tensor_name,
         sending_devices[0],
         /*send_device_incarnation=*/0, receiving_devices[i]);
 
@@ -326,8 +327,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecvFromCPUToTFOp(
   mlir::Operation* lowered_recv_op;
   mlir::Location loc = dtensor_recv.getLoc();
   for (size_t i = 0; i < receiving_devices.size(); ++i)
-    lowered_recv_op = builder.create<mlir::TF::_HostRecvOp>(
-        loc, dtensor_recv.getType(), tensor_name, sending_devices[0],
+    lowered_recv_op = mlir::TF::_HostRecvOp::create(
+        builder, loc, dtensor_recv.getType(), tensor_name, sending_devices[0],
         /*send_device_incarnation=*/0, receiving_devices[i]);
 
   // Replace dtensor_recv with newly created recv op and remove DTensorRecv op.
@@ -351,8 +352,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecvToTFOp(
   absl::Span<const std::string> receiving_devices = recv_mesh.local_devices();
 
   mlir::Location loc = dtensor_recv.getLoc();
-  mlir::Operation* lowered_recv_op = builder.create<mlir::TF::_HostRecvOp>(
-      loc, output_type, tensor_name, sending_devices[0],
+  mlir::Operation* lowered_recv_op = mlir::TF::_HostRecvOp::create(
+      builder, loc, output_type, tensor_name, sending_devices[0],
       /*send_device_incarnation=*/0, receiving_devices[0]);
 
   return lowered_recv_op;
@@ -385,7 +386,7 @@ llvm::SmallVector<mlir::Attribute, 4> GenerateBranches(
                                   ? func_op.getArgument(0)
                                   : mlir::BlockArgument{};
     auto branch_op = fn(fn_builder, location, arg, it.value());
-    fn_builder.create<mlir::func::ReturnOp>(location, branch_op->getResults());
+    mlir::func::ReturnOp::create(fn_builder, location, branch_op->getResults());
 
     branches.push_back(mlir::SymbolRefAttr::get(func_op));
   }
@@ -429,25 +430,24 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorSendToTFHostSend(
         mlir::Value val = arg;
         if (i32_copy) {
           auto val_type = mlir::cast<mlir::TensorType>(val.getType());
-          val = op_builder
-                    .create<mlir::TF::CastOp>(
-                        loc,
-                        mlir::RankedTensorType::get(
-                            val_type.getShape(), op_builder.getIntegerType(64)),
-                        val)
+          val = mlir::TF::CastOp::create(
+                    op_builder, loc,
+                    mlir::RankedTensorType::get(val_type.getShape(),
+                                                op_builder.getIntegerType(64)),
+                    val)
                     ->getResult(0);
         }
-        return op_builder.create<mlir::TF::_HostSendOp>(
-            loc, val, tensor_name, std::get<0>(device_pair),
+        return mlir::TF::_HostSendOp::create(
+            op_builder, loc, val, tensor_name, std::get<0>(device_pair),
             /*send_device_incarnation=*/0, std::get<1>(device_pair));
       });
-  mlir::Operation* case_op = builder.create<mlir::TF::CaseOp>(
-      dtensor_send.getLoc(),
-      /*output=*/llvm::ArrayRef<mlir::Type>{},
-      /*branch_index=*/device_ordinal,
-      /*input=*/dtensor_send->getOperands(),
-      /*branches=*/builder.getArrayAttr(branches),
-      /*is_stateless=*/builder.getBoolAttr(false));
+  mlir::Operation* case_op =
+      mlir::TF::CaseOp::create(builder, dtensor_send.getLoc(),
+                               /*output=*/llvm::ArrayRef<mlir::Type>{},
+                               /*branch_index=*/device_ordinal,
+                               /*input=*/dtensor_send->getOperands(),
+                               /*branches=*/builder.getArrayAttr(branches),
+                               /*is_stateless=*/builder.getBoolAttr(false));
 
   // erase the send op here iff targeting a gpu
   // otherwise there will be 'op not within cluster' error(s)
@@ -494,14 +494,15 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorRecvToTFHostRecv(
       "{0}_receive_{1}_{2}", device_pairs,
       [&](mlir::OpBuilder& op_builder, auto& loc, auto _,
           auto device_pair) -> mlir::Operation* {
-        auto recv_op = op_builder.create<mlir::TF::_HostRecvOp>(
-            loc, local_output_type, tensor_name, std::get<0>(device_pair),
+        auto recv_op = mlir::TF::_HostRecvOp::create(
+            op_builder, loc, local_output_type, tensor_name,
+            std::get<0>(device_pair),
             /*send_device_incarnation=*/0, std::get<1>(device_pair));
         SetSingleLayoutOnOp(recv_op, recv_layout);
         return recv_op;
       });
-  mlir::Operation* case_op = builder.create<mlir::TF::CaseOp>(
-      dtensor_recv.getLoc(),
+  mlir::Operation* case_op = mlir::TF::CaseOp::create(
+      builder, dtensor_recv.getLoc(),
       /*output=*/llvm::ArrayRef<mlir::Type>{local_output_type},
       /*branch_index=*/device_ordinal,
       /*input=*/dtensor_recv->getOperands(),
@@ -510,8 +511,8 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorRecvToTFHostRecv(
 
   mlir::Operation* lowered_recv;
   if (i32_copy) {
-    lowered_recv = builder.create<mlir::TF::CastOp>(
-        dtensor_recv.getLoc(), local_recv_type, case_op->getResult(0));
+    lowered_recv = mlir::TF::CastOp::create(
+        builder, dtensor_recv.getLoc(), local_recv_type, case_op->getResult(0));
   } else {
     lowered_recv = case_op;
   }
@@ -639,12 +640,12 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
         GetDeviceOrdinal(*mesh, loc,
                          send_cluster->getParentOfType<mlir::func::FuncOp>(),
                          &builder));
-    mlir::Value predicate = builder.create<mlir::TF::EqualOp>(
-        loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
+    mlir::Value predicate = mlir::TF::EqualOp::create(
+        builder, loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
         /*incompatible_shape_error=*/builder.getBoolAttr(true));
 
-    auto send_if = builder.create<mlir::TF::IfRegionOp>(
-        loc, llvm::SmallVector<mlir::Type, 4>{}, predicate,
+    auto send_if = mlir::TF::IfRegionOp::create(
+        builder, loc, llvm::SmallVector<mlir::Type, 4>{}, predicate,
         /*is_stateless=*/builder.getBoolAttr(true),
         GetUniqueControlflowFnName("copy_to_mesh_send_if_then", builder),
         GetUniqueControlflowFnName("copy_to_mesh_send_if_else", builder));
@@ -653,16 +654,15 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
     auto& else_branch = send_if.getElseBranch();
     else_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&else_branch.front());
-    builder.create<mlir::TF::YieldOp>(
-        loc,
-        /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    mlir::TF::YieldOp::create(builder, loc,
+                              /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
     // Create then branch region with DTensorSend op.
     auto& then_branch = send_if.getThenBranch();
     then_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&then_branch.front());
-    auto yield = builder.create<mlir::TF::YieldOp>(
-        loc, /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    auto yield = mlir::TF::YieldOp::create(
+        builder, loc, /*operands=*/llvm::ArrayRef<mlir::Value>{});
     dtensor_send->moveBefore(yield);
 
     // Lower DTensorSend op to actual TF op.
@@ -684,8 +684,8 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
       if (!recv_mesh.is_cpu_mesh() &&
           send_type.getElementType().isInteger(32)) {
         builder.setInsertionPointAfter(send_input.getDefiningOp());
-        auto cast_to_int64 = builder.create<mlir::TF::CastOp>(
-            send_input.getLoc(),
+        auto cast_to_int64 = mlir::TF::CastOp::create(
+            builder, send_input.getLoc(),
             mlir::RankedTensorType::get(send_type.getShape(),
                                         builder.getIntegerType(64)),
             send_input);
@@ -781,8 +781,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
         GetDeviceOrdinal(recv_mesh, loc,
                          recv_cluster->getParentOfType<mlir::func::FuncOp>(),
                          &builder));
-    mlir::Value predicate = builder.create<mlir::TF::EqualOp>(
-        loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
+    mlir::Value predicate = mlir::TF::EqualOp::create(
+        builder, loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
         /*incompatible_shape_error=*/builder.getBoolAttr(true));
 
     mlir::TensorType recv_type = dtensor_recv.getType();
@@ -795,8 +795,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
                                           builder.getIntegerType(64))
             : recv_type;
 
-    auto recv_if = builder.create<mlir::TF::IfRegionOp>(
-        loc, llvm::SmallVector<mlir::Type, 4>{output_type}, predicate,
+    auto recv_if = mlir::TF::IfRegionOp::create(
+        builder, loc, llvm::SmallVector<mlir::Type, 4>{output_type}, predicate,
         /*is_stateless=*/builder.getBoolAttr(true),
         GetUniqueControlflowFnName("copy_to_mesh_recv_if_then", builder),
         GetUniqueControlflowFnName("copy_to_mesh_recv_if_else", builder));
@@ -831,9 +831,9 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
       return absl::InvalidArgumentError("unsupported output type");
     }
 
-    mlir::Value zeros = builder.create<mlir::TF::ConstOp>(loc, const_attr);
-    builder.create<mlir::TF::YieldOp>(
-        loc, /*operands=*/llvm::ArrayRef<mlir::Value>{zeros});
+    mlir::Value zeros = mlir::TF::ConstOp::create(builder, loc, const_attr);
+    mlir::TF::YieldOp::create(builder, loc,
+                              /*operands=*/llvm::ArrayRef<mlir::Value>{zeros});
 
     // Create then branch region with DTensorRecv op.
     auto& then_branch = recv_if.getThenBranch();
@@ -843,8 +843,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
 
     TF_ASSIGN_OR_RETURN(mlir::Operation * xla_recv,
                         lower_fn(send_mesh, dtensor_recv, output_type));
-    builder.create<mlir::TF::YieldOp>(
-        loc,
+    mlir::TF::YieldOp::create(
+        builder, loc,
         /*operands=*/llvm::ArrayRef<mlir::Value>{xla_recv->getResult(0)});
 
     // Broadcast the received output to all GPU/TPU devices.
@@ -859,8 +859,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
                                     kReduceOpAdd));
 
     if (need_i32_to_i64_upcast) {
-      lowered_recv = builder.create<mlir::TF::CastOp>(
-          loc, recv_type, lowered_recv->getResult(0));
+      lowered_recv = mlir::TF::CastOp::create(builder, loc, recv_type,
+                                              lowered_recv->getResult(0));
     }
 
     // Replaces usages of DTensorRecv op with the broadcasted value.
diff --git a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
index e0bbc25792cd66..10b6296d5638b6 100644
--- a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
@@ -106,9 +106,9 @@ StatusOr<mlir::Operation*> ArgMaxSPMDExpander::ExpandOp(mlir::Operation* op) {
     }
   }
 
-  auto new_argmax = builder.create<mlir::TF::ArgMaxOp>(
-      argmax_op.getLoc(), argmax_op.getResult().getType(), input,
-      argmax_op.getDimension());
+  auto new_argmax = mlir::TF::ArgMaxOp::create(builder, argmax_op.getLoc(),
+                                               argmax_op.getResult().getType(),
+                                               input, argmax_op.getDimension());
   op->getResult(0).replaceAllUsesWith(new_argmax.getOutput());
   op->erase();
 
diff --git a/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
index f53f3b2a188945..6fb9cb790910ed 100644
--- a/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
@@ -257,8 +257,8 @@ StatusOr<mlir::Operation*> DataparallelSPMDExpander::RelayoutOperandsAndOutputs(
   builder.setInsertionPointAfter(last_op_after_splitting);
 
   // Tie all outputs together with identity_n
-  auto identity_op = builder.create<mlir::TF::IdentityNOp>(
-      op->getLoc(), generated_types, generated_outputs);
+  auto identity_op = mlir::TF::IdentityNOp::create(
+      builder, op->getLoc(), generated_types, generated_outputs);
   newly_created_ops.insert(identity_op);
   for (int i = 0; i < output_layouts.size(); ++i) {
     op->getOpResult(i).replaceAllUsesExcept(identity_op.getResult(i),
diff --git a/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
index 4ff627046e47a7..b5957d99dee649 100644
--- a/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
@@ -281,7 +281,7 @@ StatusOr<mlir::Operation*> ExpandFFTN(mlir::Operation* fft_op,
   } else {
     TF_ASSIGN_OR_RETURN(auto fft_length_vec, ExtractFFTLengthFromOp(fft_op));
     mlir::Value fft_length = IntConst(
-        builder, location, (int32)fft_length_vec[num_transform_axes - 1]);
+        builder, location, (int32_t)fft_length_vec[num_transform_axes - 1]);
     llvm::ArrayRef<int64_t> rfft_shape =
         mlir::dyn_cast<mlir::TensorType>(intermediate.getType()).getShape();
     std::vector<int64_t> rfft_shape_vec = rfft_shape.vec();
@@ -380,7 +380,7 @@ StatusOr<mlir::Operation*> ExpandIFFTN(mlir::Operation* ifft_op,
                         ExtractFFTLengthFromOp(ifft_op));
     mlir::Value ifft_length =
         IntConst(builder, location,
-                 (int32)complex_fft_length_vec[num_transform_axes - 1]);
+                 (int32_t)complex_fft_length_vec[num_transform_axes - 1]);
     // IRFFT for the last axis.
     mlir::TF::IRFFTOp irfft_output_op = mlir::TF::IRFFTOp::create(
         builder, location, ifft_op->getResult(0).getType(), transposed_output,
diff --git a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
index 0242ebbb0544e3..7de31a8bb7e5f1 100644
--- a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
@@ -83,8 +83,8 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
   mlir::Block* then_fn_block = then_func.addEntryBlock();
   mlir::OpBuilder then_fn_builder =
       mlir::OpBuilder::atBlockBegin(then_fn_block);
-  then_fn_builder.create<mlir::TF::NoOp>(location);
-  then_fn_builder.create<mlir::func::ReturnOp>(location);
+  mlir::TF::NoOp::create(then_fn_builder, location);
+  mlir::func::ReturnOp::create(then_fn_builder, location);
 
   // Build else_func that is the branch of device_id == 0.
   // The else func is just the original op.
@@ -100,9 +100,9 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
   mlir::OpBuilder else_fn_builder =
       mlir::OpBuilder::atBlockBegin(else_fn_block);
 
-  else_fn_builder.create<T>(location, op->getResultTypes(),
-                            else_fn_block->getArguments());
-  else_fn_builder.create<mlir::func::ReturnOp>(location);
+  T::create(else_fn_builder, location, op->getResultTypes(),
+            else_fn_block->getArguments());
+  mlir::func::ReturnOp::create(else_fn_builder, location);
 
   symbol_table.insert(then_func);
   symbol_table.insert(else_func);
@@ -115,12 +115,12 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
           builder, location,
           mlir::cast<mlir::TensorType>(device_id.getType()).getElementType()));
 
-  mlir::TF::NotEqualOp not_equal = builder.create<mlir::TF::NotEqualOp>(
-      location, device_id, zero_scalar,
+  mlir::TF::NotEqualOp not_equal = mlir::TF::NotEqualOp::create(
+      builder, location, device_id, zero_scalar,
       /*incompatible_shape_error=*/builder.getBoolAttr(false));
 
-  mlir::Operation* if_op = builder.create<mlir::TF::IfOp>(
-      location, then_func.getFunctionType().getResults(),
+  mlir::Operation* if_op = mlir::TF::IfOp::create(
+      builder, location, then_func.getFunctionType().getResults(),
       /*cond=*/not_equal.getResult(),
       /*input=*/op->getOperands(),
       /*then_branch=*/then_func.getSymName(),
diff --git a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
index 8a442d727aa19f..0bd4da477d2205 100644
--- a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
@@ -62,8 +62,8 @@ StatusOr<mlir::Operation*> IteratorGetNextSPMDExpander::ExpandOp(
         local_shape, global_output_type.getElementType());
   }
 
-  auto new_op = builder.create<mlir::TF::IteratorGetNextOp>(
-      DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
+  auto new_op = mlir::TF::IteratorGetNextOp::create(
+      builder, DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
 
   for (int i = 0; i < original_op->getNumResults(); ++i) {
     original_op.getResult(i).replaceAllUsesWith(new_op.getResult(i));
diff --git a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
index e18d3edf44d913..b2d6ca37777281 100644
--- a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
@@ -791,7 +791,7 @@ StatusOr<mlir::Operation*> ReshapeSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto const_attr =
       mlir::DenseIntElementsAttr::get(new_shape, local_reshape_const);
   auto new_reshape_const_op =
-      builder.create<mlir::TF::ConstOp>(DT_LOC(op), const_attr);
+      mlir::TF::ConstOp::create(builder, DT_LOC(op), const_attr);
   mlir::TF::ReshapeOp new_reshape_op = mlir::TF::ReshapeOp::create(
       builder, op->getLoc(), new_input, new_reshape_const_op);
 
diff --git a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
index 3c146a06a48558..a45a2df40a32e4 100644
--- a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
@@ -54,8 +54,8 @@ StatusOr<mlir::Operation*> OptionalGetValueSPMDExpander::ExpandOp(
     local_types[i] = local_type;
   }
 
-  auto new_op = builder.create<mlir::TF::OptionalGetValueOp>(
-      DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
+  auto new_op = mlir::TF::OptionalGetValueOp::create(
+      builder, DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
 
   for (int i = 0; i < original_op->getNumResults(); ++i) {
     original_op.getResult(i).replaceAllUsesWith(new_op.getResult(i));
diff --git a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
index b6e1c316cef6f2..6175e133710f7a 100644
--- a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
@@ -151,26 +151,26 @@ StatusOr<mlir::Value> GetDeviceSeed(const Layout& layout, mlir::Operation* op) {
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, multipliers);
   mlir::Value multiplier =
-      builder.create<mlir::TF::ConstOp>(cluster.getLoc(), const_attr)
+      mlir::TF::ConstOp::create(builder, cluster.getLoc(), const_attr)
           .getOutput();
 
   const mlir::RankedTensorType one_by_one =
       mlir::RankedTensorType::get({1, 1}, builder.getIntegerType(32));
 
-  mlir::Value seed = builder.create<mlir::TF::MatMulOp>(
-      cluster.getLoc(), one_by_one, mesh_coordinates, multiplier);
+  mlir::Value seed = mlir::TF::MatMulOp::create(
+      builder, cluster.getLoc(), one_by_one, mesh_coordinates, multiplier);
 
   // Largest prime in 16 bits.
   mlir::Value prime = CreateIntScalarConst(
       /*value=*/65521, builder, cluster.getLoc(), /*use_int64=*/false);
 
   mlir::Value seed_plus_prime =
-      builder
-          .create<mlir::TF::AddV2Op>(cluster.getLoc(), one_by_one, seed, prime)
+      mlir::TF::AddV2Op::create(builder, cluster.getLoc(), one_by_one, seed,
+                                prime)
           .getZ();
 
-  mlir::TF::SqueezeOp squeeze = builder.create<mlir::TF::SqueezeOp>(
-      cluster.getLoc(),
+  mlir::TF::SqueezeOp squeeze = mlir::TF::SqueezeOp::create(
+      builder, cluster.getLoc(),
       mlir::RankedTensorType::get({}, builder.getIntegerType(32)),
       seed_plus_prime, builder.getI64ArrayAttr({0, 1}));
 
@@ -207,11 +207,12 @@ StatusOr<mlir::Value> ComputeNewSeed(mlir::OpBuilder& builder,
   mlir::Type seed_type =
       mlir::cast<mlir::TensorType>(op_seed.getType()).getElementType();
 
-  device_id_seed = builder.create<mlir::TF::CastOp>(
-      location, mlir::RankedTensorType::get({}, seed_type), device_id_seed);
+  device_id_seed = mlir::TF::CastOp::create(
+      builder, location, mlir::RankedTensorType::get({}, seed_type),
+      device_id_seed);
 
-  mlir::Value seed_xor =
-      builder.create<mlir::TF::BitwiseXorOp>(location, op_seed, device_id_seed);
+  mlir::Value seed_xor = mlir::TF::BitwiseXorOp::create(
+      builder, location, op_seed, device_id_seed);
   return seed_xor;
 }
 
@@ -240,8 +241,8 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV1(const Layout& layout,
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
   // TODO(zhonglinhan) : check different input for StatelessRandomUniformInt
-  auto local_random = builder.create<RandomOp>(location, new_random_type,
-                                               new_shape_value, seed_xor);
+  auto local_random = RandomOp::create(builder, location, new_random_type,
+                                       new_shape_value, seed_xor);
   op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
@@ -272,9 +273,9 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2(const Layout& layout,
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
-  auto local_random = builder.create<RandomOp>(
-      location, new_random_type, new_shape_value, seed_xor,
-      random_op.getCounter(), random_op.getAlg());
+  auto local_random =
+      RandomOp::create(builder, location, new_random_type, new_shape_value,
+                       seed_xor, random_op.getCounter(), random_op.getAlg());
   op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
@@ -305,10 +306,10 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2Range(
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
-  auto local_random = builder.create<RandomOp>(
-      location, new_random_type, new_shape_value, seed_xor,
-      random_op.getCounter(), random_op.getAlg(), random_op.getMinval(),
-      random_op.getMaxval());
+  auto local_random =
+      RandomOp::create(builder, location, new_random_type, new_shape_value,
+                       seed_xor, random_op.getCounter(), random_op.getAlg(),
+                       random_op.getMinval(), random_op.getMaxval());
   op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
diff --git a/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc
index feb5b9eda74a01..f55d62efa81501 100644
--- a/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc
@@ -82,8 +82,8 @@ ReplicatedOpSPMDExpander::ReplicatedRelayoutOperandsAndOutputs(
   builder.setInsertionPointAfter(last_op_after_splitting);
 
   // Tie all outputs together with identity_n
-  auto identity_op = builder.create<mlir::TF::IdentityNOp>(
-      op->getLoc(), generated_types, generated_outputs);
+  auto identity_op = mlir::TF::IdentityNOp::create(
+      builder, op->getLoc(), generated_types, generated_outputs);
   newly_created_ops.insert(identity_op);
   for (int i = 0; i < output_layouts.size(); ++i) {
     op->getOpResult(i).replaceAllUsesExcept(identity_op.getResult(i),
diff --git a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
index c0aa768d9a5d03..c2fc958965ec33 100644
--- a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
@@ -122,9 +122,9 @@ StatusOr<mlir::Operation*> UnsortedSegmentSumSPMDExpander::ExpandOp(
       EmitRelayout(segment_ids, segment_ids_layout, new_segment_ids_layout));
 
   mlir::OpBuilder builder(op);
-  mlir::Operation* new_sum_op = builder.create<mlir::TF::UnsortedSegmentSumOp>(
-      op->getLoc(), sum_op.getOutput().getType(), data, new_segment_ids,
-      sum_op.getNumSegments());
+  mlir::Operation* new_sum_op = mlir::TF::UnsortedSegmentSumOp::create(
+      builder, op->getLoc(), sum_op.getOutput().getType(), data,
+      new_segment_ids, sum_op.getNumSegments());
 
   InferSPMDExpandedLocalShape(new_sum_op);
 
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index fc082290109260..4cf10413879cbf 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -187,11 +187,10 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   else
     new_size = Int64Const(builder, loc, sizes);
 
-  auto new_op = builder
-                    .create<mlir::TF::SliceOp>(
-                        loc, slice_op.getOutput().getType(), relayout_input,
-                        slice_op.getBegin(), new_size)
-                    .getOperation();
+  auto new_op =
+      mlir::TF::SliceOp::create(builder, loc, slice_op.getOutput().getType(),
+                                relayout_input, slice_op.getBegin(), new_size)
+          .getOperation();
   new_op = InferSPMDExpandedLocalShape(new_op);
 
   TF_ASSIGN_OR_RETURN(auto relayout_output,
diff --git a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
index 196e3702b1c843..62fc9413e78307 100644
--- a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
@@ -70,12 +70,12 @@ StatusOr<mlir::Value> ComputeGlobalReduce(
 
   // First compute a local reduce
   if (reduce_op == kReduceOpAdd) {
-    local_reduce = builder.create<mlir::TF::SumOp>(
-        input.getLoc(), input, reduction_indices,
+    local_reduce = mlir::TF::SumOp::create(
+        builder, input.getLoc(), input, reduction_indices,
         /*keep_dims=*/builder.getBoolAttr(true));
   } else if (reduce_op == kReduceOpMax) {
-    local_reduce = builder.create<mlir::TF::MaxOp>(
-        input.getLoc(), input, reduction_indices,
+    local_reduce = mlir::TF::MaxOp::create(
+        builder, input.getLoc(), input, reduction_indices,
         /*keep_dims=*/builder.getBoolAttr(true));
   } else {
     return errors::Unimplemented("reduction ", reduce_op, " not implemented");
@@ -107,8 +107,8 @@ StatusOr<mlir::Value> ComputeGlobalReduce(
     // dimension attribute type. Everything else is OK with int32_t dimensions.
     std::vector<int64_t> reduce_dim_array_64(reduced_dims.begin(),
                                              reduced_dims.end());
-    global_reduce = builder.create<mlir::TF::SqueezeOp>(
-        input.getLoc(), new_type, global_reduce->getResult(0),
+    global_reduce = mlir::TF::SqueezeOp::create(
+        builder, input.getLoc(), new_type, global_reduce->getResult(0),
         builder.getI64ArrayAttr(reduce_dim_array_64));
   }
   return global_reduce->getResult(0);
@@ -143,9 +143,9 @@ absl::Status ComputeExpAndSum(mlir::OpBuilder& builder,
 
   // Subtract max from local copy of logits.
   shifted_logits =
-      builder.create<mlir::TF::SubOp>(loc, logits, max_logits).getResult();
+      mlir::TF::SubOp::create(builder, loc, logits, max_logits).getResult();
   exp_of_shifted_logits =
-      builder.create<mlir::TF::ExpOp>(loc, shifted_logits).getResult();
+      mlir::TF::ExpOp::create(builder, loc, shifted_logits).getResult();
 
   // Sum the exponential.
   TF_ASSIGN_OR_RETURN(
@@ -162,8 +162,9 @@ mlir::Value ComputeSoftmax(mlir::OpBuilder& builder,
                            const mlir::Value& exp_of_shifted_logits,
                            const mlir::Value& sum_of_exp) {
   // For Softmax, we compute exp(shifted_logits)/sum(exp(shifted_logits))
-  auto softmax = builder.create<mlir::TF::DivOp>(
-      exp_of_shifted_logits.getLoc(), exp_of_shifted_logits, sum_of_exp);
+  auto softmax =
+      mlir::TF::DivOp::create(builder, exp_of_shifted_logits.getLoc(),
+                              exp_of_shifted_logits, sum_of_exp);
   return softmax.getResult();
 }
 
@@ -174,9 +175,9 @@ mlir::Value ComputeLogSoftmax(mlir::OpBuilder& builder,
                               const mlir::Value& sum_of_exp) {
   // For LogSoftmax, we compute shifted_logs - log(sum(exp(shifted_logits)))
   auto log_of_sum =
-      builder.create<mlir::TF::LogOp>(shifted_logits.getLoc(), sum_of_exp);
-  auto log_softmax = builder.create<mlir::TF::SubOp>(
-      shifted_logits.getLoc(), shifted_logits, log_of_sum.getResult());
+      mlir::TF::LogOp::create(builder, shifted_logits.getLoc(), sum_of_exp);
+  auto log_softmax = mlir::TF::SubOp::create(
+      builder, shifted_logits.getLoc(), shifted_logits, log_of_sum.getResult());
   return log_softmax.getResult();
 }
 
@@ -223,12 +224,11 @@ StatusOr<mlir::Value> GetFPConstOfType(mlir::OpBuilder& builder,
                                        const mlir::Value& input, float value) {
   if (mlir::TensorType type =
           mlir::dyn_cast<mlir::TensorType>(input.getType())) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            input.getLoc(),
-            mlir::DenseFPElementsAttr::get<float>(
-                mlir::RankedTensorType::get({}, type.getElementType()),
-                {value}))
+    return mlir::TF::ConstOp::create(
+               builder, input.getLoc(),
+               mlir::DenseFPElementsAttr::get<float>(
+                   mlir::RankedTensorType::get({}, type.getElementType()),
+                   {value}))
         .getOutput();
   } else {
     return errors::Unimplemented("non tensor type for labels is not supported");
@@ -290,23 +290,23 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
 
   // Slice out the [1,1] for mesh_dim_index.
   mlir::Value shard_id =
-      builder
-          .create<mlir::TF::SliceOp>(
-              loc, mlir::RankedTensorType::get({1, 1}, builder.getI32Type()),
-              mesh_coordinates,
-              IntConst(builder, input.getLoc(), {0, mesh_dim_index}),
-              IntConst(builder, input.getLoc(), {1, 1}))
+      mlir::TF::SliceOp::create(
+          builder, loc,
+          mlir::RankedTensorType::get({1, 1}, builder.getI32Type()),
+          mesh_coordinates,
+          IntConst(builder, input.getLoc(), {0, mesh_dim_index}),
+          IntConst(builder, input.getLoc(), {1, 1}))
           .getOutput();
 
-  shard_id = builder
-                 .create<mlir::TF::SqueezeOp>(
-                     loc, mlir::RankedTensorType::get({}, builder.getI32Type()),
-                     shard_id, builder.getI64ArrayAttr({0, 1}))
-                 .getOutput();
+  shard_id =
+      mlir::TF::SqueezeOp::create(
+          builder, loc, mlir::RankedTensorType::get({}, builder.getI32Type()),
+          shard_id, builder.getI64ArrayAttr({0, 1}))
+          .getOutput();
 
   // `new_indices` = `input` - `shard_id` * (classes/num_shards)
   mlir::Value id_offset =
-      builder.create<mlir::TF::MulOp>(loc, shard_id, depth).getZ();
+      mlir::TF::MulOp::create(builder, loc, shard_id, depth).getZ();
 
   // Note that the type of id_offset (int32) may not match the type of input.
   // So we insert a cast in this case.
@@ -314,25 +314,23 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
       mlir::dyn_cast<mlir::TensorType>(input.getType());
   if (!input_type) return errors::InvalidArgument("input is not a TensorType");
   if (!input_type.getElementType().isInteger(32))
-    id_offset =
-        builder
-            .create<mlir::TF::CastOp>(
-                loc,
-                mlir::RankedTensorType::get({}, input_type.getElementType()),
-                id_offset)
-            .getY();
+    id_offset = mlir::TF::CastOp::create(builder, loc,
+                                         mlir::RankedTensorType::get(
+                                             {}, input_type.getElementType()),
+                                         id_offset)
+                    .getY();
 
   mlir::Value indices =
-      builder.create<mlir::TF::SubOp>(loc, input, id_offset).getZ();
+      mlir::TF::SubOp::create(builder, loc, input, id_offset).getZ();
 
   TF_ASSIGN_OR_RETURN(mlir::Value on_value,
                       GetFPConstOfType(builder, features, 1.0));
   TF_ASSIGN_OR_RETURN(mlir::Value off_value,
                       GetFPConstOfType(builder, features, 0.0));
 
-  return builder
-      .create<mlir::TF::OneHotOp>(input.getLoc(), indices, depth, on_value,
-                                  off_value, builder.getI64IntegerAttr(1))
+  return mlir::TF::OneHotOp::create(builder, input.getLoc(), indices, depth,
+                                    on_value, off_value,
+                                    builder.getI64IntegerAttr(1))
       .getOutput();
 }
 
@@ -530,7 +528,7 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::MaybeRelayoutOutputs(
   llvm::SmallVector<mlir::Value, 4> values = {new_loss, new_backprop};
 
   mlir::TF::IdentityNOp identity_op =
-      builder.create<mlir::TF::IdentityNOp>(loss.getLoc(), types, values);
+      mlir::TF::IdentityNOp::create(builder, loss.getLoc(), types, values);
 
   newly_created_ops.insert(identity_op);
 
@@ -627,17 +625,15 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::ExpandOp(
                       GetFPConstOfType(builder, labels, 0.0));
 
   const mlir::Value is_labels_zero =
-      builder
-          .create<mlir::TF::EqualOp>(op->getLoc(), labels, labels_zero,
-                                     builder.getBoolAttr(true))
+      mlir::TF::EqualOp::create(builder, op->getLoc(), labels, labels_zero,
+                                builder.getBoolAttr(true))
           .getZ();
   const mlir::Value safe_softmax =
-      builder
-          .create<mlir::TF::SelectV2Op>(op->getLoc(), is_labels_zero,
-                                        features_zero, log_softmax)
+      mlir::TF::SelectV2Op::create(builder, op->getLoc(), is_labels_zero,
+                                   features_zero, log_softmax)
           .getOutput();
   const mlir::Value prod =
-      builder.create<mlir::TF::MulOp>(op->getLoc(), labels, safe_softmax)
+      mlir::TF::MulOp::create(builder, op->getLoc(), labels, safe_softmax)
           .getZ();
 
   // Compute the reduce sum
@@ -648,10 +644,10 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::ExpandOp(
 
   builder.setInsertionPointAfterValue(positive_loss);
   mlir::Value loss =
-      builder.create<mlir::TF::NegOp>(op->getLoc(), positive_loss).getY();
+      mlir::TF::NegOp::create(builder, op->getLoc(), positive_loss).getY();
 
   mlir::Value backprop =
-      builder.create<mlir::TF::SubOp>(op->getLoc(), softmax, labels);
+      mlir::TF::SubOp::create(builder, op->getLoc(), softmax, labels);
 
   return MaybeRelayoutOutputs(op, loss, backprop, internal_layout,
                               output_layouts[0], output_layouts[1]);
diff --git a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
index 7dc4ae56d0ed71..8e2fa02dcc9f44 100644
--- a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
@@ -58,10 +58,9 @@ StatusOr<mlir::Operation*> TensorListReserveSPMDExpander::ExpandOp(
               mlir::RankedTensorType::get(local_shape, element_type),
               builder.getContext()));
   mlir::Value new_shape_value = Int64Const(builder, DT_LOC(op), local_shape);
-  mlir::TF::TensorListReserveOp new_op =
-      builder.create<mlir::TF::TensorListReserveOp>(
-          DT_LOC(op), new_output_type, new_shape_value,
-          tensorlist_op.getNumElements());
+  mlir::TF::TensorListReserveOp new_op = mlir::TF::TensorListReserveOp::create(
+      builder, DT_LOC(op), new_output_type, new_shape_value,
+      tensorlist_op.getNumElements());
 
   op->getResult(0).replaceAllUsesWith(new_op.getResult());
   op->erase();
diff --git a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
index a0c137cb83dc4d..f1e3a60f8a2d21 100644
--- a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
+++ b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
@@ -206,8 +206,8 @@ mlir::LogicalResult HandleCopyToMeshWithinCluster(
       }
     }
     mlir::OpBuilder builder(op);
-    auto identity_op = builder.create<mlir::TF::IdentityOp>(
-        op.getLoc(), input.getType(), input);
+    auto identity_op = mlir::TF::IdentityOp::create(builder, op.getLoc(),
+                                                    input.getType(), input);
     op->getResult(0).replaceAllUsesWith(identity_op.getOutput());
     op->erase();
     return mlir::WalkResult::advance();
@@ -246,8 +246,9 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
 
   // Create send op that sends data from input cluster to target cluster.
   const Mesh& target_mesh = mesh_or_status.value();
-  builder.create<mlir::TF::DTensorSend>(
-      copy_to_mesh.getLoc(), value_to_send, builder.getStringAttr(op_key),
+  mlir::TF::DTensorSend::create(
+      builder, copy_to_mesh.getLoc(), value_to_send,
+      builder.getStringAttr(op_key),
       mlir::dtensor::MeshAttr::get(context, target_mesh));
 
   // Create recv op that recvs data from send op.
@@ -258,8 +259,8 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
         "CopyToMesh op must have static shape.");
 
   builder.setInsertionPoint(copy_to_mesh);
-  auto recv_op = builder.create<mlir::TF::DTensorRecv>(
-      copy_to_mesh.getLoc(), value_to_send.getType(),
+  auto recv_op = mlir::TF::DTensorRecv::create(
+      builder, copy_to_mesh.getLoc(), value_to_send.getType(),
       builder.getStringAttr(op_key),
       mlir::TF::ShapeAttr::get(context, tensor_type),
       mlir::dtensor::MeshAttr::get(context, target_mesh));
@@ -396,8 +397,9 @@ mlir::LogicalResult InsertCopyToMesh(mlir::tf_device::ClusterOp cluster) {
     if (input_mesh == mesh) continue;
     mlir::OpBuilder builder(op);
 
-    auto new_op = builder.create<mlir::TF::CopyToMeshOp>(
-        op->getLoc(), op->getResult(0).getType(), input, mesh.ToString());
+    auto new_op = mlir::TF::CopyToMeshOp::create(builder, op->getLoc(),
+                                                 op->getResult(0).getType(),
+                                                 input, mesh.ToString());
     op->replaceUsesOfWith(input, new_op.getResult());
   }
   return mlir::success();
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index c8dd29135e96ca..49ede9025b4310 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -733,9 +733,9 @@ mlir::LogicalResult InsertDTensorLayoutOps(
     mlir::Type value_type = GetSubtypeOrSelf(merged_layout.first);
 
     if (auto type = mlir::dyn_cast<mlir::TensorType>(value_type)) {
-      auto layout_op = builder.create<mlir::TF::DTensorLayout>(
-          merged_layout.first.getLoc(), merged_layout.first, layout_attr,
-          mlir::TF::ShapeAttr::get(builder.getContext(), type));
+      auto layout_op = mlir::TF::DTensorLayout::create(
+          builder, merged_layout.first.getLoc(), merged_layout.first,
+          layout_attr, mlir::TF::ShapeAttr::get(builder.getContext(), type));
       llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
       merged_layout.first.replaceAllUsesExcept(layout_op.getOutput(),
                                                exception);
@@ -1234,30 +1234,26 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
       mlir::TF::ShapeAttr global_shape = mlir::TF::ShapeAttr::get(
           builder.getContext(),
           mlir::cast<mlir::TensorType>(yield_op->getOperand(i).getType()));
-      mlir::TF::RelayoutOp first_relayout =
-          builder.create<mlir::TF::RelayoutOp>(
-              op.getLoc(), yield_op->getOperand(i).getType(),
-              yield_op->getOperand(i), input_layout.ToString());
-      mlir::TF::DTensorLayout first_layout_op =
-          builder.create<mlir::TF::DTensorLayout>(
-              op.getLoc(), first_relayout.getOutput(),
-              mlir::dtensor::LayoutAttr::get(builder.getContext(),
-                                             input_layout),
-              global_shape);
+      mlir::TF::RelayoutOp first_relayout = mlir::TF::RelayoutOp::create(
+          builder, op.getLoc(), yield_op->getOperand(i).getType(),
+          yield_op->getOperand(i), input_layout.ToString());
+      mlir::TF::DTensorLayout first_layout_op = mlir::TF::DTensorLayout::create(
+          builder, op.getLoc(), first_relayout.getOutput(),
+          mlir::dtensor::LayoutAttr::get(builder.getContext(), input_layout),
+          global_shape);
       yield_op->setOperand(i, first_layout_op.getOutput());
 
       // Insert the second relayout op after the loop itself.
       builder.setInsertionPointAfter(op);
       mlir::TF::DTensorLayout second_layout_op =
-          builder.create<mlir::TF::DTensorLayout>(
-              op.getLoc(), op->getResult(i),
+          mlir::TF::DTensorLayout::create(
+              builder, op.getLoc(), op->getResult(i),
               mlir::dtensor::LayoutAttr::get(builder.getContext(),
                                              input_layout),
               global_shape);
-      mlir::TF::RelayoutOp second_relayout =
-          builder.create<mlir::TF::RelayoutOp>(
-              op.getLoc(), second_layout_op.getOutput().getType(),
-              second_layout_op.getOutput(), output_layout.ToString());
+      mlir::TF::RelayoutOp second_relayout = mlir::TF::RelayoutOp::create(
+          builder, op.getLoc(), second_layout_op.getOutput().getType(),
+          second_layout_op.getOutput(), output_layout.ToString());
       op->getResult(i).replaceAllUsesExcept(
           second_relayout.getOutput(), llvm::SmallPtrSet<mlir::Operation*, 1>{
                                            second_layout_op.getOperation()});
diff --git a/tensorflow/dtensor/mlir/lower_send_recv.cc b/tensorflow/dtensor/mlir/lower_send_recv.cc
index 0cbcdd61abd7c4..142932afbee7da 100644
--- a/tensorflow/dtensor/mlir/lower_send_recv.cc
+++ b/tensorflow/dtensor/mlir/lower_send_recv.cc
@@ -90,8 +90,8 @@ void PropagateDeviceIdToClusters(mlir::ModuleOp module) {
 
   module.walk([&](mlir::tf_device::ClusterOp op) {
     mlir::OpBuilder builder(&op.GetBody().front());
-    builder.create<mlir::TF::IdentityOp>(main_func.getLoc(),
-                                         device_id->getType(), *device_id);
+    mlir::TF::IdentityOp::create(builder, main_func.getLoc(),
+                                 device_id->getType(), *device_id);
   });
 }
 
diff --git a/tensorflow/dtensor/mlir/merge_clusters.cc b/tensorflow/dtensor/mlir/merge_clusters.cc
index 0e88ca55057a26..81a856aa1a0c9c 100644
--- a/tensorflow/dtensor/mlir/merge_clusters.cc
+++ b/tensorflow/dtensor/mlir/merge_clusters.cc
@@ -288,31 +288,31 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
       absl::StrCat(kSendRecvKeyPrefix, *num_send_recvs);
   *num_send_recvs += 1;
 
-  builder.create<mlir::TF::DTensorSend>(
-      if_region.getLoc(), if_region.getCond(),
-      builder.getStringAttr(send_recv_key),
-      mlir::dtensor::MeshAttr::get(context, mesh));
+  mlir::TF::DTensorSend::create(builder, if_region.getLoc(),
+                                if_region.getCond(),
+                                builder.getStringAttr(send_recv_key),
+                                mlir::dtensor::MeshAttr::get(context, mesh));
 
   // Create new cluster op that contains cloned if operation.
-  auto new_cluster = builder.create<mlir::tf_device::ClusterOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{});
+  auto new_cluster = mlir::tf_device::ClusterOp::create(
+      builder, if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{});
   new_cluster.getBody().push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&new_cluster.GetBody());
-  auto return_op = builder.create<mlir::tf_device::ReturnOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Value, 4>{});
+  auto return_op = mlir::tf_device::ReturnOp::create(
+      builder, if_region.getLoc(), llvm::SmallVector<mlir::Value, 4>{});
 
   // Add DTensorRecv op inside new cluster that receives the cluster.
   builder.setInsertionPoint(return_op);
-  auto recv_op = builder.create<mlir::TF::DTensorRecv>(
-      if_region.getLoc(), predicate_tensor_type,
+  auto recv_op = mlir::TF::DTensorRecv::create(
+      builder, if_region.getLoc(), predicate_tensor_type,
       builder.getStringAttr(send_recv_key),
       mlir::TF::ShapeAttr::get(context, predicate_tensor_type),
       mlir::dtensor::MeshAttr::get(context, mesh));
 
   // Clone tf.IfRegion op inside newly created cluster and make sure
   // that the predicate tensor is from DTensorRecv op created above.
-  auto host_side_if = builder.create<mlir::TF::IfRegionOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
+  auto host_side_if = mlir::TF::IfRegionOp::create(
+      builder, if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
       recv_op.getOutput(), if_region.getIsStateless(),
       GetUniqueControlflowFnName("cloned_if_then", builder),
       GetUniqueControlflowFnName("cloned_if_else", builder));
@@ -322,15 +322,15 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
   auto& then_branch = host_side_if.getThenBranch();
   then_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&then_branch.front());
-  builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
-                                    /*operands=*/llvm::ArrayRef<mlir::Value>{});
+  mlir::TF::YieldOp::create(builder, if_region.getLoc(),
+                            /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
   // Create empty else branch region.
   auto& else_branch = host_side_if.getElseBranch();
   else_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&else_branch.front());
-  builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
-                                    /*operands=*/llvm::ArrayRef<mlir::Value>{});
+  mlir::TF::YieldOp::create(builder, if_region.getLoc(),
+                            /*operands=*/llvm::ArrayRef<mlir::Value>{});
   new_cluster->setAttr(kMeshAttr, builder.getStringAttr(mesh.ToString()));
 }
 
@@ -550,8 +550,8 @@ mlir::LogicalResult MergeClusters(mlir::ModuleOp module) {
 
     // Create a single cluster op contains merged computations for `mesh`.
     builder.setInsertionPoint(&func_block.front());
-    auto new_cluster = builder.create<mlir::tf_device::ClusterOp>(
-        module.getLoc(), merged_return_types);
+    auto new_cluster = mlir::tf_device::ClusterOp::create(
+        builder, module.getLoc(), merged_return_types);
     new_cluster.getBody().push_back(new mlir::Block);
     new_cluster->setAttr(kMeshAttr, builder.getStringAttr(mesh.ToString()));
 
@@ -578,8 +578,8 @@ mlir::LogicalResult MergeClusters(mlir::ModuleOp module) {
     }
 
     builder.setInsertionPointToEnd(&new_cluster.GetBody());
-    builder.create<mlir::tf_device::ReturnOp>(new_cluster.getLoc(),
-                                              merged_return_values);
+    mlir::tf_device::ReturnOp::create(builder, new_cluster.getLoc(),
+                                      merged_return_values);
 
     // Replace return value usages.
     for (auto it :
diff --git a/tensorflow/dtensor/mlir/move_compilation_to_host.cc b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
index 053913f4844606..894b1bacbe72ee 100644
--- a/tensorflow/dtensor/mlir/move_compilation_to_host.cc
+++ b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
@@ -117,8 +117,8 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
   builder.setInsertionPointAfter(compile_op);
   for (int i = 0; i < num_tpu_devices; ++i) {
     const std::string& tensor_name = device_key_map[i];
-    auto send = builder.create<mlir::TF::_HostSendOp>(
-        compile_op->getLoc(), compilation_key, tensor_name,
+    auto send = mlir::TF::_HostSendOp::create(
+        builder, compile_op->getLoc(), compilation_key, tensor_name,
         compile_op_launch.getDevice(),
         /*send_device_incarnation=*/0, local_devices[i]);
     send->setAttr("device", compile_op_launch.getDeviceAttr());
@@ -148,15 +148,15 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
 
     mlir::Block* fn_block = recv_select_fn.addEntryBlock();
     mlir::OpBuilder fn_builder = mlir::OpBuilder::atBlockEnd(fn_block);
-    auto recv = fn_builder.create<mlir::TF::_HostRecvOp>(
-        compile_op->getLoc(),
+    auto recv = mlir::TF::_HostRecvOp::create(
+        fn_builder, compile_op->getLoc(),
         mlir::cast<mlir::TensorType>(compilation_key.getType()),
         device_key_map[i], compile_op_launch.getDevice(),
         /*send_device_incarnation=*/0, local_devices[i]);
     recv->setAttr("device", builder.getStringAttr(local_devices[i]));
 
-    fn_builder.create<mlir::func::ReturnOp>(recv_select_fn.getLoc(),
-                                            recv.getTensor());
+    mlir::func::ReturnOp::create(fn_builder, recv_select_fn.getLoc(),
+                                 recv.getTensor());
 
     compilation_key_functions.emplace_back(recv_select_fn);
   }
@@ -172,8 +172,8 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
     symbols.push_back(mlir::SymbolRefAttr::get(func));
 
   // Create a TF::Case op that selects `values` based on `id`.
-  auto program_key = builder.create<mlir::TF::CaseOp>(
-      compile_op.getLoc(),
+  auto program_key = mlir::TF::CaseOp::create(
+      builder, compile_op.getLoc(),
       /*output=*/llvm::SmallVector<mlir::Type, 4>{compilation_key.getType()},
       /*branch_index=*/*device_id,
       /*input=*/llvm::ArrayRef<mlir::Value>{},
@@ -288,15 +288,16 @@ mlir::LogicalResult HandleCompilationOps(
           llvm::formatv("error while creating TPU compilation logic. {0}",
                         device_ordinal_host.status().message()));
 
-    mlir::Value predicate_host = builder.create<mlir::TF::EqualOp>(
-        compile_op.getLoc(), *device_ordinal_host,
+    mlir::Value predicate_host = mlir::TF::EqualOp::create(
+        builder, compile_op.getLoc(), *device_ordinal_host,
         CreateIntScalarConst(0, builder, compile_op.getLoc()),
         /*incompatible_shape_error=*/builder.getBoolAttr(true));
 
     // If op here contains send/recv and TPUCompile op that should not be pruned
     // away. Therefore, we explicitly set the op to be stateful.
-    auto if_host = builder.create<mlir::TF::IfRegionOp>(
-        compile_op.getLoc(), llvm::SmallVector<mlir::Type, 4>{}, predicate_host,
+    auto if_host = mlir::TF::IfRegionOp::create(
+        builder, compile_op.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
+        predicate_host,
         /*is_stateless=*/builder.getBoolAttr(false),
         GetUniqueControlflowFnName("compilation_host_then", builder),
         GetUniqueControlflowFnName("compilation_host_else", builder));
@@ -305,18 +306,17 @@ mlir::LogicalResult HandleCompilationOps(
     auto& host_else_branch = if_host.getElseBranch();
     host_else_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&host_else_branch.front());
-    builder.create<mlir::TF::YieldOp>(
-        compile_op.getLoc(),
-        /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    mlir::TF::YieldOp::create(builder, compile_op.getLoc(),
+                              /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
     // Create then branch region with logic to compile TPU program and send
     // program key to all TPU devices.
     auto& host_then_branch = if_host.getThenBranch();
     host_then_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&host_then_branch.front());
-    auto yield = builder.create<mlir::TF::YieldOp>(
-        compile_op.getLoc(),
-        /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    auto yield =
+        mlir::TF::YieldOp::create(builder, compile_op.getLoc(),
+                                  /*operands=*/llvm::ArrayRef<mlir::Value>{});
     compilation_move_before = yield;
 
     builder.setInsertionPointAfter(if_host);
diff --git a/tensorflow/dtensor/mlir/op_to_device_cluster.cc b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
index df3aa89dd9bd35..89c351b0f71ccc 100644
--- a/tensorflow/dtensor/mlir/op_to_device_cluster.cc
+++ b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
@@ -51,8 +51,8 @@ mlir::LogicalResult WrapDeviceCluster(mlir::OpBuilder *builder,
                                       mlir::Operation *op) {
   // Create new tf_device.cluster op wrapping a single operation.
   builder->setInsertionPoint(op);
-  auto cluster = builder->create<mlir::tf_device::ClusterOp>(
-      op->getLoc(), op->getResultTypes());
+  auto cluster = mlir::tf_device::ClusterOp::create(*builder, op->getLoc(),
+                                                    op->getResultTypes());
   if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
     cluster->setAttr(kMeshAttr, builder->getStringAttr(
                                     layout_op.getLayout().mesh().ToString()));
@@ -89,7 +89,7 @@ mlir::LogicalResult WrapDeviceCluster(mlir::OpBuilder *builder,
   cluster.getBody().push_back(new mlir::Block);
 
   builder->setInsertionPointToEnd(&cluster.GetBody());
-  builder->create<mlir::tf_device::ReturnOp>(op->getLoc(), op->getResults());
+  mlir::tf_device::ReturnOp::create(*builder, op->getLoc(), op->getResults());
 
   // Move `op` inside newly created `ClusterOp`.
   op->moveBefore(cluster.GetBody().getTerminator());
diff --git a/tensorflow/dtensor/mlir/op_utils.cc b/tensorflow/dtensor/mlir/op_utils.cc
index 08aa8f95612104..4b7a776ea2cd2c 100644
--- a/tensorflow/dtensor/mlir/op_utils.cc
+++ b/tensorflow/dtensor/mlir/op_utils.cc
@@ -116,8 +116,8 @@ mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
 
       // Replace DTensorLayout op with identity op.
       mlir::OpBuilder builder(input_layout_op);
-      auto new_identity = builder.create<mlir::TF::IdentityOp>(
-          input_layout_op->getLoc(), input_layout_op.getType(),
+      auto new_identity = mlir::TF::IdentityOp::create(
+          builder, input_layout_op->getLoc(), input_layout_op.getType(),
           input_layout_op.getInput());
       input_layout_op.getOutput().replaceAllUsesWith(new_identity.getOutput());
       input_layout_op.erase();
diff --git a/tensorflow/dtensor/mlir/propagate_default_layout.cc b/tensorflow/dtensor/mlir/propagate_default_layout.cc
index 6b0b35283fdca5..7be77a3f624ff4 100644
--- a/tensorflow/dtensor/mlir/propagate_default_layout.cc
+++ b/tensorflow/dtensor/mlir/propagate_default_layout.cc
@@ -53,8 +53,8 @@ void CreateDTensorLayoutOp(const Layout& layout, mlir::Value input,
                            mlir::MLIRContext* context) {
   if (layout.IsEmpty()) return;
 
-  auto layout_op = builder->create<mlir::TF::DTensorLayout>(
-      loc, input, mlir::dtensor::LayoutAttr::get(context, layout),
+  auto layout_op = mlir::TF::DTensorLayout::create(
+      *builder, loc, input, mlir::dtensor::LayoutAttr::get(context, layout),
       mlir::TF::ShapeAttr::get(context, type));
   if (arg_index != nullptr) {
     layout_op->setAttr(kFromArgIndex, arg_index);
diff --git a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
index 0417e392b4b28d..7381e3628e25d9 100644
--- a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
+++ b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
@@ -125,8 +125,8 @@ mlir::LogicalResult PrependDeviceIdToCallsites(mlir::OpBuilder* builder,
   mlir::Operation* new_call = nullptr;
   if (auto stateful_partitioned_call =
           llvm::dyn_cast<mlir::TF::StatefulPartitionedCallOp>(op)) {
-    new_call = builder->create<mlir::TF::StatefulPartitionedCallOp>(
-        op->getLoc(), op->getResultTypes(), new_operands,
+    new_call = mlir::TF::StatefulPartitionedCallOp::create(
+        *builder, op->getLoc(), op->getResultTypes(), new_operands,
         /*args_attrs=*/nullptr,
         /*res_attrs=*/nullptr, stateful_partitioned_call.getF(),
         stateful_partitioned_call.getConfig(),
@@ -134,8 +134,8 @@ mlir::LogicalResult PrependDeviceIdToCallsites(mlir::OpBuilder* builder,
         stateful_partitioned_call.getExecutorType());
   } else {
     auto partitioned_call = llvm::cast<mlir::TF::PartitionedCallOp>(op);
-    new_call = builder->create<mlir::TF::PartitionedCallOp>(
-        op->getLoc(), op->getResultTypes(), new_operands,
+    new_call = mlir::TF::PartitionedCallOp::create(
+        *builder, op->getLoc(), op->getResultTypes(), new_operands,
         /*args_attrs=*/nullptr,
         /*res_attrs=*/nullptr, partitioned_call.getF(),
         partitioned_call.getConfig(), partitioned_call.getConfigProto(),
diff --git a/tensorflow/dtensor/mlir/restore_shape_inference.cc b/tensorflow/dtensor/mlir/restore_shape_inference.cc
index ab327153634786..3be8637314be97 100644
--- a/tensorflow/dtensor/mlir/restore_shape_inference.cc
+++ b/tensorflow/dtensor/mlir/restore_shape_inference.cc
@@ -85,8 +85,8 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
     // O(N).
     value.setType(type);
   } else if (auto cast_op = llvm::dyn_cast_or_null<mlir::TF::CastOp>(op)) {
-    auto new_cast_op = builder->create<mlir::TF::CastOp>(cast_op.getLoc(), type,
-                                                         cast_op.getOperand());
+    auto new_cast_op = mlir::TF::CastOp::create(*builder, cast_op.getLoc(),
+                                                type, cast_op.getOperand());
     cast_op.replaceAllUsesWith(new_cast_op.getResult());
     cast_op.erase();
 
@@ -103,8 +103,8 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
         module, builder, new_cast_op.getOperand(), new_type);
   } else if (auto identity_op =
                  llvm::dyn_cast_or_null<mlir::TF::IdentityOp>(op)) {
-    auto new_identity_op = builder->create<mlir::TF::IdentityOp>(
-        identity_op.getLoc(), type, identity_op.getInput());
+    auto new_identity_op = mlir::TF::IdentityOp::create(
+        *builder, identity_op.getLoc(), type, identity_op.getInput());
     identity_op.getOutput().replaceAllUsesWith(new_identity_op.getOutput());
     identity_op.erase();
 
@@ -128,8 +128,9 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
     // RestoreV2Op we want to fix is on the mesh of the corresponding
     // DTensorSend. Set shape of this DTensorRecv first and go to the
     // corresponding DTensorSend.
-    auto new_recv_op = builder->create<mlir::TF::DTensorRecv>(
-        recv_op.getLoc(), type, builder->getStringAttr(recv_op.getKey()),
+    auto new_recv_op = mlir::TF::DTensorRecv::create(
+        *builder, recv_op.getLoc(), type,
+        builder->getStringAttr(recv_op.getKey()),
         mlir::TF::ShapeAttr::get(builder->getContext(),
                                  mlir::dyn_cast<mlir::TensorType>(type)),
         mlir::dtensor::MeshAttr::get(builder->getContext(), recv_op.getMesh()));
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
index f08908eff9395e..e695320769ecc4 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
@@ -53,14 +53,14 @@ StatusOr<mlir::Value> ExpandIndices(mlir::OpBuilder& builder,
           .getElementType());
   // Little trick to make a rank-2 tensor of [[0,0], [0,1]] using rank 1
   // constants.
-  mlir::Value indices_padding = builder.create<mlir::TF::ReshapeOp>(
-      loc,
+  mlir::Value indices_padding = mlir::TF::ReshapeOp::create(
+      builder, loc,
       mlir::TF::collection_ops_util::GetR1Const({0, 0, 0, 1}, builder, loc),
       mlir::TF::collection_ops_util::GetR1Const({2, 2}, builder, loc));
   mlir::Value indices_padded =
-      builder.create<mlir::TF::PadOp>(loc, indices_padded_type,
-                                      /*input=*/indices,
-                                      /*paddings=*/indices_padding);
+      mlir::TF::PadOp::create(builder, loc, indices_padded_type,
+                              /*input=*/indices,
+                              /*paddings=*/indices_padding);
   return indices_padded;
 }
 
@@ -98,16 +98,15 @@ StatusOr<mlir::Operation*> DynamicEnqueueSparseExpander::ExpandOp(
   // This op does not have a return value so we do not need to replace any
   // consumers.
   mlir::Operation* sparse_enqueue_op =
-      builder
-          .create<mlir::TF::DynamicEnqueueTPUEmbeddingArbitraryTensorBatchOp>(
-              location,
-              /*sample_indices_or_row_splits_list=*/indices,
-              /*embedding_indices=*/values,
-              /*aggregation_weights=*/dense_enqueue_op.getAggregationWeights(),
-              /*mode_override=*/
-              dense_enqueue_op.getModeOverride(),
-              /*device_ordinal=*/dense_enqueue_op.getDeviceOrdinal(),
-              /*combiners=*/dense_enqueue_op.getCombiners());
+      mlir::TF::DynamicEnqueueTPUEmbeddingArbitraryTensorBatchOp::create(
+          builder, location,
+          /*sample_indices_or_row_splits_list=*/indices,
+          /*embedding_indices=*/values,
+          /*aggregation_weights=*/dense_enqueue_op.getAggregationWeights(),
+          /*mode_override=*/
+          dense_enqueue_op.getModeOverride(),
+          /*device_ordinal=*/dense_enqueue_op.getDeviceOrdinal(),
+          /*combiners=*/dense_enqueue_op.getCombiners());
   dense_enqueue_op.erase();
   return sparse_enqueue_op;
 }
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
index 7ed10e42dfe186..5056b89ca9ae32 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
@@ -38,8 +38,8 @@ StatusOr<mlir::Operation*> MatMulSparseExpander::ExpandOp(mlir::Operation* op) {
     // Since operand 0 is a SparseValue, we don't need to check that
     // the indices, values, and dense_shapes exist.
     mlir::TF::SparseTensorDenseMatMulOp new_op =
-        builder.create<mlir::TF::SparseTensorDenseMatMulOp>(
-            op->getLoc(), op->getResultTypes(),
+        mlir::TF::SparseTensorDenseMatMulOp::create(
+            builder, op->getLoc(), op->getResultTypes(),
             mlir::ValueRange{
                 GetIndicesFromSparseTensor(op->getOperand(0)).value(),
                 GetValuesFromSparseTensor(op->getOperand(0)).value(),
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index 91eab6f8438dc2..9fd3af1af33c07 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -125,8 +125,8 @@ absl::Status CreateSplitOp(const int num_split, const int split_dimension,
       mlir::RankedTensorType::get({}, builder->getIntegerType(32));
   auto split_dimension_attr =
       mlir::DenseElementsAttr::get(split_dim_type, split_dimension);
-  auto split_dimension_op = builder->create<mlir::TF::ConstOp>(
-      location, split_dim_type, split_dimension_attr);
+  auto split_dimension_op = mlir::TF::ConstOp::create(
+      *builder, location, split_dim_type, split_dimension_attr);
 
   // Correctly set output shapes of split op output if input shape is statically
   // known.
@@ -157,8 +157,9 @@ absl::Status CreateSplitOp(const int num_split, const int split_dimension,
 
   // Creates a split op that splits |src_input| along |split_dimension|.
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
-  *split_op = builder->create<mlir::TF::SplitOp>(
-      location, output_types, split_dimension_op.getOutput(), src_input);
+  *split_op =
+      mlir::TF::SplitOp::create(*builder, location, output_types,
+                                split_dimension_op.getOutput(), src_input);
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/dtensor/mlir/tpu_integration.cc b/tensorflow/dtensor/mlir/tpu_integration.cc
index 67c6e0d9fbed23..e7fffe44a1f520 100644
--- a/tensorflow/dtensor/mlir/tpu_integration.cc
+++ b/tensorflow/dtensor/mlir/tpu_integration.cc
@@ -110,8 +110,8 @@ mlir::LogicalResult CreateTPUCluster(
   auto& function_block = function->getCallableRegion()->front();
   builder->setInsertionPointToStart(&function_block);
 
-  auto cluster = builder->create<mlir::tf_device::ClusterOp>(
-      tpu_call.getLoc(), function->getResultTypes());
+  auto cluster = mlir::tf_device::ClusterOp::create(*builder, tpu_call.getLoc(),
+                                                    function->getResultTypes());
   cluster.getBody().push_back(new mlir::Block);
 
   auto& function_body = function_block.getOperations();
@@ -121,8 +121,8 @@ mlir::LogicalResult CreateTPUCluster(
 
   builder->setInsertionPointToEnd(&cluster.GetBody());
   mlir::Operation* function_block_terminator = function_block.getTerminator();
-  builder->create<mlir::tf_device::ReturnOp>(
-      tpu_call.getLoc(), function_block_terminator->getOperands());
+  mlir::tf_device::ReturnOp::create(*builder, tpu_call.getLoc(),
+                                    function_block_terminator->getOperands());
 
   function_block_terminator->setOperands(cluster.getResults());
 
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index df52a5ddde934b..7858b3430d33ef 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -122,8 +122,8 @@ mlir::LogicalResult EmitAllReduceForXla(
   constexpr char kCrossReplica[] = "CrossReplica";
 
   // For TPUs, lower to XlaAllReduce straightforwardly.
-  *final_op = builder.create<mlir::TF::XlaAllReduceOp>(
-      all_reduce.getLoc(), all_reduce.getResult().getType(),
+  *final_op = mlir::TF::XlaAllReduceOp::create(
+      builder, all_reduce.getLoc(), all_reduce.getResult().getType(),
       all_reduce.getInput(), all_reduce.getGroupAssignment(),
       all_reduce.getReduceOpAttr(), builder.getStringAttr(kCrossReplica));
   return mlir::success();
@@ -198,7 +198,7 @@ mlir::Value GetRelativeDeviceId(mlir::Operation* op,
       ops_util::ReshapeScalarToSizeType(builder, DeviceId(op).value(), loc);
   mlir::Value start_device_id = ops_util::GetR1Const(
       {output_layout.mesh().min_global_device_id()}, builder, loc);
-  return builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+  return mlir::TF::SubOp::create(builder, loc, device_id, start_device_id);
 }
 
 void CreateGroupAndInstanceKey(
@@ -219,13 +219,14 @@ void CreateGroupAndInstanceKey(
   // Create a scalar group key by slicing device_id_to_group_key with
   // device_id.
   auto group_key_loc = DT_LOC2(loc, "group_key");
-  auto group_key_slice = builder.create<mlir::TF::SliceOp>(
-      group_key_loc, EffectivelyScalarR1Type(builder.getIntegerType(32)),
+  auto group_key_slice = mlir::TF::SliceOp::create(
+      builder, group_key_loc,
+      EffectivelyScalarR1Type(builder.getIntegerType(32)),
       /*input=*/IntConst(builder, loc, device_id_to_group_key),
       /*begin=*/device_id,
       /*size=*/IntConst(builder, loc, {1}));
-  auto group_key_reshape = builder.create<mlir::TF::ReshapeOp>(
-      group_key_loc, /*tensor=*/group_key_slice.getResult(),
+  auto group_key_reshape = mlir::TF::ReshapeOp::create(
+      builder, group_key_loc, /*tensor=*/group_key_slice.getResult(),
       /*shape=*/ops_util::GetR1Const({}, builder, loc));
   *group_key_scalar = group_key_reshape.getResult();
 
@@ -257,8 +258,8 @@ mlir::Operation* EmitCollectiveReduce(
   const bool is_mean_op = reduce_op_str == kReduceOpMean;
   mlir::Value group_size_scalar = ops_util::CreateScalarConst(
       host_group_size, builder, DT_LOC2(loc, "group_size"));
-  auto collective_reduce = builder.create<mlir::TF::CollectiveReduceV2Op>(
-      loc, /*output_type=*/input.getType(), input, group_size_scalar,
+  auto collective_reduce = mlir::TF::CollectiveReduceV2Op::create(
+      builder, loc, /*output_type=*/input.getType(), input, group_size_scalar,
       group_key_scalar, instance_key_scalar,
       /*ordering_token=*/mlir::ValueRange({}),
       /*merge_op=*/builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
@@ -312,19 +313,21 @@ mlir::Operation* EmitCollectiveReduceScatter(
   const bool is_mean_op = reduce_op_str == kReduceOpMean;
   mlir::Value group_size_scalar = ops_util::CreateScalarConst(
       host_group_size, builder, DT_LOC2(loc, "group_size"));
-  auto collective_reduce_scatter = builder.create<
-      mlir::TF::CollectiveReduceScatterV2Op>(
-      loc, output_type, input, group_size_scalar, group_key_scalar,
-      instance_key_scalar,
-      /*ordering_token=*/mlir::ValueRange({}),
-      /*merge_op=*/builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
-      /*final_op=*/builder.getStringAttr(is_mean_op ? "Div" : "Id"),
-      /*communication_hint=*/builder.getStringAttr("nccl"),  // TODO(tmorris):
-                                                             // this shouldn't
-                                                             // be needed
-      /*timeout_seconds=*/builder.getF32FloatAttr(0.),
-      /*is_stateless=*/builder.getBoolAttr(false),
-      /*max_subdivs_per_device=*/builder.getI64IntegerAttr(16));
+  auto collective_reduce_scatter =
+      mlir::TF::CollectiveReduceScatterV2Op::create(
+          builder, loc, output_type, input, group_size_scalar, group_key_scalar,
+          instance_key_scalar,
+          /*ordering_token=*/mlir::ValueRange({}),
+          /*merge_op=*/
+          builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
+          /*final_op=*/builder.getStringAttr(is_mean_op ? "Div" : "Id"),
+          /*communication_hint=*/
+          builder.getStringAttr("nccl"),  // TODO(tmorris):
+                                          // this shouldn't
+                                          // be needed
+          /*timeout_seconds=*/builder.getF32FloatAttr(0.),
+          /*is_stateless=*/builder.getBoolAttr(false),
+          /*max_subdivs_per_device=*/builder.getI64IntegerAttr(16));
   SetSingleLayoutOnOp(collective_reduce_scatter, Layout::Empty());
   if (need_transpose) {
     return EmitTransposeOp(builder, loc,
@@ -394,8 +397,8 @@ mlir::Operation* EmitCollectiveAllToAll(
         new_shape.push_back(input_shape[i]);
       }
     }
-    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
-        loc, data, ops_util::GetR1Const(new_shape, builder, loc));
+    auto reshape_op = mlir::TF::ReshapeOp::create(
+        builder, loc, data, ops_util::GetR1Const(new_shape, builder, loc));
 
     std::vector<int64> perm_for_permute_transpose;
     perm_for_permute_transpose.reserve(input_shape.size() + 1);
@@ -420,8 +423,8 @@ mlir::Operation* EmitCollectiveAllToAll(
                                        1LL, std::multiplies<int64>());
   std::vector<int64> flatten_shape = {host_group_size,
                                       num_elements / host_group_size};
-  auto flatten_reshape_op = builder.create<mlir::TF::ReshapeOp>(
-      loc, input, ops_util::GetR1Const(flatten_shape, builder, loc));
+  auto flatten_reshape_op = mlir::TF::ReshapeOp::create(
+      builder, loc, input, ops_util::GetR1Const(flatten_shape, builder, loc));
   mlir::TensorType output_type =
       mlir::RankedTensorType::get(flatten_shape, input_type.getElementType());
 
@@ -432,9 +435,10 @@ mlir::Operation* EmitCollectiveAllToAll(
                             &group_key_scalar, &instance_key_scalar);
   mlir::Value group_size_scalar =
       ops_util::CreateScalarConst(host_group_size, builder, loc);
-  auto collective_alltoall = builder.create<mlir::TF::CollectiveAllToAllV2Op>(
-      loc, /*output_type=*/output_type, flatten_reshape_op->getResult(0),
-      group_size_scalar, group_key_scalar, instance_key_scalar,
+  auto collective_alltoall = mlir::TF::CollectiveAllToAllV2Op::create(
+      builder, loc, /*output_type=*/output_type,
+      flatten_reshape_op->getResult(0), group_size_scalar, group_key_scalar,
+      instance_key_scalar,
       /*ordering_token=*/mlir::ValueRange({}),
       /*communication_hint=*/builder.getStringAttr(""),
       /*timeout_seconds=*/builder.getF32FloatAttr(0.),
@@ -444,8 +448,9 @@ mlir::Operation* EmitCollectiveAllToAll(
 
   if (requires_transpose) {
     // Unflatten after all-to-all.
-    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
-        loc, prev_op, ops_util::GetR1Const(transposed_shape, builder, loc));
+    auto reshape_op = mlir::TF::ReshapeOp::create(
+        builder, loc, prev_op,
+        ops_util::GetR1Const(transposed_shape, builder, loc));
     // Undo earlier transpose which moved split or concat dim to rank 0.
     std::vector<int64> perm_for_transpose;
     perm_for_transpose.reserve(input_shape.size());
@@ -473,8 +478,8 @@ mlir::Operation* EmitCollectiveAllToAll(
   std::vector<int64> output_shape(input_shape.begin(), input_shape.end());
   output_shape[concat_dimension] *= host_group_size;
   output_shape[split_dimension] /= host_group_size;
-  auto post_reshape_op = builder.create<mlir::TF::ReshapeOp>(
-      loc, prev_op, ops_util::GetR1Const(output_shape, builder, loc));
+  auto post_reshape_op = mlir::TF::ReshapeOp::create(
+      builder, loc, prev_op, ops_util::GetR1Const(output_shape, builder, loc));
 
   return post_reshape_op;
 }
@@ -503,8 +508,8 @@ mlir::Operation* EmitCollectiveGather(
 
   mlir::Value group_size_scalar =
       ops_util::CreateScalarConst(host_group_size, builder, loc);
-  auto collective_gather = builder.create<mlir::TF::CollectiveGatherV2Op>(
-      loc, /*output_type=*/input.getType(), input, group_size_scalar,
+  auto collective_gather = mlir::TF::CollectiveGatherV2Op::create(
+      builder, loc, /*output_type=*/input.getType(), input, group_size_scalar,
       group_key_scalar, instance_key_scalar,
       /*ordering_token=*/mlir::ValueRange({}),
       /*communication_hint=*/builder.getStringAttr(""),
@@ -606,12 +611,10 @@ mlir::LogicalResult LowerReduceScatterOp(
   mlir::OpBuilder builder(reduce_scatter);
   if (reduce_scatter.getDeviceType().ends_with("TPU")) {
     // For TPUs, lower to XlaReduceScatter straightforwardly.
-    mlir::Operation* xla_reduce_scatter =
-        builder.create<mlir::TF::XlaReduceScatterOp>(
-            loc, reduce_scatter.getResult().getType(),
-            reduce_scatter.getInput(), reduce_scatter.getGroupAssignment(),
-            reduce_scatter.getScatterDimension(),
-            reduce_scatter.getReduceOpAttr());
+    mlir::Operation* xla_reduce_scatter = mlir::TF::XlaReduceScatterOp::create(
+        builder, loc, reduce_scatter.getResult().getType(),
+        reduce_scatter.getInput(), reduce_scatter.getGroupAssignment(),
+        reduce_scatter.getScatterDimension(), reduce_scatter.getReduceOpAttr());
     SetSingleLayoutOnOp(xla_reduce_scatter, *output_layout);
     reduce_scatter.replaceAllUsesWith(xla_reduce_scatter);
   } else if (reduce_scatter.getDeviceType().ends_with("GPU") &&
@@ -653,16 +656,17 @@ mlir::LogicalResult LowerReduceScatterOp(
       return reduce_scatter.emitOpError(input_layout.status().message());
     }
 
-    auto dtensor_allreduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-        reduce_scatter.getLoc(), reduce_scatter.getOperand(0).getType(),
-        reduce_scatter.getOperand(0), reduce_scatter.getGroupAssignment(),
-        reduce_scatter.getReduceOp(), reduce_scatter.getDeviceType());
+    auto dtensor_allreduce = mlir::TF::DTensorAllReduceOp::create(
+        builder, reduce_scatter.getLoc(),
+        reduce_scatter.getOperand(0).getType(), reduce_scatter.getOperand(0),
+        reduce_scatter.getGroupAssignment(), reduce_scatter.getReduceOp(),
+        reduce_scatter.getDeviceType());
     SetSingleLayoutOnOp(dtensor_allreduce, *input_layout);
 
     mlir::Operation* dtensor_all_scatter =
-        builder.create<mlir::TF::DTensorAllScatterOp>(
-            reduce_scatter.getLoc(), reduce_scatter.getResult().getType(),
-            dtensor_allreduce.getResult(),
+        mlir::TF::DTensorAllScatterOp::create(
+            builder, reduce_scatter.getLoc(),
+            reduce_scatter.getResult().getType(), dtensor_allreduce.getResult(),
             mlir::dtensor::LayoutAttr::get(builder.getContext(), *input_layout),
             mlir::dtensor::LayoutAttr::get(builder.getContext(),
                                            *output_layout));
@@ -676,8 +680,9 @@ mlir::LogicalResult LowerReduceScatterOp(
 mlir::Value CreateZeroScalar(mlir::OpBuilder& builder, mlir::Location loc,
                              mlir::RankedTensorType type) {
   const mlir::Value zero_scalar = ops_util::CreateScalarConst(0, builder, loc);
-  return builder.create<mlir::TF::CastOp>(
-      loc, mlir::RankedTensorType::get({}, type.getElementType()), zero_scalar);
+  return mlir::TF::CastOp::create(
+      builder, loc, mlir::RankedTensorType::get({}, type.getElementType()),
+      zero_scalar);
 }
 
 // device_id is the relative device_id in a mesh (device id - mesh's 1st device
@@ -691,15 +696,15 @@ mlir::Value SelectElementsBasedOnId(
       ops_util::GetR1Const(candidates_flat, builder, loc);
   const mlir::Value candidates_shape =
       ops_util::GetR1Const({num_devices, output_shape_size}, builder, loc);
-  const mlir::Value candidates = builder.create<mlir::TF::ReshapeOp>(
-      loc, candidates_flat_const, candidates_shape);
+  const mlir::Value candidates = mlir::TF::ReshapeOp::create(
+      builder, loc, candidates_flat_const, candidates_shape);
 
   // Add a zero after the only value in the 1x1 device_id tensor.
-  const mlir::Value device_id_paddings = builder.create<mlir::TF::ReshapeOp>(
-      loc, ops_util::GetR1Const({0, 1}, builder, loc),
+  const mlir::Value device_id_paddings = mlir::TF::ReshapeOp::create(
+      builder, loc, ops_util::GetR1Const({0, 1}, builder, loc),
       ops_util::GetR1Const({1, 2}, builder, loc));
-  const mlir::Value device_id_padded = builder.create<mlir::TF::PadOp>(
-      loc, candidates_shape.getType(), /*input=*/device_id,
+  const mlir::Value device_id_padded = mlir::TF::PadOp::create(
+      builder, loc, candidates_shape.getType(), /*input=*/device_id,
       /*paddings=*/device_id_paddings);
 
   // Slice a vertical vector out of the 2D candidates matrix.
@@ -707,13 +712,15 @@ mlir::Value SelectElementsBasedOnId(
       {1, output_shape_size}, builder.getIntegerType(32));
   const mlir::Value chosen_shape_const =
       ops_util::GetR1Const(chosen_shape_type.getShape(), builder, loc);
-  const mlir::Value chosen = builder.create<mlir::TF::SliceOp>(
-      loc, chosen_shape_type, /*input=*/candidates, /*begin=*/device_id_padded,
+  const mlir::Value chosen = mlir::TF::SliceOp::create(
+      builder, loc, chosen_shape_type, /*input=*/candidates,
+      /*begin=*/device_id_padded,
       /*size=*/chosen_shape_const);
 
   // Remove the leading dimension of size 1 before returning the result.
-  return builder.create<mlir::TF::ReshapeOp>(
-      loc, chosen, ops_util::GetR1Const({output_shape_size}, builder, loc));
+  return mlir::TF::ReshapeOp::create(
+      builder, loc, chosen,
+      ops_util::GetR1Const({output_shape_size}, builder, loc));
 }
 
 StatusOr<const mlir::DenseIntElementsAttr> GetGroupAssignment(
@@ -841,8 +848,8 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
       new_shape.push_back(input_shape_after_tr[j]);
     }
 
-    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
-        loc, /*tensor=*/collective_op->getResult(0),
+    auto reshape_op = mlir::TF::ReshapeOp::create(
+        builder, loc, /*tensor=*/collective_op->getResult(0),
         /*shape=*/ops_util::GetR1Const(new_shape, builder, loc));
 
     prev_op_result = reshape_op->getResult(0);
@@ -877,8 +884,8 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
     prev_op_result = post_transpose_op->getResult(0);
   }
 
-  auto output_reshape_op = builder.create<mlir::TF::ReshapeOp>(
-      loc, /*tensor=*/prev_op_result,
+  auto output_reshape_op = mlir::TF::ReshapeOp::create(
+      builder, loc, /*tensor=*/prev_op_result,
       /*shape=*/ops_util::GetR1Const(output_shape, builder, loc));
   SetSingleLayoutOnOp(output_reshape_op, tgt_layout);
   all_gather.replaceAllUsesWith(output_reshape_op->getResult(0));
@@ -900,8 +907,8 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   builder.setInsertionPointAfter(all_gather);
 
   if (concat_dims.empty()) {
-    mlir::TF::IdentityOp identity = builder.create<mlir::TF::IdentityOp>(
-        all_gather.getLoc(), all_gather.getInput().getType(),
+    mlir::TF::IdentityOp identity = mlir::TF::IdentityOp::create(
+        builder, all_gather.getLoc(), all_gather.getInput().getType(),
         all_gather.getInput());
     SetSingleLayoutOnOp(identity, tgt_layout);
 
@@ -942,7 +949,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   const mlir::Value output_shape_const = Int64Const(builder, loc, output_shape);
   const mlir::Value zero_scalar = CreateZeroScalar(builder, loc, input_type);
   const mlir::Value zeros =
-      builder.create<mlir::TF::FillOp>(loc, output_shape_const, zero_scalar);
+      mlir::TF::FillOp::create(builder, loc, output_shape_const, zero_scalar);
 
   // For every possible device ID, generate its strided slice ranges. Store all
   // ranges---num_devices * output_shape_size * (begin, end, stride)---as three
@@ -1001,12 +1008,12 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
     if (!tgt_layout.mesh().is_tpu_mesh())
       return all_gather.emitOpError()
              << "source and target layout are not both on tpu";
-    update_result = builder.create<mlir::TF::XlaDynamicUpdateSliceOp>(
-        loc, zeros.getType(), /*input=*/zeros,
+    update_result = mlir::TF::XlaDynamicUpdateSliceOp::create(
+        builder, loc, zeros.getType(), /*input=*/zeros,
         /*update=*/all_gather.getInput(), /*indices=*/begin);
   } else {
-    update_result = builder.create<mlir::TF::TensorStridedSliceUpdateOp>(
-        loc, zeros.getType(),
+    update_result = mlir::TF::TensorStridedSliceUpdateOp::create(
+        builder, loc, zeros.getType(),
         /*input=*/zeros, begin, end, strides,
         /*value=*/all_gather.getInput());
   }
@@ -1062,9 +1069,9 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   absl::string_view reduce_type = kReduceOpAdd;
   if (type && type.getElementType().isInteger(1)) reduce_type = kReduceOpAny;
   mlir::TF::DTensorAllReduceOp all_reduce =
-      builder.create<mlir::TF::DTensorAllReduceOp>(
-          loc, update_result.getType(), update_result,
-          builder.create<mlir::TF::ConstOp>(loc, group_assignment),
+      mlir::TF::DTensorAllReduceOp::create(
+          builder, loc, update_result.getType(), update_result,
+          mlir::TF::ConstOp::create(builder, loc, group_assignment),
           builder.getStringAttr(std::string(reduce_type)),
           builder.getStringAttr(device_type));
   SetSingleLayoutOnOp(all_reduce, tgt_layout);
@@ -1146,12 +1153,12 @@ mlir::LogicalResult LowerAllScatterOp(
   mlir::Attribute matrix_attr =
       mlir::DenseIntElementsAttr::get(matrix_type, matrix);
   mlir::Value matrix_value =
-      builder.create<mlir::TF::ConstOp>(all_scatter.getLoc(), matrix_attr)
+      mlir::TF::ConstOp::create(builder, all_scatter.getLoc(), matrix_attr)
           .getResult();
 
   // Compute the offset from mult_matrix_value and mesh_coordinates.
-  mlir::TF::MatMulOp offset = builder.create<mlir::TF::MatMulOp>(
-      all_scatter.getLoc(),
+  mlir::TF::MatMulOp offset = mlir::TF::MatMulOp::create(
+      builder, all_scatter.getLoc(),
       mlir::RankedTensorType::get({1, original_layout.rank()},
                                   builder.getIntegerType(32)),
       mesh_coordinates, matrix_value);
@@ -1164,14 +1171,14 @@ mlir::LogicalResult LowerAllScatterOp(
   }
 
   // Input to slice needs to be rank 1, so we need to squeeze it.
-  mlir::TF::SqueezeOp offset_squeezed = builder.create<mlir::TF::SqueezeOp>(
-      all_scatter.getLoc(),
+  mlir::TF::SqueezeOp offset_squeezed = mlir::TF::SqueezeOp::create(
+      builder, all_scatter.getLoc(),
       mlir::RankedTensorType::get({original_layout.rank()},
                                   builder.getIntegerType(32)),
       offset.getProduct(), builder.getI64ArrayAttr({0}));
 
-  auto result = builder.create<mlir::TF::SliceOp>(
-      all_scatter.getLoc(), output_type, all_scatter.getInput(),
+  auto result = mlir::TF::SliceOp::create(
+      builder, all_scatter.getLoc(), output_type, all_scatter.getInput(),
       offset_squeezed.getOutput(), slice_shape_value);
 
   SetSingleLayoutOnOp(result, desired_layout);
@@ -1231,9 +1238,9 @@ mlir::LogicalResult LowerAllToAllOp(mlir::TF::DTensorAllToAllOp all_to_all) {
 
   if (mlir::StringRef(device_type).ends_with("TPU")) {
     // For TPUs, lower to XlaAllToAll.
-    mlir::Operation* xla_all_to_all = builder.create<mlir::TF::AllToAllOp>(
-        loc, all_to_all.getResult().getType(), all_to_all.getInput(),
-        builder.create<mlir::TF::ConstOp>(loc, group_assignment),
+    mlir::Operation* xla_all_to_all = mlir::TF::AllToAllOp::create(
+        builder, loc, all_to_all.getResult().getType(), all_to_all.getInput(),
+        mlir::TF::ConstOp::create(builder, loc, group_assignment),
         concat_dimension, split_dimension, group_size);
     SetSingleLayoutOnOp(xla_all_to_all, tgt_layout);
     all_to_all.replaceAllUsesWith(xla_all_to_all);
diff --git a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
index 6eaeacee29f611..d8a7bcd9705521 100644
--- a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
+++ b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
@@ -254,8 +254,8 @@ mlir::LogicalResult UpdateTPUCompileMetadata(const Mesh& mesh_config,
     if (mesh_config.use_xla_spmd()) {
       // Create a new compile op with the appropriate new number of operands.
       builder->setInsertionPointAfter(compile);
-      auto new_compile_op = builder->create<mlir::TF::_TPUCompileMlirOp>(
-          compile.getLoc(), compile.getCompilationStatus().getType(),
+      auto new_compile_op = mlir::TF::_TPUCompileMlirOp::create(
+          *builder, compile.getLoc(), compile.getCompilationStatus().getType(),
           /*program=*/
           llvm::SmallVector<mlir::Type, 8>(
               mesh_config.num_devices(),
diff --git a/tensorflow/dtensor/mlir/value_utils.cc b/tensorflow/dtensor/mlir/value_utils.cc
index e9240996904fd0..9ecdfa424ac723 100644
--- a/tensorflow/dtensor/mlir/value_utils.cc
+++ b/tensorflow/dtensor/mlir/value_utils.cc
@@ -103,18 +103,18 @@ mlir::Value ReshapeSizeTypeToScalar(mlir::OpBuilder builder, mlir::Location loc,
       mlir::RankedTensorType::get({}, builder.getIntegerType(32));
   mlir::Value scalar_shape =
       ops_util::GetR1Const(scalar_type.getShape(), builder, loc);
-  return builder.create<mlir::TF::ReshapeOp>(
-      loc, mlir::ArrayRef<mlir::Type>{scalar_type},
+  return mlir::TF::ReshapeOp::create(
+      builder, loc, mlir::ArrayRef<mlir::Type>{scalar_type},
       mlir::ArrayRef<mlir::Value>{tensor, scalar_shape});
 }
 
 mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
-                     llvm::ArrayRef<int32> values) {
+                     llvm::ArrayRef<int32_t> values) {
   auto const_type = mlir::RankedTensorType::get(
       {static_cast<int64_t>(values.size())}, builder.getIntegerType(32));
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type) {
@@ -133,7 +133,7 @@ mlir::Value Int64Const(mlir::OpBuilder& builder, mlir::Location loc,
       {static_cast<int64_t>(values.size())}, builder.getIntegerType(64));
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 mlir::Value FloatConst(mlir::OpBuilder& builder, mlir::Location loc,
@@ -142,16 +142,17 @@ mlir::Value FloatConst(mlir::OpBuilder& builder, mlir::Location loc,
       {static_cast<int64_t>(values.size())}, builder.getF32Type());
   mlir::Attribute const_attr =
       mlir::DenseFPElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 mlir::Value StringScalarConst(mlir::OpBuilder& builder, mlir::Location loc,
                               llvm::StringRef value) {
-  return builder.create<mlir::TF::ConstOp>(
-      loc, mlir::DenseStringElementsAttr::get(
-               mlir::RankedTensorType::get(
-                   {}, builder.getType<mlir::TF::StringType>()),
-               value));
+  return mlir::TF::ConstOp::create(
+      builder, loc,
+      mlir::DenseStringElementsAttr::get(
+          mlir::RankedTensorType::get({},
+                                      builder.getType<mlir::TF::StringType>()),
+          value));
 }
 
 mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
@@ -161,7 +162,7 @@ mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
                                   builder.getType<mlir::TF::StringType>());
   mlir::Attribute const_attr =
       mlir::DenseStringElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
@@ -171,7 +172,7 @@ mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
   if (llvm::cast<mlir::RankedTensorType>(type).getElementType().isInteger(64)) {
     return Int64Const(builder, loc, values);
   } else {
-    llvm::SmallVector<int32, 4> values32(values.begin(), values.end());
+    llvm::SmallVector<int32_t, 4> values32(values.begin(), values.end());
     return IntConst(builder, loc, values32);
   }
 }
@@ -213,14 +214,16 @@ absl::Status ExtractConstVectorFromValue(
 mlir::Value CreateIntScalarConst(const int64_t value, mlir::OpBuilder builder,
                                  mlir::Location loc, bool use_int64) {
   if (use_int64) {
-    return builder.create<mlir::TF::ConstOp>(
-        loc, mlir::DenseIntElementsAttr::get(
-                 mlir::RankedTensorType::get({}, builder.getI64Type()), value));
+    return mlir::TF::ConstOp::create(
+        builder, loc,
+        mlir::DenseIntElementsAttr::get(
+            mlir::RankedTensorType::get({}, builder.getI64Type()), value));
   } else {
-    return builder.create<mlir::TF::ConstOp>(
-        loc, mlir::DenseIntElementsAttr::get(
-                 mlir::RankedTensorType::get({}, builder.getI32Type()),
-                 static_cast<int32_t>(value)));
+    return mlir::TF::ConstOp::create(
+        builder, loc,
+        mlir::DenseIntElementsAttr::get(
+            mlir::RankedTensorType::get({}, builder.getI32Type()),
+            static_cast<int32_t>(value)));
   }
 }
 
@@ -228,32 +231,32 @@ StatusOr<mlir::Value> CreateZeroScalarConst(mlir::OpBuilder& builder,
                                             mlir::Location loc,
                                             mlir::Type type) {
   if (type.isF64()) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseFPElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getF64Type()),
-                     static_cast<double>(0.)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseFPElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getF64Type()),
+                   static_cast<double>(0.)))
         .getResult();
   } else if (type.isF32()) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseFPElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getF32Type()),
-                     static_cast<float>(0.f)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseFPElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getF32Type()),
+                   static_cast<float>(0.f)))
         .getResult();
   } else if (type.isInteger(32)) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseIntElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getI32Type()),
-                     static_cast<int32_t>(0)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseIntElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getI32Type()),
+                   static_cast<int32_t>(0)))
         .getResult();
   } else if (type.isInteger(64)) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseIntElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getI64Type()),
-                     static_cast<int64_t>(0)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseIntElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getI64Type()),
+                   static_cast<int64_t>(0)))
         .getResult();
   } else {
     return errors::InvalidArgument(
@@ -270,8 +273,9 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
     return errors::InvalidArgument("Input array must have shape [1, N].");
   }
 
-  mlir::TF::SliceOp sliced_value = builder.create<mlir::TF::SliceOp>(
-      location, mlir::RankedTensorType::get({1, 1}, arrayType.getElementType()),
+  mlir::TF::SliceOp sliced_value = mlir::TF::SliceOp::create(
+      builder, location,
+      mlir::RankedTensorType::get({1, 1}, arrayType.getElementType()),
       /*input=*/array,
       /*begin=*/IntConst(builder, location, {0, index}),
       /*size=*/IntConst(builder, location, {1, 1}));
@@ -281,8 +285,8 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
       mlir::RankedTensorType::get({}, builder.getIntegerType(32));
   mlir::Value scalar_shape = mlir::TF::collection_ops_util::GetR1Const(
       scalar_size_type.getShape(), builder, location);
-  mlir::Value scalar_sliced_value = builder.create<mlir::TF::ReshapeOp>(
-      location, mlir::ArrayRef<mlir::Type>{scalar_size_type},
+  mlir::Value scalar_sliced_value = mlir::TF::ReshapeOp::create(
+      builder, location, mlir::ArrayRef<mlir::Type>{scalar_size_type},
       mlir::ArrayRef<mlir::Value>{sliced_value.getOutput(), scalar_shape},
       mlir::ArrayRef<mlir::NamedAttribute>{});
   return scalar_sliced_value;
diff --git a/tensorflow/dtensor/mlir/value_utils.h b/tensorflow/dtensor/mlir/value_utils.h
index 804683bc56a2cc..9775f57c79db11 100644
--- a/tensorflow/dtensor/mlir/value_utils.h
+++ b/tensorflow/dtensor/mlir/value_utils.h
@@ -48,7 +48,7 @@ StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type);
 
 // Return a 1-D int32 constant array with the given values.
 mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
-                     llvm::ArrayRef<int32> values);
+                     llvm::ArrayRef<int32_t> values);
 // Return a 1-D int64 constant array with the given values.
 mlir::Value Int64Const(mlir::OpBuilder& builder, mlir::Location loc,
                        llvm::ArrayRef<int64_t> values);
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 9b38fcdeb48bb0..38c84bc127ef90 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -746,9 +746,6 @@ dtensor_test(
         "tpu": 10,
         TPU_V3_DONUT_BACKEND: 32,
     },
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:api",
@@ -802,7 +799,6 @@ dtensor_test(
     },
     tags = [
         "no_oss_py38",  # TODO(b/267017937)
-        "cuda-only",
     ],
     deps = [
         ":test_util",
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 1249b5c01e321f..7e877cd92f1cd1 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -749,9 +749,11 @@ add_library(tensorflow-lite
 set(_ALL_TFLITE_HDRS ${_ALL_TFLITE_SRCS})
 list(FILTER _ALL_TFLITE_HDRS INCLUDE REGEX ".*\\.h$")
 target_include_directories(tensorflow-lite
-  PUBLIC $<BUILD_INTERFACE:${TENSORFLOW_SOURCE_DIR}> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  PUBLIC ${CMAKE_CURRENT_BINARY_DIR}
-  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..
+  PUBLIC
+    $<BUILD_INTERFACE:${TENSORFLOW_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
 )
 target_link_libraries(tensorflow-lite
   PUBLIC
@@ -879,7 +881,9 @@ target_compile_options(_pywrap_tensorflow_interpreter_wrapper
   PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
 )
 
-target_compile_options(xnnpack-delegate
-  PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
-  PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
-)
\ No newline at end of file
+if(TFLITE_ENABLE_XNNPACK)
+  target_compile_options(xnnpack-delegate
+    PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
+    PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
+  )
+endif()
\ No newline at end of file
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 765c2bc12f2d7d..2c2e703735ef07 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -1352,7 +1352,15 @@ typedef enum TfLiteDelegateFlags {
   /// operator information using `Profiler::EventType::OPERATOR_INVOKE_EVENT`
   /// and the results will appear in the operator-wise Profiling section and not
   /// in the Delegate internal section.
-  kTfLiteDelegateFlagsPerOperatorProfiling = 4
+  kTfLiteDelegateFlagsPerOperatorProfiling = 4,
+
+  // This flag can be used by callers to hint that the delegate is likely to
+  // delegate the entire graph to a single delegate so certain allocations can
+  // be skipped.
+  // This is an ADVANCED feature and should only be used if the caller has
+  // prior knowledge that the delegate will fully delegate all subgraphs
+  // to a single delegate.
+  kTfLiteDelegateFlagsHintFullyDelegatedToSingleDelegate = 8,
 } TfLiteDelegateFlags;
 
 /// WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 996d36b7e9725f..4d28de5a21ca2a 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -2489,9 +2489,11 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
   // Restore delegation state if applicable.
   TF_LITE_ENSURE_STATUS(RedoAllDelegates());
 
+  int64_t delegate_flags = TfLiteDelegateGetFlagsInternal(delegate);
   const bool delegate_supports_dynamic_shapes =
-      TfLiteDelegateGetFlagsInternal(delegate) &
-      kTfLiteDelegateFlagsAllowDynamicTensors;
+      delegate_flags & kTfLiteDelegateFlagsAllowDynamicTensors;
+  const bool hint_fully_delegated_to_single_delegate =
+      delegate_flags & kTfLiteDelegateFlagsHintFullyDelegatedToSingleDelegate;
   const auto pre_delegation_state = state_;
 
   if (state_ == kStateInvokableAndImmutable) {
@@ -2500,7 +2502,8 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
     // tensors.
     // Reset the state to force tensor/op reallocation.
     state_ = kStateUninvokable;
-  } else if (!delegate_supports_dynamic_shapes) {
+  } else if (!delegate_supports_dynamic_shapes &&
+             !hint_fully_delegated_to_single_delegate) {
     // Check if graph has dynamic tensors by preparing ops.
     int last_execution_plan_index_prepared;
     TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
@@ -2533,15 +2536,25 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
   SwitchToKernelContext();
   TF_LITE_ENSURE_STATUS(reset_delegation_if_not_ok(status));
 
+  if (hint_fully_delegated_to_single_delegate && !IsFullyDelegated()) {
+    ReportError(
+        "Hint fully delegated to single delegate is set, but the graph is not "
+        "fully delegated.");
+    return kTfLiteApplicationError;
+  }
+
   // STEP 3: Leave graph in consistent state based on delegate & previous state.
   // ===========================================================================
 
   if (!delegate_supports_dynamic_shapes) {
     // CASE 1: Current delegate does not support dynamic shapes.
     // Reset the state to force tensor/op reallocation.
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_STATUS(
-        reset_delegation_if_not_ok(EnsureMemoryAllocations()));
+    if (!hint_fully_delegated_to_single_delegate) {
+      state_ = kStateUninvokable;
+      TF_LITE_ENSURE_STATUS(
+          reset_delegation_if_not_ok(EnsureMemoryAllocations()));
+    }
+
     // After using a delegate which doesn't support dynamic tensors, make the
     // entire graph immutable.
     state_ = kStateInvokableAndImmutable;
diff --git a/tensorflow/lite/delegates/serialization.cc b/tensorflow/lite/delegates/serialization.cc
index 0c26589e19bf96..fec19eb6ac34a2 100644
--- a/tensorflow/lite/delegates/serialization.cc
+++ b/tensorflow/lite/delegates/serialization.cc
@@ -47,6 +47,12 @@ namespace {
 
 static const char kDelegatedNodesSuffix[] = "_dnodes";
 
+#if defined(_WIN32)
+static const char kPathSeparator = '\\';
+#else
+static const char kPathSeparator = '/';
+#endif  // defined(_WIN32)
+
 // Farmhash Fingerprint
 inline uint64_t CombineFingerprints(uint64_t l, uint64_t h) {
   // Murmur-inspired hashing.
@@ -63,7 +69,8 @@ inline uint64_t CombineFingerprints(uint64_t l, uint64_t h) {
 
 inline std::string JoinPath(const std::string& path1,
                             const std::string& path2) {
-  return (path1.back() == '/') ? (path1 + path2) : (path1 + "/" + path2);
+  return (path1.back() == kPathSeparator) ? (path1 + path2)
+                                          : (path1 + kPathSeparator + path2);
 }
 
 inline std::string GetFilePath(const std::string& cache_dir,
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index a2715c519798a5..02d51f21d4fa4e 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -333,6 +333,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "macros",
+    hdrs = ["macros.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:minimal_logging",
+    ],
+)
+
 flatbuffer_cc_library(
     name = "weight_cache_schema",
     srcs = ["weight_cache_schema.fbs"],
@@ -350,8 +360,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":file_util",
+        ":macros",
         ":mmap_handle",
         ":weight_cache_schema",
+        "//tensorflow/lite:logger",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
         "@XNNPACK",
@@ -380,6 +392,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "fingerprint_test_helpers",
+    testonly = True,
+    hdrs = ["fingerprint_test_helpers.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":weight_cache",
+        ":weight_cache_test_helpers",
+        ":xnnpack_delegate_hdrs_only",
+        "//tensorflow/lite/c:common",
+        "@XNNPACK",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "mmap_handle",
     srcs = ["mmap_handle.cc"],
@@ -387,8 +414,8 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":file_util",
+        ":macros",
         ":windows_util",
-        "//tensorflow/lite:minimal_logging",
     ],
 )
 
@@ -409,6 +436,7 @@ cc_library(
     hdrs = ["file_util.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":macros",
         "//tensorflow/lite:minimal_logging",
     ],
 )
@@ -1334,6 +1362,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1350,6 +1379,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1384,6 +1414,7 @@ cc_test(
     }),
     deps = [
         ":conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1433,6 +1464,7 @@ cc_test(
     }),
     deps = [
         ":depthwise_conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1452,6 +1484,7 @@ cc_test(
     tags = ["notap"],
     deps = [
         ":dynamically_quantized_fully_connected_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1468,6 +1501,7 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1484,6 +1518,7 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_transpose_conv_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1500,10 +1535,14 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":fully_connected_tester",
         ":test_main",
+        ":weight_cache",
+        ":weight_cache_test_helpers",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
+        "@XNNPACK",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1851,6 +1890,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1867,6 +1907,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1917,6 +1958,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2150,6 +2192,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2294,6 +2337,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":test_main",
         ":transpose_conv_tester",
         ":xnnpack_delegate_test_mode",
@@ -2373,6 +2417,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2388,6 +2433,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2418,6 +2464,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2628,6 +2675,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
index 92293e08227593..d195d4f25435e8 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
@@ -24,17 +24,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(ChannelwiseQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct ChannelwiseQuantizedConv2D : DelegateTest {};
 
+TEST_F(ChannelwiseQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -71,11 +70,7 @@ TEST(ChannelwiseQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -112,11 +107,7 @@ TEST(ChannelwiseQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -155,11 +146,7 @@ TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -198,11 +185,7 @@ TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -241,11 +224,7 @@ TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -288,11 +267,7 @@ TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -335,11 +310,7 @@ TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -382,11 +353,7 @@ TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -429,11 +396,7 @@ TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -476,11 +439,7 @@ TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,11 +482,7 @@ TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -570,13 +525,11 @@ TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
+TEST_F(ChannelwiseQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -619,7 +572,7 @@ TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
+TEST_F(ChannelwiseQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -627,9 +580,7 @@ TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -673,15 +624,13 @@ TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
+TEST_F(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
index 25dada01896c34..0c6de84e9a8d2f 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
@@ -23,18 +23,16 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct ChannelwiseQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -66,11 +64,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -103,11 +97,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -140,11 +130,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -179,11 +165,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -216,11 +198,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -255,11 +233,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -297,11 +271,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -339,11 +309,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -385,11 +351,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -431,11 +393,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -477,11 +435,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,11 +477,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -573,11 +523,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -619,11 +565,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -665,11 +607,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -711,13 +649,11 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -759,7 +695,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -767,9 +703,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -812,15 +746,13 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 25090bbaf2b5cf..e1b5a674946b73 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(Conv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct Conv2D : DelegateTest {};
 
+TEST_F(Conv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -52,11 +50,7 @@ TEST(Conv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -78,11 +72,7 @@ TEST(Conv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -106,11 +96,7 @@ TEST(Conv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -136,11 +122,7 @@ TEST(Conv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -164,11 +146,7 @@ TEST(Conv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -192,11 +170,7 @@ TEST(Conv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -224,11 +198,7 @@ TEST(Conv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -256,11 +226,7 @@ TEST(Conv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -288,11 +254,7 @@ TEST(Conv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -320,11 +282,7 @@ TEST(Conv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -352,11 +310,7 @@ TEST(Conv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -384,11 +338,7 @@ TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -416,11 +366,7 @@ TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -448,11 +394,7 @@ TEST(Conv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -481,11 +423,7 @@ TEST(Conv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -514,11 +452,7 @@ TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -547,11 +481,7 @@ TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -579,11 +509,7 @@ TEST(Conv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -611,11 +537,7 @@ TEST(Conv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -643,11 +565,7 @@ TEST(Conv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DISABLED_TanhActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DISABLED_TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -675,11 +593,7 @@ TEST(Conv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DISABLED_SignBitActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DISABLED_SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -707,13 +621,11 @@ TEST(Conv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, MultiThreading) {
+TEST_F(Conv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -741,7 +653,7 @@ TEST(Conv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, WeightsCache) {
+TEST_F(Conv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -749,10 +661,7 @@ TEST(Conv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -781,15 +690,13 @@ TEST(Conv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(Conv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index e894bcdc2bc46a..931fff88178dfb 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct DepthwiseConv2D : DelegateTest {};
 
+TEST_F(DepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -47,11 +45,7 @@ TEST(DepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -69,11 +63,7 @@ TEST(DepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -91,11 +81,7 @@ TEST(DepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -115,11 +101,7 @@ TEST(DepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -137,11 +119,7 @@ TEST(DepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -161,11 +139,7 @@ TEST(DepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -188,11 +162,7 @@ TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -215,11 +185,7 @@ TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -246,11 +212,7 @@ TEST(DepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -277,11 +239,7 @@ TEST(DepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -308,11 +266,7 @@ TEST(DepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -339,11 +293,7 @@ TEST(DepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -372,11 +322,7 @@ TEST(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -403,11 +349,7 @@ TEST(DepthwiseConv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -434,11 +376,7 @@ TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -465,11 +403,7 @@ TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -496,11 +430,7 @@ TEST(DepthwiseConv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -528,11 +458,7 @@ TEST(DepthwiseConv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -560,11 +486,7 @@ TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -592,11 +514,7 @@ TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -623,11 +541,7 @@ TEST(DepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -654,11 +568,7 @@ TEST(DepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -685,11 +595,7 @@ TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DISABLED_TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -716,11 +622,7 @@ TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DISABLED_SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -747,13 +649,11 @@ TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, MultiThreading) {
+TEST_F(DepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -780,7 +680,7 @@ TEST(DepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, WeightsCache) {
+TEST_F(DepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -788,9 +688,7 @@ TEST(DepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -818,15 +716,13 @@ TEST(DepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, TransientIndirectionBuffer) {
+TEST_F(DepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
index 59507269580cbd..52e8333db4fd04 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
@@ -19,22 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DynamicallyQuantizedConv2D, 3x3) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+struct DynamicallyQuantizedConv2D : DelegateTest {};
 
+TEST_F(DynamicallyQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -56,15 +50,7 @@ TEST(DynamicallyQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -88,15 +74,7 @@ TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, Grouped) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -122,15 +100,7 @@ TEST(DynamicallyQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -154,15 +124,7 @@ TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -186,14 +148,7 @@ TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -221,15 +176,7 @@ TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -257,15 +204,7 @@ TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -293,15 +232,7 @@ TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -329,15 +260,7 @@ TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -364,15 +287,7 @@ TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -399,15 +314,7 @@ TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ReluActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -435,15 +342,7 @@ TEST(DynamicallyQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -471,15 +370,7 @@ TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -507,15 +398,7 @@ TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TanhActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -543,15 +426,7 @@ TEST(DynamicallyQuantizedConv2D, TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -579,15 +454,13 @@ TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, MultiThreading) {
+TEST_F(DynamicallyQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -615,7 +488,7 @@ TEST(DynamicallyQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, WeightsCache) {
+TEST_F(DynamicallyQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -625,9 +498,7 @@ TEST(DynamicallyQuantizedConv2D, WeightsCache) {
   delegate_options.weights_cache = weights_cache.get();
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -656,16 +527,14 @@ TEST(DynamicallyQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
+TEST_F(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
   xnnpack_options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
index 2f198a95195f11..2d2febcb21ab66 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
@@ -30,9 +30,10 @@ namespace xnnpack {
 
 // Dummy class to use with parameterized test.
 class DynamicallyQuantizedFullyConnectedTest
-    : public testing::TestWithParam<WeightsType> {};
+    : public testing::WithParamInterface<WeightsType>,
+      public DelegateTest {};
 
-int GenInputChannels(const std::function<int()> &rng,
+int GenInputChannels(const std::function<int()>& rng,
                      WeightsType weights_type) {
   switch (weights_type) {
     case WeightsType::kChannelWiseQuantizedInt8:
@@ -45,14 +46,6 @@ int GenInputChannels(const std::function<int()> &rng,
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -71,14 +64,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -99,14 +84,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -128,13 +105,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -156,14 +126,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -184,14 +146,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -214,14 +168,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -244,14 +190,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -275,14 +213,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -304,14 +234,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -333,14 +255,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -362,14 +276,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -393,13 +299,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
 TEST_P(DynamicallyQuantizedFullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+  UseCustomDelegate(delegate_options);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -429,9 +330,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
index de863e4f1e2125..4a40e56852b56c 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct DynamicallyQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,10 +49,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -75,11 +70,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -100,11 +91,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -125,11 +112,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -153,10 +136,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -180,11 +160,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -212,11 +188,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -244,13 +216,11 @@ TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -279,7 +249,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -287,9 +257,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
index 3bdcd343373bac..abfd76c12a14f9 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
@@ -55,10 +55,12 @@ void DynamicallyQuantizedTransposeConvTester::Test(
   const Model* model = GetModel(buffer.data());
 
   std::unique_ptr<Interpreter> delegate_interpreter;
-  ASSERT_EQ(InterpreterBuilder(
-                model, ::tflite::ops::builtin::BuiltinOpResolverWithXNNPACK())(
-                &delegate_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
       InterpreterBuilder(
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.cc b/tensorflow/lite/delegates/xnnpack/file_util.cc
index b475080480ecb4..7fbb917c850e4e 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util.cc
@@ -39,7 +39,13 @@ limitations under the License.
 #endif  // TFLITE_XNNPACK_IN_MEMORY_FILE_ENABLED
 #endif  // defined(__linux__) || defined(__ANDROID__)
 
+#include <sys/stat.h>
+
+#include <cerrno>
 #include <cstdio>
+#include <cstring>
+
+#include "tensorflow/lite/delegates/xnnpack/macros.h"
 
 #if !TFLITE_XNNPACK_IN_MEMORY_FILE_ENABLED
 #include "tensorflow/lite/logger.h"
@@ -57,7 +63,7 @@ FileDescriptor FileDescriptor::Duplicate() const {
   if (!IsValid()) {
     return FileDescriptor(-1);
   }
-  return FileDescriptor(dup(fd_));
+  return FileDescriptor::Duplicate(fd_);
 }
 
 void FileDescriptor::Reset(int new_fd) {
@@ -90,6 +96,9 @@ FileDescriptor::Offset FileDescriptorView::MovePos(
 }
 
 FileDescriptor FileDescriptor::Open(const char* path, int flags, mode_t mode) {
+  if (!path) {
+    return {};
+  }
 #if defined(_WIN32)
   if (!(flags & O_TEXT)) {
     flags |= O_BINARY;
@@ -154,5 +163,22 @@ FileDescriptor CreateInMemoryFileDescriptor(const char* path) {
 #endif
 }
 
+bool IsFileEmpty(const char* path, const FileDescriptor& fd) {
+#if defined(_WIN32)
+  struct _stat64 file_stats{};
+  const int res = fd.IsValid() ? _fstat64(fd.Value(), &file_stats)
+                               : _stat64(path, &file_stats);
+#else
+  struct stat file_stats{};
+  const int res =
+      fd.IsValid() ? fstat(fd.Value(), &file_stats) : stat(path, &file_stats);
+#endif
+  XNNPACK_RETURN_CHECK(
+      res == 0 || errno == ENOENT,
+      "could not access file descriptor %d stats to get size ('%s'): %s.",
+      fd.Value(), path, strerror(errno));
+  return file_stats.st_size == 0;
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.h b/tensorflow/lite/delegates/xnnpack/file_util.h
index cddc0a4c615f06..9817c74d9f7ee6 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.h
+++ b/tensorflow/lite/delegates/xnnpack/file_util.h
@@ -76,6 +76,14 @@ class FileDescriptorView {
   // WARNING: the file descriptor must be valid and the file must be opened.
   Offset MovePos(Offset offset) const;
 
+  // Returns the size of the file.
+  Offset Size() const {
+    Offset pos = GetPos();
+    Offset size = SetPosFromEnd(0);
+    SetPos(pos);
+    return size;
+  }
+
   // Reads `count` bytes from the file at the current position to `dst`.
   //
   // Returns true if all the data available in the file was read to the buffer
@@ -167,6 +175,11 @@ class FileDescriptor : public FileDescriptorView {
 // descriptor.
 bool InMemoryFileDescriptorAvailable();
 
+// Returns true if the file is empty (the file may exist)
+//
+// Note: if `fd` is valid, then `path` is ignored.
+bool IsFileEmpty(const char* path, const FileDescriptor& fd);
+
 // Creates a new file descriptor that isn't backed by a file system. The file
 // will be automatically cleaned up when the last file descriptor pointing to it
 // is closed.
diff --git a/tensorflow/lite/delegates/xnnpack/file_util_test.cc b/tensorflow/lite/delegates/xnnpack/file_util_test.cc
index 69196fefa28f52..9a1ce5e50aa5f2 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <fcntl.h>
 
+#include <atomic>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -25,6 +26,14 @@ limitations under the License.
 namespace tflite::xnnpack {
 namespace {
 
+// Returns a path for a temporary file.
+//
+// Each call will return a new path.
+std::string NewTempFilePath() {
+  static std::atomic<int> i = 0;
+  return testing::TempDir() + "test_file_" + std::to_string(i++);
+}
+
 TEST(FileDescriptorTest, DefaultConstructedIsInvalid) {
   FileDescriptor fd;
   EXPECT_FALSE(fd.IsValid());
@@ -54,7 +63,7 @@ TEST(FileDescriptorTest, OpenNullFileFails) {
 }
 
 TEST(FileDescriptorTest, OpenWriteRewindAndReadWorks) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd =
       FileDescriptor::Open(tmp_file.c_str(), O_CREAT | O_TRUNC | O_RDWR, 0644);
   ASSERT_TRUE(fd.IsValid());
@@ -67,7 +76,7 @@ TEST(FileDescriptorTest, OpenWriteRewindAndReadWorks) {
 }
 
 TEST(FileDescriptorTest, WriteFailureReturnsFalse) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_RDONLY, 0644);
   ASSERT_TRUE(fd.IsValid());
@@ -76,7 +85,7 @@ TEST(FileDescriptorTest, WriteFailureReturnsFalse) {
 }
 
 TEST(FileDescriptorTest, ReadFailureReturnsFalse) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
   ASSERT_TRUE(fd.IsValid());
@@ -84,5 +93,50 @@ TEST(FileDescriptorTest, ReadFailureReturnsFalse) {
   EXPECT_FALSE(fd.Read(dst_data.data(), dst_data.size()));
 }
 
+TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnEmptyFileThatExists) {
+  const std::string tmp_file = NewTempFilePath();
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  fd.Close();
+  EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), FileDescriptor()));
+}
+
+TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnNonExistingFile) {
+  const std::string tmp_file = NewTempFilePath();
+  EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), FileDescriptor()));
+}
+
+TEST(FileDescriptorTest,
+     IsFileEmptyReturnTrueForAnNonExistingFileWithFileDescriptor) {
+  const std::string tmp_file = NewTempFilePath();
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  EXPECT_TRUE(IsFileEmpty("asdfasdf", FileDescriptor()));
+}
+
+TEST(FileDescriptorTest, IsFileEmptyReturnFalseForAFileThatHasContents) {
+  const std::string tmp_file = NewTempFilePath();
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  const std::string src_data = "The quick brown fox jumps over the lazy dog.";
+  EXPECT_TRUE(fd.Write(src_data.data(), src_data.size()));
+  EXPECT_FALSE(IsFileEmpty(tmp_file.c_str(), fd));
+}
+
+TEST(FileDescriptorTest, IsFileEmptyPrioritizesTheFileDescriptor) {
+  // We open 2 files, put some data only in one and then pass the file name of
+  // the one that has data and the file descriptor of the empty one.
+  const std::string tmp_file = NewTempFilePath();
+  const std::string tmp_file2 = NewTempFilePath();
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  FileDescriptor fd2 = FileDescriptor::Open(tmp_file2.c_str(),
+                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  const std::string src_data = "The quick brown fox jumps over the lazy dog.";
+  EXPECT_TRUE(fd.Write(src_data.data(), src_data.size()));
+  fd.Close();
+  EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), fd2));
+}
+
 }  // namespace
 }  // namespace tflite::xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
new file mode 100644
index 00000000000000..29edbe5a35c841
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
@@ -0,0 +1,112 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "experimental.h"  // from @XNNPACK
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite::xnnpack {
+
+struct TfLiteDelegateDeleter {
+  void operator()(TfLiteDelegate* delegate) {
+    TfLiteXNNPackDelegateDelete(delegate);
+  }
+};
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, TfLiteDelegateDeleter>;
+
+struct DelegateTest : public virtual testing::Test {
+  void SetUp() override {
+    TfLiteXNNPackDelegateOptions delegate_options =
+        TfLiteXNNPackDelegateOptionsDefault();
+
+    // By default, we try to setup a file weight cache to also check fingerprint
+    // generation. If the test system doesn't support a file system, then the
+    // cache file will be invalid.
+    if (cache_file.IsValid()) {
+      xnn_clear_fingerprints();
+      delegate_options.weight_cache_file_path = cache_file.GetCPath();
+      delegate_options.weight_cache_file_descriptor =
+          cache_file.Duplicate().Release();
+      delegate_options.flags |=
+          TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+      check_for_cache_fingerprints = true;
+    }
+
+    xnnpack_delegate =
+        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
+    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
+  }
+
+  void TearDown() override {
+    if (check_for_cache_fingerprints) {
+      ASSERT_TRUE(cache_file.IsValid());
+      EXPECT_TRUE(IsCompatibleCacheFile(cache_file));
+      if (AlterXNNPackFingerprints()) {
+        EXPECT_FALSE(IsCompatibleCacheFile(cache_file));
+      }
+    }
+  }
+
+  // Artificially change fingerprint values.
+  //
+  // This allows us to check that changing a fingerprint value will make the
+  // cache file incompatible.
+  //
+  // Returns the current number of fingerprints.
+  int AlterXNNPackFingerprints() {
+    int i = 0;
+    int modified = 0;
+    for (const xnn_fingerprint* fingerprint = xnn_get_fingerprint_by_idx(i);
+         fingerprint != nullptr;
+         fingerprint = xnn_get_fingerprint_by_idx(++i)) {
+      xnn_fingerprint new_fingerprint = *fingerprint;
+      ++new_fingerprint.value;
+      xnn_set_fingerprint(new_fingerprint);
+      ++modified;
+    }
+    return modified;
+  }
+
+  // Replaces the xnnpack delegate with a custom one.
+  void UseCustomDelegate(const TfLiteXNNPackDelegateOptions& delegate_options) {
+    check_for_cache_fingerprints = false;
+    xnnpack_delegate =
+        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
+    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
+  }
+
+  // Replaces the xnnpack delegate with one that sets up a file backed weight
+  // cache.
+  void UseDelegateWithFileWeightCache() {}
+
+  // The default delegate is created in a generic way.
+  TfLiteDelegatePtr xnnpack_delegate;
+  tflite::xnnpack::TempFileDesc cache_file;
+  bool check_for_cache_fingerprints = false;
+};
+
+}  // namespace tflite::xnnpack
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index 92a6074c464f85..6701d0bc1c8f59 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(FullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct FullyConnectedTest : public DelegateTest {};
 
+TEST_F(FullyConnectedTest, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -45,11 +43,7 @@ TEST(FullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -65,11 +59,7 @@ TEST(FullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -87,11 +77,7 @@ TEST(FullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -110,11 +96,7 @@ TEST(FullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -133,11 +115,7 @@ TEST(FullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -156,11 +134,7 @@ TEST(FullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -180,11 +154,7 @@ TEST(FullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -204,11 +174,7 @@ TEST(FullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -229,11 +195,7 @@ TEST(FullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -252,11 +214,7 @@ TEST(FullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -275,11 +233,7 @@ TEST(FullyConnected, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, FP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, FP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -299,11 +253,7 @@ TEST(FullyConnected, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -322,11 +272,7 @@ TEST(FullyConnected, DynamicWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -346,11 +292,7 @@ TEST(FullyConnected, DynamicWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -369,11 +311,7 @@ TEST(FullyConnected, DynamicBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeightsAndBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeightsAndBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -393,11 +331,7 @@ TEST(FullyConnected, DynamicWeightsAndBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -416,11 +350,7 @@ TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -440,11 +370,7 @@ TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -463,11 +389,7 @@ TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -487,11 +409,7 @@ TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -510,11 +428,7 @@ TEST(FullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -533,11 +447,7 @@ TEST(FullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -556,13 +466,11 @@ TEST(FullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, MultiThreading) {
+TEST_F(FullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -581,7 +489,7 @@ TEST(FullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, WeightsCache) {
+TEST_F(FullyConnectedTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -589,9 +497,7 @@ TEST(FullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/macros.h b/tensorflow/lite/delegates/xnnpack/macros.h
new file mode 100644
index 00000000000000..ef2218ec621107
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/macros.h
@@ -0,0 +1,48 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_MACROS_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_MACROS_H_
+
+#include <cstdio>
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#define XNNPACK_LOG_LIMIT 4048
+
+#define XNNPACK_ABORT_CHECK(TEST, ...)                                   \
+  if (!(TEST)) {                                                         \
+    char msg[XNNPACK_LOG_LIMIT] = {0};                                   \
+    int bytes =                                                          \
+        snprintf(msg, XNNPACK_LOG_LIMIT, "%s:%d: ", __FILE__, __LINE__); \
+    snprintf(msg + bytes, XNNPACK_LOG_LIMIT - bytes, "" __VA_ARGS__);    \
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, msg);                      \
+    std::abort();                                                        \
+  }
+
+#define XNNPACK_VAR_ARG_HEAD(FIRST, ...) FIRST
+
+#define XNNPACK_RETURN_CHECK(TEST, ...)                                    \
+  if (!(TEST)) {                                                           \
+    if (sizeof(XNNPACK_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) {       \
+      char msg[XNNPACK_LOG_LIMIT] = {0};                                   \
+      int bytes =                                                          \
+          snprintf(msg, XNNPACK_LOG_LIMIT, "%s:%d: ", __FILE__, __LINE__); \
+      snprintf(msg + bytes, XNNPACK_LOG_LIMIT - bytes, "" __VA_ARGS__);    \
+      TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, msg);                      \
+    }                                                                      \
+    return false;                                                          \
+  }
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_MACROS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/mmap_handle.cc b/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
index 169e284de47f46..92caf07fac811e 100644
--- a/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
+++ b/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
@@ -32,20 +32,8 @@ limitations under the License.
 #include <cstring>
 
 #include "tensorflow/lite/delegates/xnnpack/file_util.h"
+#include "tensorflow/lite/delegates/xnnpack/macros.h"
 #include "tensorflow/lite/delegates/xnnpack/windows_util.h"
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-
-#define XNNPACK_VAR_ARG_HEAD(FIRST, ...) FIRST
-
-#define XNNPACK_RETURN_CHECK(TEST, ...)                              \
-  if (!(TEST)) {                                                     \
-    if (sizeof(XNNPACK_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) { \
-      TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,                      \
-                      "XNNPack weight cache: " __VA_ARGS__);         \
-    }                                                                \
-    return false;                                                    \
-  }
 
 namespace tflite::xnnpack {
 
@@ -100,9 +88,10 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
                        safe_path, strerror(errno));
 #else
   struct stat file_stats;
-  XNNPACK_RETURN_CHECK(fstat(fd.Value(), &file_stats) == 0,
-                       "could not access file stats to get size ('%s'): %s.",
-                       safe_path, strerror(errno));
+  XNNPACK_RETURN_CHECK(
+      fstat(fd.Value(), &file_stats) == 0,
+      "could not access file descriptor %d stats to get size ('%s'): %s.",
+      fd.Value(), safe_path, strerror(errno));
 #endif
 
   // This will reset data_ and size_ on return until it is deactivated.
@@ -149,8 +138,9 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
   data_ = static_cast<uint8_t*>(
       mmap(/*addr=*/nullptr, size_ + offset_page_adjustment_, PROT_READ,
            MAP_SHARED, fd.Value(), offset_ - offset_page_adjustment_));
-  XNNPACK_RETURN_CHECK(data_ != MAP_FAILED, "could not mmap file (%s): %s.",
-                       safe_path, strerror(errno));
+  XNNPACK_RETURN_CHECK(data_ != MAP_FAILED,
+                       "could not mmap file descriptor %d (%s): %s.",
+                       fd.Value(), safe_path, strerror(errno));
 #endif
   unmap_on_error.Deactivate();
   return true;
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
index f67ba714b01cc8..06daba0d9bada7 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedConv2D : DelegateTest {};
 
+TEST_F(SignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -59,11 +58,7 @@ TEST(SignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -91,11 +86,7 @@ TEST(SignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -125,11 +116,7 @@ TEST(SignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -163,11 +150,7 @@ TEST(SignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -197,11 +180,7 @@ TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -231,11 +210,7 @@ TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -269,11 +244,7 @@ TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -307,11 +278,7 @@ TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -345,11 +312,7 @@ TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -383,11 +346,7 @@ TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -421,11 +380,7 @@ TEST(SignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -459,11 +414,7 @@ TEST(SignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -497,13 +448,11 @@ TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, MultiThreading) {
+TEST_F(SignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -537,15 +486,13 @@ TEST(SignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(SignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
index 3acfbaaf34778e..c409b18002ef51 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
@@ -20,18 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(SignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -54,11 +52,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -82,11 +76,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -110,11 +100,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -140,11 +126,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -168,11 +150,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -198,11 +176,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -231,11 +205,7 @@ TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -264,11 +234,7 @@ TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -301,11 +267,7 @@ TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -338,11 +300,7 @@ TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -375,11 +333,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,11 +366,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -451,11 +401,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -488,11 +434,7 @@ TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -525,11 +467,7 @@ TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -562,13 +500,11 @@ TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(SignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -601,7 +537,7 @@ TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(SignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -609,9 +545,7 @@ TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -645,15 +579,13 @@ TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
index 3097d314a3a6ab..5a7a9dfd77b24e 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedFullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedFullyConnected : DelegateTest {};
 
+TEST_F(SignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -52,11 +51,7 @@ TEST(SignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -78,11 +73,7 @@ TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -106,11 +97,7 @@ TEST(SignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -135,11 +122,7 @@ TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -164,11 +147,7 @@ TEST(SignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -193,11 +172,7 @@ TEST(SignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -223,11 +198,7 @@ TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -253,11 +224,7 @@ TEST(SignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -284,11 +251,7 @@ TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -313,11 +276,7 @@ TEST(SignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -342,11 +301,7 @@ TEST(SignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -371,11 +326,7 @@ TEST(SignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -400,13 +351,11 @@ TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, MultiThreading) {
+TEST_F(SignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -431,7 +380,7 @@ TEST(SignedQuantizedFullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, WeightsCache) {
+TEST_F(SignedQuantizedFullyConnected, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -439,9 +388,7 @@ TEST(SignedQuantizedFullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
index 7daae13ebdea16..d4dceb9077ff26 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -52,11 +51,7 @@ TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -79,11 +74,7 @@ TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -105,11 +96,7 @@ TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -132,11 +119,7 @@ TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -158,11 +141,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -185,11 +164,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -211,11 +186,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -238,11 +209,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -267,11 +234,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -297,11 +260,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -326,11 +285,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -356,11 +311,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -389,11 +340,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -423,11 +370,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -456,11 +399,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -490,11 +429,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -524,11 +459,7 @@ TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -559,13 +490,11 @@ TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(SignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -595,13 +524,11 @@ TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST_F(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -632,7 +559,7 @@ TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(SignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -640,9 +567,7 @@ TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
index 260fd87e282a63..d37317c34f545a 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(TransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct TransposeConvTest : DelegateTest {};
 
+TEST_F(TransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -50,11 +49,7 @@ TEST(TransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -76,11 +71,7 @@ TEST(TransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -101,11 +92,7 @@ TEST(TransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -127,11 +114,7 @@ TEST(TransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -152,11 +135,7 @@ TEST(TransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -178,11 +157,7 @@ TEST(TransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -203,11 +178,7 @@ TEST(TransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -229,11 +200,7 @@ TEST(TransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -257,11 +224,7 @@ TEST(TransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -286,11 +249,7 @@ TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -314,11 +273,7 @@ TEST(TransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -343,11 +298,7 @@ TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -375,11 +326,7 @@ TEST(TransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -408,11 +355,7 @@ TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -440,11 +383,7 @@ TEST(TransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -473,11 +412,7 @@ TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -506,11 +441,7 @@ TEST(TransposeConvTest, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, FP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, FP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -540,11 +471,7 @@ TEST(TransposeConvTest, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -573,11 +500,7 @@ TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -607,11 +530,7 @@ TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -640,11 +559,7 @@ TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -674,11 +589,7 @@ TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -707,11 +618,7 @@ TEST(TransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -741,11 +648,7 @@ TEST(TransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -775,11 +678,7 @@ TEST(TransposeConvTest, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseFP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -810,11 +709,7 @@ TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -844,11 +739,7 @@ TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -879,11 +770,7 @@ TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -913,11 +800,7 @@ TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -948,13 +831,11 @@ TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, MultiThreading) {
+TEST_F(TransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -983,13 +864,11 @@ TEST(TransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, MultiThreadingNoBias) {
+TEST_F(TransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -1019,7 +898,7 @@ TEST(TransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, WeightsCache) {
+TEST_F(TransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -1027,9 +906,7 @@ TEST(TransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
index 6660fc5af75ebe..b8c9d48f4f05a2 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedConv2D : DelegateTest {};
 
+TEST_F(UnsignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -61,11 +60,7 @@ TEST(UnsignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -96,11 +91,7 @@ TEST(UnsignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -137,11 +128,7 @@ TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -174,11 +161,7 @@ TEST(UnsignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -211,11 +194,7 @@ TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -248,11 +227,7 @@ TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -289,11 +264,7 @@ TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -330,11 +301,7 @@ TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -371,11 +338,7 @@ TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,11 +375,7 @@ TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -453,11 +412,7 @@ TEST(UnsignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -494,11 +449,7 @@ TEST(UnsignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -535,13 +486,11 @@ TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, MultiThreading) {
+TEST_F(UnsignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -578,15 +527,13 @@ TEST(UnsignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
index 7facb9787338c7..a269343dafc512 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -56,11 +55,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -87,11 +82,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -118,11 +109,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -151,11 +138,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -182,11 +165,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -215,11 +194,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -251,11 +226,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -287,11 +258,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -327,11 +294,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -367,11 +330,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -407,11 +366,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -447,11 +402,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -489,11 +440,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -529,11 +476,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -569,11 +512,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -609,13 +548,11 @@ TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -651,7 +588,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -659,9 +596,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -698,15 +633,13 @@ TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
index 90df47c884d042..25aabd2a559413 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedFullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedFullyConnected : DelegateTest {};
 
+TEST_F(UnsignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -54,11 +53,7 @@ TEST(UnsignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -83,11 +78,7 @@ TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -114,11 +105,7 @@ TEST(UnsignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -146,11 +133,7 @@ TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -178,11 +161,7 @@ TEST(UnsignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -210,11 +189,7 @@ TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -243,11 +218,7 @@ TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -276,11 +247,7 @@ TEST(UnsignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -310,11 +277,7 @@ TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -342,11 +305,7 @@ TEST(UnsignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -374,11 +333,7 @@ TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -406,11 +361,7 @@ TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -438,13 +389,11 @@ TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, MultiThreading) {
+TEST_F(UnsignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
index 8e6a779a1979f9..5167d18443ac30 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,11 +50,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -78,11 +73,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -104,11 +95,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -131,11 +118,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -157,11 +140,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -184,11 +163,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -210,11 +185,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -237,11 +208,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -266,11 +233,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -296,11 +259,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -325,11 +284,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -355,11 +310,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -388,11 +339,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -422,11 +369,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -455,11 +398,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -489,11 +428,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -523,11 +458,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -558,13 +489,11 @@ TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -594,13 +523,11 @@ TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -631,7 +558,7 @@ TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(UnsignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -639,9 +566,7 @@ TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index e9ccdbfd8eedd9..9aaf497700f87f 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
 
 #include <fcntl.h>
+
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
 #if defined(_MSC_VER)
 #include <io.h>
 #define F_OK 0
@@ -22,7 +25,9 @@ limitations under the License.
 #include <unistd.h>
 #endif
 
+#include <algorithm>
 #include <cerrno>  // IWYU pragma: keep
+#include <cinttypes>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
@@ -33,32 +38,15 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/file_util.h"
+#include "tensorflow/lite/delegates/xnnpack/macros.h"
 #include "tensorflow/lite/delegates/xnnpack/mmap_handle.h"
 #include "tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h"
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-
-#define XNNPACK_ABORT_CHECK(TEST, ...)                      \
-  if (!(TEST)) {                                            \
-    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, __VA_ARGS__); \
-    std::abort();                                           \
-  }
-
-#define XNNPACK_VAR_ARG_HEAD(FIRST, ...) FIRST
-
-#define XNNPACK_RETURN_CHECK(TEST, ...)                              \
-  if (!(TEST)) {                                                     \
-    if (sizeof(XNNPACK_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) { \
-      TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,                      \
-                      "XNNPack weight cache: " __VA_ARGS__);         \
-    }                                                                \
-    return false;                                                    \
-  }
 
 namespace tflite::xnnpack {
 
@@ -92,6 +80,20 @@ bool FileExists(const char* path) {
   return access(path, F_OK) != -1;
 }
 
+bool CheckFingerprints(const cache::schema::BufferList* buffer_list) {
+  if (buffer_list->fingerprints()) {
+    for (uint64_t cache_fingerprint : *buffer_list->fingerprints()) {
+      xnn_fingerprint fingerprint;
+      static_assert(sizeof(fingerprint) == sizeof(cache_fingerprint));
+      std::memcpy(&fingerprint, &cache_fingerprint, sizeof(fingerprint));
+      XNNPACK_RETURN_CHECK(
+          xnn_check_fingerprint(fingerprint) == xnn_status_success,
+          "fingerprint (id: 0x%x) could not be matched", fingerprint.id);
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 #define XNN_MOVE_CONSTRUCT_MEMBER(x) x(std::move(other.x))
@@ -134,12 +136,17 @@ bool WeightCacheBuilder::Start(const char* path, const FileDescriptor& fd) {
   XNNPackCacheHeader header{XNNPackCacheHeader::kInvalidHeader};
   header.buffer_list_offset = sizeof(header);
 
-  XNNPACK_RETURN_CHECK(fd_.Truncate(0), "could not truncate weight cache");
+  XNNPACK_RETURN_CHECK(fd_.Truncate(0), "could not truncate weight cache.");
+  XNNPACK_RETURN_CHECK(fd_.SetPos(0) == 0, "couldn't move to file start.");
   XNNPACK_RETURN_CHECK(fd_.Write(&header, sizeof(header)),
                        "could not write initial cache header in %s: %s.",
                        file_path_.c_str(), strerror(errno));
 
   schema_.base_offset = Align(sizeof(header), kMinAlignment);
+
+  XNNPACK_RETURN_CHECK(StartBuildStep(), "failed to start initial write step.");
+  XNNPACK_RETURN_CHECK(StopBuildStep(), "failed to write initial step.");
+
   return true;
 }
 
@@ -191,7 +198,8 @@ void* WeightCacheBuilder::Reserve(size_t size) {
 }
 
 BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
-                                          const void* data, uint64_t size) {
+                                          const void* data, uint64_t size,
+                                          int32_t fingerprint_id) {
   XNNPACK_ABORT_CHECK(is_build_step_,
                       "cannot append data to an unstarted builder.");
   // Add some padding so that the cache file can be mmaped and the buffer
@@ -210,6 +218,34 @@ BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
   buffer.size = loc.size;
   schema_.buffers.push_back(std::make_unique<cache::schema::BufferT>(buffer));
 
+  // Not passing a fingerprint id is a logic error on XNNPack's side. If we
+  // don't have a fingerprint for an operation, we have no way of ensuring that
+  // the generation of the cached data hasn't changed when reloading the cache.
+  //
+  // If we just log this and continue on with the work. This run will build a
+  // cache with cached data that can't be checked in the future. This will lead,
+  // in future runs that reuse the cache, to crashes that are impossible to
+  // debug or outputs that are nonsensical without any chance of linking this
+  // back to this error.
+  //
+  // We abort because we have no way of making that failure bubble up to the
+  // calling code to handle it gracefully...
+  XNNPACK_ABORT_CHECK(fingerprint_id != 0,
+                      "XNNPack weight cache: no fingerprint identifier was set "
+                      "when appending a buffer to the cache file.");
+  const xnn_fingerprint* fingerprint = xnn_get_fingerprint(fingerprint_id);
+  XNNPACK_ABORT_CHECK(fingerprint,
+                      "XNNPack weight cache: could not find a fingerprint with "
+                      "id 0x%x when appending a buffer to the cache file.",
+                      fingerprint_id);
+  uint64_t fingerprint_value;
+  static_assert(sizeof(fingerprint_value) == sizeof(*fingerprint));
+  std::memcpy(&fingerprint_value, fingerprint, sizeof(*fingerprint));
+  if (std::find(schema_.fingerprints.begin(), schema_.fingerprints.end(),
+                fingerprint_value) == schema_.fingerprints.end()) {
+    schema_.fingerprints.push_back(fingerprint_value);
+  }
+
   if (!fd_.Write(data, size)) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
                     "XNNPack weight cache: cannot append buffer to cache file");
@@ -242,16 +278,7 @@ bool WeightCacheBuilder::StopBuildStep() {
   XNNPACK_RETURN_CHECK(fd_.SetPos(layout_offset) != -1,
                        "could not move in the file: %s", strerror(errno));
 
-  XNNPACK_RETURN_CHECK(
-      sizeof(XNNPackCacheHeader::xnnpack_build_identifier) ==
-          xnn_experimental_get_build_identifier_size(),
-      "cache file ('%s') header cannot hold XNNPack's build identifier: %s.",
-      file_path_.c_str(), strerror(errno));
-
   XNNPackCacheHeader header{XNNPackCacheHeader::kVersion};
-  memcpy(header.xnnpack_build_identifier,
-         xnn_experimental_get_build_identifier_data(),
-         xnn_experimental_get_build_identifier_size());
   header.buffer_list_offset = fd_.GetPos();
   header.buffer_list_size = builder.GetSize();
 
@@ -339,7 +366,8 @@ bool MMapWeightCacheProvider::LoadOrStartBuild(const char* path,
   }
   const char* const safe_path = Sanitize(path);
   FileDescriptor build_fd = fd.Duplicate();
-  if (!IsInMemoryCachePath(safe_path) && Load(safe_path, std::move(fd))) {
+  if (!IsInMemoryCachePath(safe_path) && !IsFileEmpty(safe_path, fd) &&
+      Load(safe_path, std::move(fd))) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_VERBOSE,
                     "XNNPack weight cache loaded from '%s'.", safe_path);
     return true;
@@ -409,16 +437,10 @@ bool MMapWeightCacheProvider::Load() {
   }();
 
   XNNPACK_RETURN_CHECK(header.version == XNNPackCacheHeader::kVersion,
-                       "incompatible header version. Got %zd, expected %zd. "
-                       "Cache needs to be built again.",
+                       "incompatible header version. Got %" PRIu64
+                       ", expected %" PRIu64 ". Cache needs to be built again.",
                        header.version, XNNPackCacheHeader::kVersion);
 
-  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
-                           header.xnnpack_build_identifier,
-                           sizeof(header.xnnpack_build_identifier)),
-                       "XNNPack weight cache: incompatible XNNPack version. "
-                       "Cache needs to be built again.");
-
   XNNPACK_RETURN_CHECK(header.buffer_list_offset < mmap_handle.size(),
                        "invalid offset for buffer list descriptor.");
 
@@ -438,6 +460,8 @@ bool MMapWeightCacheProvider::Load() {
   XNNPACK_RETURN_CHECK(buffer_list,
                        "could not get packed weights from flatbuffer.");
 
+  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
+
   mmap_buffer_base_offset_ = buffer_list->base_offset();
   if (const auto buffers = buffer_list->buffers(); buffers) {
     for (auto* buffer : *buffers) {
@@ -592,7 +616,8 @@ size_t MMapWeightCacheProvider::LookUpOrInsert(
     return offset_it->second.offset;
   }
 
-  const BufferLocation location = builder_.Append(pack_id, ptr, size);
+  const BufferLocation location =
+      builder_.Append(pack_id, ptr, size, cache_key->fingerprint_id);
   XNNPACK_ABORT_CHECK(!location.IsInvalid(),
                       "Inserting data in the cache failed.");
   cache_key_to_offset_.emplace(pack_id, location);
@@ -682,17 +707,39 @@ bool IsCompatibleCacheFile(const char* path) {
   FileDescriptor fd = FileDescriptor::Open(path, O_RDONLY);
   XNNPACK_RETURN_CHECK(fd.IsValid(), "Could not open file: %s: %s.", path,
                        strerror(errno));
+  return IsCompatibleCacheFile(std::move(fd));
+}
+
+bool IsCompatibleCacheFile(FileDescriptorView fd) {
+  XNNPACK_RETURN_CHECK(fd.IsValid(), "Invalid file descriptor: %d.",
+                       fd.Value());
+  const size_t current_pos = fd.GetPos();
+  ScopeGuard reset_pos_on_return(
+      [current_pos, &fd] { fd.SetPos(current_pos); });
+  XNNPACK_RETURN_CHECK(fd.SetPos(0) != -1,
+                       "Couldn't move to the start of the file.");
+
   XNNPackCacheHeader header;
   XNNPACK_RETURN_CHECK(fd.Read(&header, sizeof(header)),
                        "Couldn't read file header.");
-  XNNPACK_RETURN_CHECK(
-      header.version == XNNPackCacheHeader::kVersion,
-      "Cache header version is incompatible. Expected %llu, got %llu.",
-      XNNPackCacheHeader::kVersion, header.version);
-  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
-                           header.xnnpack_build_identifier,
-                           sizeof(header.xnnpack_build_identifier)),
-                       "Cache header build identifier is different.");
+  XNNPACK_RETURN_CHECK(header.version == XNNPackCacheHeader::kVersion,
+                       "Cache header version is incompatible. Expected %" PRIu64
+                       ", got %" PRIu64 ".",
+                       XNNPackCacheHeader::kVersion, header.version);
+
+  fd.SetPos(header.buffer_list_offset);
+  auto buffer = std::make_unique<uint8_t[]>(header.buffer_list_size);
+  XNNPACK_RETURN_CHECK(fd.Read(buffer.get(), header.buffer_list_size));
+
+  flatbuffers::Verifier verifier(buffer.get(), header.buffer_list_size);
+  XNNPACK_RETURN_CHECK(cache::schema::VerifyBufferListBuffer(verifier),
+                       "buffer list validation failed.");
+
+  const cache::schema::BufferList* buffer_list =
+      cache::schema::GetBufferList(buffer.get());
+  XNNPACK_RETURN_CHECK(buffer_list,
+                       "could not get packed weights from flatbuffer.");
+  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
   return true;
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index 7dd04a20f2095f..781422b4bec662 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -56,15 +56,25 @@ inline constexpr char kInMemoryCachePath[] = ":memory";
 // When reading a cache file, the cache should be rejected if `version`
 // doesn't match `kVersion`.
 struct XNNPackCacheHeader {
-  enum : uint64_t { kInvalidHeader = 0, kVersion = 1 };
+  enum : uint64_t { kInvalidHeader = 0, kVersion = 2 };
   uint64_t version;
-  uint8_t xnnpack_build_identifier[32];
   uint64_t buffer_list_offset;
   uint64_t buffer_list_size;
 };
 
+// Checks if the file at the given path is compatible with the current XNNPack
+// weight cache.
 bool IsCompatibleCacheFile(const char* path);
 
+// Checks if the opened file is compatible with the current XNNPack weight
+// cache.
+//
+// Position in the file may be changed during the function execution but is
+// restored upon exiting.
+//
+// Note: the file descriptor must be open and valid.
+bool IsCompatibleCacheFile(FileDescriptorView fd);
+
 struct PackIdentifier {
   enum { kNoId = SIZE_MAX };
   uint64_t pack_algorithm_id = kNoId;
@@ -150,8 +160,8 @@ class WeightCacheBuilder {
   // The buffer space must have been reserved before using `Reserve`. If not, a
   // new call to `Reserve` will be done and the data will be copied over.
   [[nodiscard /*The location to the appended data should be saved.*/]]
-  BufferLocation Append(PackIdentifier pack_id, const void* data,
-                        uint64_t size);
+  BufferLocation Append(PackIdentifier pack_id, const void* data, uint64_t size,
+                        int fingerprint_id);
 
   // Writes the flatbuffer to disk.
   [[nodiscard /*Writing the weight cache can fail.*/]]
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
index 33566b8be2208a..37f19612010709 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
@@ -32,11 +32,14 @@ table Buffer {
 }
 
 table BufferList {
+  /// A list of packing fingerprints. All of these need to be checked when
+  /// loading the cache to ensure that it is compatible.
+  fingerprints: [uint64];
   /// A list of buffers.
   buffers: [Buffer];
   /// Defines the base offset for the data in the file. That offset
   /// may be needed to guarantee data alignment.
-  base_offset:uint64;
+  base_offset: uint64;
 }
 
 root_type BufferList;
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
index a74e40018e1eba..c1e4071ff4a353 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
@@ -56,7 +57,13 @@ namespace {
 
 using testing::ElementsAreArray;
 
-TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
+static xnn_fingerprint kDefaultFingerprint{/*id=*/0xf00d, /*value=*/0xb33f};
+
+struct WeightCacheBuilderTest : testing::Test {
+  void SetUp() override { xnn_set_fingerprint(kDefaultFingerprint); }
+};
+
+TEST_F(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -72,7 +79,8 @@ TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   const size_t payload_size = size(payload);
   void* buffer = builder.Reserve(payload_size);
   std::memcpy(buffer, payload.c_str(), payload_size);
-  auto loc = builder.Append(dummy_id, buffer, payload_size);
+  auto loc =
+      builder.Append(dummy_id, buffer, payload_size, kDefaultFingerprint.id);
 
   EXPECT_EQ(loc.size, payload_size);
   EXPECT_GE(builder.capacity(), payload_size);
@@ -123,7 +131,7 @@ TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
+TEST_F(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -137,7 +145,8 @@ TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
+                            kDefaultFingerprint.id);
 
   EXPECT_EQ(loc.size, payload_size);
 
@@ -186,7 +195,7 @@ TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
+TEST_F(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   const std::string cache_path = testing::TempDir() + "/cache";
   const std::string payload = "This is some data in the file.";
   const PackIdentifier dummy_id{1, 2, 3};
@@ -198,7 +207,8 @@ TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
+                            kDefaultFingerprint.id);
   EXPECT_EQ(loc.size, payload_size);
   ASSERT_TRUE(builder.StopBuildStep());
 
@@ -218,13 +228,13 @@ TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   EXPECT_FALSE(builder.StartBuildStep());
 }
 
-TEST(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
+TEST_F(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
   WeightCacheBuilder builder;
   EXPECT_FALSE(builder.Start("", FileDescriptor()));
   EXPECT_FALSE(builder.Start("/seldf/sedsft", FileDescriptor()));
 }
 
-TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
+TEST_F(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   if (!TfLiteXNNPackDelegateCanUseInMemoryWeightCacheProvider()) {
     GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                     "isn't supported by the current system, skipping test.";
@@ -239,7 +249,7 @@ TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   EXPECT_EQ(errno, ENOENT);
 }
 
-TEST(WeightCacheBuilderTest, MultipleStepBuild) {
+TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
   using std::size;
 
   const std::string payload1 = "This is some data in the file.";
@@ -262,7 +272,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload1);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload1.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id1, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id1, buffer, payload_size, kDefaultFingerprint.id);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -270,7 +281,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload3);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload3.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id3, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id3, buffer, payload_size, kDefaultFingerprint.id);
     (void)loc;
   }
 
@@ -284,7 +296,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload2);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload2.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id2, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id2, buffer, payload_size, kDefaultFingerprint.id);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -389,7 +402,8 @@ struct FakeContext {
                                           const int weights_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = nullptr};
+            .bias = nullptr,
+            .fingerprint_id = kDefaultFingerprint.id};
   }
 
   // Creates a look up key for the XNNPack weight provider C interface.
@@ -398,7 +412,8 @@ struct FakeContext {
                                           const int bias_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = buffers[bias_index].data()};
+            .bias = buffers[bias_index].data(),
+            .fingerprint_id = kDefaultFingerprint.id};
   }
 
   // Helps creating fake packed data.
@@ -505,6 +520,7 @@ struct BuildMMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
+    xnn_set_fingerprint(kDefaultFingerprint);
     AddTensors();
     EndSetup();
   }
@@ -723,6 +739,7 @@ struct MMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
+    xnn_set_fingerprint(kDefaultFingerprint);
   }
   bool use_explicit_fd = GetParam().use_explicit_fd;
   const char* const explicit_fd_path = GetParam().explicit_fd_path;
@@ -783,12 +800,14 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data};
+        .bias = tensors[1].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data};
+        .bias = tensors[4].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     // Lookup non-packed tensor.
     ASSERT_EQ(cache->look_up(cache, &look_up_key_1), SIZE_MAX);
@@ -829,7 +848,8 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data};
+        .bias = tensors[3].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const size_t build_offset_2 = cache->look_up_or_insert(
         cache, &look_up_key_2, (void*)packed_data_ref_2,
@@ -904,17 +924,20 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data};
+        .bias = tensors[1].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data};
+        .bias = tensors[3].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data};
+        .bias = tensors[4].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     ASSERT_TRUE(cache->is_finalized(cache));
 
@@ -945,65 +968,149 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
   }
 }
 
-TEST_P(MMapWeightCacheProviderTest, XnnpackRebuildOnVersionMismatch) {
+TEST_P(MMapWeightCacheProviderTest, CacheIsRebuiltOnFingerprintMismatch) {
+  if (use_in_memory_cache) {
+    GTEST_SUCCEED() << "In-memory cache is never reloaded.";
+    return;
+  }
   TempFileDesc temp_fd;
   const char* temp_fd_cpath = explicit_fd_path;
-  FileDescriptor temp_fd_value = temp_fd.Duplicate();
 
-  {  // Set bad build identifier
-    XNNPackCacheHeader header{.version = XNNPackCacheHeader::kVersion};
-    header.xnnpack_build_identifier[0] += 1;
-    ASSERT_TRUE(temp_fd_value.Write(&header, sizeof(header)));
+  xnn_fingerprint test_fingeprint{0x7357, 0xF33D};
+  {  // Build a cache file with a specific fingerprint.
+    // Clear fingerprints and add a test fingerprint to XNNPack.
+    xnn_clear_fingerprints();
+    xnn_set_fingerprint(test_fingeprint);
+
+    // Build a cache file.
+    MMapWeightCacheProvider cache_provider;
+
+    const char kernel[] = "Fake data.";
+    TfLiteTensor tensor;
+    tensor.data.data = (void*)kernel;
+    cache_provider.MapTensorIdentifiers(
+        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
+    ASSERT_TRUE(
+        cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
+    ASSERT_TRUE(cache_provider.StartBuildStep());
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = 1234,
+        .kernel = kernel,
+        .bias = nullptr,
+        .fingerprint_id = test_fingeprint.id};
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1,
+        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
+        sizeof(kernel));
+    (void)build_offset_1;
+    ASSERT_TRUE(cache_provider.StopBuildStep());
   }
 
   if (!use_explicit_fd) {
     temp_fd.Close();
     temp_fd_cpath = temp_fd.GetCPath();
-    temp_fd_value.Close();
-    if (use_in_memory_cache) {
-      temp_fd_cpath = kInMemoryCachePath;
-    }
   }
 
+  // Change the test fingerprint value.
+  test_fingeprint.value = 0xdeadb33f;
+  xnn_set_fingerprint(test_fingeprint);
+
+  // Reload the file.
   auto build_cache_provider = std::make_unique<MMapWeightCacheProvider>();
   MMapWeightCacheProvider& cache_provider = *build_cache_provider;
-  ASSERT_TRUE(cache_provider.LoadOrStartBuild(temp_fd_cpath,
-                                              temp_fd_value.Duplicate()));
+  ASSERT_TRUE(
+      cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
   ASSERT_TRUE(cache_provider.StartBuildStep());
 }
 
-class IsCompatibleCacheFileTest : public testing::Test {
+enum class IsCompatibleCacheFileTestOverload { kPath, kDescriptor };
+
+class IsCompatibleCacheFileTest
+    : public testing::TestWithParam<IsCompatibleCacheFileTestOverload> {
  public:
+  using Param = IsCompatibleCacheFileTestOverload;
+
   void SetUp() override {
-    header_.version = XNNPackCacheHeader::kVersion;
-    memcpy(header_.xnnpack_build_identifier,
-           xnn_experimental_get_build_identifier_data(),
-           xnn_experimental_get_build_identifier_size());
+    xnn_clear_fingerprints();
+    xnn_set_fingerprint(kDefaultFingerprint);
+
+    // Build a cache file.
+    MMapWeightCacheProvider cache_provider;
+
+    const char kernel[] = "Fake data.";
+    TfLiteTensor tensor;
+    tensor.data.data = (void*)kernel;
+    cache_provider.MapTensorIdentifiers(
+        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
+    ASSERT_TRUE(
+        cache_provider.LoadOrStartBuild(fd_.GetCPath(), fd_.Duplicate()));
+    ASSERT_TRUE(cache_provider.StartBuildStep());
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = 1234,
+        .kernel = kernel,
+        .bias = nullptr,
+        .fingerprint_id = kDefaultFingerprint.id};
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1,
+        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
+        sizeof(kernel));
+    (void)build_offset_1;
+    ASSERT_TRUE(cache_provider.StopBuildStep());
   }
 
-  bool WriteHeaderAndReturnIsCompatibleCacheFile() {
-    const bool res = fd_.Write(&header_, sizeof(header_));
-    fd_.Close();
-    return res && IsCompatibleCacheFile(fd_.GetCPath());
+  void ChangeRuntimeFingerprintValue() {
+    xnn_set_fingerprint(
+        {kDefaultFingerprint.id, kDefaultFingerprint.value + 1});
+  }
+
+  bool CallIsCompatibleCacheFile() {
+    switch (GetParam()) {
+      case Param::kPath:
+        fd_.Close();
+        return IsCompatibleCacheFile(fd_.GetCPath());
+      case Param::kDescriptor: {
+        const auto pos = fd_.GetPos();
+        EXPECT_NE(pos, 0);  // We test with a non zero position.
+        return IsCompatibleCacheFile(fd_);
+        EXPECT_EQ(fd_.GetPos(), pos);
+      }
+    }
   }
 
-  XNNPackCacheHeader header_{};
   TempFileDesc fd_;
 };
 
-TEST_F(IsCompatibleCacheFileTest, ReturnsTrueForACorrectHeader) {
-  EXPECT_TRUE(WriteHeaderAndReturnIsCompatibleCacheFile());
+std::string Name(
+    const testing::TestParamInfo<IsCompatibleCacheFileTestOverload>& info) {
+  switch (info.param) {
+    case IsCompatibleCacheFileTestOverload::kPath:
+      return "WithPathOverload";
+    case IsCompatibleCacheFileTestOverload::kDescriptor:
+      return "WithFileDescriptorOverload";
+  }
 }
 
-TEST_F(IsCompatibleCacheFileTest, ReturnsFalseForWrongHeaderVersion) {
-  header_.version += 1;
-  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsTrueWhenFingerprintMatches) {
+  EXPECT_TRUE(CallIsCompatibleCacheFile());
 }
 
-TEST_F(IsCompatibleCacheFileTest, ReturnsFalseForWrongBuildIdentifier) {
-  header_.xnnpack_build_identifier[0] += 1;
-  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintMismatches) {
+  ChangeRuntimeFingerprintValue();
+  EXPECT_FALSE(CallIsCompatibleCacheFile());
 }
 
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintIsNotFound) {
+  xnn_clear_fingerprints();
+  EXPECT_FALSE(CallIsCompatibleCacheFile());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, IsCompatibleCacheFileTest,
+    testing::Values(IsCompatibleCacheFileTest::Param::kPath,
+                    IsCompatibleCacheFileTest::Param::kDescriptor),
+    Name);
+
 }  // namespace
 }  // namespace tflite::xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h b/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h
index 365f94dc6ce885..ab29545730664d 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h
@@ -86,14 +86,14 @@ class TempFileDesc : public FileDescriptor {
     errno_t err = tmpnam_s(filename, L_tmpnam_s);
     if (err) {
       fprintf(stderr, "Could not create temporary filename.\n");
-      std::abort();
+      return;
     }
     path_ = filename;
     FileDescriptor fd =
         FileDescriptor::Open(path_.c_str(), _O_CREAT | _O_EXCL | _O_RDWR, 0644);
     if (!fd.IsValid()) {
       fprintf(stderr, "Could not create temporary file.\n");
-      std::abort();
+      return;
     }
     Reset(fd.Release());
   }
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index a869ca74a4cc5e..f390b8065caac2 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -211,7 +211,7 @@ bool CheckZeroPointForPerChannelQuantization(
   // be 8.
   for (int c = 0; c < quantization_zero_point.size; c++) {
     const int zero_point = quantization_zero_point.data[c];
-    if (zero_point != 0 && (tensor.type != kTfLiteInt4 && zero_point != 8)) {
+    if (zero_point != 0 && (tensor.type != kTfLiteInt4 || zero_point != 8)) {
       TF_LITE_KERNEL_LOG(context,
                          "unsupported zero-point value (%d) in channel %d of "
                          "%s tensor %d in XNNPACK delegate",
@@ -268,7 +268,8 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
       return xnn_datatype_quint8;
     }
     case kTfLiteInt8:
-    case kTfLiteInt4: {
+    case kTfLiteInt4:
+    case kTfLiteInt2: {
       switch (tensor.quantization.type) {
         case kTfLiteAffineQuantization: {
           const auto quantization_params =
@@ -320,6 +321,8 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
               return xnn_datatype_qcint8;
             case kTfLiteInt4:
               return xnn_datatype_qcint4;
+            case kTfLiteInt2:
+              return xnn_datatype_qcint2;
             default:
               // Outermost switch prevents this
               TFL_UNREACHABLE();
@@ -528,6 +531,22 @@ TfLiteStatus DefineXNNPACKValue(TfLiteContext* context, xnn_subgraph_t subgraph,
           dims.size(), dims.data(), data, XNN_INVALID_VALUE_ID, flags,
           xnnpack_id);
     } break;
+    case xnn_datatype_qcint2: {
+      status = xnn_define_channelwise_quantized_tensor_value_v3(
+          subgraph, datatype,
+          static_cast<const TfLiteAffineQuantization*>(
+              tensor.quantization.params)
+              ->zero_point->data[0],
+          static_cast<const TfLiteAffineQuantization*>(
+              tensor.quantization.params)
+              ->scale->data,
+          dims.size(),
+          static_cast<const TfLiteAffineQuantization*>(
+              tensor.quantization.params)
+              ->quantized_dimension,
+          dims.data(), data, XNN_INVALID_VALUE_ID, flags, xnnpack_id,
+          /*channelwise_zero_point=*/nullptr);
+    } break;
     case xnn_datatype_qcint4:
     case xnn_datatype_qcint8:
     case xnn_datatype_qcint32:
@@ -2228,18 +2247,21 @@ class Subgraph {
     return kTfLiteError;
   }
 
-  static TfLiteStatus CheckTensorFloat32OrFloat16OrQCInt4OrQCInt8Type(
-      const Delegate& delegate, TfLiteContext* context,
-      const TfLiteTensor& tensor, int expected_quantized_dimension,
-      int tensor_index, int node_index) {
+  static TfLiteStatus CheckTensorFilterType(const Delegate& delegate,
+                                            TfLiteContext* context,
+                                            const TfLiteTensor& tensor,
+                                            int expected_quantized_dimension,
+                                            int tensor_index, int node_index) {
     switch (tensor.type) {
       case kTfLiteFloat32:
       case kTfLiteFloat16:
         return kTfLiteOk;
+      case kTfLiteInt2:
       case kTfLiteInt4:
       case kTfLiteInt8:
         if (delegate.support_signed_8bit_quantization() &&
-            (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type)) {
+            (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type ||
+             kTfLiteInt2 == tensor.type)) {
           switch (tensor.quantization.type) {
             case kTfLiteAffineQuantization: {
               const TfLiteAffineQuantization* quantization_params =
@@ -2277,6 +2299,20 @@ class Subgraph {
                     quantization_params->quantized_dimension, tensor_index,
                     node_index);
                 return kTfLiteError;
+              } else if (tensor.type == kTfLiteInt2 &&
+                         quantization_params->scale->size !=
+                             SizeOfDimension(
+                                 &tensor,
+                                 quantization_params->quantized_dimension)) {
+                // Only per channel quantized 2 bit weights are supported.
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "2 bit weights must be per channel and not per tensor "
+                    "quantized in channel #%" PRId32
+                    " in tensor #%d in node #%d",
+                    quantization_params->quantized_dimension, tensor_index,
+                    node_index);
+                return kTfLiteError;
               }
               break;
             }
@@ -4489,7 +4525,7 @@ class Subgraph {
     // Dynamic filter is supported, but only for FP32.
     if (!(delegate.support_dynamic_fully_connected_operator() &&
           filter_tensor.type == kTfLiteFloat32)) {
-      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrFloat16OrQCInt4OrQCInt8Type(
+      TF_LITE_ENSURE_STATUS(CheckTensorFilterType(
           delegate, logging_context, filter_tensor,
           /*expected_quantized_dimension=*/0, filter_tensor_id, node_index));
       if (quasi_static_tensors.count(filter_tensor_id) == 0) {
@@ -4543,10 +4579,12 @@ class Subgraph {
     bool dynamically_quantized =
         (!delegate.disable_dynamically_quantized_ops() &&
          (input_tensor.type == kTfLiteFloat32 &&
-          (filter_tensor.type == kTfLiteInt4 ||
+          (filter_tensor.type == kTfLiteInt2 ||
+           filter_tensor.type == kTfLiteInt4 ||
            filter_tensor.type == kTfLiteInt8)));
     bool supported_srq = (input_tensor.type == kTfLiteInt8 &&
-                          (filter_tensor.type == kTfLiteInt4 ||
+                          (filter_tensor.type == kTfLiteInt2 ||
+                           filter_tensor.type == kTfLiteInt4 ||
                            filter_tensor.type == kTfLiteInt8));
     if (input_tensor.type != output_tensor.type ||
         ((input_tensor.type != filter_tensor.type) &&
@@ -4567,6 +4605,15 @@ class Subgraph {
       return kTfLiteError;
     }
 
+    if (filter_tensor.type == kTfLiteInt2 && input_channels % 4 != 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported non-multiple of 4 number of inputs channels (%d) in"
+          " FULLY_CONNECTED operator #%d",
+          input_channels, node_index);
+      return kTfLiteError;
+    }
+
     float output_min = -std::numeric_limits<float>::infinity();
     float output_max = +std::numeric_limits<float>::infinity();
     TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
@@ -4644,6 +4691,16 @@ class Subgraph {
             &filter_tensor.dims->data[NumDimensions(&filter_tensor)]);
         uint32_t kernel_id = XNN_INVALID_VALUE_ID;
         switch (filter_datatype) {
+          case xnn_datatype_qcint2: {
+            int32_t zero_point_value = filter_params->zero_point->data[0];
+            status = xnn_define_channelwise_quantized_tensor_value_v3(
+                subgraph, filter_datatype, zero_point_value,
+                filter_params->scale->data, filter_dims.size(),
+                /*channel_dim=*/0, filter_dims.data(),
+                GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+                /*flags=*/0, &kernel_id, /*channelwise_zero_point=*/nullptr);
+            break;
+          }
           case xnn_datatype_qcint4:
           case xnn_datatype_qcint8: {
             int32_t zero_point_value = filter_params->zero_point->data[0];
diff --git a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
index 8cba5779565223..56692cbcaeecb7 100644
--- a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
+++ b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
@@ -292,8 +292,8 @@ class AudioMicrofrontendOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("AudioMicrofrontend")
                             .Device(tensorflow::DEVICE_CPU)
-                            .TypeConstraint<uint16>("out_type"),
-                        AudioMicrofrontendOp<uint16>);
+                            .TypeConstraint<uint16_t>("out_type"),
+                        AudioMicrofrontendOp<uint16_t>);
 REGISTER_KERNEL_BUILDER(Name("AudioMicrofrontend")
                             .Device(tensorflow::DEVICE_CPU)
                             .TypeConstraint<float>("out_type"),
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 3249969563db1d..00fdb0c9b77f18 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -39,6 +39,21 @@ exports_files([
     "tflite_version_script.lds",
 ])
 
+exports_files([
+    # go/keep-sorted start
+    "src/main/java/org/tensorflow/lite/DataType.java",
+    "src/main/java/org/tensorflow/lite/DataTypeUtils.java",
+    "src/main/java/org/tensorflow/lite/InterpreterFactory.java",
+    "src/main/java/org/tensorflow/lite/NativeInterpreterWrapperExperimental.java",
+    "src/main/java/org/tensorflow/lite/NativeSignatureRunnerWrapper.java",
+    "src/main/java/org/tensorflow/lite/RuntimeFlavor.java",
+    "src/main/java/org/tensorflow/lite/Tensor.java",
+    "src/main/java/org/tensorflow/lite/TensorImpl.java",
+    "src/main/java/org/tensorflow/lite/annotations/UsedByReflection.java",
+    "src/main/java/org/tensorflow/lite/package-info.java",
+    # go/keep-sorted end
+])
+
 #-----------------------------------------------------------------------------
 # Filegroup targets.
 
@@ -928,6 +943,17 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+filegroup(
+    name = "portable_tests_for_litert",
+    srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java",
+        "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
+        "src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java",
+        "src/test/java/org/tensorflow/lite/TestInit.java",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 # portable_flex_tests includes files for testing interpreter with Flex delegate.
 filegroup(
     name = "portable_flex_tests",
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 6a3ec9f57e2a02..5a47ace22d912b 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -167,7 +167,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/serialization:writer_lib",
         "//tensorflow/lite/tools/versioning",
-        "@FP16",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -574,6 +574,7 @@ cc_test(
         "//tensorflow/lite:array",
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -833,6 +834,7 @@ cc_library(
         "@ruy//ruy/profiler:instrumentation",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/types:half",
         "//tensorflow/lite:array",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:cc_api_stable",
@@ -1118,6 +1120,7 @@ cc_test(
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1499,6 +1502,7 @@ cc_test(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1515,8 +1519,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -1709,6 +1713,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1739,6 +1744,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1853,6 +1859,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1913,8 +1920,8 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
@@ -1972,6 +1979,7 @@ cc_test(
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2082,12 +2090,12 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2101,7 +2109,9 @@ cc_test(
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -2485,7 +2495,9 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2530,6 +2542,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2544,6 +2557,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2581,8 +2595,8 @@ cc_test(
         "//tensorflow/lite/kernels/internal:tensor_ctypes",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2881,6 +2895,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -2905,6 +2920,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -3173,12 +3189,13 @@ cc_test(
     size = "small",
     srcs = ["dynamic_update_slice_test.cc"],
     deps = [
+        ":subgraph_test_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
-        "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
         "@flatbuffers",
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 42747a87e61b2a..96bb22ed76c431 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 
@@ -574,18 +575,17 @@ TEST_P(TanhOpTest, Tanh) {
 }
 
 TEST_P(TanhOpTest, TanhFloat16) {
-  FloatActivationsOpModel<Eigen::half> m(
-      GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_TANH,
+                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      Eigen::half(0),
-      Eigen::half(-6),
-      Eigen::half(2),
-      Eigen::half(4),
-      Eigen::half(3),
-      Eigen::half(-2),
-      Eigen::half(10),
-      Eigen::half(1),
+      half(0),
+      half(-6),
+      half(2),
+      half(4),
+      half(3),
+      half(-2),
+      half(10),
+      half(1),
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
@@ -1210,18 +1210,17 @@ TEST_P(LogisticOpTest, SigmoidFloat32) {
 }
 
 TEST_P(LogisticOpTest, SigmoidFloat16) {
-  FloatActivationsOpModel<Eigen::half> m(
-      GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_LOGISTIC,
+                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      Eigen::half{-1.2f},
-      Eigen::half{-6.0f},
-      Eigen::half{2.0f},
-      Eigen::half{4.0f},
-      Eigen::half{3.0f},
-      Eigen::half{-2.0f},
-      Eigen::half{10.0f},
-      Eigen::half{1.0f},
+      half{-1.2f},
+      half{-6.0f},
+      half{2.0f},
+      half{4.0f},
+      half{3.0f},
+      half{-2.0f},
+      half{10.0f},
+      half{1.0f},
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/atan2_test.cc b/tensorflow/lite/kernels/atan2_test.cc
index 309ba79f284f3f..0c3839361570a6 100644
--- a/tensorflow/lite/kernels/atan2_test.cc
+++ b/tensorflow/lite/kernels/atan2_test.cc
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -34,7 +35,7 @@ tflite::TensorType GetTTEnum<double>() {
 }
 
 template <>
-tflite::TensorType GetTTEnum<Eigen::half>() {
+tflite::TensorType GetTTEnum<half>() {
   return tflite::TensorType_FLOAT16;
 }
 
@@ -74,7 +75,7 @@ class Atan2Test : public ::testing::Test {
   using FloatType = Float;
 };
 
-using TestTypes = ::testing::Types<float, double, Eigen::half, Eigen::bfloat16>;
+using TestTypes = ::testing::Types<float, double, half, Eigen::bfloat16>;
 
 TYPED_TEST_SUITE(Atan2Test, TestTypes);
 
@@ -85,15 +86,15 @@ TYPED_TEST(Atan2Test, TestScalar) {
   tflite::TensorData output = {GetTTEnum<Float>(), {}};
   Atan2Model m(y, x, output);
 
-  auto got = m.GetOutput<Float>({Float(0.0)}, {Float(0.0)});
+  auto got = m.GetOutput<Float>({Float(0.0f)}, {Float(0.0f)});
   ASSERT_EQ(got.size(), 1);
   EXPECT_FLOAT_EQ(got[0], 0.0);
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0)}, {Float(0.0)})[0],
-                  Float(M_PI / 2));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0)}, {Float(1.0)})[0],
-                  Float(0.0));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0)}, {Float(0.0)})[0],
-                  Float(-M_PI / 2));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0f)}, {Float(0.0f)})[0],
+                  Float(static_cast<float>(M_PI / 2)));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0f)}, {Float(1.0f)})[0],
+                  Float(0.0f));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0f)}, {Float(0.0f)})[0],
+                  Float(-static_cast<float>(M_PI / 2)));
 }
 
 TYPED_TEST(Atan2Test, TestBatch) {
@@ -102,10 +103,12 @@ TYPED_TEST(Atan2Test, TestBatch) {
   tflite::TensorData x = {GetTTEnum<Float>(), {4, 2, 1}};
   tflite::TensorData output = {GetTTEnum<Float>(), {4, 2, 1}};
   Atan2Model m(y, x, output);
-  std::vector<Float> y_data = {Float(0.1), Float(0.2), Float(0.3), Float(0.4),
-                               Float(0.5), Float(0.6), Float(0.7), Float(0.8)};
-  std::vector<Float> x_data = {Float(0.8), Float(0.7), Float(0.6), Float(0.5),
-                               Float(0.4), Float(0.3), Float(0.2), Float(0.1)};
+  std::vector<Float> y_data = {Float(0.1f), Float(0.2f), Float(0.3f),
+                               Float(0.4f), Float(0.5f), Float(0.6f),
+                               Float(0.7f), Float(0.8f)};
+  std::vector<Float> x_data = {Float(0.8f), Float(0.7f), Float(0.6f),
+                               Float(0.5f), Float(0.4f), Float(0.3f),
+                               Float(0.2f), Float(0.1f)};
   auto got = m.GetOutput<Float>(y_data, x_data);
   ASSERT_EQ(got.size(), 8);
   for (int i = 0; i < 8; ++i) {
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 192a552bca4ea2..3560c21e5d498a 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/types/fp16.h"
+#include "tensorflow/lite/types/half.h"
 
 #ifdef __ARM_NEON
 #include <arm_neon.h>
@@ -99,17 +101,9 @@ void copyCast(const std::complex<float>* in, std::complex<float>* out,
 }
 
 template <typename ToT>
-void copyCast(const Eigen::half* in, ToT* out, int num_elements) {
-  std::transform(in, in + num_elements, out, [](Eigen::half a) {
-    return static_cast<ToT>(Eigen::half_impl::half_to_float(a));
-  });
-}
-
-template <>
-void copyCast(const Eigen::half* in, std::complex<float>* out,
-              int num_elements) {
-  std::transform(in, in + num_elements, out, [](Eigen::half a) {
-    return std::complex<float>(Eigen::half_impl::half_to_float(a));
+void copyCast(const half* in, ToT* out, int num_elements) {
+  std::transform(in, in + num_elements, out, [](half a) {
+    return static_cast<ToT>(fp16_ieee_to_fp32_value(a));
   });
 }
 
@@ -122,33 +116,26 @@ void copyCast(const Eigen::bfloat16* in, std::complex<float>* out,
 }
 
 template <typename FromT>
-void copyCastToFloat16(const FromT* in, Eigen::half* out, int num_elements) {
+void copyCastToFloat16(const FromT* in, half* out, int num_elements) {
   std::transform(in, in + num_elements, out, [](FromT a) {
-    return Eigen::half_impl::float_to_half_rtne(static_cast<float>(a));
+    return half::from_bits(fp16_ieee_from_fp32_value(static_cast<float>(a)));
   });
 }
 
 template <>
-void copyCastToFloat16(const std::complex<float>* in, Eigen::half* out,
+void copyCastToFloat16(const std::complex<float>* in, half* out,
                        int num_elements) {
   std::transform(in, in + num_elements, out, [](std::complex<float> a) {
-    return Eigen::half_impl::float_to_half_rtne(std::real(a));
+    return half::from_bits(fp16_ieee_from_fp32_value(std::real(a)));
   });
 }
 
 template <>
-void copyCastToFloat16(const Eigen::half* in, Eigen::half* out,
-                       int num_elements) {
-  std::transform(in, in + num_elements, out, [](Eigen::half a) { return a; });
-}
-
-template <>
-void copyCastToFloat16(const Eigen::bfloat16* in, Eigen::half* out,
-                       int num_elements) {
+void copyCastToFloat16(const Eigen::bfloat16* in, half* out, int num_elements) {
   // bfloat16 -> float -> half (fp16)
   std::transform(in, in + num_elements, out, [](Eigen::bfloat16 a) {
-    return Eigen::half_impl::float_to_half_rtne(
-        Eigen::bfloat16_impl::bfloat16_to_float(a));
+    return half::from_bits(
+        fp16_ieee_from_fp32_value(Eigen::bfloat16_impl::bfloat16_to_float(a)));
   });
 }
 
@@ -310,7 +297,7 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
       copyCast(in, out->data.int8, num_elements);
       break;
     case kTfLiteFloat16:
-      copyCastToFloat16(in, reinterpret_cast<Eigen::half*>(out->data.f16),
+      copyCastToFloat16(in, reinterpret_cast<half*>(out->data.f16),
                         num_elements);
       break;
     case kTfLiteBFloat16:
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index 77cc2f3442b1c2..09cc8fbfbda37c 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/random/random.h"
 #include "absl/types/span.h"
-#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/cast_test_common.h"
@@ -31,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -413,11 +413,10 @@ TEST(CastOpModel, CastFloatToFloat16) {
   m.PopulateTensor<float>(m.input(), {100.f, 1.0f, 0.f, 0.4f, 1.999f, 1.1f});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
-      m.ExtractVector<Eigen::half>(m.output()),
-      ElementsAreArray(
-          {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
-           static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
-           static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1)}));
+      m.ExtractVector<half>(m.output()),
+      ElementsAreArray({static_cast<half>(100.f), static_cast<half>(1.0f),
+                        static_cast<half>(0.f), static_cast<half>(0.4f),
+                        static_cast<half>(1.999f), static_cast<half>(1.1f)}));
 }
 
 TEST(CastOpModel, CastFloatToBFloat16) {
@@ -435,11 +434,10 @@ TEST(CastOpModel, CastFloatToBFloat16) {
 
 TEST(CastOpModel, CastFloat16ToFloat) {
   CastOpModel m({TensorType_FLOAT16, {3, 2}}, {TensorType_FLOAT32, {3, 2}});
-  m.PopulateTensor<Eigen::half>(
-      m.input(),
-      {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
-       static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
-       static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1f)});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(1.0f),
+                          static_cast<half>(0.f), static_cast<half>(0.4f),
+                          static_cast<half>(1.999f), static_cast<half>(1.1f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.ExtractVector<float>(m.output()),
               ElementsAreArray(ArrayFloatNear(
@@ -462,6 +460,61 @@ TEST(CastOpModel, CastBFloat16ToFloat) {
                   /*max_abs_err=*/0.05f)));
 }
 
+TEST(CastOpModel, CastFloat16ToInt32) {
+  CastOpModel m({TensorType_FLOAT16, {1, 6}}, {TensorType_INT32, {1, 6}});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(20.f),
+                          static_cast<half>(3.f), static_cast<half>(0.4f),
+                          static_cast<half>(0.999f), static_cast<half>(1.1f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({100, 20, 3, 0, 0, 1}));
+}
+
+TEST(CastOpModel, CastInt32ToFloat16) {
+  CastOpModel m({TensorType_INT32, {1, 6}}, {TensorType_FLOAT16, {1, 6}});
+  m.PopulateTensor<int32_t>(m.input(), {100, 20, 3, 0, 1, -1});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.ExtractVector<half>(m.output()),
+      ElementsAreArray({static_cast<half>(100.f), static_cast<half>(20.f),
+                        static_cast<half>(3.f), static_cast<half>(0.f),
+                        static_cast<half>(1.f), static_cast<half>(-1.f)}));
+}
+
+TEST(CastOpModel, CastFloat16ToBFloat16) {
+  CastOpModel m({TensorType_FLOAT16, {1, 6}}, {TensorType_BFLOAT16, {1, 6}});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(20.f),
+                          static_cast<half>(3.f), static_cast<half>(0.4f),
+                          static_cast<half>(0.999f), static_cast<half>(1.1f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<Eigen::bfloat16>(m.output()),
+              ElementsAreArray({static_cast<Eigen::bfloat16>(100.f),
+                                static_cast<Eigen::bfloat16>(20.f),
+                                static_cast<Eigen::bfloat16>(3.f),
+                                static_cast<Eigen::bfloat16>(0.4f),
+                                static_cast<Eigen::bfloat16>(0.999f),
+                                static_cast<Eigen::bfloat16>(1.1f)}));
+}
+
+TEST(CastOpModel, CastBFloat16ToFloat16) {
+  CastOpModel m({TensorType_BFLOAT16, {1, 6}}, {TensorType_FLOAT16, {1, 6}});
+  m.PopulateTensor<Eigen::bfloat16>(
+      m.input(),
+      {static_cast<Eigen::bfloat16>(100.f), static_cast<Eigen::bfloat16>(20.f),
+       static_cast<Eigen::bfloat16>(3.f), static_cast<Eigen::bfloat16>(0.4f),
+       static_cast<Eigen::bfloat16>(0.999f),
+       static_cast<Eigen::bfloat16>(1.1f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<half>(m.output()),
+              ElementsAreArray(ArrayFloatNear(
+                  {static_cast<half>(100.f), static_cast<half>(20.f),
+                   static_cast<half>(3.f), static_cast<half>(0.4f),
+                   static_cast<half>(0.999f), static_cast<half>(1.1f)},
+                  /*max_abs_err=*/0.05f)));
+}
+
 TEST(CastOpModel, CastConstInputCachingWorks) {
   // This tests the implementation of a performance optimization. If that
   // optimization is changed, this test will likely break/need to be updated.
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 10226bb60a8ed8..bc2091aa823832 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -396,12 +397,10 @@ TEST(ComparisonsTest, LessFloat) {
 TEST(ComparisonsTest, LessFloat16) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT16,
                           BuiltinOperator_LESS);
-  model.PopulateTensor<Eigen::half>(
-      model.input1(),
-      {Eigen::half(0.1), Eigen::half(0.9), Eigen::half(0.7), Eigen::half(0.3)});
-  model.PopulateTensor<Eigen::half>(
-      model.input2(),
-      {Eigen::half(0.1), Eigen::half(0.2), Eigen::half(0.6), Eigen::half(0.5)});
+  model.PopulateTensor<half>(model.input1(),
+                             {half(0.1f), half(0.9f), half(0.7f), half(0.3f)});
+  model.PopulateTensor<half>(model.input2(),
+                             {half(0.1f), half(0.2f), half(0.6f), half(0.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 28692ae1528dd3..f9c765375cc20f 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -121,12 +122,11 @@ TEST(ConcatenationOpTest, ThreeDimensionalOneInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, ThreeDimensionalOneInputFloat16) {
-  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2}},
-                                      /*axis=*/1,
-                                      /*num_inputs=*/1);
-  m.SetInput(0,
-             {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(3.0f),
-              static_cast<Eigen::half>(4.0f), static_cast<Eigen::half>(7.0f)});
+  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2}},
+                               /*axis=*/1,
+                               /*num_inputs=*/1);
+  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(3.0f),
+                 static_cast<half>(4.0f), static_cast<half>(7.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
 }
@@ -206,23 +206,21 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputFloat16) {
-  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
-                                      /*axis=*/0,
-                                      /*num_inputs=*/2);
-  m.SetInput(
-      0, {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(2.0f),
-          static_cast<Eigen::half>(3.0f), static_cast<Eigen::half>(4.0f),
-          static_cast<Eigen::half>(5.0f), static_cast<Eigen::half>(6.0f),
-          static_cast<Eigen::half>(7.0f), Eigen::half{8.0f},
-          static_cast<Eigen::half>(9.0f), static_cast<Eigen::half>(10.0f),
-          static_cast<Eigen::half>(11.0f), static_cast<Eigen::half>(12.0f)});
-  m.SetInput(
-      1, {static_cast<Eigen::half>(13.0f), static_cast<Eigen::half>(14.0f),
-          Eigen::half{15.0f}, static_cast<Eigen::half>(16.0f),
-          Eigen::half{17.0f}, static_cast<Eigen::half>(18.0f),
-          static_cast<Eigen::half>(19.0f), static_cast<Eigen::half>(20.0f),
-          static_cast<Eigen::half>(21.0f), static_cast<Eigen::half>(22.0f),
-          static_cast<Eigen::half>(23.0f), static_cast<Eigen::half>(24.0f)});
+  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
+                               /*axis=*/0,
+                               /*num_inputs=*/2);
+  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(2.0f),
+                 static_cast<half>(3.0f), static_cast<half>(4.0f),
+                 static_cast<half>(5.0f), static_cast<half>(6.0f),
+                 static_cast<half>(7.0f), half{8.0f}, static_cast<half>(9.0f),
+                 static_cast<half>(10.0f), static_cast<half>(11.0f),
+                 static_cast<half>(12.0f)});
+  m.SetInput(1,
+             {static_cast<half>(13.0f), static_cast<half>(14.0f), half{15.0f},
+              static_cast<half>(16.0f), half{17.0f}, static_cast<half>(18.0f),
+              static_cast<half>(19.0f), static_cast<half>(20.0f),
+              static_cast<half>(21.0f), static_cast<half>(22.0f),
+              static_cast<half>(23.0f), static_cast<half>(24.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index 373a719d5ac412..99aa637a068d23 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -112,10 +113,9 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   DynamicUpdateSliceOpModel m({TensorType_FLOAT16, {3, 3}},
                               {TensorType_FLOAT16, {2, 1}},
                               {TensorType_INT32, {2}});
-  m.SetInput<Eigen::half>({Eigen::half(1), Eigen::half(2), Eigen::half(3),
-                           Eigen::half(4), Eigen::half(5), Eigen::half(6),
-                           Eigen::half(7), Eigen::half(8), Eigen::half(9)});
-  m.SetUpdate<Eigen::half>({Eigen::half(-1), Eigen::half(-2)});
+  m.SetInput<half>({half(1), half(2), half(3), half(4), half(5), half(6),
+                    half(7), half(8), half(9)});
+  m.SetUpdate<half>({half(-1), half(-2)});
   m.SetStartIndices<int32_t>({1, 1});
   const int kInplaceInputTensorIdx = 0;
   const int kInplaceOutputTensorIdx = 0;
@@ -123,11 +123,10 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   TfLiteTensor* output_tensor = m.GetOutputTensor(kInplaceOutputTensorIdx);
   output_tensor->data.data = input_tensor->data.data;
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<Eigen::half>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {Eigen::half(1), Eigen::half(2), Eigen::half(3),
-                   Eigen::half(4), Eigen::half(-1), Eigen::half(6),
-                   Eigen::half(7), Eigen::half(-2), Eigen::half(9)})));
+  EXPECT_THAT(m.GetOutput<half>(),
+              ElementsAreArray(
+                  ArrayFloatNear({half(1), half(2), half(3), half(4), half(-1),
+                                  half(6), half(7), half(-2), half(9)})));
   EXPECT_EQ(output_tensor->data.data, input_tensor->data.data);
 }
 
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 028623e3a0a321..a8e9815f30bc61 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -139,8 +140,8 @@ TEST_P(FillOpTest, FillFloat) {
 }
 
 TEST_P(FillOpTest, FillFloat16) {
-  FillOpModel<int64_t, Eigen::half> m(TensorType_INT64, {3}, {2, 2, 2},
-                                      Eigen::half(4.0f), GetParam());
+  FillOpModel<int64_t, half> m(TensorType_INT64, {3}, {2, 2, 2}, half(4.0f),
+                               GetParam());
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index 86ea68ad39e599..13154175e334cc 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -79,28 +80,28 @@ TEST(FloorOpTest, MultiDims) {
 
 TEST(FloorOpTest, SingleDimFloat16) {
   FloorOpModel model({2}, TensorType_FLOAT16);
-  model.PopulateTensor<>(model.input(), {Eigen::half(8.5), Eigen::half(0.0)});
+  model.PopulateTensor<>(model.input(), {half(8.5f), half(0.0f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<Eigen::half>(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutput<half>(), ElementsAreArray({8, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(FloorOpTest, MultiDimsFloat16) {
   FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT16);
-  model.PopulateTensor<Eigen::half>(model.input(), {
-                                                       Eigen::half(0.75),
-                                                       Eigen::half(8.25),
-                                                       Eigen::half(0.49),
-                                                       Eigen::half(9.99),
-                                                       Eigen::half(0.5),
-                                                       Eigen::half(-0.25),
-                                                       Eigen::half(-8.75),
-                                                       Eigen::half(-0.99),
-                                                       Eigen::half(-9.49),
-                                                       Eigen::half(-0.5),
-                                                   });
+  model.PopulateTensor<half>(model.input(), {
+                                                half(0.75f),
+                                                half(8.25f),
+                                                half(0.49f),
+                                                half(9.99f),
+                                                half(0.5f),
+                                                half(-0.25f),
+                                                half(-8.75f),
+                                                half(-0.99f),
+                                                half(-9.49f),
+                                                half(-0.5f),
+                                            });
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<Eigen::half>(),
+  EXPECT_THAT(model.GetOutput<half>(),
               ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
 }
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index 2bd9a0235ebe2c..f4b9f65711fbdc 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -244,21 +246,19 @@ TEST(GatherNdOpTest, BFloat16Int32) {
 TEST(GatherNdOpTest, Float16Int32) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT32, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int32_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int32) {
@@ -297,21 +297,19 @@ TEST(GatherNdOpTest, BFloat16Int64) {
 TEST(GatherNdOpTest, Float16Int64) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT64, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int64) {
@@ -462,21 +460,19 @@ TEST(GatherNdOpTest, BFloat16Int16) {
 TEST(GatherNdOpTest, Float16Int16) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT16, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int16_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int16) {
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 23e30eb7867774..61ca1b654f6160 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -252,7 +254,7 @@ TEST_P(GatherOpTest, LastAxis0DIndex) {
 }
 
 using TestTypes = testing::Types<int8_t, uint8_t, int16_t, int32_t, int64_t,
-                                 float, Eigen::half, Eigen::bfloat16>;
+                                 float, half, Eigen::bfloat16>;
 
 template <typename T>
 struct TypedGatherOpTest : public testing::Test {};
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index babdb4f69fad03..00e25ee9b86500 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -247,24 +248,20 @@ TEST(MaximumOpTest, Int32WithBroadcastTest5D) {
 }
 
 TEST(MaximumOpTest, Float16Test) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0),  Eigen::half(-1.0),
-      Eigen::half(11.0), Eigen::half(-2.0), Eigen::half(-1.44)};
-  std::initializer_list<Eigen::half> data2 = {
-      Eigen::half(-1.0), Eigen::half(0.0),  Eigen::half(1.0),
-      Eigen::half(12.0), Eigen::half(-3.0), Eigen::half(-1.43)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(1.0), Eigen::half(12.0),
-       Eigen::half(-2.0), Eigen::half(-1.43)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
-      data2,
-      {Eigen::half(-1.0), Eigen::half(0.0), Eigen::half(-1.0),
-       Eigen::half(11.0), Eigen::half(-3.0), Eigen::half(-1.44)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),  half(-1.0f),
+                                       half(11.0f), half(-2.0f), half(-1.44f)};
+  std::initializer_list<half> data2 = {half(-1.0f), half(0.0f),  half(1.0f),
+                                       half(12.0f), half(-3.0f), half(-1.43f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
+                  {half(1.0f), half(0.0f), half(1.0f), half(12.0f), half(-2.0f),
+                   half(-1.43f)});
+  TestModel<half>(BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
+                  {half(-1.0f), half(0.0f), half(-1.0f), half(11.0f),
+                   half(-3.0f), half(-1.44f)});
 }
 
 TEST(MaximumOpTest, BFloat16Test) {
@@ -308,42 +305,39 @@ TEST(MaximumOpTest, BFloat16WithBroadcastTest5DScalarY) {
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5DScalarY) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0), Eigen::half(-1.0),
-      Eigen::half(-2.0), Eigen::half(3.0), Eigen::half(11.0)};
-  std::initializer_list<Eigen::half> data2 = {Eigen::half(2.0)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
-      data2,
-      {Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0),
-       Eigen::half(3.0), Eigen::half(11.0)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
-       Eigen::half(2.0), Eigen::half(2.0)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f), half(-1.0f),
+                                       half(-2.0f), half(3.0f), half(11.0f)};
+  std::initializer_list<half> data2 = {half(2.0f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM,
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+                  {TensorType_FLOAT16, {1}},
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
+                  {half(2.0f), half(2.0f), half(2.0f), half(2.0f), half(3.0f),
+                   half(11.0f)});
+  TestModel<half>(BuiltinOperator_MINIMUM,
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+                  {TensorType_FLOAT16, {1}},
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
+                  {half(1.0f), half(0.0f), half(-1.0f), half(-2.0f), half(2.0f),
+                   half(2.0f)});
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5D) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0),   Eigen::half(-1.0),
-      Eigen::half(-2.0), Eigen::half(-1.44), Eigen::half(11.0)};
-  std::initializer_list<Eigen::half> data2 = {Eigen::half(0.5),
-                                              Eigen::half(2.0)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(2.0), Eigen::half(0.5), Eigen::half(2.0),
-       Eigen::half(0.5), Eigen::half(11.0)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
-      data2,
-      {Eigen::half(0.5), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
-       Eigen::half(-1.44), Eigen::half(2.0)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),   half(-1.0f),
+                                       half(-2.0f), half(-1.44f), half(11.0f)};
+  std::initializer_list<half> data2 = {half(0.5f), half(2.0f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM,
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+                  {TensorType_FLOAT16, {2}},
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
+                  {half(1.0f), half(2.0f), half(0.5f), half(2.0f), half(0.5f),
+                   half(11.0f)});
+  TestModel<half>(BuiltinOperator_MINIMUM,
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+                  {TensorType_FLOAT16, {2}},
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
+                  {half(0.5f), half(0.0f), half(-1.0f), half(-2.0f),
+                   half(-1.44f), half(2.0f)});
 }
 
 TEST(MaximumOpTest, BFloat16WithBroadcastTest5D) {
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index fe9cc68bdf8a4d..883f9182758412 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -67,14 +68,12 @@ TEST(NegOpModel, NegFloat32) {
 
 TEST(NegOpModel, NegFloat16) {
   NegOpModel m({TensorType_FLOAT16, {6}}, {TensorType_FLOAT16, {6}});
-  m.SetInput<Eigen::half>({Eigen::half(-2.0f), Eigen::half(-1.0f),
-                           Eigen::half(0.f), Eigen::half(1.0f),
-                           Eigen::half(2.0f), Eigen::half(3.0f)});
+  m.SetInput<half>({half(-2.0f), half(-1.0f), half(0.f), half(1.0f), half(2.0f),
+                    half(3.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<Eigen::half>(),
-              ElementsAreArray({Eigen::half(2.0f), Eigen::half(1.0f),
-                                Eigen::half(0.f), Eigen::half(-1.0f),
-                                Eigen::half(-2.0f), Eigen::half(-3.0f)}));
+  EXPECT_THAT(m.GetOutput<half>(),
+              ElementsAreArray({half(2.0f), half(1.0f), half(0.f), half(-1.0f),
+                                half(-2.0f), half(-3.0f)}));
 }
 
 TEST(NegOpModel, NegBfloat16) {
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 971be96a915b4b..b985abccddcee7 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -927,19 +928,16 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat32ValuedTestInt8) {
 
 template <typename padding_integer_type>
 void SimpleConstFloat16ValuedTest() {
-  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+  PadV2OpConstModel<half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-      Eigen::half{4.0f}, {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{1.5f}, Eigen::half{2.5f}, Eigen::half{3.5f},
-              Eigen::half{4.5}});
+      half{4.0f}, {TensorType_FLOAT16});
+  m.SetInput({half{1.5f}, half{2.5f}, half{3.5f}, half{4.5f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear(
-          {Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{1.5}, Eigen::half{2.5}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{3.5}, Eigen::half{4.5}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4}})));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{4}, half{4}, half{4}, half{4}, half{4}, half{1.5f},
+                   half{2.5f}, half{4}, half{4}, half{3.5f}, half{4.5f},
+                   half{4}, half{4}, half{4}, half{4}, half{4}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
@@ -1050,12 +1048,15 @@ TEST_F(PadV2OpTest, Int16PaddingSimple4DConstFloat32ValuedTest) {
 
 template <typename padding_integer_type>
 void Simple4DConstFloat16ValuedTest() {
-  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+  PadV2OpConstModel<half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1},
-      Eigen::half{7.0}, {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{3.0f}, Eigen::half{6.0f}});
+      half{7.0f}, {TensorType_FLOAT16});
+  m.SetInput({half{3.0f}, half{6.0f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 7, 6, 7, 7, 7, 7, 7}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{3.0f}, half{7.0f}, half{6.0f}, half{7.0f}, half{7.0f},
+                   half{7.0f}, half{7.0f}, half{7.0f}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
 }
 
@@ -1167,15 +1168,18 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTest) {
 
 template <typename padding_integer_type>
 void SimpleDynamicTestV2Float16() {
-  PadV2OpDynamicModel<Eigen::half, padding_integer_type> m(
-      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, Eigen::half{0.0},
+  PadV2OpDynamicModel<half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, half{0.0f},
       {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{1.0f}, Eigen::half{2.0f}, Eigen::half{3.0f},
-              Eigen::half{4.0f}});
+  m.SetInput({half{1.0f}, half{2.0f}, half{3.0f}, half{4.0f}});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
-                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
+                   half{1.0f}, half{2.0f}, half{0.0f}, half{0.0f}, half{3.0f},
+                   half{4.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
+                   half{0.0f}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index 4301b0120f53c3..7e2d3df543ba28 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -354,45 +355,38 @@ TEST(ReverseOpTest, Int16MultiDimensions) {
 
 // float16 tests.
 TEST(ReverseOpTest, Float16OneDimension) {
-  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4}},
-                                    {TensorType_INT32, {1}});
-  model.PopulateTensor<Eigen::half>(
-      model.input(),
-      {Eigen::half(1), Eigen::half(2), Eigen::half(3), Eigen::half(4)});
+  ReverseOpModel<half> model({TensorType_FLOAT16, {4}},
+                             {TensorType_INT32, {1}});
+  model.PopulateTensor<half>(model.input(),
+                             {half(1), half(2), half(3), half(4)});
   model.PopulateTensor<int32_t>(model.axis(), {0});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
   EXPECT_THAT(model.GetOutput(),
-              ElementsAreArray({Eigen::half(4), Eigen::half(3), Eigen::half(2),
-                                Eigen::half(1)}));
+              ElementsAreArray({half(4), half(3), half(2), half(1)}));
 }
 
 TEST(ReverseOpTest, Float16MultiDimensions) {
-  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4, 3, 2}},
-                                    {TensorType_INT32, {1}});
-  model.PopulateTensor<Eigen::half>(
+  ReverseOpModel<half> model({TensorType_FLOAT16, {4, 3, 2}},
+                             {TensorType_INT32, {1}});
+  model.PopulateTensor<half>(
       model.input(),
-      {Eigen::half(1),  Eigen::half(2),  Eigen::half(3),  Eigen::half(4),
-       Eigen::half(5),  Eigen::half(6),  Eigen::half(7),  Eigen::half(8),
-       Eigen::half(9),  Eigen::half(10), Eigen::half(11), Eigen::half(12),
-       Eigen::half(13), Eigen::half(14), Eigen::half(15), Eigen::half(16),
-       Eigen::half(17), Eigen::half(18), Eigen::half(19), Eigen::half(20),
-       Eigen::half(21), Eigen::half(22), Eigen::half(23), Eigen::half(24)});
+      {half(1),  half(2),  half(3),  half(4),  half(5),  half(6),
+       half(7),  half(8),  half(9),  half(10), half(11), half(12),
+       half(13), half(14), half(15), half(16), half(17), half(18),
+       half(19), half(20), half(21), half(22), half(23), half(24)});
   model.PopulateTensor<int32_t>(model.axis(), {1});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
   EXPECT_THAT(
       model.GetOutput(),
-      ElementsAreArray({Eigen::half(5),  Eigen::half(6),  Eigen::half(3),
-                        Eigen::half(4),  Eigen::half(1),  Eigen::half(2),
-                        Eigen::half(11), Eigen::half(12), Eigen::half(9),
-                        Eigen::half(10), Eigen::half(7),  Eigen::half(8),
-                        Eigen::half(17), Eigen::half(18), Eigen::half(15),
-                        Eigen::half(16), Eigen::half(13), Eigen::half(14),
-                        Eigen::half(23), Eigen::half(24), Eigen::half(21),
-                        Eigen::half(22), Eigen::half(19), Eigen::half(20)}));
+      ElementsAreArray({half(5),  half(6),  half(3),  half(4),  half(1),
+                        half(2),  half(11), half(12), half(9),  half(10),
+                        half(7),  half(8),  half(17), half(18), half(15),
+                        half(16), half(13), half(14), half(23), half(24),
+                        half(21), half(22), half(19), half(20)}));
 }
 
 // bfloat16 tests.
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index c3752827f3e61c..e3fccf888c9815 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -68,33 +69,29 @@ TEST(RoundOpTest, MultiDims) {
 }
 
 TEST(RoundOpTest, Float16SingleDim) {
-  RoundOpModel<Eigen::half> model({6});
-  model.PopulateTensor<Eigen::half>(
-      model.input(), {Eigen::half(8.5), Eigen::half(0.0), Eigen::half(3.5),
-                      Eigen::half(4.2), Eigen::half(-3.5), Eigen::half(-4.5)});
+  RoundOpModel<half> model({6});
+  model.PopulateTensor<half>(model.input(),
+                             {half(8.5f), half(0.0f), half(3.5f), half(4.2f),
+                              half(-3.5f), half(-4.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      ElementsAreArray({Eigen::half(8), Eigen::half(0), Eigen::half(4),
-                        Eigen::half(4), Eigen::half(-4), Eigen::half(-4)}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray(
+                  {half(8), half(0), half(4), half(4), half(-4), half(-4)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
 }
 
 TEST(RoundOpTest, Float16MultiDims) {
-  RoundOpModel<Eigen::half> model({2, 1, 1, 6});
-  model.PopulateTensor<Eigen::half>(
+  RoundOpModel<half> model({2, 1, 1, 6});
+  model.PopulateTensor<half>(
       model.input(),
-      {Eigen::half(0.0001), Eigen::half(8.0001), Eigen::half(0.9999),
-       Eigen::half(9.9999), Eigen::half(0.5), Eigen::half(-0.0001),
-       Eigen::half(-8.0001), Eigen::half(-0.9999), Eigen::half(-9.9999),
-       Eigen::half(-0.5), Eigen::half(-2.5), Eigen::half(1.5)});
+      {half(0.0001f), half(8.0001f), half(0.9999f), half(9.9999f), half(0.5f),
+       half(-0.0001f), half(-8.0001f), half(-0.9999f), half(-9.9999f),
+       half(-0.5f), half(-2.5f), half(1.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      ElementsAreArray({Eigen::half(0), Eigen::half(8), Eigen::half(1),
-                        Eigen::half(10), Eigen::half(0), Eigen::half(0),
-                        Eigen::half(-8), Eigen::half(-1), Eigen::half(-10),
-                        Eigen::half(-0), Eigen::half(-2), Eigen::half(2)}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({half(0), half(8), half(1), half(10), half(0),
+                                half(0), half(-8), half(-1), half(-10),
+                                half(-0), half(-2), half(2)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
 }
 
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index feb02c48d2f3aa..2f3430770f7b68 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
-#include "Eigen/Core"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -338,20 +338,16 @@ TEST_P(SliceOpTest, SliceBool) {
 }
 
 TEST_P(SliceOpTest, SliceFloat16) {
-  SliceOpModel<Eigen::half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
-                                       {2, 1, -1, 1}, TensorType_INT32,
-                                       TensorType_FLOAT16, GetParam());
-  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(1), Eigen::half(2),
-              Eigen::half(2), Eigen::half(2), Eigen::half(3), Eigen::half(3),
-              Eigen::half(3), Eigen::half(4), Eigen::half(4), Eigen::half(4),
-              Eigen::half(5), Eigen::half(5), Eigen::half(5), Eigen::half(6),
-              Eigen::half(6), Eigen::half(6)});
+  SliceOpModel<half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                {2, 1, -1, 1}, TensorType_INT32,
+                                TensorType_FLOAT16, GetParam());
+  m.SetInput({half(1), half(1), half(1), half(2), half(2), half(2), half(3),
+              half(3), half(3), half(4), half(4), half(4), half(5), half(5),
+              half(5), half(6), half(6), half(6)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray({Eigen::half(3), Eigen::half(3), Eigen::half(3),
-                        Eigen::half(5), Eigen::half(5), Eigen::half(5)}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({half(3), half(3), half(3),
+                                               half(5), half(5), half(5)}));
 }
 
 TEST_P(SliceOpTest, SliceBFloat16) {
@@ -373,19 +369,16 @@ TEST_P(SliceOpTest, SliceBFloat16) {
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1Float16) {
-  SliceOpModel<Eigen::half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
-                                       {2, -1, 1, 1}, TensorType_INT32,
-                                       TensorType_FLOAT16, GetParam());
-  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(2), Eigen::half(2),
-              Eigen::half(3), Eigen::half(3), Eigen::half(4), Eigen::half(4),
-              Eigen::half(5), Eigen::half(5), Eigen::half(6), Eigen::half(6),
-              Eigen::half(7), Eigen::half(7), Eigen::half(8), Eigen::half(8),
-              Eigen::half(9), Eigen::half(9)});
+  SliceOpModel<half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
+                                {2, -1, 1, 1}, TensorType_INT32,
+                                TensorType_FLOAT16, GetParam());
+  m.SetInput({half(1), half(1), half(2), half(2), half(3), half(3), half(4),
+              half(4), half(5), half(5), half(6), half(6), half(7), half(7),
+              half(8), half(8), half(9), half(9)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({Eigen::half(5), Eigen::half(6), Eigen::half(8),
-                                Eigen::half(9)}));
+              ElementsAreArray({half(5), half(6), half(8), half(9)}));
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1BFloat16) {
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 6ba4ef3b78977f..f7c79680576fe1 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive  // IWYU pragma: keep
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -152,7 +154,7 @@ class StridedSliceOpModel : public SingleOpModel {
 template <typename T>
 class StridedSliceOpTest : public ::testing::Test {};
 
-using DataTypes = ::testing::Types<float, Eigen::half, Eigen::bfloat16, uint8_t,
+using DataTypes = ::testing::Types<float, half, Eigen::bfloat16, uint8_t,
                                    uint32_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
@@ -347,7 +349,9 @@ TYPED_TEST(StridedSliceOpTest, In1D_Int32End) {
       continue;
     }
     std::vector<TypeParam> values(32768);
-    std::iota(values.begin(), values.end(), TypeParam(0));
+    for (int i = 0; i < 32768; ++i) {
+      values[i] = static_cast<TypeParam>(i);
+    }
 
     StridedSliceOpModel<TypeParam> m({32768}, {1}, {1}, {1}, values, {0},
                                      {32768}, {1}, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 2ebeb4a9457280..f792bd31529582 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/serialization/writer_lib.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
+#include "tensorflow/lite/types/fp16.h"  // IWYU pragma: keep
 #include "tensorflow/lite/version.h"
 #include "tsl/platform/logging.h"
 
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index cbdb74d29d04aa..69053a598785a2 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -38,7 +38,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "fp16/fp16.h"  // from @FP16
 #include "absl/algorithm/container.h"
 #include "absl/log/absl_check.h"
 #include "absl/log/absl_log.h"
@@ -57,6 +56,8 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "tensorflow/lite/types/fp16.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 #include "tsl/platform/logging.h"
 
@@ -134,7 +135,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 }
 
 template <>
-constexpr TfLiteType typeToTfLiteType<Eigen::half>() {
+constexpr TfLiteType typeToTfLiteType<half>() {
   return kTfLiteFloat16;
 }
 
@@ -1362,7 +1363,7 @@ TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
 TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
 TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
-TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(half, TensorType_FLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteBFloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(Eigen::bfloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
@@ -1461,13 +1462,13 @@ struct TypeUnion<uint8_t> {
 };
 
 template <>
-struct TypeUnion<Eigen::half> {
+struct TypeUnion<half> {
  public:
   // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT16;
   // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat16;
-  typedef Eigen::half ScalarType;
+  typedef half ScalarType;
 };
 
 template <>
diff --git a/tensorflow/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
index ed9a679b4e4d33..01f514692b0616 100644
--- a/tensorflow/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/array.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -197,6 +198,14 @@ TEST(TestUtilTest, QuantizeVectorScalingUp) {
   EXPECT_THAT(q_data, ElementsAreArray(expected));
 }
 
+TEST(TestUtilTest, DequantizeVectorFp16) {
+  std::vector<half> data = {half(-1.0f), half(-0.5f), half(0.0f), half(0.5f),
+                            half(1.0f)};
+  auto f_data = Dequantize<half>(data, /*scale=*/0.1f, /*zero_point=*/0);
+  std::vector<float> expected = {-0.1f, -0.05f, 0.0f, 0.05f, 0.1f};
+  EXPECT_THAT(f_data, ElementsAreArray(tflite::ArrayFloatNear(expected, 1e-7)));
+}
+
 TEST(DimsAreMatcherTestTensor, ValidOneD) {
   TensorUniquePtr t = BuildTfLiteTensor(kTfLiteInt32, {2}, kTfLiteDynamic);
   EXPECT_THAT(t.get(), DimsAre({2}));
diff --git a/tensorflow/lite/profiling/proto/CMakeLists.txt b/tensorflow/lite/profiling/proto/CMakeLists.txt
index 0bfa81a41476f3..5738c992fc2839 100644
--- a/tensorflow/lite/profiling/proto/CMakeLists.txt
+++ b/tensorflow/lite/profiling/proto/CMakeLists.txt
@@ -17,8 +17,8 @@ find_package(Protobuf REQUIRED)
 add_library(profiling_info_proto profiling_info.proto)
 
 list(APPEND profiling_info_generated_files
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/profiling_info.pb.cc
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/profiling_info.pb.h)
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/profiling_info.pb.cc
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/profiling_info.pb.h)
 
 # Generate profiling_info.pb.cc and profiling_info.pb.h from
 # profiling_info.proto using protoc. Once the protobuf package version is
@@ -26,7 +26,7 @@ list(APPEND profiling_info_generated_files
 add_custom_command(
     OUTPUT ${profiling_info_generated_files}
     COMMAND ${Protobuf_PROTOC_EXECUTABLE}
-    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR}/../../.. tflite/profiling/proto/profiling_info.proto
+    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${TENSORFLOW_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/profiling_info.proto
     DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/profiling_info.proto
 )
 
@@ -37,8 +37,8 @@ target_include_directories(profiling_info_proto PUBLIC ${CMAKE_BINARY_DIR})
 
 add_library(model_runtime_info_proto model_runtime_info.proto)
 list(APPEND model_runtime_info_generated_files
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/model_runtime_info.pb.cc
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/model_runtime_info.pb.h
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/model_runtime_info.pb.cc
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/model_runtime_info.pb.h
 )
 
 # Generate model_runtime_info.pb.cc and model_runtime_info.pb.h from
@@ -47,7 +47,7 @@ list(APPEND model_runtime_info_generated_files
 add_custom_command(
     OUTPUT ${model_runtime_info_generated_files}
     COMMAND ${Protobuf_PROTOC_EXECUTABLE}
-    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR}/../../.. tflite/profiling/proto/model_runtime_info.proto
+    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${TENSORFLOW_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/model_runtime_info.proto
     DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/model_runtime_info.proto ${profiling_info_generated_files}
 )
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 3880f6461ed74b..3babca6c653022 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -12,6 +12,7 @@ package(
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
         "//third_party/odml/infra/genai/conversion:__subpackages__",
+        "//third_party/odml/litert/litert/python:__subpackages__",
         "//third_party/odml/model_customization/quantization:__subpackages__",
         "//third_party/py/ai_edge_torch:__subpackages__",
         "//third_party/py/tensorflow_federated:__subpackages__",
diff --git a/tensorflow/lite/python/testdata/double_op.cc b/tensorflow/lite/python/testdata/double_op.cc
index a6f8c542cd3b19..9d227a1e83e8ea 100644
--- a/tensorflow/lite/python/testdata/double_op.cc
+++ b/tensorflow/lite/python/testdata/double_op.cc
@@ -52,8 +52,8 @@ class DoubleOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("Double").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    DoubleOp<int32>);
+    Name("Double").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    DoubleOp<int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("Double").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     DoubleOp<float>);
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 958928db4663d8..ba3ce1c9e0b9e8 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -275,6 +275,7 @@ cc_library(
     hdrs = ["split.h"],
     deps = [
         "//tensorflow/lite:string",
+        "//tensorflow/lite/types:half",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -286,7 +287,9 @@ cc_test(
     deps = [
         ":split",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -333,6 +336,7 @@ cc_library(
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/evaluation:utils",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ] + select({
diff --git a/tensorflow/lite/testing/split.h b/tensorflow/lite/testing/split.h
index ec932a8de8d68f..5431bccf1a72f6 100644
--- a/tensorflow/lite/testing/split.h
+++ b/tensorflow/lite/testing/split.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace testing {
@@ -199,12 +200,10 @@ inline std::vector<std::complex<double>> Split(const string& s,
 }
 
 template <>
-inline std::vector<Eigen::half> Split(const string& s,
-                                      const string& delimiter) {
-  std::vector<Eigen::half> fields;
+inline std::vector<half> Split(const string& s, const string& delimiter) {
+  std::vector<half> fields;
   for (const auto& p : SplitToPos(s, delimiter)) {
-    fields.push_back(Eigen::half_impl::float_to_half_rtne(
-        strtof(s.data() + p.first, nullptr)));
+    fields.push_back(static_cast<half>(strtof(s.data() + p.first, nullptr)));
   }
   return fields;
 }
diff --git a/tensorflow/lite/testing/split_test.cc b/tensorflow/lite/testing/split_test.cc
index c8824395ea97dc..90b8276b3ed654 100644
--- a/tensorflow/lite/testing/split_test.cc
+++ b/tensorflow/lite/testing/split_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace testing {
@@ -45,6 +47,17 @@ TEST(SplitTest, SplitFloat) {
   EXPECT_THAT(Split<float>("1.0 B 1e-5", " "), ElementsAre(1.0, 0.0, 1e-5));
 }
 
+TEST(SplitTest, SplitHalf) {
+  EXPECT_THAT(Split<half>("1.0 2.5 1e-2", " "),
+              ElementsAre(half(1.0f), half(2.5f), half(0.01f)));
+}
+
+TEST(SplitTest, SplitBfloat16) {
+  EXPECT_THAT(Split<Eigen::bfloat16>("1.0 2.5 1e-2", " "),
+              ElementsAre(Eigen::bfloat16(1.0f), Eigen::bfloat16(2.5f),
+                          Eigen::bfloat16(0.01f)));
+}
+
 TEST(SplitTest, SplitInt) {
   EXPECT_THAT(Split<int>("1,-1,258", ","), ElementsAre(1, -1, 258));
 }
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 89fed23bb7d2a8..5b15e6a6ed0ee5 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/result_expectations.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
+#include "tensorflow/lite/types/half.h"
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
@@ -405,11 +406,11 @@ void TfLiteDriver::SetInput(const std::string& name,
       break;
     }
     case kTfLiteFloat16: {
-      const auto& values = testing::Split<Eigen::half>(csv_values, ",");
+      const auto& values = testing::Split<half>(csv_values, ",");
       for (auto k : values) {
         TFLITE_LOG(INFO) << "input" << k;
       }
-      if (!CheckSizes<Eigen::half>(tensor->bytes, values.size())) return;
+      if (!CheckSizes<half>(tensor->bytes, values.size())) return;
       SetTensorData(values, tensor->data.raw);
       break;
     }
@@ -500,7 +501,7 @@ void TfLiteDriver::SetExpectation(const std::string& name,
       expected_output_[id]->SetData<std::complex<double>>(csv_values);
       break;
     case kTfLiteFloat16:
-      expected_output_[id]->SetData<Eigen::half>(csv_values);
+      expected_output_[id]->SetData<half>(csv_values);
       break;
     case kTfLiteBFloat16:
       expected_output_[id]->SetData<Eigen::bfloat16>(csv_values);
diff --git a/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt b/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt
index 12a7e577bd3277..6a39f06e03c32b 100644
--- a/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt
@@ -17,8 +17,8 @@ find_package(Protobuf REQUIRED)
 add_library(benchmark_result_proto benchmark_result.proto)
 
 list(APPEND benchmark_result_generated_files
-    ${CMAKE_BINARY_DIR}/tflite/tools/benchmark/proto/benchmark_result.pb.cc
-    ${CMAKE_BINARY_DIR}/tflite/tools/benchmark/proto/benchmark_result.pb.h)
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/tools/benchmark/proto/benchmark_result.pb.cc
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/tools/benchmark/proto/benchmark_result.pb.h)
 
 # Generate benchmark_result.pb.cc and benchmark_result.pb.h from
 # benchmark_result.proto using protoc. Once the protobuf package version is
@@ -26,7 +26,7 @@ list(APPEND benchmark_result_generated_files
 add_custom_command(
     OUTPUT ${benchmark_result_generated_files}
     COMMAND ${Protobuf_PROTOC_EXECUTABLE}
-    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR}/../../../.. tflite/tools/benchmark/proto/benchmark_result.proto
+    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${TENSORFLOW_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_result.proto
     DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_result.proto
 )
 
diff --git a/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt
index 91e893ee377048..8be897f54d728f 100644
--- a/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt
+++ b/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt
@@ -24,8 +24,9 @@ endif()
 
 add_library(ml_dtypes INTERFACE)
 target_include_directories(ml_dtypes INTERFACE
-  "${ML_DTYPES_SOURCE_DIR}"
-  "${ML_DTYPES_SOURCE_DIR}/ml_dtypes")
+  "$<BUILD_INTERFACE:${ML_DTYPES_SOURCE_DIR}>"
+  "$<BUILD_INTERFACE:${ML_DTYPES_SOURCE_DIR}/ml_dtypes>"
+  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
 file(GLOB ML_DTYPES_PUBLIC_HEADERS
   ${ML_DTYPES_SOURCE_DIR}/ml_dtypes/include/*.h)
 set_target_properties(ml_dtypes PROPERTIES
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index c781a2fc18d86a..14e4370cbbd929 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 1b918df9d1744ae40725254f4baa592ed05c912e
+  GIT_TAG 183297df5c945236cbc4bb1f625f9f2008bfc564
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
@@ -49,5 +49,6 @@ include_directories(
    "${PTHREADPOOL_SOURCE_DIR}/include"
    "${FP16_SOURCE_DIR}/include"
    "${XNNPACK_SOURCE_DIR}/include"
+   "${XNNPACK_SOURCE_DIR}"
    "${CPUINFO_SOURCE_DIR}/"
 )
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index f73bbfa1288754..d80391b3967130 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -716,7 +716,7 @@ TEST(CalibratorTest, CalibrationWithCallOnce) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -727,8 +727,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index 33f62f0c850363..e1494788dc45b4 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -901,7 +901,7 @@ TEST_F(QuantizationUtilsTest, ExtendToPowerOfTwo) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -912,8 +912,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 8a0013b09e6851..319da9523aea7e 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -39,7 +39,7 @@ limitations under the License.
 // Note: More rigorous model tests can be found in subgraph_quantizer_test.cc
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -2309,7 +2309,7 @@ TEST_P(BiasInputTest, QuantizationSucceeds) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -2320,8 +2320,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 6173ec1b112203..96b8bf8689e610 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <random>
 #include <string>
+#include <type_traits>
 
 #include "absl/types/span.h"
 #include "Eigen/Core"  // from @eigen_archive
diff --git a/tensorflow/lite/types/BUILD b/tensorflow/lite/types/BUILD
index c00aadb6ae46e9..0bc596f7782e2a 100644
--- a/tensorflow/lite/types/BUILD
+++ b/tensorflow/lite/types/BUILD
@@ -28,4 +28,8 @@ cc_library(
         "fp16.h",
         "half.h",
     ],
+    # copybara:uncomment_begin(google-only)
+    # compatible_with = ["//buildenv/target:non_prod"],
+    # copybara:uncomment_end
+    deps = ["@FP16"],
 )
diff --git a/tensorflow/lite/types/fp16.h b/tensorflow/lite/types/fp16.h
index cc63fe7d21fbd8..94484350f68bcd 100644
--- a/tensorflow/lite/types/fp16.h
+++ b/tensorflow/lite/types/fp16.h
@@ -27,6 +27,13 @@ limitations under the License.
 // - https://github.com/google/XNNPACK/issues/6989
 // We also don't need a lot of the functionality in the upstream library.
 
+// If building with a library that uses //third_party/FP16, that library
+// provides its own fp16 conversion functions. Avoid redefining them here to
+// prevent build errors.
+// FP16_H and FP16_BITCASTS_H are defined by //third_party/FP16/fp16.h and
+// //third_party/FP16/bitcasts.h respectively.
+#if !defined(FP16_H) && !defined(FP16_BITCASTS_H)
+
 static inline float fp32_from_bits(uint32_t w) {
   union {
     uint32_t as_bits;
@@ -216,4 +223,6 @@ static inline uint16_t fp16_ieee_from_fp32_value(float f) {
          (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 
+#endif  // !defined(FP16_H) && !defined(FP16_BITCASTS_H)
+
 #endif  // TENSORFLOW_LITE_TYPES_FP16_H_
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 77eb63a7551ed6..79c85e589caa13 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/test/BUILD b/tensorflow/python/compiler/tensorrt/test/BUILD
index 388140b04fac1d..26582e8aac4f51 100644
--- a/tensorflow/python/compiler/tensorrt/test/BUILD
+++ b/tensorflow/python/compiler/tensorrt/test/BUILD
@@ -74,7 +74,6 @@ filegroup(
 
 base_tags = [
     "no_cuda_on_cpu_tap",
-    "cuda-only",
     "no_windows",
     "nomac",
     # TODO(b/303453873): Re-enable tests once TensorRT has been updated
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index f30a5a8c6668ec..0b3860dbaa9934 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -331,7 +331,6 @@ cuda_py_strict_test(
     shard_count = 4,
     tags = [
         "no_windows",  # TODO(b/142475891): Enable this test on Windows.
-        "cuda-only", #TODO(ROCm) Re-enable after issue is fixed.
     ],
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
@@ -363,7 +362,6 @@ cuda_py_strict_test(
     python_version = "PY3",
     tags = [
         "no_windows_gpu",
-        "cuda-only", #TODO(ROCm) Re-enable after issue is fixed.
     ],
     deps = [
         ":debug_events_reader",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 7b5f6a94506487..cf156d75a4380d 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -712,7 +712,6 @@ distribute_py_strict_test(
         "multi_and_single_gpu",
         "no_cuda_asan",  # b/213388775
         "no_oss",  # b/241013307
-	    "cuda-only",
         "notap",  # Flaky; TODO(b/289970206)
     ],
     tpu_tags = [
@@ -2502,7 +2501,6 @@ distribute_py_strict_test(
         "multi_and_single_gpu",
         "nomac",  # TODO(b/201788023): Attempt MultiProcessCluster to fix this.
         "notpu",
-        "cuda-only", #times out
     ],
     deps = [
         ":distribute_lib",
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 7e905c3d51b0c1..1c28e3b8cc706c 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -164,7 +164,6 @@ tf_py_strict_test(
         "no_cuda_on_cpu_tap",
         "no_oss",  # TODO(b/206860622): Broken with numpy 1.20+
         "no_pip",
-        "cuda-only",
         "no_windows",
     ],
     deps = [
@@ -209,7 +208,6 @@ tf_py_strict_test(
         "no_cuda_on_cpu_tap",
         "no_oss",  # TODO(b/206860622): Broken with numpy 1.20+
         "no_pip",
-        "cuda-only",
         "no_windows",
     ],
     deps = [":feature_column_v2_test_main_lib"],
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index ba216815b3a623..e3ebab0c442106 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -1502,7 +1502,6 @@ cuda_py_strict_test(
     srcs = ["config_test.py"],
     tags = [
         "no_pip",  # test_ops are not available in pip
-        "cuda-only",
     ],
     deps = [
         ":config",
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index b44ef77a7e901d..0aaa5add6081a9 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -1281,6 +1281,12 @@ def is_tf_type(x):  # pylint: disable=invalid-name
   Returns:
     `True` if `x` is a TensorFlow-native type.
   """
+  # ObjectProxy is a special type of object that is used by wrapt to wrap
+  # objects. It is not a Tensor.
+  if (type(x).__name__ == "ObjectProxy"):
+    return False
+  if (type(x).__name__ == "_DictWrapper"):
+    return False
   return isinstance(x, tf_type_classes)
 
 
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 9369ffa456392a..4206ef9f882ffc 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -83,6 +83,9 @@
 
 # pylint: disable=g-import-not-at-top
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py
 except ImportError:
   h5py = None
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index 054df939e8e59a..1a44e6b76f3276 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -17,6 +17,7 @@
 import collections
 import functools
 import itertools
+import os
 import unittest
 
 from absl.testing import parameterized
@@ -30,6 +31,9 @@
 from tensorflow.python.util import nest
 
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py  # pylint:disable=g-import-not-at-top
 except ImportError:
   h5py = None
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 1f6bbc43320d0a..05a2c9282909a2 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -34,6 +34,9 @@
 
 # pylint: disable=g-import-not-at-top
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py
   HDF5_OBJECT_HEADER_LIMIT = 64512
 except ImportError:
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index eee859233e5eba..b9ba0bc20d0ba0 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras model saving code."""
-
+import os
 from tensorflow.python import tf2
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving import saving_utils
@@ -25,6 +25,9 @@
 
 # pylint: disable=g-import-not-at-top
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py
 except ImportError:
   h5py = None
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index b7c79c74dae5f9..640edf8f97a629 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -141,7 +141,6 @@ cuda_py_strict_test(
     shard_count = 15,
     tags = [
         "no_oss",  # b/241024908
-        "cuda-only",
         "nomac",  # b/181799478
         "notap",  # b/31080670
     ],
diff --git a/tensorflow/python/kernel_tests/nn_ops/BUILD b/tensorflow/python/kernel_tests/nn_ops/BUILD
index 507d00c15d196c..df5b780a6e7367 100644
--- a/tensorflow/python/kernel_tests/nn_ops/BUILD
+++ b/tensorflow/python/kernel_tests/nn_ops/BUILD
@@ -296,7 +296,6 @@ cuda_py_strict_test(
     shard_count = 4,
     tags = [
         "no_mac_arm64",
-		"cuda-only",
         "optonly",  # times out
     ],
     deps = [
@@ -438,7 +437,6 @@ cuda_py_strict_test(
     size = "medium",  # http://b/30603882
     timeout = "long",
     srcs = ["depthwise_conv_op_d9m_test.py"],
-    tags = ["cuda-only"],
     shard_count = 8,
     deps = [
         ":depthwise_conv_op_base",
diff --git a/tensorflow/python/kernel_tests/sparse_ops/BUILD b/tensorflow/python/kernel_tests/sparse_ops/BUILD
index 37b8518b3c1ebe..20fe7ab1adfb98 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/BUILD
+++ b/tensorflow/python/kernel_tests/sparse_ops/BUILD
@@ -108,7 +108,6 @@ cuda_py_strict_test(
     shard_count = 5,
     tags = [
         "optonly",  # b/77589990
-        "cuda-only"
     ],
     deps = [
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index c33014cc3ae5b2..1d897c4b67e512 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -100,7 +100,7 @@ absl::Status PyArrayDescr_to_TF_DataType(PyArray_Descr* descr,
     if (!key_string) {
       return errors::Internal("Corrupt numpy type descriptor");
     }
-    tensorflow::string key = key_string;
+    std::string key = key_string;
     // The typenames here should match the field names in the custom struct
     // types constructed in test_util.py.
     // TODO(mrry,keveman): Investigate Numpy type registration to replace this
@@ -320,7 +320,8 @@ absl::Status EncodePyBytesArray(PyArrayObject* array, int64_t nelems,
   return absl::OkStatus();
 }
 
-absl::Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src, uint64 nelems,
+absl::Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src,
+                                           uint64_t nelems,
                                            PyArrayObject* dst) {
   const void* tensor_data = TF_TensorData(src);
   DCHECK(tensor_data != nullptr);
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 703bab0f65a7b8..fbb1f10c855b15 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -253,10 +253,10 @@ absl::Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data,
   auto* np_array = reinterpret_cast<PyArrayObject*>(
       PyArray_SimpleNewFromData(dim_size, dims, type_num, data));
   if (np_array == nullptr) {
-    string shape_str = absl::StrJoin(
+    std::string shape_str = absl::StrJoin(
         absl::Span<npy_intp>{dims, static_cast<size_t>(dim_size)}, ", ");
     if (PyErr_Occurred()) {
-      string exception_str = PyExceptionFetch();
+      std::string exception_str = PyExceptionFetch();
       PyErr_Clear();
       return errors::InvalidArgument(
           "Failed to create numpy array from tensor of shape [", shape_str,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 16ba9db74ba764..54178667bfddaa 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -63,7 +63,7 @@ PyObject* GetPyTrampoline() {
 struct PyCall {
   // Passed to python runtime to call the python function registered
   // with this "token".
-  string token;
+  std::string token;
 
   // The device on which Tensors are stored; only used for EagerPyFunc.
   Device* device = nullptr;
@@ -164,7 +164,8 @@ absl::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
   TF_RETURN_IF_ERROR(handle->Tensor(output_tensor));
   // actual_device may be nullptr, which implies local CPU.
   if (expected_device == actual_device) return absl::OkStatus();
-  const string& expected_device_name = expected_device->attributes().name();
+  const std::string& expected_device_name =
+      expected_device->attributes().name();
   if (actual_device == nullptr) {
     if (!IsCPUDevice(expected_device)) {
       return errors::Internal(
@@ -380,7 +381,8 @@ class PyFuncOp : public OpKernel {
       return;
     }
 
-    OP_REQUIRES(ctx, static_cast<int32>(call.out.size()) == ctx->num_outputs(),
+    OP_REQUIRES(ctx,
+                static_cast<int32_t>(call.out.size()) == ctx->num_outputs(),
                 errors::InvalidArgument(token_, " returns ", call.out.size(),
                                         " values, but expects to see ",
                                         ctx->num_outputs(), " values."));
@@ -396,7 +398,7 @@ class PyFuncOp : public OpKernel {
   }
 
  private:
-  string token_;
+  std::string token_;
 
   // True if and only if this op should execute the python function eagerly,
   // i.e., if and only if the eager attribute is set.
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 3547cd4a8ddc81..6b2b6be8cf53e7 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -360,8 +360,9 @@ struct ConverterTraits<int64_t> {
 typedef Converter<int64_t> Int64Converter;
 
 template <>
-struct ConverterTraits<uint64> {
-  static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, uint64 value) {
+struct ConverterTraits<uint64_t> {
+  static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
+                                               uint64_t value) {
     return tensorflow::unwrap(ctx)->CreateUint64Scalar(value);
   }
 
@@ -370,7 +371,7 @@ struct ConverterTraits<uint64> {
     return tensorflow::unwrap(ctx)->CreateTensor(DT_UINT64, dim_sizes);
   }
 
-  static const char* ConvertScalar(PyObject* v, uint64* out) {
+  static const char* ConvertScalar(PyObject* v, uint64_t* out) {
 #if PY_MAJOR_VERSION < 3
     if (TF_PREDICT_TRUE(PyInt_Check(v))) {
       *out = PyInt_AsUnsignedLongLongMask(v);
@@ -394,10 +395,10 @@ struct ConverterTraits<uint64> {
   }
 };
 
-typedef Converter<uint64> UInt64Converter;
+typedef Converter<uint64_t> UInt64Converter;
 
 template <>
-struct ConverterTraits<int32> {
+struct ConverterTraits<int32_t> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
                                                int32_t value) {
     return tensorflow::unwrap(ctx)->CreateInt32Scalar(value);
@@ -408,7 +409,7 @@ struct ConverterTraits<int32> {
     return tensorflow::unwrap(ctx)->CreateTensor(DT_INT32, dim_sizes);
   }
 
-  static const char* ConvertScalar(PyObject* v, int32* out) {
+  static const char* ConvertScalar(PyObject* v, int32_t* out) {
     int64_t i;
 #if PY_MAJOR_VERSION < 3
     if (TF_PREDICT_TRUE(PyInt_Check(v))) {
@@ -432,14 +433,14 @@ struct ConverterTraits<int32> {
     } else {
       return ErrorMixedTypes;
     }
-    *out = static_cast<uint32>(static_cast<uint64>(i));
+    *out = static_cast<uint32_t>(static_cast<uint64_t>(i));
     // Check for 32-bit overflow.
     if (TF_PREDICT_FALSE(i != *out)) return ErrorFoundInt64;
     return nullptr;
   }
 };
 
-typedef Converter<int32> Int32Converter;
+typedef Converter<int32_t> Int32Converter;
 
 // Floating-point support
 
@@ -694,11 +695,11 @@ TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
   absl::Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor);
 
   if (TF_PREDICT_FALSE(!status.ok())) {
-    PyErr_SetString(PyExc_ValueError,
-                    tensorflow::strings::StrCat(
-                        "Failed to convert a NumPy array to a Tensor (",
-                        status.message(), ").")
-                        .c_str());
+    PyErr_SetString(
+        PyExc_ValueError,
+        absl::StrCat("Failed to convert a NumPy array to a Tensor (",
+                     status.message(), ").")
+            .c_str());
     return nullptr;
   }
 
@@ -758,8 +759,7 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj,
                .ok()) {
         PyErr_SetString(
             PyExc_TypeError,
-            tensorflow::strings::StrCat("Invalid dtype argument value ", dtype)
-                .c_str());
+            absl::StrCat("Invalid dtype argument value ", dtype).c_str());
         return nullptr;
       }
     }
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index a78f0a12f21c3f..fa1845bd782841 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -45,7 +45,7 @@ const char* ClassName(PyObject* py) {
 
 // Returns a PyObject containing a string, or null
 void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
-                        string* out) {
+                        std::string* out) {
   // The "traceback" module is assumed to be imported already by script_ops.py.
   PyObject* tb_module = PyImport_AddModule("traceback");
 
@@ -84,7 +84,7 @@ void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
 #if PY_MAJOR_VERSION < 3
     strings::StrAppend(out, PyString_AS_STRING(v), "\n");
 #else
-    strings::StrAppend(out, PyUnicode_AsUTF8(v), "\n");
+    absl::StrAppend(out, PyUnicode_AsUTF8(v), "\n");
 #endif
   }
 
@@ -92,7 +92,7 @@ void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
   Py_DECREF(ret_val);
 }
 
-string PyExceptionFetch() {
+std::string PyExceptionFetch() {
   CHECK(PyErr_Occurred())
       << "Must only call PyExceptionFetch after an exception.";
   PyObject* ptype;
@@ -100,7 +100,7 @@ string PyExceptionFetch() {
   PyObject* ptraceback;
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
   PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  string err = ClassName(ptype);
+  std::string err = ClassName(ptype);
   if (pvalue) {
     PyObject* str = PyObject_Str(pvalue);
 
@@ -108,11 +108,11 @@ string PyExceptionFetch() {
 #if PY_MAJOR_VERSION < 3
       strings::StrAppend(&err, ": ", PyString_AS_STRING(str), "\n");
 #else
-      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str), "\n");
+      absl::StrAppend(&err, ": ", PyUnicode_AsUTF8(str), "\n");
 #endif
       Py_DECREF(str);
     } else {
-      strings::StrAppend(&err, "(unknown error message)\n");
+      absl::StrAppend(&err, "(unknown error message)\n");
     }
 
     TryAppendTraceback(ptype, pvalue, ptraceback, &err);
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
index af1b21699e6502..d6b2b9f78ddca6 100644
--- a/tensorflow/python/lib/core/py_util.h
+++ b/tensorflow/python/lib/core/py_util.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 // Fetch the exception message as a string. An exception must be set
 // (PyErr_Occurred() must be true).
-string PyExceptionFetch();
+std::string PyExceptionFetch();
 
 // Assert that Python GIL is held.
 inline void DCheckPyGilState() {
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 069280d4425fb7..c624c412f3d12d 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1006,9 +1006,6 @@ tf_py_strict_test(
     name = "collective_ops_test",
     size = "small",
     srcs = ["collective_ops_test.py"],
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":array_ops",
         ":collective_ops",
@@ -1037,7 +1034,6 @@ tf_py_strict_test(
     srcs = ["collective_ops_xla_test.py"],
     tags = [
         "no_pip",
-        "cuda-only",
         "no_windows",
         "nomac",
     ],
@@ -3594,9 +3590,6 @@ cuda_py_strict_test(
     srcs = ["nn_fused_batchnorm_d9m_test.py"],
     main = "nn_fused_batchnorm_d9m_test.py",
     shard_count = 4,
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":nn_grad",
         ":nn_impl",
@@ -3618,9 +3611,6 @@ cuda_py_strict_test(
     srcs = ["nn_fused_batchnorm_test.py"],
     main = "nn_fused_batchnorm_test.py",
     shard_count = 24,
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":array_ops",
         ":gradient_checker",
@@ -3765,7 +3755,6 @@ cuda_py_strict_test(
     main = "special_math_ops_test.py",
     shard_count = 10,
     tags = [
-        "cuda-only",
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9f6644b4342ada..94dadf91a0e18d 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -982,8 +982,8 @@ def slice(input_, begin, size, name=None):
 
   Args:
     input_: A `Tensor`.
-    begin: An `int32` or `int64` `Tensor`.
-    size: An `int32` or `int64` `Tensor`.
+    begin: An `int16`, `int32` or `int64` `Tensor`.
+    size: An `int16`, `int32` or `int64` `Tensor`.
     name: A name for the operation (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 087af6a842fc86..c8e8e33ca9c69b 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1718,6 +1718,12 @@ def resize_images_v2(images,
   >>> max_10_20.shape.as_list()
   [1, 10, 10, 1]
 
+  Note:
+    The `bicubic` interpolation method currently does not have a GPU kernel
+    implementation. As a result, `tf.image.resize(..., method='bicubic')`
+    always executes on the CPU, even when GPU devices are available.
+
+
   Args:
     images: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
       of shape `[height, width, channels]`.
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index c1272a552ed4e0..67bc83f6dd5445 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -127,7 +127,6 @@ cuda_py_strict_test(
     shard_count = 16,
     tags = [
         "no_oss",
-        "cuda-only",
     ],
     deps = [
         ":control_flow_ops",
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 55505df533d447..c279965f378982 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -30,9 +30,9 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-#===============================================================================
+# ===============================================================================
 # Op to construct a constant RaggedTensor from a nested Python list.
-#===============================================================================
+# ===============================================================================
 @tf_export("ragged.constant")
 @dispatch.add_dispatch_support
 def constant(
@@ -57,15 +57,16 @@ def constant(
 
   Args:
     pylist: A nested `list`, `tuple` or `np.ndarray`.  Any nested element that
-      is not a `list`, `tuple` or `np.ndarray` must be a scalar value
-      compatible with `dtype`.
+      is not a `list`, `tuple` or `np.ndarray` must be a scalar value compatible
+      with `dtype`.
     dtype: The type of elements for the returned `RaggedTensor`.  If not
       specified, then a default is chosen based on the scalar values in
-      `pylist`.
+      `pylist`. If there are no scalar values in `pylist`, then the default is
+      `tf.float32`.
     ragged_rank: An integer specifying the ragged rank of the returned
       `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
-      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to
-      `max(0, K - 1 - len(inner_shape))` if `inner_shape` is specified.
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
     inner_shape: A tuple of integers specifying the shape for individual inner
       values in the returned `RaggedTensor`.  Defaults to `()` if `ragged_rank`
       is not specified.  If `ragged_rank` is specified, then a default is chosen
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 5b38d0f3f8eae8..ae4fae57926bfe 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -108,6 +108,7 @@ tf_python_pybind_extension(
         "//tensorflow/tools/pip_package:__subpackages__",
     ],
     deps = [
+        "@local_xla//xla/python/profiler/internal:traceme_state",
         "@local_xla//xla/python/profiler/internal:traceme_wrapper",
         "@pybind11",
     ],
diff --git a/tensorflow/python/profiler/internal/_pywrap_traceme.pyi b/tensorflow/python/profiler/internal/_pywrap_traceme.pyi
index 105e2dce09d3a7..47b8b56c94a269 100644
--- a/tensorflow/python/profiler/internal/_pywrap_traceme.pyi
+++ b/tensorflow/python/profiler/internal/_pywrap_traceme.pyi
@@ -17,3 +17,5 @@ class TraceMe:
     def __init__(self, arg0: str, **kwargs) -> None: ...
     def SetMetadata(self, **kwargs) -> None: ...
     def Stop(self) -> None: ...
+
+def traceme_enabled(*args, **kwargs): ...
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index ba1b1a63674491..9397eb18134cf3 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -17,14 +17,33 @@ limitations under the License.
 
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "xla/python/profiler/internal/traceme_state.h"
 
 namespace py = ::pybind11;
 
 using ::xla::profiler::TraceMeWrapper;
 
+// Returns true if TraceMe is enabled.
+// This is a low-overhead function that can be called frequently.
+static PyObject* traceme_enabled(PyObject* self, PyObject* args) {
+  if (xla::profiler::traceme_enabled) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+
+static PyMethodDef traceme_method_def = {"traceme_enabled", traceme_enabled,
+                                         METH_NOARGS,
+                                         "Returns true if TraceMe is enabled."};
+
 PYBIND11_MODULE(_pywrap_traceme, m) {
   py::class_<TraceMeWrapper>(m, "TraceMe", py::module_local())
       .def(py::init<const py::str&, const py::kwargs&>())
       .def("SetMetadata", &TraceMeWrapper::SetMetadata)
       .def("Stop", &TraceMeWrapper::Stop);
+
+  py::object module_name = m.attr("__name__");
+  m.attr("traceme_enabled") =
+      py::reinterpret_steal<py::object>(PyCFunction_NewEx(
+          &traceme_method_def, /*self=*/nullptr, module_name.ptr()));
 };
diff --git a/tensorflow/python/profiler/profiler_v2_test.py b/tensorflow/python/profiler/profiler_v2_test.py
index b0b4ff301f6b0b..bec85cdc60bba8 100644
--- a/tensorflow/python/profiler/profiler_v2_test.py
+++ b/tensorflow/python/profiler/profiler_v2_test.py
@@ -98,6 +98,14 @@ def test_context_manager_with_options(self):
     file_list = gfile.ListDirectory(logdir)
     self.assertEqual(len(file_list), 1)
 
+  def test_callback(self):
+    logdir = self.get_temp_dir()
+    self.assertFalse(trace.enabled())
+    profiler.start(logdir)
+    self.assertTrue(trace.enabled())
+    profiler.stop()
+    self.assertFalse(trace.enabled())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 6b6bc7ac243a75..4c877803ef7623 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -19,9 +19,9 @@
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
-# This variable is modified by PythonHooks::Start/Stop() in C++. Such
-# arrangement will reduce the number of calls through pybind11.
-enabled = False
+# This is a low-overhead function that directly calls C++ to check if the
+# profiler is enabled.
+enabled = _pywrap_traceme.traceme_enabled
 
 
 @tf_export('profiler.experimental.Trace', v1=[])
@@ -74,7 +74,7 @@ def __init__(self, name, **kwargs):
       The example above uses the keyword argument "step_num" to specify the
       training step being traced.
     """
-    if enabled:
+    if enabled():
       # Creating _pywrap_traceme.TraceMe starts the clock.
       self._traceme = _pywrap_traceme.TraceMe(name, **kwargs)
     else:
@@ -177,7 +177,7 @@ def inner_wrapper(func):
 
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
-      if enabled:
+      if enabled():
         with Trace(trace_name, **trace_kwargs):
           return func(*args, **kwargs)
       return func(*args, **kwargs)
diff --git a/tensorflow/python/trackable/data_structures.py b/tensorflow/python/trackable/data_structures.py
index c920dd882aac35..3bcb4c9ccba3b9 100644
--- a/tensorflow/python/trackable/data_structures.py
+++ b/tensorflow/python/trackable/data_structures.py
@@ -23,9 +23,6 @@
   # Fall back to the build-time dependency if the system package is not available.
   from .....third_party import wrapt  # pylint: disable=relative-beyond-top-level
 
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function as defun
-from tensorflow.python.ops import variables
 from tensorflow.python.trackable import base
 from tensorflow.python.trackable import layer_utils
 from tensorflow.python.util.compat import collections_abc
@@ -195,6 +192,8 @@ def trainable(self, value):
 
   def _track_value(self, value, name):
     """Add a dependency on `value`."""
+    # pylint: disable=g-import-not-at-top
+    from tensorflow.python.ops import variables
     value = sticky_attribute_assignment(
         trackable=self, value=value, name=name)
     if isinstance(value, variables.Variable):
@@ -810,6 +809,12 @@ def __reduce_ex__(self, protocol):
             (self.__wrapped__,))
 
   def __getattribute__(self, name):
+    if name == "__dict__":
+      # Returns __dict__ from wrapt.ObjectProxy
+      try:
+        return object.__getattribute__(self, "__dict__")
+      except (AttributeError, TypeError):
+        return {}
     if (hasattr(type(self), name)
         and isinstance(getattr(type(self), name), property)):
       # Bypass ObjectProxy for properties. Whether this workaround is necessary
@@ -1108,6 +1113,9 @@ def __getattribute__(self, name):
 
 
 def _is_function(x):
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.eager import def_function
+  from tensorflow.python.eager import function as defun
   return isinstance(x, (def_function.Function, defun.ConcreteFunction))
 
 
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 4bc1e84a600dee..3d6038075b86cf 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -1185,7 +1185,6 @@ cuda_py_strict_test(
     name = "basic_loops_test",
     size = "medium",
     srcs = ["basic_loops_test.py"],
-    tags = ["cuda-only"], #TODO(ROCm) Re-enable after issue is fixed.
     deps = [
         ":basic_loops",
         ":supervisor",
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 5875bf0e16668d..0851f88e67434a 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -133,6 +133,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//third_party/python_runtime:headers",
     ],
     alwayslink = 1,
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index 6a78c6668d9643..8d6a68dd7397f6 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/util/kernel_registry.h"
 
+#include <string>
+
 #include "absl/log/log.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/python/util/nest.cc b/tensorflow/python/util/nest.cc
index d7df8c42dde196..467359cbb9cf5e 100644
--- a/tensorflow/python/util/nest.cc
+++ b/tensorflow/python/util/nest.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstddef>
 #include <string>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b80627e3bbf891..d7c5e0f9dbe5a2 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -89,7 +89,6 @@ load(
     "@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl",
     "use_pywrap_rules",
     _pybind_extension = "pybind_extension",
-    _stripped_cc_info = "stripped_cc_info",
 )
 
 # Do not sort: copybara rule changes this
@@ -3340,8 +3339,6 @@ def pybind_extension(
             **kwargs
         )
 
-stripped_cc_info = _stripped_cc_info
-
 # Note: we cannot add //third_party/tf_runtime:__subpackages__ here,
 # because that builds all of tf_runtime's packages, and some of them
 # are known not to build on big endian systems.
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
deleted file mode 100644
index 91d501109d08a1..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ /dev/null
@@ -1,87 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010
-
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN apt-get update && apt-get install -y \
-    libnvinfer-dev=6.0.1-1+cuda10.1 \
-    libnvinfer6=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/*.sh /install/
-ARG DEBIAN_FRONTEND=noninteractive
-RUN /install/install_bootstrap_deb_packages.sh
-RUN /install/install_deb_packages.sh
-RUN /install/install_latest_clang.sh
-RUN /install/install_bazel.sh
-
-# Install python 3.6.
-RUN apt-get install --reinstall python3-apt
-RUN yes "" | add-apt-repository ppa:deadsnakes/ppa
-RUN apt-get update && apt-get install -y \
-    python3.6 python3.6-dev python3-pip python3.6-venv && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3.6 -m pip install pip --upgrade && \
-    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0
-
-RUN /install/install_pip_packages.sh
-
-# Install python 3.8.
-RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
-RUN rm -rf /var/lib/apt/lists/*
-# Have to download get-pip.py due to a pip circular issue
-# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
-RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-RUN python3.8 get-pip.py
-RUN python3.8 -m pip install --upgrade pip setuptools wheel
-
-# Overwrite include paths that are generated for the multipython image.
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
-
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
-
-# Make apt work with python 3.6.
-RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
-       /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
deleted file mode 100644
index c135dd5bd5d667..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ /dev/null
@@ -1,87 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
-
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN apt-get update && apt-get install -y \
-    libnvinfer-dev=6.0.1-1+cuda10.1 \
-    libnvinfer6=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.6.9"
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index b8b9e2195b7830..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,88 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
-
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN apt-get update && apt-get install -y \
-    libnvinfer-dev=6.0.1-1+cuda10.1 \
-    libnvinfer6=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.5.9"
-RUN /install/build_and_install_python.sh "3.6.9"
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index 5a86fb05d119b6..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,78 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
-
-FROM gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu16.04-manylinux2010
-
-RUN apt-get update
-RUN apt-get remove -y --allow-change-held-packages cuda-license-10-0 libcudnn7 libcudnn8 libnccl2 libnccl-dev
-RUN apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
-  libcublas10 \
-  libcublas-dev \
-  cuda-nvml-dev-10.2 \
-  cuda-command-line-tools-10.2 \
-  cuda-libraries-dev-10.2 \
-  cuda-minimal-build-10.2 \
-  libcudnn7=7.6.5.32-1+cuda10.2 \
-  libcudnn7-dev=7.6.5.32-1+cuda10.2
-RUN rm -f /usr/local/cuda
-RUN ln -s /usr/local/cuda-10.2 /usr/local/cuda
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.0"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index 3f90ac008459fc..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,93 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
-
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      patch \
-      rpm2cpio \
-      unar \
-      wget \
-      xz-utils \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN echo \
-    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 / \
-    > /etc/apt/sources.list.d/nvidia-ml.list \
-      && \
-    apt-get update && apt-get install -y \
-    libnvinfer-dev=7.1.3-1+cuda11.0 \
-    libnvinfer7=7.1.3-1+cuda11.0 \
-    libnvinfer-plugin-dev=7.1.3-1+cuda11.0 \
-    libnvinfer-plugin7=7.1.3-1+cuda11.0 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.6.9"
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-
-ENV CLANG_VERSION="r7f6f9f4cf966c78a315d15d6e913c43cfa45c47c"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
deleted file mode 100644
index cb149c9d82ba21..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
+++ /dev/null
@@ -1,105 +0,0 @@
-# Dockerfile to build a manylinxu2010/manylinux 2014 compliant cross-compiler.
-#
-# Builds a devtoolset-7 environment with manylinux2010 compatible glibc (2.12) and
-# libstdc++ (4.4) in /dt7. 
-#
-# Builds a devtoolset-9 environment with manylinux2014 compatible glibc (2.17) and
-# libstdc++ (4.8) in /dt9.
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
-
-FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu20.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      patch \
-      rpm2cpio \
-      unar \
-      wget \
-      xz-utils \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.17 / libstdc++ 4.8 / devtoolset-9 in /dt9.
-RUN /build_devtoolset.sh devtoolset-9 /dt9
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu20.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt9 /dt9
-
-# Install TensorRT.
-RUN echo \
-    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 / \
-    > /etc/apt/sources.list.d/nvidia-ml.list \
-      && \
-    apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80 && \
-    apt-get update && apt-get install -y \
-    libnvinfer-dev=7.2.2-1+cuda11.1 \
-    libnvinfer7=7.2.2-1+cuda11.1 \
-    libnvinfer-plugin-dev=7.2.2-1+cuda11.1 \
-    libnvinfer-plugin7=7.2.2-1+cuda11.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    libsqlite3-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.4"
-RUN /install/build_and_install_python.sh "3.10.0"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10"
-
-ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
-
-# TensorRT 7 for CUDA 11.1 is compatible with CUDA 11.2, but requires
-# libnvrtc.so.11.1. See https://github.com/NVIDIA/TensorRT/issues/1064.
-# TODO(b/187962120): Remove when upgrading to TensorRT 8.
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/lib64"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
deleted file mode 100644
index c4912a65b65d61..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
+++ /dev/null
@@ -1,26 +0,0 @@
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Nick Lopez <ngiraldo@google.com>"
-
-# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
-# /usr/local/cuda
-RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
-RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-ARG DEBIAN_FRONTEND=noninteractive
-RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:george-edison55/cmake-3.x
-RUN /install/install_deb_packages.sh
-RUN /install/install_pip_packages.sh
-RUN /install/install_golang.sh
-
-# Install clang from pre-built package
-RUN cd /tmp && \
-    wget https://storage.googleapis.com/clang-builds-stable/clang-ubuntu16_04/clang_r337145.tar.gz && \
-    echo "ab98c63eb09c04112cc992bc95ebc0dcea8c5e9d0760438789be2896cdc69ff8  clang_r337145.tar.gz" | sha256sum -c && \
-    tar -C /usr/local -xf clang_r323528.tar.gz && \
-    rm clang_r337145.tar.gz
-
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index e9974b05b3cab8..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,82 +0,0 @@
-# Dockerfile for ROCm RBE builds.
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 \
-#  --tag "local-toolchain-ubuntu18.04-manylinux2010" .
-# $ docker build -f Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython
-
-FROM ubuntu:18.04
-COPY --from=local-toolchain-ubuntu18.04-manylinux2010 /dt7 /dt7
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Install ROCm packages
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl libnuma-dev gnupg sudo libelf1 build-essential \
-  && curl -k -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/5.0/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list \
-  && apt-get update && apt-get install -y --no-install-recommends \
-    rocm-dev rocm-libs rccl \
-  && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Set ROCm environment variables and paths.
-# We use /opt/rocm without version suffix so that the toolchain configuration
-# for builtin headers doesn't need to be adapted
-ARG ROCM_PATH=/opt/rocm
-ENV HCC_HOME=$ROCM_PATH/hcc
-ENV HIP_PATH=$ROCM_PATH/hip
-ENV OPENCL_ROOT=$ROCM_PATH/opencl
-ENV PATH="$ROCM_PATH/bin:${PATH}"
-ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
-ENV PATH="$OPENCL_ROOT/bin:${PATH}"
-
-# Set target file to help determine which device(s) to build for
-RUN bash -c "ls -al /opt/roc*"
-RUN bash -c "echo -e 'gfx900\ngfx906\ngfx908' > $ROCM_PATH/bin/target.lst"
-
-# Copy and run the install scripts.
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - bsdmainutils (hexdump) for MLIR generated GPU kernels
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    bsdmainutils \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install openjdk 11
-RUN yes "" | add-apt-repository ppa:openjdk-r/ppa
-RUN apt-get update && apt-get install -y openjdk-11-jdk && \
-    update-alternatives --auto java
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.4"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
deleted file mode 100644
index 32834ccac2a3af..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
+++ /dev/null
@@ -1,86 +0,0 @@
-# Dockerfile for ROCm RBE builds.
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.local-toolchain-ubuntu20.04-manylinux2014 \
-#  --tag "local-toolchain-ubuntu20.04-manylinux2014" .
-# $ docker build -f Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-rocm-ubuntu20.04-manylinux2014-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-rocm-ubuntu20.04-manylinux2014-multipython
-
-FROM ubuntu:20.04
-COPY --from=local-toolchain-ubuntu20.04-manylinux2014 /dt7 /dt7
-COPY --from=local-toolchain-ubuntu20.04-manylinux2014 /dt9 /dt9
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Install ROCm packages
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl libnuma-dev gnupg sudo libelf1 build-essential \
-  && curl -k -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main" \
-    | tee /etc/apt/sources.list.d/rocm.list \
-  && apt-get update && apt-get install -y \
-    rocm-dev rocm-libs rccl \
-  && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Set ROCm environment variables and paths.
-# We use /opt/rocm without version suffix so that the toolchain configuration
-# for builtin headers doesn't need to be adapted
-ARG ROCM_PATH=/opt/rocm
-ENV HCC_HOME=$ROCM_PATH/hcc
-ENV HIP_PATH=$ROCM_PATH/hip
-ENV OPENCL_ROOT=$ROCM_PATH/opencl
-ENV PATH="$ROCM_PATH/bin:${PATH}"
-ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
-ENV PATH="$OPENCL_ROOT/bin:${PATH}"
-
-# Set target file to help determine which device(s) to build for
-RUN bash -c "ls -al /opt/roc*"
-RUN bash -c "echo -e 'gfx900\ngfx906\ngfx908' > $ROCM_PATH/bin/target.lst"
-
-# Copy and run the install scripts.
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install openjdk 11
-RUN yes "" | add-apt-repository ppa:openjdk-r/ppa
-RUN apt-get update && apt-get install -y openjdk-11-jdk && \
-    update-alternatives --auto java
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.4"
-RUN /install/build_and_install_python.sh "3.10.0"
-RUN /install/build_and_install_python.sh "3.11.0"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "nojax"
-
-ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/install/install_latest_clang.sh b/tensorflow/tools/ci_build/install/install_latest_clang.sh
deleted file mode 100755
index 5eed5d2141fcf0..00000000000000
--- a/tensorflow/tools/ci_build/install/install_latest_clang.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Contact c-toolchain-team@ for new releases or new ubuntu versions.
-DIST="$(grep "DISTRIB_RELEASE" /etc/lsb-release |sed 's,.*=,,; s,\.,_,')"
-INSTALL_DIR="/clang_${CLANG_VERSION}"
-STORAGE="https://storage.googleapis.com/clang-builds-stable"
-mkdir -p "${INSTALL_DIR}"
-cd "${INSTALL_DIR}"
-wget "${STORAGE}/clang-ubuntu${DIST}/clang_${CLANG_VERSION}.tar.gz"
-tar xvzf clang_${CLANG_VERSION}.tar.gz
-rm clang_${CLANG_VERSION}.tar.gz
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
index 36c8dbd6c29948..ca6e0b612d5a47 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
@@ -64,13 +64,31 @@ if [ ! -d /tf ];then
         mkdir /tf
 fi
 
-# vvv TODO (rocm) weekly-sync-20251021 excluded tests
+# vvv TODO (rocm) weekly-sync-20251224 excluded tests
 EXCLUDED_TESTS=(
     # //tensorflow/core/kernels:matmul_op_test_gpu
     Test/FusedMatMulWithBiasOpTest/1.MatMul*
 
     # //tensorflow/core/common_runtime:process_function_library_runtime_test_gpu
     ProcessFunctionLibraryRuntimeTest.MultiDevice_ResourceOutput_GPU
+
+    # //tensorflow/compiler/tests:randomized_tests_seeded
+    # //tensorflow/compiler/tests:randomized_tests_mlir_seeded
+    OpTest.ScatterNd
+
+    # //tensorflow/core/util/autotune_maps:autotune_serialize_test_gpu
+    AutotuneSerializeTest.Consistency
+    AutotuneSerializeTest.VersionControl
+
+    # //tensorflow/python/kernel_tests/nn_ops:depthwise_conv_op_d9m_test
+    DepthwiseConv2DDeterministicTest.testBackwardDeterminismGPU
+
+    # //tensorflow/python/kernel_tests/sparse_ops:sparse_ops_test
+    SparseFillEmptyRowsTest.testSparseFillEmptyRowsGradInvalidReverseIndexMap
+
+    # //tensorflow/core/profiler/backends/gpu:device_tracer_test
+    DeviceTracerTest.StartTwoTracers
+    DeviceTracerTest.TraceToXSpace
 )
 
 # Run bazel test command. Double test timeouts to avoid flakes.
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_xla.sh b/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
index ea43eddc474626..9bef5a24a46366 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
@@ -138,6 +138,73 @@ EXCLUDED_TESTS=(
 
     # @local_xla//xla/tests:multioutput_fusion_test_amdgpu_any
     MultiOutputFusionTest.MultiOutputReduceFusionMajorWithExtraOutput
+
+    # vvv TODO (rocm) weekly-sync-20251224 excluded tests
+    # @local_xla//xla/service/gpu:gpu_compiler_test_amdgpu_any
+    PersistedAutotuningTest.SingleOperationGetsAutotuned
+
+    # @local_xla//xla/backends/gpu/codegen/triton:support_test
+    BitcastOrReshapeTestSuite/BitcastOrReshapeTest.IsTritonSupportedBitcastOrReshape*
+    BitcastOrReshapeTestSuite/BitcastOrReshapeTest.IsTritonSupported0DBitcastOrReshape*
+    BitcastConvertSuite/BitcastConvertTest.BitcastConvertDisguisedAsBitcast*
+    UnaryElementwiseTestSuite/UnaryElementwiseTest.IsTritonSupportedUnaryElementwise*
+    ConvertTestSuite/ConvertTest.Convert*
+    BinaryElementwiseTestSuite/BinaryElementwiseTest.IsTritonSupportedBinaryElementwise*
+    TernaryElementwiseTestSuite/TernaryElementwiseTest.IsTritonSupportedTernaryElementwise*
+    ReductionComputationTestSuite/ReductionComputationTest.DifferentBinaryOps*
+    TransposeTestSuite/TransposeTest.LoadTranspose3D*
+    SliceTestSuite/SliceTest.ContinuousSlice*
+    BroadcastTestSuite/BroadcastTest.Broadcast*
+    ParameterTestSuite/ParameterTest.Parameter*
+    ConstantTestSuite/ConstantTest.ConstantEffectiveScalar*
+    DotTestSuite/DotTypesTest.Dot*
+
+    # @local_xla//xla/backends/gpu/codegen/triton:support_legacy_test
+    DotTestTestSuite/DotTest.IsTritonSupportedExecutesCorrectlyForDot/f8e5m2_dot
+
+    # @local_xla//xla/backends/gpu/profiler:kernel_name_tracer_test
+    KernelNameTracerTest.Create
+    KernelNameTracerTest.CaptureKernelNames
+    KernelNameTracerTest.CaptureKernelNamesFromCommandBufferThunk
+
+    # @local_xla//xla/service/gpu/autotuning:gemm_fusion_autotuner_test
+    GemmFusionAutotunerTest.Int8FusedGemm256
+    GemmFusionAutotunerLevelSweep/GemmFusionAutotunerLevelTest.Deviceless/0
+
+    # @local_xla//xla/service/gpu/tests:swap_conv_operands_test
+    SwapConvOperandsTest.LargePadding
+    SwapConvOperandsTest.SmallPadding
+    SwapConvOperandsTest.DoesNotLower
+
+    # @local_xla//xla/service/gpu/tests:gpu_triton_custom_call_test
+    GpuIrEmitterUnnestedTest.CanNotEmitTritonCustomCallOnPreAmpereGpu
+
+    # @local_xla//xla/tests:convolution_autotune_disabled_test
+    Transposed2DConvHloTest/Transposed2DConvHloTest.Simple*
+    ConvolveWithAndWithoutCanonicalization_Instantiation/ConvolveWithAndWithoutCanonicalization.Convolve2D_NoSpatialDims*
+    ConvolutionHloTest.ConvolveBackwardInput
+    ConvolutionHloTest.TestConv0D
+    ConvolutionHloTest.TestConv2DF16
+    ConvolutionHloTest.SwappedOperandConvolveWithStride
+    ConvolutionHloTest.TestFusedConv3D
+    ConvolutionHloTest.SwappedOperandConvolve
+    ConvolutionHloTest.TestBooleanInput
+    ConvolutionHloTest.SwappedOperandConvolve2
+    ConvolutionTest.Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid
+    ConvolutionTest.ConvolveF32BackwardInputGroupedConvolution
+    Convolve_1x1x4x4_1x1x2x2_Valid/2.Types
+    Convolve_1x1x4x4_1x1x2x2_Valid/1.Types
+    Convolve_1x1x4x4_1x1x2x2_Same/1.Types
+    Convolve_1x1x4x4_1x1x2x2_Same/2.Types
+    Convolve_1x1x4x4_1x1x3x3_Same/1.Types
+    Convolve_1x1x4x4_1x1x3x3_Same/2.Types
+    Convolve2D*
+
+    # @local_xla//xla/tests:convolution_1d_autotune_disabled_test
+    ConvolutionTest.Convolve1D*
+    Convolve1D_1x2x5_1x2x2*
+    Convolve1D1WindowTest_Instantiation/Convolve1D1WindowTestFloat*
+    Convolve1D1WindowTest_Instantiation/Convolve1D1WindowTestHalf*
 )
 
 bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rocm.bazelrc test \
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index b5fbef19051f8a..19958cb6478765 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:24.04@sha256:66460d557b25769b102175144d538d88219c077c678a49af4afca6fbfc1b5252
+FROM ubuntu:24.04@sha256:c35e29c9450151419d9448b0fd75374fec4fff364a27f176fb458d472dfc9e54
 
 LABEL maintainer="Shanqing Cai <cais@google.com>"
 
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 269137c997d447..3d8eabc8361f6b 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -32,17 +32,17 @@ namespace graph_transforms {
 // into the quantized equivalent.
 struct QuantizedOpInfo {
   // The name of the float op.
-  string float_name;
+  std::string float_name;
   // Which attributes to copy directly over.
-  std::vector<string> attrs_to_copy;
+  std::vector<std::string> attrs_to_copy;
   // Extra data type attributes we need to set.
-  std::vector<std::pair<string, DataType>> dtypes_to_set;
+  std::vector<std::pair<std::string, DataType>> dtypes_to_set;
   // What depth of inputs the op can read in.
   DataType input_bit_depth;
   // The depth of the op's quantized outputs.
   DataType output_bit_depth;
   // Which inputs (e.g. shapes) aren't involved in the quantization process.
-  std::set<int32> unquantized_inputs;
+  std::set<int32_t> unquantized_inputs;
   // How the outputs are arranged, either
   // [input0, input1, min0, max0, min1, max1] for contiguous, or
   // [input0, input1, min0, min1, max0, max1] for separate.
@@ -145,12 +145,12 @@ const std::vector<QuantizedOpInfo>& GetQuantizedOpList() {
 
 namespace {
 // Replaces invalid characters in input names to get a unique node name.
-string UniqueNodeNameFromInput(const string& input_name) {
-  string prefix;
-  string node_name;
-  string suffix;
+std::string UniqueNodeNameFromInput(const std::string& input_name) {
+  std::string prefix;
+  std::string node_name;
+  std::string suffix;
   NodeNamePartsFromInput(input_name, &prefix, &node_name, &suffix);
-  string result;
+  std::string result;
   if (prefix == "^") {
     result += "__hat__";
   }
@@ -163,9 +163,10 @@ string UniqueNodeNameFromInput(const string& input_name) {
 
 // Pulls two float values from the named parameters, with a lot of checking.
 absl::Status ExtractRangeFromParams(const TransformFuncContext& context,
-                                    const string& min_name,
-                                    const string& max_name, float* min_value,
-                                    float* max_value, bool* has_range) {
+                                    const std::string& min_name,
+                                    const std::string& max_name,
+                                    float* min_value, float* max_value,
+                                    bool* has_range) {
   // See if we've been given quantized inputs with a known range.
   const bool has_min = (context.params.count(min_name) != 0);
   const bool has_max = (context.params.count(max_name) != 0);
@@ -193,17 +194,17 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
                                  const TransformFuncContext& context,
                                  GraphDef* output_graph_def) {
   // Make sure we can look up inputs and outputs quickly.
-  std::set<string> input_names(context.input_names.begin(),
-                               context.input_names.end());
-  std::set<string> output_names(context.output_names.begin(),
-                                context.output_names.end());
+  std::set<std::string> input_names(context.input_names.begin(),
+                                    context.input_names.end());
+  std::set<std::string> output_names(context.output_names.begin(),
+                                     context.output_names.end());
   GraphDef current_graph_def = input_graph_def;
   // Keep running the merging until no more duplicates are found.
   bool any_duplicates_found;
   do {
     any_duplicates_found = false;
     // First arrange all of the nodes by a hash of their contents.
-    std::map<uint64, std::vector<const NodeDef*>> hashed_nodes;
+    std::map<uint64_t, std::vector<const NodeDef*>> hashed_nodes;
     for (const NodeDef& node : current_graph_def.node()) {
       NodeDef nameless_node = node;
       // The name matters if it's being used as an input or output node,
@@ -211,14 +212,14 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
       if (!input_names.count(node.name()) && !output_names.count(node.name())) {
         nameless_node.set_name("");
       }
-      const uint64 hash = HashNodeDef(nameless_node);
+      const uint64_t hash = HashNodeDef(nameless_node);
       hashed_nodes[hash].push_back(&node);
     }
     // If we have multiple nodes with the same hash, then we know they're
     // duplicates and can be removed, unless they're stateful.
-    std::map<string, string> inputs_to_rename;
+    std::map<std::string, std::string> inputs_to_rename;
     GraphDef merged_graph_def;
-    for (const std::pair<const uint64, std::vector<const NodeDef*>>&
+    for (const std::pair<const uint64_t, std::vector<const NodeDef*>>&
              hashed_node_info : hashed_nodes) {
       const std::vector<const NodeDef*>& hash_node_list =
           hashed_node_info.second;
@@ -229,7 +230,7 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
             OpRegistry::Global()->LookUpOpDef(current_node->op(), &op_def));
         const bool is_duplicate = ((!op_def->is_stateful()) && (i > 0));
         if (is_duplicate) {
-          const string original_name = hash_node_list[0]->name();
+          const std::string original_name = hash_node_list[0]->name();
           inputs_to_rename[current_node->name() + ":*"] = original_name;
           any_duplicates_found = true;
         } else {
@@ -241,7 +242,7 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
     // Update the graph so that any nodes that referred to removed inputs now
     // pull from the remaining duplicate.
     TF_RETURN_IF_ERROR(RenameNodeInputs(merged_graph_def, inputs_to_rename,
-                                        std::unordered_set<string>(),
+                                        std::unordered_set<std::string>(),
                                         &current_graph_def));
   } while (any_duplicates_found);
 
@@ -261,11 +262,11 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
 absl::Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
                                           const TransformFuncContext& context,
                                           GraphDef* output_graph_def) {
-  std::set<string> graph_outputs;
-  for (const string& output_name : context.output_names) {
+  std::set<std::string> graph_outputs;
+  for (const std::string& output_name : context.output_names) {
     graph_outputs.insert(NodeNameFromInput(output_name));
   }
-  std::map<string, string> inputs_to_rename;
+  std::map<std::string, std::string> inputs_to_rename;
   GraphDef replaced_graph_def;
   TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
       input_graph_def,  // clang-format off
@@ -276,10 +277,10 @@ absl::Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
           {"Max"},
         }
       },  // clang-format on
-      [&inputs_to_rename, &graph_outputs](const NodeMatch& match,
-                                          const std::set<string>& input_nodes,
-                                          const std::set<string>& output_nodes,
-                                          std::vector<NodeDef>* new_nodes) {
+      [&inputs_to_rename, &graph_outputs](
+          const NodeMatch& match, const std::set<std::string>& input_nodes,
+          const std::set<std::string>& output_nodes,
+          std::vector<NodeDef>* new_nodes) {
         const NodeDef& quantize_node = match.node;
         const NodeDef& dequantize_node = match.inputs[0].node;
         inputs_to_rename[quantize_node.name() + ":0"] =
@@ -302,7 +303,7 @@ absl::Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
       {true}, &replaced_graph_def));
 
   return RenameNodeInputs(replaced_graph_def, inputs_to_rename,
-                          std::unordered_set<string>(), output_graph_def);
+                          std::unordered_set<std::string>(), output_graph_def);
 }
 
 // If the user has passed in the input_min and input_max args, then we need to
@@ -321,15 +322,15 @@ absl::Status QuantizePlaceholders(const GraphDef& input_graph_def,
     *output_graph_def = input_graph_def;
     return absl::OkStatus();
   }
-  std::map<string, string> inputs_to_rename_first_pass;
-  std::map<string, string> inputs_to_rename_second_pass;
+  std::map<std::string, std::string> inputs_to_rename_first_pass;
+  std::map<std::string, std::string> inputs_to_rename_second_pass;
   GraphDef placeholder_graph_def;
   placeholder_graph_def.Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     if (node.op() != "Placeholder") {
       *(placeholder_graph_def.mutable_node()->Add()) = node;
     } else {
-      string namespace_prefix = node.name() + "_eightbit";
+      std::string namespace_prefix = node.name() + "_eightbit";
 
       NodeDef quantized_placeholder;
       quantized_placeholder = node;
@@ -354,7 +355,7 @@ absl::Status QuantizePlaceholders(const GraphDef& input_graph_def,
       SetNodeTensorAttr<float>("value", max_tensor, &max_node);
       *(placeholder_graph_def.mutable_node()->Add()) = max_node;
 
-      const string rename_suffix = "__RENAMED_PLACEHOLDER__";
+      const std::string rename_suffix = "__RENAMED_PLACEHOLDER__";
       NodeDef dequantize_node;
       dequantize_node.set_op("Dequantize");
       dequantize_node.set_name(namespace_prefix + "/dequantize");
@@ -375,12 +376,12 @@ absl::Status QuantizePlaceholders(const GraphDef& input_graph_def,
   }
 
   GraphDef first_pass_graph_def;
-  TF_RETURN_IF_ERROR(
-      RenameNodeInputs(placeholder_graph_def, inputs_to_rename_first_pass,
-                       std::unordered_set<string>(), &first_pass_graph_def));
+  TF_RETURN_IF_ERROR(RenameNodeInputs(
+      placeholder_graph_def, inputs_to_rename_first_pass,
+      std::unordered_set<std::string>(), &first_pass_graph_def));
   TF_RETURN_IF_ERROR(
       RenameNodeInputs(first_pass_graph_def, inputs_to_rename_second_pass,
-                       std::unordered_set<string>(), output_graph_def));
+                       std::unordered_set<std::string>(), output_graph_def));
 
   return absl::OkStatus();
 }
@@ -400,15 +401,15 @@ absl::Status ConvertFakeQuantsToRequantize(const GraphDef& input_graph_def,
           {"Const"},
         }
       },  // clang-format on
-      [](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
+      [](const NodeMatch& match, const std::set<std::string>& input_nodes,
+         const std::set<std::string>& output_nodes,
          std::vector<NodeDef>* new_nodes) {
         const NodeDef& fake_quant_node = match.node;
         const NodeDef& original_op_node = match.inputs[0].node;
         const NodeDef& fake_quant_min_node = match.inputs[1].node;
         const NodeDef& fake_quant_max_node = match.inputs[2].node;
 
-        string namespace_prefix = fake_quant_node.name() + "_eightbit";
+        std::string namespace_prefix = fake_quant_node.name() + "_eightbit";
 
         new_nodes->push_back(original_op_node);
         new_nodes->push_back(fake_quant_min_node);
@@ -494,8 +495,8 @@ absl::Status MergeAdjacentRequantizes(const GraphDef& input_graph_def,
           {"Const"},
         }
       },  // clang-format on
-      [](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
+      [](const NodeMatch& match, const std::set<std::string>& input_nodes,
+         const std::set<std::string>& output_nodes,
          std::vector<NodeDef>* new_nodes) {
         const NodeDef& fake_requantize_node = match.node;
         const NodeDef& original_op_node =
@@ -544,8 +545,9 @@ absl::Status HoistFakeQuants(const GraphDef& input_graph_def,
     GraphDef hoisted_graph_def;
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
-        [depth](const NodeMatch& match, const std::set<string>& input_nodes,
-                const std::set<string>& output_nodes,
+        [depth](const NodeMatch& match,
+                const std::set<std::string>& input_nodes,
+                const std::set<std::string>& output_nodes,
                 std::vector<NodeDef>* new_nodes) {
           const NodeDef& fake_quant_node = match.node;
           const NodeDef& fake_quant_min_node = match.inputs[1].node;
@@ -633,17 +635,17 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
   // between adjacent quantized ops, but a later pass removes these where it
   // can.
 
-  std::set<string> ops_to_ignore;
+  std::set<std::string> ops_to_ignore;
   if (context.params.count("ignore_op") > 0) {
-    for (const string& name : context.params.at("ignore_op")) {
+    for (const std::string& name : context.params.at("ignore_op")) {
       ops_to_ignore.insert(name);
     }
   }
 
   const std::vector<QuantizedOpInfo>& op_list = GetQuantizedOpList();
-  string op_pattern;
+  std::string op_pattern;
   bool is_first = true;
-  std::map<string, QuantizedOpInfo> op_map;
+  std::map<std::string, QuantizedOpInfo> op_map;
   for (const QuantizedOpInfo& op_info : op_list) {
     if (ops_to_ignore.count(op_info.float_name) == 0) {
       absl::StrAppend(&op_pattern, is_first ? "" : "|", op_info.float_name);
@@ -692,8 +694,8 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
   TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
       converted_graph_def, {op_pattern},
       [&op_map, fallback_min, fallback_max, has_fallback_range](
-          const NodeMatch& match, const std::set<string>& input_nodes,
-          const std::set<string>& output_nodes,
+          const NodeMatch& match, const std::set<std::string>& input_nodes,
+          const std::set<std::string>& output_nodes,
           std::vector<NodeDef>* new_nodes) {
         const NodeDef& float_node = match.node;
         const QuantizedOpInfo& op_info = op_map[float_node.op()];
@@ -728,18 +730,18 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           return absl::OkStatus();
         }
 
-        string namespace_prefix = float_node.name() + "_eightbit";
+        std::string namespace_prefix = float_node.name() + "_eightbit";
 
         // Quantize all of the inputs.
-        std::vector<string> quantized_input_names;
+        std::vector<std::string> quantized_input_names;
         for (int i = 0; i < float_node.input_size(); ++i) {
           // Skip any non-float inputs.
           if (op_info.unquantized_inputs.count(i)) {
             continue;
           }
 
-          const string& input_name = float_node.input(i);
-          string unique_input_name =
+          const std::string& input_name = float_node.input(i);
+          std::string unique_input_name =
               namespace_prefix + "/" + UniqueNodeNameFromInput(input_name);
 
           // Add some common constants we need for reshaping inputs.
@@ -749,8 +751,9 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           AddNodeInput("^" + NodeNameFromInput(input_name), &reshape_dims);
           SetNodeAttr("dtype", DT_INT32, &reshape_dims);
           Tensor reshape_dims_tensor(DT_INT32, {1});
-          reshape_dims_tensor.flat<int32>()(0) = -1;
-          SetNodeTensorAttr<int32>("value", reshape_dims_tensor, &reshape_dims);
+          reshape_dims_tensor.flat<int32_t>()(0) = -1;
+          SetNodeTensorAttr<int32_t>("value", reshape_dims_tensor,
+                                     &reshape_dims);
           new_nodes->push_back(reshape_dims);
 
           NodeDef reduction_dims;
@@ -759,9 +762,9 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           AddNodeInput("^" + NodeNameFromInput(input_name), &reduction_dims);
           SetNodeAttr("dtype", DT_INT32, &reduction_dims);
           Tensor reduction_dims_tensor(DT_INT32, {1});
-          reduction_dims_tensor.flat<int32>()(0) = 0;
-          SetNodeTensorAttr<int32>("value", reduction_dims_tensor,
-                                   &reduction_dims);
+          reduction_dims_tensor.flat<int32_t>()(0) = 0;
+          SetNodeTensorAttr<int32_t>("value", reduction_dims_tensor,
+                                     &reduction_dims);
           new_nodes->push_back(reduction_dims);
 
           NodeDef reshape_node;
@@ -806,11 +809,11 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
         NodeDef quantized_main_node;
         quantized_main_node.set_op("Quantized" + float_node.op());
         quantized_main_node.set_name(float_node.name() + "/eightbit");
-        for (const string& attr_to_copy : op_info.attrs_to_copy) {
+        for (const std::string& attr_to_copy : op_info.attrs_to_copy) {
           CopyNodeAttr(float_node, attr_to_copy, attr_to_copy,
                        &quantized_main_node);
         }
-        for (const std::pair<string, DataType>& dtype_to_set :
+        for (const std::pair<std::string, DataType>& dtype_to_set :
              op_info.dtypes_to_set) {
           SetNodeAttr(dtype_to_set.first, dtype_to_set.second,
                       &quantized_main_node);
@@ -820,32 +823,35 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           if (op_info.unquantized_inputs.count(i)) {
             AddNodeInput(float_node.input(i), &quantized_main_node);
           } else {
-            const string& quantized_input_name =
+            const std::string& quantized_input_name =
                 quantized_input_names[quantized_input_index];
             AddNodeInput(quantized_input_name + ":0", &quantized_main_node);
             ++quantized_input_index;
           }
         }
         if (op_info.min_max_order == QuantizedOpInfo::CONTIGUOUS_MIN_MAX) {
-          for (const string& quantized_input_name : quantized_input_names) {
+          for (const std::string& quantized_input_name :
+               quantized_input_names) {
             AddNodeInput(quantized_input_name + ":1", &quantized_main_node);
             AddNodeInput(quantized_input_name + ":2", &quantized_main_node);
           }
         } else {
-          for (const string& quantized_input_name : quantized_input_names) {
+          for (const std::string& quantized_input_name :
+               quantized_input_names) {
             AddNodeInput(quantized_input_name + ":1", &quantized_main_node);
           }
-          for (const string& quantized_input_name : quantized_input_names) {
+          for (const std::string& quantized_input_name :
+               quantized_input_names) {
             AddNodeInput(quantized_input_name + ":2", &quantized_main_node);
           }
         }
         new_nodes->push_back(quantized_main_node);
 
-        string eight_bit_node_name;
+        std::string eight_bit_node_name;
         if (op_info.output_bit_depth == DT_QINT32) {
           // Shrink the range of the output down from 32 bits to 8.
-          string requantize_min_input;
-          string requantize_max_input;
+          std::string requantize_min_input;
+          std::string requantize_max_input;
           if (has_fallback_range) {
             // Use constant values for the min/max range if they were given.
             NodeDef fallback_min_node;
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index 6fca08585fb271..128672734f7c0b 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -36,7 +36,7 @@ absl::Status RemoveAttribute(const GraphDef& input_graph_def,
         "argument, e.g. remove_attribute(op_name=Mul, attribute_name=foo)");
   }
 
-  string op_name;
+  std::string op_name;
   if (context.params.count("op_name")) {
     if (context.params.at("op_name").size() != 1) {
       return errors::InvalidArgument(
@@ -48,7 +48,7 @@ absl::Status RemoveAttribute(const GraphDef& input_graph_def,
     op_name = "*";
   }
 
-  const string attribute_name = context.params.at("attribute_name")[0];
+  const std::string attribute_name = context.params.at("attribute_name")[0];
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();
diff --git a/tensorflow/tools/mlpbtxt/frommlpbtxt.cc b/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
index dec8b6b542a8d0..2817d919dbc915 100644
--- a/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
+++ b/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
@@ -29,15 +29,15 @@ namespace tensorflow {
 namespace {
 
 int Run(int argc, char** argv) {
-  string FLAGS_in = "";
-  string FLAGS_out = "";
+  std::string FLAGS_in = "";
+  std::string FLAGS_out = "";
 
   std::vector<Flag> flag_list = {
       Flag("in", &FLAGS_in, "Input multi-line proto text (.mlpbtxt) file name"),
       Flag("out", &FLAGS_out, "Output proto text (.pbtxt) file name")};
 
   // Parse the command-line.
-  const string usage = Flags::Usage(argv[0], flag_list);
+  const std::string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_ok = Flags::Parse(&argc, argv, flag_list);
   if (argc != 1 || !parse_ok) {
     printf("%s", usage.c_str());
@@ -47,7 +47,7 @@ int Run(int argc, char** argv) {
   port::InitMain(argv[0], &argc, &argv);
 
   // Read the input file --in.
-  string in_contents;
+  std::string in_contents;
   absl::Status s = ReadFileToString(Env::Default(), FLAGS_in, &in_contents);
   if (!s.ok()) {
     printf("Error reading file %s: %s\n", FLAGS_in.c_str(),
@@ -56,7 +56,7 @@ int Run(int argc, char** argv) {
   }
 
   // Write the output file --out.
-  const string out_contents = PBTxtFromMultiline(in_contents);
+  const std::string out_contents = PBTxtFromMultiline(in_contents);
   s = WriteStringToFile(Env::Default(), FLAGS_out, out_contents);
   if (!s.ok()) {
     printf("Error writing file %s: %s\n", FLAGS_out.c_str(),
diff --git a/tensorflow/tools/mlpbtxt/tomlpbtxt.cc b/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
index 552d4075619cd3..8c69f5047bb384 100644
--- a/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
+++ b/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
@@ -30,9 +30,9 @@ namespace tensorflow {
 namespace {
 
 int Run(int argc, char** argv) {
-  string FLAGS_in = "";
-  string FLAGS_out = "";
-  string FLAGS_fields = "description";
+  std::string FLAGS_in = "";
+  std::string FLAGS_out = "";
+  std::string FLAGS_fields = "description";
 
   std::vector<Flag> flag_list = {
       Flag("in", &FLAGS_in, "Input proto text (.pbtxt) file name"),
@@ -41,7 +41,7 @@ int Run(int argc, char** argv) {
       Flag("fields", &FLAGS_fields, "Comma-separated list of field names")};
 
   // Parse the command-line.
-  const string usage = Flags::Usage(argv[0], flag_list);
+  const std::string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_ok = Flags::Parse(&argc, argv, flag_list);
   if (argc != 1 || !parse_ok) {
     printf("%s", usage.c_str());
@@ -49,7 +49,7 @@ int Run(int argc, char** argv) {
   }
 
   // Parse the --fields option.
-  std::vector<string> fields =
+  std::vector<std::string> fields =
       str_util::Split(FLAGS_fields, ',', str_util::SkipEmpty());
   if (fields.empty()) {
     printf("--fields must be non-empty.\n%s", usage.c_str());
@@ -59,7 +59,7 @@ int Run(int argc, char** argv) {
   port::InitMain(argv[0], &argc, &argv);
 
   // Read the input file --in.
-  string in_contents;
+  std::string in_contents;
   absl::Status s = ReadFileToString(Env::Default(), FLAGS_in, &in_contents);
   if (!s.ok()) {
     printf("Error reading file %s: %s\n", FLAGS_in.c_str(),
@@ -68,7 +68,7 @@ int Run(int argc, char** argv) {
   }
 
   // Write the output file --out.
-  const string out_contents = PBTxtToMultiline(in_contents, fields);
+  const std::string out_contents = PBTxtToMultiline(in_contents, fields);
   s = WriteStringToFile(Env::Default(), FLAGS_out, out_contents);
   if (!s.ok()) {
     printf("Error writing file %s: %s\n", FLAGS_out.c_str(),
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 24135c42619d31..c0de3f391710e2 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -448,7 +448,7 @@ py_test(
 
 verify_manylinux_compliance_test(
     name = "manylinux_compliance_test",
-    aarch64_compliance_tag = "manylinux_2_17_aarch64",
+    aarch64_compliance_tag = "manylinux_2_27_aarch64",
     ppc64le_compliance_tag = "manylinux_2_17_ppc64le",
     test_tags = [
         "manual",
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index 8d35977d14a987..b9d06f956f6d2a 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:22.04@sha256:09506232a8004baa32c47d68f1e5c307d648fdd59f5e7eaa42aaf87914100db3 as builder
+FROM ubuntu:22.04@sha256:104ae83764a5119017b8e8d6218fa0832b09df65aae7d5a6de29a85d813da2fb as builder
 ################################################################################
 
 # Install devtoolset build dependencies
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index 144e34d7460806..005d8552b79300 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -8,6 +8,7 @@ load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependenci
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@local_config_android//:android.bzl", "android_workspace")
 load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
 
 def _tf_bind():
@@ -79,14 +80,13 @@ def workspace():
     # Note: We add this to fix Kokoro builds.
     # The rules below call into `rules_proto` but the hash has changed and
     # Bazel refuses to continue. So, we add our own mirror.
-    http_archive(
+    tf_http_archive(
         name = "rules_proto",
         sha256 = "20b240eba17a36be4b0b22635aca63053913d5c1ee36e16be36499d167a2f533",
         strip_prefix = "rules_proto-11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
-        ],
+        ),
     )
 
     # Now, finally use the rules
@@ -106,13 +106,13 @@ def workspace():
 
     # Toolchains for ML projects hermetic builds.
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
-    http_archive(
+    tf_http_archive(
         name = "rules_ml_toolchain",
-        sha256 = "b1e5e306d8b1103e73b9b778dfc3a9e069d20664437a03246a235724962b5c94",
-        strip_prefix = "rules_ml_toolchain-484235be45e6843db962c45d08fe4b2b65a6a24c",
-        urls = [
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/484235be45e6843db962c45d08fe4b2b65a6a24c.tar.gz",
-        ],
+        sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+        strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
+        urls = tf_mirror_urls(
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
+        ),
     )
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
diff --git a/tensorflow/workspace1.bzl b/tensorflow/workspace1.bzl
index 399ff8f7579a7d..408e9a89183f0f 100644
--- a/tensorflow/workspace1.bzl
+++ b/tensorflow/workspace1.bzl
@@ -1,11 +1,11 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@local_xla//third_party/llvm:setup.bzl", "llvm_setup")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/android:android_configure.bzl", "android_configure")
 
 # buildifier: disable=unnamed-macro
@@ -21,14 +21,13 @@ def workspace(with_rules_cc = True):
 
     closure_repositories()
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_toolchains",
         sha256 = "294cdd859e57fcaf101d4301978c408c88683fbc46fbc1a3829da92afbea55fb",
         strip_prefix = "bazel-toolchains-8c717f8258cd5f6c7a45b97d974292755852b658",
-        urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
-        ],
+        ),
     )
 
     android_configure(name = "local_config_android")
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 44725b23d6da6c..067de3cdcac811 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -1,6 +1,5 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("//third_party:repo.bzl", "tf_vendored")
 load("@bazel_features//:deps.bzl", "bazel_features_deps")
 load("@bazel_skylib//lib:versions.bzl", "versions")
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
@@ -169,18 +168,18 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f644ad3ac88b3b0208a82742938bca35235865d6ca64950dac58b166877eb2a5",
-        strip_prefix = "XNNPACK-1b918df9d1744ae40725254f4baa592ed05c912e",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/1b918df9d1744ae40725254f4baa592ed05c912e.zip"),
+        sha256 = "08976c0ba6495775f78d738adbcc60a567b5826774f23d3c403486c70ff79772",
+        strip_prefix = "XNNPACK-183297df5c945236cbc4bb1f625f9f2008bfc564",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/183297df5c945236cbc4bb1f625f9f2008bfc564.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
     # XNNPack dependency.
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "fb4f8180171d035a08432b086194121f627d00a76d58cebaad57d7a87ad40dbd",
-        strip_prefix = "kleidiai-7a3a609a3278106df7157bdd27b8f0e75ab00b60",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/7a3a609a3278106df7157bdd27b8f0e75ab00b60.zip"),
+        sha256 = "5e922c9afb7a0c881fc4359b58488f3faa840e8435de1a2207a6525935ed83c2",
+        strip_prefix = "kleidiai-63205aa90afa6803d8f58bc3081b69288e9f1906",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/63205aa90afa6803d8f58bc3081b69288e9f1906.zip"),
     )
 
     tf_http_archive(
@@ -410,16 +409,8 @@ def _tf_repositories():
         },
     )
 
-    # Use XLA's googletest wrapper which provides EXPECT_OK and ASSERT_OK macros.
-    # This wrapper adds those macros to the open-source gmock/gmock.h header,
-    # matching the behavior of internal builds.
-    tf_vendored(
-        name = "com_google_googletest",
-        path = "third_party/xla/third_party/xla_googletest_wrapper",
-    )
-
     tf_http_archive(
-        name = "com_google_googletest_upstream",
+        name = "com_google_googletest",
         # Use the commit on 2025/6/09:
         # https://github.com/google/googletest/commit/28e9d1f26771c6517c3b4be10254887673c94018
         sha256 = "f253ca1a07262f8efde8328e4b2c68979e40ddfcfc001f70d1d5f612c7de2974",
@@ -428,6 +419,8 @@ def _tf_repositories():
         #   - avoid dependencies on @fuchsia_sdk,
         #   - refer to re2 as @com_googlesource_code_re2,
         #   - refer to abseil as @com_google_absl.
+        #   - add status assert macros for consistency with internal gmock (see
+        #     README.add-status-macros.md).
         #
         # To update the patch, run:
         # $ cd ~
@@ -440,7 +433,11 @@ def _tf_repositories():
         # $ git diff > <client-root>/third_party/tensorflow/third_party/googletest/googletest.patch
         #
         # The patch path is relative to third_party/tensorflow.
-        patch_file = ["@local_xla//third_party/googletest:googletest.patch"],
+        patch_file = [
+            "@local_xla//third_party/googletest:googletest.patch",
+            "@local_xla//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
+            "@local_xla//third_party/googletest:0002-Rename-dependencies-for-workspace.bzl-build.patch",
+            ],
         urls = tf_mirror_urls("https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c940189.zip"),
     )
 
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index adabcc54fc586d..b74e2e012b0e3f 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -1,80 +1,73 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//third_party:repo.bzl", "tf_vendored")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls", "tf_vendored")
 load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     tf_vendored(name = "local_xla", path = "third_party/xla")
     tf_vendored(name = "local_tsl", path = "third_party/xla/third_party/tsl")
 
-    http_archive(
+    tf_http_archive(
         name = "io_bazel_rules_closure",
         sha256 = "5b00383d08dd71f28503736db0500b6fb4dda47489ff5fc6bed42557c07c6ba9",
         strip_prefix = "rules_closure-308b05b2419edb5c8ee0471b67a40403df940149",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",  # 2019-06-13
-        ],
+        ),
     )
 
     tf_runtime()
 
     # https://github.com/bazelbuild/bazel-skylib/releases
-    http_archive(
+    tf_http_archive(
         name = "bazel_skylib",
         sha256 = "bc283cdfcd526a52c3201279cda4bc298652efa898b10b4db0837dc51652756f",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
-        ],
+        ),
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_license",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
-        ],
+        ),
         sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_pkg",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
-        ],
+        ),
         sha256 = "451e08a4d78988c06fa3f9306ec813b836b1d076d0f055595444ba4ff22b867f",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_features",
         sha256 = "4fd9922d464686820ffd8fcefa28ccffa147f7cdc6b6ac0d8b07fde565c65d66",
         strip_prefix = "bazel_features-1.25.0",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz",
-        ],
+        ),
     )
 
     # Maven dependencies.
     RULES_JVM_EXTERNAL_TAG = "4.3"
-    http_archive(
+    tf_http_archive(
         name = "rules_jvm_external",
         strip_prefix = "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG,
         sha256 = "6274687f6fc5783b589f56a2f1ed60de3ce1f99bc4e8f9edef3de43bdf7c6e74",
-        url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG),
     )
 
     # Platforms
-    http_archive(
+    tf_http_archive(
         name = "platforms",
         sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
-        ],
+        ),
     )
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
diff --git a/third_party/py/python_init_pip.bzl b/third_party/py/python_init_pip.bzl
index 7689b92b60a00a..39901b9b2e64ea 100644
--- a/third_party/py/python_init_pip.bzl
+++ b/third_party/py/python_init_pip.bzl
@@ -24,6 +24,10 @@ cc_library(
 cc_library(
     name = "numpy_headers",
     deps = [":numpy_headers_2", ":numpy_headers_1"],
+    # For the layering check to work we need to re-export the headers from the
+    # dependencies.
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]) +
+           glob(["site-packages/numpy/core/include/**/*.h"]),
 )
 """,
         ),
diff --git a/third_party/py/python_init_rules.bzl b/third_party/py/python_init_rules.bzl
index ac9b8eb3893441..e8bfd6548965e4 100644
--- a/third_party/py/python_init_rules.bzl
+++ b/third_party/py/python_init_rules.bzl
@@ -1,6 +1,5 @@
 """Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def python_init_rules(extra_patches = []):
@@ -11,15 +10,14 @@ def python_init_rules(extra_patches = []):
         set of patches.
     """
 
-    http_archive(
+    tf_http_archive(
         name = "rules_cc",
-        urls = ["https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"],
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"),
         strip_prefix = "rules_cc-0.1.0",
         sha256 = "4b12149a041ddfb8306a8fd0e904e39d673552ce82e4296e96fac9cbf0780e59",
-        patches = [
-            Label("//third_party/py:rules_cc_protobuf.patch"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_cc_protobuf.patch",
         ],
-        patch_args = ["-p1"],
     )
 
     tf_http_archive(
@@ -34,15 +32,14 @@ def python_init_rules(extra_patches = []):
         },
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_python",
         sha256 = "fa7dd2c6b7d63b3585028dd8a90a6cf9db83c33b250959c2ee7b583a6c130e12",
         strip_prefix = "rules_python-1.6.0",
-        url = "https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz",
-        patch_args = ["-p1"],
-        patches = [
-            Label("//third_party/py:rules_python_pip_version.patch"),
-            Label("//third_party/py:rules_python_freethreaded.patch"),
-            Label("//third_party/py:rules_python_versions.patch"),
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_python_pip_version.patch",
+            "@local_xla//third_party/py:rules_python_freethreaded.patch",
+            "@local_xla//third_party/py:rules_python_versions.patch",
         ] + extra_patches,
     )
diff --git a/third_party/py/python_init_toolchains.bzl b/third_party/py/python_init_toolchains.bzl
index 860fc08ceda2a8..82d755c32bbfba 100644
--- a/third_party/py/python_init_toolchains.bzl
+++ b/third_party/py/python_init_toolchains.bzl
@@ -41,7 +41,6 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
         tool_version = MINOR_MAPPING.get(HERMETIC_PYTHON_VERSION)
         if not tool_version:
             tool_version = HERMETIC_PYTHON_VERSION + ".0"
-        url_components = HERMETIC_PYTHON_URL.split("://", 1)
 
         sha256s = {}
         for platform in PLATFORMS.keys():
@@ -51,12 +50,12 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
 
         python_register_toolchains(
             name = get_toolchain_name_per_python_version(name),
-            base_url = url_components[0] + "://",
+            base_url = "",
             ignore_root_user_error = True,
             python_version = tool_version,
             tool_versions = {
                 tool_version: {
-                    "url": url_components[1],
+                    "url": HERMETIC_PYTHON_URL,
                     "sha256": sha256s,
                     "strip_prefix": HERMETIC_PYTHON_PREFIX,
                 },
diff --git a/third_party/py/rules_python_versions.patch b/third_party/py/rules_python_versions.patch
index 8dbc70bad193d7..c31b6772c2675f 100644
--- a/third_party/py/rules_python_versions.patch
+++ b/third_party/py/rules_python_versions.patch
@@ -1,8 +1,60 @@
 diff --git a/python/versions.bzl b/python/versions.bzl
-index 30929f82..8e79225a 100644
+index 30929f82..c0856d70 100644
 --- a/python/versions.bzl
 +++ b/python/versions.bzl
-@@ -855,6 +855,51 @@ TOOL_VERSIONS = {
+@@ -810,6 +810,51 @@ TOOL_VERSIONS = {
+             "x86_64-unknown-linux-gnu-freethreaded": "python/install",
+         },
+     },
++    "3.13.11": {
++        "url": "20251209/cpython-{python_version}+20251209-{platform}-{build}.{ext}",
++        "sha256": {
++            "aarch64-apple-darwin": "295a9f7bc899ea1cc08baf60bbf511bdd1e4a29b2dd7e5f59b48f18bfa6bf585",
++            "aarch64-unknown-linux-gnu": "ea1e678e6e82301bb32bf3917732125949b6e46d541504465972024a3f165343",
++            "ppc64le-unknown-linux-gnu": "7660e53aad9d35ee256913c6d98427f81f078699962035c5fa8b5c3138695109",
++            "riscv64-unknown-linux-gnu": "763fa1548e6a432e9402916e690c74ea30f26dcd2e131893dd506f72b87c27c9",
++            "s390x-unknown-linux-gnu": "ffb6af51fbfabfc6fbc4e7379bdec70c2f51e972b1d2f45c053493b9da3a1bbe",
++            "x86_64-apple-darwin": "dac4a0a0a9b71f6b02a8b0886547fa22814474239bffb948e3e77185406ea136",
++            "x86_64-pc-windows-msvc": "87822417007045a28a7eccc47fe67b8c61265b99b10dbbfa24d231a3622b1c27",
++            "aarch64-pc-windows-msvc": "ba646d0c3b7dd7bdfb770d9b2ebd6cd2df02a37fda90c9c79a7cf59c7df6f165",
++            "aarch64-pc-windows-msvc-freethreaded": "6daf6d092c7294cfe68c4c7bf2698ac134235489c874b3bf796c7972b9dbba30",
++            "x86_64-unknown-linux-gnu": "1ffa06d714a44aea14c0c54c30656413e5955a6c92074b4b3cb4351dcc28b63b",
++            "x86_64-unknown-linux-musl": "969fe24017380b987c4e3ce15e9edf82a4618c1e61672b2cc9b021a1c98eae78",
++            "aarch64-apple-darwin-freethreaded": "4213058b7fcd875596c12b58cd46a399358b0a87ecde4b349cbdd00cf87ed79a",
++            "aarch64-unknown-linux-gnu-freethreaded": "290ca3bd0007db9e551f90b08dfcb6c1b2d62c33b2fc3e9a43e77d385d94f569",
++            "ppc64le-unknown-linux-gnu-freethreaded": "09d4b50f8abb443f7e3af858c920aa61c2430b0954df465e861caa7078e55e69",
++            "riscv64-unknown-linux-gnu-freethreaded": "5406f2a7cacafbd2aac3ce2de066a0929aab55423824276c36e04cb83babc36c",
++            "s390x-unknown-linux-gnu-freethreaded": "3984b67c4292892eaccdd1c094c7ec788884c4c9b3534ab6995f6be96d5ed51d",
++            "x86_64-apple-darwin-freethreaded": "d6f489464045d6895ae68b0a04a9e16477e74fe3185a75f3a9a0af8ccd25eade",
++            "x86_64-pc-windows-msvc-freethreaded": "bb9a29a7ba8f179273b79971da6aaa7be592d78c606a63f99eff3e4c12fb0fae",
++            "x86_64-unknown-linux-gnu-freethreaded": "33f89c957d986d525529b8a980103735776f4d20cf52f55960a057c760188ac3",
++        },
++        "strip_prefix": {
++            "aarch64-apple-darwin": "python",
++            "aarch64-unknown-linux-gnu": "python",
++            "ppc64le-unknown-linux-gnu": "python",
++            "s390x-unknown-linux-gnu": "python",
++            "riscv64-unknown-linux-gnu": "python",
++            "x86_64-apple-darwin": "python",
++            "x86_64-pc-windows-msvc": "python",
++            "aarch64-pc-windows-msvc": "python",
++            "x86_64-unknown-linux-gnu": "python",
++            "x86_64-unknown-linux-musl": "python",
++            "aarch64-apple-darwin-freethreaded": "python/install",
++            "aarch64-unknown-linux-gnu-freethreaded": "python/install",
++            "ppc64le-unknown-linux-gnu-freethreaded": "python/install",
++            "riscv64-unknown-linux-gnu-freethreaded": "python/install",
++            "s390x-unknown-linux-gnu-freethreaded": "python/install",
++            "x86_64-apple-darwin-freethreaded": "python/install",
++            "x86_64-pc-windows-msvc-freethreaded": "python/install",
++            "aarch64-pc-windows-msvc-freethreaded": "python/install",
++            "x86_64-unknown-linux-gnu-freethreaded": "python/install",
++        },
++    },
+     "3.14.0rc1": {
+         "url": "20250808/cpython-{python_version}+20250808-{platform}-{build}.{ext}",
+         "sha256": {
+@@ -855,6 +900,51 @@ TOOL_VERSIONS = {
              "x86_64-unknown-linux-gnu-freethreaded": "python/install",
          },
      },
@@ -54,16 +106,18 @@ index 30929f82..8e79225a 100644
  }
  
  # buildifier: disable=unsorted-dict-items
-@@ -865,7 +910,7 @@ MINOR_MAPPING = {
+@@ -864,8 +954,8 @@ MINOR_MAPPING = {
+     "3.10": "3.10.18",
      "3.11": "3.11.13",
      "3.12": "3.12.11",
-     "3.13": "3.13.6",
+-    "3.13": "3.13.6",
 -    "3.14": "3.14.0rc1",
++    "3.13": "3.13.11",
 +    "3.14": "3.14.0",
  }
  
  def _generate_platforms():
-@@ -1045,29 +1090,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
+@@ -1045,29 +1135,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
      for u in url:
          p, _, _ = platform.partition(FREETHREADED)
  
diff --git a/third_party/systemlibs/grpc.bazel.generate_cc.bzl b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
index aa5d18eaa9a488..f396b1f853e71c 100644
--- a/third_party/systemlibs/grpc.bazel.generate_cc.bzl
+++ b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
@@ -46,7 +46,7 @@ def generate_cc_impl(ctx):
     includes = [
         f
         for src in ctx.attr.srcs
-        for f in src[ProtoInfo].transitive_imports.to_list()
+        for f in src[ProtoInfo].transitive_sources.to_list()
     ]
     outs = []
     proto_root = get_proto_root(
diff --git a/third_party/systemlibs/grpc.bazel.protobuf.bzl b/third_party/systemlibs/grpc.bazel.protobuf.bzl
index cfb124ce43b1ef..9eeb4cb4475188 100644
--- a/third_party/systemlibs/grpc.bazel.protobuf.bzl
+++ b/third_party/systemlibs/grpc.bazel.protobuf.bzl
@@ -163,7 +163,7 @@ def includes_from_deps(deps):
     return [
         file
         for src in deps
-        for file in src[ProtoInfo].transitive_imports.to_list()
+        for file in src[ProtoInfo].transitive_sources.to_list()
     ]
 
 def get_proto_arguments(protos, genfiles_dir_path):
diff --git a/third_party/xla/.github/workflows/bazel_dependency_violations.yml b/third_party/xla/.github/workflows/bazel_dependency_violations.yml
index 0588447392e993..e3fbfbab9bee81 100644
--- a/third_party/xla/.github/workflows/bazel_dependency_violations.yml
+++ b/third_party/xla/.github/workflows/bazel_dependency_violations.yml
@@ -39,7 +39,7 @@ jobs:
     continue-on-error: true
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/bazel_query.yml b/third_party/xla/.github/workflows/bazel_query.yml
index 052309ef806012..8888c7b0f3267e 100644
--- a/third_party/xla/.github/workflows/bazel_query.yml
+++ b/third_party/xla/.github/workflows/bazel_query.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/bazel_tags.yml b/third_party/xla/.github/workflows/bazel_tags.yml
index 00ed95c8e6f0e0..09ecd6f00603ef 100644
--- a/third_party/xla/.github/workflows/bazel_tags.yml
+++ b/third_party/xla/.github/workflows/bazel_tags.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
index bab85cb699bd02..014f71761c2527 100644
--- a/third_party/xla/.github/workflows/benchmark_postsubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -110,7 +110,7 @@ jobs:
           PR: ${{ steps.find_pr.outputs.pr }}
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@v6.0.0
+        uses: actions/checkout@v6.0.1
       - name: Wait For Connection
         uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
         with:
@@ -235,7 +235,7 @@ jobs:
           gsutil cp "$OUTPUT_FILE" "$GCS_BUCKET/$GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@v5.0.0
+        uses: actions/upload-artifact@v6.0.0
         with:
           name: xspace-artifacts-${{ matrix.job_info.pool }}-${{ matrix.job_info.platform }}
           path: ${{ env.XSPACE_FILE }}
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/benchmark_presubmit.yml b/third_party/xla/.github/workflows/benchmark_presubmit.yml
index 33f65f9eead53d..33dc31bd6a64d6 100644
--- a/third_party/xla/.github/workflows/benchmark_presubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_presubmit.yml
@@ -86,7 +86,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Configure GPU backend
         if: ${{ matrix.job_info.platform == 'GPU' }}
diff --git a/third_party/xla/.github/workflows/buildifier.yml b/third_party/xla/.github/workflows/buildifier.yml
index d61728b29b4716..079a608acc26e0 100644
--- a/third_party/xla/.github/workflows/buildifier.yml
+++ b/third_party/xla/.github/workflows/buildifier.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 6
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install buildifier"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/buildtools/buildifier@433ea85 # 6.4.0
       - name: "Run buildifier"
diff --git a/third_party/xla/.github/workflows/check_contents.yml b/third_party/xla/.github/workflows/check_contents.yml
index 820a99675525ca..afc6f9c7780e14 100644
--- a/third_party/xla/.github/workflows/check_contents.yml
+++ b/third_party/xla/.github/workflows/check_contents.yml
@@ -46,7 +46,7 @@ jobs:
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Fetch HEAD of main branch"
         run: git fetch origin main --depth=1
 
diff --git a/third_party/xla/.github/workflows/ci.yml b/third_party/xla/.github/workflows/ci.yml
index daafda6979df76..db5629dac494b0 100644
--- a/third_party/xla/.github/workflows/ci.yml
+++ b/third_party/xla/.github/workflows/ci.yml
@@ -115,15 +115,14 @@ jobs:
     defaults:
       run:
         shell: bash
-    timeout-minutes: 60
     steps:
       - name: "Checking out openxla/xla"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           path: "openxla/xla"
       - name: Checking out ${{ matrix.job_info.repo }}
         if: ${{ matrix.job_info.repo != 'openxla/xla' }}
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           repository: ${{ matrix.job_info.repo }}
           path: ${{ matrix.job_info.repo }}
@@ -133,6 +132,7 @@ jobs:
           halt-dispatch-input: ${{ inputs.halt-for-connection }}
       - name: "Run build.py"
         working-directory: ${{ matrix.job_info.repo }}
+        timeout-minutes: 60
         run: |
           if [[ "${{ matrix.job_info.pool }}" == *windows* ]]; then
             python $GITHUB_WORKSPACE\\openxla\\xla\\build_tools\\ci\\build.py --build="${{ matrix.job_info.name }}_github_actions"
diff --git a/third_party/xla/.github/workflows/ci_multi_device.yml b/third_party/xla/.github/workflows/ci_multi_device.yml
new file mode 100644
index 00000000000000..4171626436f600
--- /dev/null
+++ b/third_party/xla/.github/workflows/ci_multi_device.yml
@@ -0,0 +1,64 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Multi-Device CI
+permissions:
+  contents: read
+on:
+  workflow_dispatch:  # Allows manual triggering
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+          {
+            pool: "linux-x86-a3-8g-h100-8gpu",
+            container: "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest",
+            name: "XLA Linux x86 GPU 8xH100",
+            repo: "openxla/xla",
+          },
+        ]
+    name: ${{ matrix.job_info.name }}
+    runs-on: ${{ matrix.job_info.pool }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 60
+    steps:
+      - name: "Checking out openxla/xla"
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          path: "openxla/xla"
+      - name: Checking out ${{ matrix.job_info.repo }}
+        if: ${{ matrix.job_info.repo != 'openxla/xla' }}
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          repository: ${{ matrix.job_info.repo }}
+          path: ${{ matrix.job_info.repo }}
+      - name: "Wait For Connection"
+        uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      - name: "Run build.py"
+        working-directory: ${{ matrix.job_info.repo }}
+        run: |
+          if [[ "${{ matrix.job_info.pool }}" == *windows* ]]; then
+            python $GITHUB_WORKSPACE\\openxla\\xla\\build_tools\\ci\\build.py --build="${{ matrix.job_info.name }}_github_actions"
+          else
+            $GITHUB_WORKSPACE/openxla/xla/build_tools/ci/build.py --build="${{ matrix.job_info.name }}_github_actions"
+          fi
diff --git a/third_party/xla/.github/workflows/clang_format.yml b/third_party/xla/.github/workflows/clang_format.yml
index 198d0dd5df3a83..f0de7043ebb15b 100644
--- a/third_party/xla/.github/workflows/clang_format.yml
+++ b/third_party/xla/.github/workflows/clang_format.yml
@@ -34,7 +34,7 @@ jobs:
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           fetch-depth: '0'
       - name: "Fetch HEAD of main branch"
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
index 39225d1aeb05d9..060249354d15d9 100644
--- a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -75,7 +75,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@v6.0.0
+        uses: actions/checkout@v6.0.1
 
       - name: Create results directory
         run:
@@ -207,7 +207,7 @@ jobs:
           gsutil cp "$OUTPUT_DIR/$FILENAME_GEMMA3" "$GCS_BUCKET/$GEMMA3_GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@v5.0.0
+        uses: actions/upload-artifact@v6.0.0
         with:
           name: cpu-xla-benchmarks-xspace-${{ matrix.job_info.pool }}
           path: ${{ github.workspace }}/${{ matrix.job_info.output_dir }}/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
index 51f97449ee6b41..e96e3d44ecaab1 100644
--- a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
+++ b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
@@ -54,7 +54,7 @@ jobs:
 
     steps:
       - name: Checkout OpenXLA
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           # Use inputs.checkout_ref if provided, otherwise default to the event's ref
           # (e.g., PR's HEAD SHA or caller's commit SHA)
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
index 748226655d8b9d..8ab4fb1d5dba14 100644
--- a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
@@ -56,7 +56,7 @@ jobs:
       OUTPUT_DIR: ${{ github.workspace }}/output
     steps:
       - name: Checkout XLA
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Download Gemma Hlo Files
         run: |
@@ -198,7 +198,7 @@ jobs:
           upload_to_gcs "$GEMMA3_SAMPLE_LOOP_BASE_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: gpu-xla-benchmarks-xspace-${{ matrix.job_info.os }}
           path: ${{ github.workspace }}/output/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/nightly_benchmarks.yml b/third_party/xla/.github/workflows/nightly_benchmarks.yml
index 23a82d9350624d..e65fd69daf6944 100644
--- a/third_party/xla/.github/workflows/nightly_benchmarks.yml
+++ b/third_party/xla/.github/workflows/nightly_benchmarks.yml
@@ -110,7 +110,7 @@ jobs:
             exit 1
           fi
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           ref: ${{ env.CHECKOUT_REF }}
       - name: Build Binaries
@@ -182,7 +182,7 @@ jobs:
           gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/postsubmit_benchmark.yml b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
index 346c17bfaec5c4..a4f249366b6ea9 100644
--- a/third_party/xla/.github/workflows/postsubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
@@ -145,7 +145,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
@@ -224,7 +224,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/presubmit_benchmark.yml b/third_party/xla/.github/workflows/presubmit_benchmark.yml
index e3efa8d429bf5b..4259667c73dad0 100644
--- a/third_party/xla/.github/workflows/presubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/presubmit_benchmark.yml
@@ -139,7 +139,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
@@ -199,7 +199,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: results-${{ env.CONFIG_ID }} 
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/rollback_notification.yml b/third_party/xla/.github/workflows/rollback_notification.yml
index 8978fe8d9984e5..7a3c21fdd21b9f 100644
--- a/third_party/xla/.github/workflows/rollback_notification.yml
+++ b/third_party/xla/.github/workflows/rollback_notification.yml
@@ -33,7 +33,7 @@ jobs:
     timeout-minutes: 6
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Check if PR was rolled back"
         run: python3 .github/workflows/rollback_notification.py
 
diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index f781a8bcb93b8a..0ec69c216d7aaf 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -44,7 +44,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           persist-credentials: false
 
@@ -58,7 +58,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v3.pre.node20
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v3.pre.node20
         with:
           name: SARIF file
           path: results.sarif
@@ -67,6 +67,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@fe4161a26a8629af62121b670040955b330f9af2 # v4.31.6
+        uses: github/codeql-action/upload-sarif@1b168cd39490f61582a9beae412bb7057a6b2c4e # v4.31.8
         with:
           sarif_file: results.sarif
diff --git a/third_party/xla/.gitignore b/third_party/xla/.gitignore
index 619ec239a7809c..734c302636dbb4 100644
--- a/third_party/xla/.gitignore
+++ b/third_party/xla/.gitignore
@@ -28,3 +28,8 @@ tools/python_bin_path.sh
 *.VC.opendb
 *.suo
 *.user
+
+# Ignore clangd files and directories: https://openxla.org/xla/lsp
+.cache
+compile_commands.json
+external
diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index d54ae9978b494d..114f1dd36f5315 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -3,6 +3,7 @@ module(name = "xla")
 ##############################################################
 # Bazel module dependencies
 
+# go/keep-sorted start
 bazel_dep(name = "abseil-cpp", version = "20250814.0", repo_name = "com_google_absl")
 bazel_dep(name = "abseil-py", version = "2.1.0", repo_name = "absl_py")
 bazel_dep(name = "bazel_features", version = "1.36.0")
@@ -10,8 +11,7 @@ bazel_dep(name = "bazel_skylib", version = "1.8.1")
 bazel_dep(name = "boringssl", version = "0.20250818.0")
 bazel_dep(name = "curl", version = "8.11.0")
 bazel_dep(name = "google_benchmark", version = "1.8.5", repo_name = "com_google_benchmark")
-bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest_upstream")
-bazel_dep(name = "xla_googletest_wrapper", version = "1.0", repo_name = "com_google_googletest")
+bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest")
 bazel_dep(name = "grpc", version = "1.74.1", repo_name = "com_github_grpc_grpc")
 bazel_dep(name = "gutil", version = "20250502.0", repo_name = "com_google_gutil")
 bazel_dep(name = "jsoncpp", version = "1.9.6", repo_name = "jsoncpp_git")
@@ -22,12 +22,15 @@ bazel_dep(name = "pybind11_abseil", version = "202402.0")
 bazel_dep(name = "pybind11_bazel", version = "2.13.6")
 bazel_dep(name = "pybind11_protobuf", version = "0.0.0-20250210-f02a2b7")
 bazel_dep(name = "re2", version = "2024-07-02.bcr.1", repo_name = "com_googlesource_code_re2")
+bazel_dep(name = "riegeli", version = "0.0.0-20250822-9f2744d", repo_name = "com_google_riegeli")
 bazel_dep(name = "rules_cc", version = "0.2.0")
+bazel_dep(name = "rules_java", version = "8.16.1")
 bazel_dep(name = "rules_license", version = "1.0.0")
 bazel_dep(name = "rules_python", version = "1.6.0")
 bazel_dep(name = "rules_shell", version = "0.6.1")
 bazel_dep(name = "snappy", version = "1.2.1")
 bazel_dep(name = "zlib", version = "1.3.1.bcr.5")
+# go/keep-sorted end
 
 # Only for compatibility, not directly used, change repo_name to None after upgrading Bazel to latest 7.x
 bazel_dep(name = "eigen", version = "4.0.0-20241125.bcr.3", repo_name = "DO_NOT_USE_eigen")
@@ -42,9 +45,9 @@ bazel_dep(name = "rules_ml_toolchain")
 # echo "sha256-${HASH}"
 archive_override(
     module_name = "rules_ml_toolchain",
-    integrity = "sha256-seXjBtixED5zubd438Op4GnSBmRDegMkaiNXJJYrXJQ=",
-    strip_prefix = "rules_ml_toolchain-484235be45e6843db962c45d08fe4b2b65a6a24c",
-    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/484235be45e6843db962c45d08fe4b2b65a6a24c.tar.gz"],
+    integrity = "sha256-HCxTCgVOnos8gR7CHtimh/yGW+w6u8j/Zb64KbHWeuQ=",
+    strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
+    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz"],
 )
 
 # TODO: Upstream the patch?
@@ -71,15 +74,14 @@ single_version_override(
 # Use an unreleased version of googletest
 archive_override(
     module_name = "googletest",
+    patch_strip = 1,
+    patches = [
+        "//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
+    ],
     strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
     urls = ["https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c94018.zip"],
 )
 
-local_path_override(
-    module_name = "xla_googletest_wrapper",
-    path = "third_party/xla_googletest_wrapper",
-)
-
 ##############################################################
 # C++ dependencies
 
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 69ebc22643da3b..29e65b3afcc430 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -1,19 +1,19 @@
 # buildifier: disable=load-on-top
 workspace(name = "xla")
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Initialize toolchains for ML projects.
 #
 # A hermetic build system is designed to produce completely reproducible builds for C++.
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
-http_archive(
+tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
-    strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
-    urls = [
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
-    ],
+    sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+    strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
+    urls = tf_mirror_urls(
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
+    ),
 )
 
 load(
diff --git a/third_party/xla/build_tools/ci/BUILD b/third_party/xla/build_tools/ci/BUILD
index 3d37ca202dd82b..79a571ba22ace1 100644
--- a/third_party/xla/build_tools/ci/BUILD
+++ b/third_party/xla/build_tools/ci/BUILD
@@ -14,6 +14,7 @@
 # ============================================================================
 
 load("@bazel_skylib//rules:diff_test.bzl", "diff_test")
+load("@rules_shell//shell:sh_binary.bzl", "sh_binary")
 load("//xla:pytype.bzl", "pytype_strict_binary")
 
 package(
diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index 20d77da3bee540..8a6efe40f3a71f 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -111,6 +111,7 @@ class BuildType(enum.Enum):
   XLA_LINUX_X86_CPU_BZLMOD_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS = enum.auto()
 
   # Presubmit builds for regression testing.
@@ -273,33 +274,37 @@ def _tag_filters_for_compute_capability(
   return tag_filters
 
 
+nvidia_gpu_filters = (
+    "-no_oss",
+    "requires-gpu-nvidia",
+    "gpu",
+    "-rocm-only",
+    "-oneapi-only",
+)
+
+nvidia_single_gpu_filters = nvidia_gpu_filters + ("-multi_gpu",)
+
+nvidia_only_multi_gpu_filters = nvidia_gpu_filters + ("multi_gpu",)
+
+
 def nvidia_gpu_build_with_compute_capability(
     *,
     type_: BuildType,
     configs: Tuple[str, ...],
     compute_capability: int,
+    multi_gpu: bool = False,
 ) -> Build:
   extra_gpu_tags = _tag_filters_for_compute_capability(compute_capability)
+  filter_tags = (
+      nvidia_only_multi_gpu_filters if multi_gpu else nvidia_single_gpu_filters
+  )
   return Build(
       type_=type_,
       repo="openxla/xla",
       target_patterns=_XLA_DEFAULT_TARGET_PATTERNS,
       configs=configs,
-      test_tag_filters=(
-          "-no_oss",
-          "requires-gpu-nvidia",
-          "gpu",
-          "-rocm-only",
-          "-oneapi-only",
-      )
-      + extra_gpu_tags,
-      build_tag_filters=(
-          "-no_oss",
-          "requires-gpu-nvidia",
-          "gpu",
-          "-rocm-only",
-          "-oneapi-only",
-      ),
+      test_tag_filters=filter_tags + extra_gpu_tags,
+      build_tag_filters=filter_tags,
       options={
           "run_under": "//build_tools/ci:parallel_gpu_execute",
           "//xla/tsl:ci_build": True,
@@ -436,6 +441,14 @@ def nvidia_gpu_build_with_compute_capability(
     type_=BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     compute_capability=75,
+    multi_gpu=False,
+)
+
+nvidia_gpu_build_with_compute_capability(
+    type_=BuildType.XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS,
+    configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
+    compute_capability=90,
+    multi_gpu=True,
 )
 
 oneapi_build_tag_filter = (
@@ -510,21 +523,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -542,21 +543,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -575,21 +564,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -607,21 +584,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -640,21 +605,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=(),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=100),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         # Use User Mode and Kernel Mode Drivers pre-installed on the system.
@@ -675,21 +628,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=(),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=100),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         # Use User Mode and Kernel Mode Drivers pre-installed on the system.
@@ -731,7 +672,6 @@ def nvidia_gpu_build_with_compute_capability(
         **_DEFAULT_BAZEL_OPTIONS,
         "macos_minimum_os": "10.15",
         "test_tmpdir": "/Volumes/BuildData/bazel_output",
-        "define": "xnn_enable_avxvnniint8=false",
         "//xla/tsl:ci_build": True,
     },
     build_tag_filters=macos_tag_filter,
@@ -767,7 +707,6 @@ def nvidia_gpu_build_with_compute_capability(
         "macos_minimum_os": "10.15",
         "test_tmpdir": "/tmpfs/bazel_output",
         "test_size_filters": "small,medium",
-        "define": "xnn_enable_avxvnniint8=false",
         "//xla/tsl:ci_build": True,
     },
     build_tag_filters=macos_tag_filter,
@@ -932,11 +871,7 @@ def nvidia_gpu_build_with_compute_capability(
 Build(
     type_=BuildType.TENSORFLOW_LINUX_X86_GPU_L4_GITHUB_ACTIONS,
     repo="tensorflow/tensorflow",
-    configs=(
-        "release_gpu_linux",
-        "rbe_linux_cuda",
-        "hermetic_cuda_umd"
-    ),
+    configs=("release_gpu_linux", "rbe_linux_cuda", "hermetic_cuda_umd"),
     target_patterns=(
         "//tensorflow/compiler/...",
         "-//tensorflow/compiler/tf2tensorrt/...",
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index f5e914157ec888..feccb0fc7cdd78 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -53,46 +53,52 @@ parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_fi
 bazel test --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=nonccl --config=rbe_linux_cpu --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu,requires-gpu-sm90-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=9.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu,requires-gpu-sm90-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=9.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
-bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS
@@ -104,7 +110,7 @@ bazel analyze-profile profile.json.gz
 df -h
 bazel --version
 mkdir -p /tmpfs/bazel_output
-bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/tmpfs/bazel_output --test_size_filters=small,medium --define=xnn_enable_avxvnniint8=false --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
+bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/tmpfs/bazel_output --test_size_filters=small,medium --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_MACOS_ARM64_CPU_KOKORO
 # BEGIN BuildType.XLA_MACOS_X86_CPU_KOKORO
@@ -112,7 +118,7 @@ sudo wget --no-verbose -O /usr/local/bin/bazel https://github.com/bazelbuild/baz
 chmod +x /usr/local/bin/bazel
 bazel --version
 mkdir -p /Volumes/BuildData/bazel_output
-bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/Volumes/BuildData/bazel_output --define=xnn_enable_avxvnniint8=false --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
+bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/Volumes/BuildData/bazel_output --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_MACOS_X86_CPU_KOKORO
 # BEGIN BuildType.XLA_WINDOWS_X86_CPU_GITHUB_ACTIONS
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
index 30729ca1031561..a54852db554b35 100755
--- a/third_party/xla/build_tools/configure/configure.py
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -452,18 +452,6 @@ def to_bazelrc_lines(
     if dpav.clang_major_version and dpav.clang_major_version >= 19:
       self.compiler_options.append("-Wno-c23-extensions")
 
-    # Avoid XNNPACK using `-mavxvnniint8` (needs clang-16+/gcc-13+)
-    if (
-        dpav.clang_major_version is not None and dpav.clang_major_version < 16
-    ) or (dpav.gcc_major_version is not None and dpav.gcc_major_version < 13):
-      rc.append("build --define=xnn_enable_avxvnniint8=false")
-
-    # Avoid XNNPACK using `-mavx512fp16` (needs clang-14+/gcc-12+).
-    if (
-        dpav.clang_major_version is not None and dpav.clang_major_version < 14
-    ) or (dpav.gcc_major_version is not None and dpav.gcc_major_version < 12):
-      rc.append("build --define=xnn_enable_avx512fp16=false")
-
     rc.append(f"build --action_env PYTHON_BIN_PATH={self.python_bin_path}")
     rc.append(f"build --python_path {self.python_bin_path}")
     rc.append("test --test_env LD_LIBRARY_PATH")
diff --git a/third_party/xla/build_tools/configure/testdata/gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
index 54545cbb9914bc..8eefec15ee8efb 100644
--- a/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
@@ -1,6 +1,4 @@
 build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
-build --define=xnn_enable_avxvnniint8=false
-build --define=xnn_enable_avx512fp16=false
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
 build --python_path /usr/bin/python3
 test --test_env LD_LIBRARY_PATH
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
index 3155b30218df08..373613415c1f7c 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
@@ -5,8 +5,6 @@ build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES=7.5
 build:cuda --repo_env HERMETIC_CUDNN_VERSION="9.8.0"
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-build --define=xnn_enable_avxvnniint8=false
-build --define=xnn_enable_avx512fp16=false
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
 build --python_path /usr/bin/python3
 test --test_env LD_LIBRARY_PATH
diff --git a/third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh b/third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh
new file mode 100755
index 00000000000000..f1d9f7d670eaa9
--- /dev/null
+++ b/third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2025 The OpenXLA Authors.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Generates a patch file that disables the layering check for all cc_library
+# targets in the archive. Both BUILD and BUILD.bazel files are taken into account.
+#
+# The script takes one argument: the URL of the .tar.gz archive to download.
+#
+# The following tools are needed (need to be installed on the machine):
+# - curl
+# - git
+# - buildozer (from Bazel buildtools)
+#
+# The tool has originally been written for ortools but should work for similarly structured
+# projects as well.
+#
+# Example:
+# build_tools/dependencies/gen_disable_layering_check_patch.sh \
+# https://github.com/google/or-tools/archive/v9.11.tar.gz \
+# > third_party/ortools/layering_check.patch
+
+set -euo pipefail
+
+readonly TMP_DIR=$(mktemp -d)
+trap 'rm -rf -- $TMP_DIR' EXIT
+
+echo "Downloading archive $1..." >&2
+curl -Lqo "$TMP_DIR/archive.tar.gz" "$1" 1>&2
+
+echo "Extracting archive..." >&2
+mkdir -p "$TMP_DIR/extracted" 1>&2
+tar  -x -C "$TMP_DIR/extracted" -f "$TMP_DIR/archive.tar.gz" --strip-components=1 1>&2
+
+echo "Initialzing temporary git repo..." >&2
+git -C "$TMP_DIR/extracted" init 1>&2
+git -C "$TMP_DIR/extracted" add . 1>&2
+git -C "$TMP_DIR/extracted" commit --no-verify -m "original state" -q 1>&2
+
+echo "Patching build targets..." >&2
+find $TMP_DIR/extracted -name BUILD.bazel -or -name BUILD | while read f; do
+   buildozer 'add features "-layering_check"' $(dirname $f):%cc_library 1>&2 || exit_code=$?
+   if [[ $exit_code -ne 0 && $exit_code -ne 3 ]]; then
+     echo "Buildozer command failed with exit code: $exit_code" >&2
+     exit $exit_code
+   fi
+done
+
+echo "Generating diff..." >&2
+git -C "$TMP_DIR/extracted" --no-pager diff
diff --git a/third_party/xla/build_tools/lint/generate_compile_commands.py b/third_party/xla/build_tools/lint/generate_compile_commands.py
index ec9d6fe0d2037b..1c7e6f930931ed 100644
--- a/third_party/xla/build_tools/lint/generate_compile_commands.py
+++ b/third_party/xla/build_tools/lint/generate_compile_commands.py
@@ -67,7 +67,11 @@ def from_args_list(cls, args_list: list[str]) -> "CompileCommand":
       if arg.endswith(".cc"):
         cc_file = arg
 
-      filtered_args.append(arg)
+      # Split generated commands, because otherwise they get wrapped
+      # into "command with spaces" when passed to clangd, and clangd
+      # can't parse them correctly.
+      for s in arg.split(" "):
+        filtered_args.append(s)
 
     return cls(cc_file, filtered_args)
 
diff --git a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
index 5848bb71b1c20c..b9adc77a1690e8 100644
--- a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
+++ b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
@@ -1,6 +1,8 @@
 load("@cuda_cudart//:version.bzl", cuda_major_version = "VERSION")
 load("@nightly_timestamp//:timestamp.bzl", "XLA_NIGHTLY_TIMESTAMP")
 load("@rc_number//:rc_number.bzl", "XLA_RC_NUMBER")
+load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
+load("@rules_cc//cc:cc_test.bzl", "cc_test")
 load("@rules_python//python:packaging.bzl", "py_wheel")
 
 # This ensures we can only build plugins for selected CUDA versions.
diff --git a/third_party/xla/docs/flags_guidance.md b/third_party/xla/docs/flags_guidance.md
index f887b66c4fbc50..e2fff15a4cdcb1 100644
--- a/third_party/xla/docs/flags_guidance.md
+++ b/third_party/xla/docs/flags_guidance.md
@@ -79,22 +79,22 @@ data-parallel collectives (`xla_gpu_enable_pipelined_all_gather`,
 (`xla_gpu_enable_while_loop_double_buffering`), latency hiding scheduling
 (`xla_gpu_enable_latency_hiding_scheduler`), and SOL latency estimator on
 Hopper/Blackwell (`xla_gpu_enable_analytical_sol_latency_estimator`). See
-[GPU Optimization Levels](https://openxla.org/xla/gpu_optimization_levels) for
-details.
-
-| Flag | Type | Notes |
-| :---- | :---- | :----- |
-| `xla_gpu_enable_latency_hiding_scheduler` | Boolean (true/false) |This flag enables latency hiding schedulers to overlap asynchronous communication with computation efficiently. The default value is False. |
-| `xla_gpu_enable_analytical_sol_latency_estimator` | Boolean (true/false) | Enables platform specific scheduling decisions, which in turn improve compute-communication overlap. The default value is true. |
-| `xla_gpu_analytical_latency_estimator_options` | Structured string | Configures parameters for the `xla_gpu_enable_analytical_sol_latency_estimator`. Adjust by setting `nic_speed_gbps=$NIC_SPEED,nccl_op_launch_us=$LAUNCH_OVERHEAD,chunk_prep_us=$CHUNK_PREP,rtt_us=$RTT,chunk_size_bytes=$CHUNK_SIZE,gpus_per_node=$GPUS_PER_NODE`. The default value depends on a detected platform. |
-| `xla_gpu_enable_triton_gemm` | Boolean (true/false) | Use Triton-based matrix multiplication. |
-| `xla_gpu_enable_command_buffer` | List of CommandBufferCmdType | Which kind of commands should be captured in command buffers. |
-| `xla_gpu_all_reduce_combine_threshold_bytes` | Integer (bytes) | These flags tune when to combine multiple small AllGather / ReduceScatter / AllReduce into one big AllGather / ReduceScatter / AllReduce to reduce time spent on cross-device communication. For example, for the AllGather / ReduceScatter thresholds on a Transformer-based workload, consider tuning them high enough so as to combine at least a Transformer Layer’s weight AllGather / ReduceScatter. By default, the combine_threshold_bytes is set to 256. |
-| `xla_gpu_all_gather_combine_threshold_bytes` | Integer (bytes) | See xla_gpu_all_reduce_combine_threshold_bytes above. |
-| `xla_gpu_reduce_scatter_combine_threshold_bytes` | Integer (bytes) | See xla_gpu_all_reduce_combine_threshold_bytes above. |
-| `xla_gpu_enable_pipelined_all_gather` | Boolean (true/false) | Enable pipelinling of all-gather instructions. |
-| `xla_gpu_enable_pipelined_reduce_scatter` | Boolean (true/false) | Enable pipelinling of reduce-scatter instructions. |
-| `xla_gpu_enable_pipelined_all_reduce` | Boolean (true/false) | Enable pipelinling of all-reduce instructions. |
-| `xla_gpu_enable_while_loop_double_buffering` | Boolean (true/false) | Enable double-buffering for while loop. |
-| `xla_gpu_enable_all_gather_combine_by_dim` | Boolean (true/false) | Combine all-gather ops with the same gather dimension or irrespective of their dimension. |
-| `xla_gpu_enable_reduce_scatter_combine_by_dim` | Boolean (true/false) | Combine reduce-scatter ops with the same dimension or irrespective of their dimension. |
+[GPU Effort Levels](https://openxla.org/xla/effort_levels) for details.
+
+Flag                                              | Type                         | Notes
+:------------------------------------------------ | :--------------------------- | :----
+`xla_gpu_enable_latency_hiding_scheduler`         | Boolean (true/false)         | This flag enables latency hiding schedulers to overlap asynchronous communication with computation efficiently. The default value is False.
+`xla_gpu_enable_analytical_sol_latency_estimator` | Boolean (true/false)         | Enables platform specific scheduling decisions, which in turn improve compute-communication overlap. The default value is true.
+`xla_gpu_analytical_latency_estimator_options`    | Structured string            | Configures parameters for the `xla_gpu_enable_analytical_sol_latency_estimator`. Adjust by setting `nic_speed_gbps=$NIC_SPEED,nccl_op_launch_us=$LAUNCH_OVERHEAD,chunk_prep_us=$CHUNK_PREP,rtt_us=$RTT,chunk_size_bytes=$CHUNK_SIZE,gpus_per_node=$GPUS_PER_NODE`. The default value depends on a detected platform.
+`xla_gpu_enable_triton_gemm`                      | Boolean (true/false)         | Use Triton-based matrix multiplication.
+`xla_gpu_enable_command_buffer`                   | List of CommandBufferCmdType | Which kind of commands should be captured in command buffers.
+`xla_gpu_all_reduce_combine_threshold_bytes`      | Integer (bytes)              | These flags tune when to combine multiple small AllGather / ReduceScatter / AllReduce into one big AllGather / ReduceScatter / AllReduce to reduce time spent on cross-device communication. For example, for the AllGather / ReduceScatter thresholds on a Transformer-based workload, consider tuning them high enough so as to combine at least a Transformer Layer’s weight AllGather / ReduceScatter. By default, the combine_threshold_bytes is set to 256.
+`xla_gpu_all_gather_combine_threshold_bytes`      | Integer (bytes)              | See xla_gpu_all_reduce_combine_threshold_bytes above.
+`xla_gpu_reduce_scatter_combine_threshold_bytes`  | Integer (bytes)              | See xla_gpu_all_reduce_combine_threshold_bytes above.
+`xla_gpu_enable_pipelined_all_gather`             | Boolean (true/false)         | Enable pipelinling of all-gather instructions.
+`xla_gpu_enable_pipelined_reduce_scatter`         | Boolean (true/false)         | Enable pipelinling of reduce-scatter instructions.
+`xla_gpu_enable_pipelined_all_reduce`             | Boolean (true/false)         | Enable pipelinling of all-reduce instructions.
+`xla_gpu_enable_pipelined_host_offloading`        | Boolean (true/false)         | Enable pipelining of host offloading instructions.
+`xla_gpu_enable_while_loop_double_buffering`      | Boolean (true/false)         | Enable double-buffering for while loop.
+`xla_gpu_enable_all_gather_combine_by_dim`        | Boolean (true/false)         | Combine all-gather ops with the same gather dimension or irrespective of their dimension.
+`xla_gpu_enable_reduce_scatter_combine_by_dim`    | Boolean (true/false)         | Combine reduce-scatter ops with the same dimension or irrespective of their dimension.
diff --git a/third_party/xla/docs/test_hlo_passes.md b/third_party/xla/docs/test_hlo_passes.md
index 8afcf6bf773aaf..723406e74d991a 100644
--- a/third_party/xla/docs/test_hlo_passes.md
+++ b/third_party/xla/docs/test_hlo_passes.md
@@ -51,7 +51,7 @@ For example, some
 be written as follows:
 
 ```
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 HloModule Test, is_scheduled=true
 fused_computation {
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 4a78380bc9dd7d..a9bf5dcaee9f9f 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -1,6 +1,7 @@
 tensorflow/compiler/xla/backends/cpu/nanort/package_groups.bzl:
 tensorflow/compiler/xla/backends/cpu/package_groups.bzl:
 tensorflow/compiler/xla/internal/package_groups.bzl:
+tensorflow/compiler/xla/megascale/package_groups.bzl:
 tensorflow/compiler/xla/mlir_hlo/WORKSPACE:
 tensorflow/compiler/xla/package_groups.bzl:
 tensorflow/compiler/xla/pjrt/cpu/package_groups.bzl:
@@ -156,5 +157,4 @@ xla/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl:
 xla/third_party/tensorrt/tensorrt_configure.bzl:
 xla/third_party/tensorrt/workspace.bzl:
 xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h:
-xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h:
 xla/third_party/zlib.BUILD:
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index 4d07e058d098cc..d9c9c10d9f3122 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -189,10 +189,11 @@ common:mkl_aarch64_threadpool -c opt
 # This is an alias for the mkl_aarch64_threadpool build.
 common:mkl_aarch64 --config=mkl_aarch64_threadpool
 
-# Default CUDA, CUDNN and NVSHMEM versions.
+# Default CUDA, CUDNN, NCCL and NVSHMEM versions.
 common:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.9.1"
 common:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.8.0"
 common:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
+common:cuda_version --repo_env=HERMETIC_NCCL_VERSION="2.27.7"
 
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 common:cuda --repo_env TF_NEED_CUDA=1
@@ -280,8 +281,6 @@ common:rocm_base --copt=-Wno-gnu-offsetof-extensions
 common:rocm_base --crosstool_top=@local_config_rocm//crosstool:toolchain
 common:rocm_base --define=using_rocm_hipcc=true
 common:rocm_base --define=tensorflow_mkldnn_contraction_kernel=0
-common:rocm_base --define=xnn_enable_avxvnniint8=false
-common:rocm_base --define=xnn_enable_avx512fp16=false
 common:rocm_base --repo_env TF_NEED_ROCM=1
 
 common:rocm_clang_official --config=rocm_base
@@ -295,7 +294,9 @@ common:rocm_ci --config=rocm
 
 common:rocm_ci_hermetic --dynamic_mode=off
 common:rocm_ci_hermetic --config=rocm_clang_official
-common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_VERSION=rocm_7.10.0_gfx94X"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_URL=https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx94X-dcgpu-7.10.0a20251107.tar.gz"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_HASH=486dbf647bcf9b78f21d7477f43addc7b2075b1a322a119045db9cdc5eb98380"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_LINKS=llvm/amdgcn:amdgcn"
 common:rocm_ci_hermetic --@local_config_rocm//rocm:rocm_path_type=hermetic
 
 # This config option is used for SYCL as GPU backend.
@@ -538,12 +539,6 @@ common:rbe_linux_cpu_clang_local --extra_toolchains="@local_config_cuda//crossto
 common:rbe_linux_cpu_clang_local --repo_env=CC="/usr/lib/llvm-18/bin/clang"
 common:rbe_linux_cpu_clang_local --repo_env=TF_SYSROOT="/dt9"
 
-# Download CUDA/CUDNN redistributions to preserve the repositories cache between
-# CPU and GPU builds.
-# TODO(ybaturina): Uncomment when RBE is ready to support this.
-# common:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
-# common:rbe_linux_cpu --config=cuda_version
-
 common:rbe_linux_cuda --config=cuda_clang_official
 common:rbe_linux_cuda --config=rbe_linux_cpu
 # dt9 is based on glibc 2.17, which is outdated and incompatible with CUDA 12.8.0
diff --git a/third_party/xla/third_party/boringssl.patch b/third_party/xla/third_party/boringssl.patch
new file mode 100644
index 00000000000000..31433753e3abde
--- /dev/null
+++ b/third_party/xla/third_party/boringssl.patch
@@ -0,0 +1,13 @@
+diff --git a/BUILD b/BUILD
+index 206786442..3d1624382 100644
+--- a/BUILD
++++ b/BUILD
+@@ -145,7 +145,7 @@ cc_library(
+ 
+ cc_library(
+     name = "ssl",
+-    srcs = ssl_sources + ssl_internal_headers,
++    srcs = ssl_sources + ssl_internal_headers + crypto_internal_headers,
+     hdrs = ssl_headers,
+     copts = boringssl_copts_cxx,
+     includes = ["src/include"],
diff --git a/third_party/xla/third_party/brotli/BUILD.bazel b/third_party/xla/third_party/brotli/BUILD.bazel
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/third_party/brotli/workspace.bzl b/third_party/xla/third_party/brotli/workspace.bzl
new file mode 100644
index 00000000000000..ec76237744b347
--- /dev/null
+++ b/third_party/xla/third_party/brotli/workspace.bzl
@@ -0,0 +1,11 @@
+"""Provides the repo macro to import brotli"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "org_brotli",
+        sha256 = "e720a6ca29428b803f4ad165371771f5398faba397edf6778837a18599ea13ff",
+        strip_prefix = "brotli-1.1.0",
+        urls = tf_mirror_urls("https://github.com/google/brotli/archive/refs/tags/v1.1.0.tar.gz"),
+    )
diff --git a/third_party/xla/third_party/cudnn_frontend/workspace.bzl b/third_party/xla/third_party/cudnn_frontend/workspace.bzl
index f954a198969f87..1bea852e045b88 100644
--- a/third_party/xla/third_party/cudnn_frontend/workspace.bzl
+++ b/third_party/xla/third_party/cudnn_frontend/workspace.bzl
@@ -7,7 +7,7 @@ def repo():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "257b3b7f8a99abc096094abc9e5011659117b647d55293bcd2c5659f9181b99e",
-        strip_prefix = "cudnn-frontend-1.13.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.13.0.zip"),
+        sha256 = "453d4650e6a25ede58fbbd7077c64ebe92734218d474ec7371bb13fa6d2181fa",
+        strip_prefix = "cudnn-frontend-1.16.1",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.16.1.zip"),
     )
diff --git a/third_party/xla/third_party/curl.BUILD b/third_party/xla/third_party/curl.BUILD
index cb33aa940fe48f..c1884050002a5e 100644
--- a/third_party/xla/third_party/curl.BUILD
+++ b/third_party/xla/third_party/curl.BUILD
@@ -442,6 +442,7 @@ cc_library(
         "@local_xla//xla/tsl:ios": [],
         "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
+            "@boringssl//:crypto",
             "@boringssl//:ssl",
         ],
     }),
diff --git a/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch b/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
new file mode 100644
index 00000000000000..501d22d00cb301
--- /dev/null
+++ b/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
@@ -0,0 +1,164 @@
+From 5c2d2d62a71fe19c92c6f807d533c0ea90c15f03 Mon Sep 17 00:00:00 2001
+From: Marcin Radomski <dextero@google.com>
+Date: Thu, 4 Dec 2025 15:37:45 +0000
+Subject: [PATCH 1/2] Add ASSERT_OK/EXPECT_OK/ASSERT_OK_AND_ASSIGN macros
+
+Create this patch with git diff -U2 / git format-patch -U2 to avoid
+mismatches with googletest.patch.
+---
+ BUILD.bazel                                   |   5 +-
+ googlemock/include/gmock/gmock.h              |   1 +
+ .../include/gmock/internal/xla-gmock-macros.h | 118 ++++++++++++++++++
+ 3 files changed, 123 insertions(+), 1 deletion(-)
+ create mode 100644 googlemock/include/gmock/internal/xla-gmock-macros.h
+
+diff --git a/BUILD.bazel b/BUILD.bazel
+index 008af6a1..32d2a22c 100644
+--- a/BUILD.bazel
++++ b/BUILD.bazel
+@@ -163,5 +163,8 @@ cc_library(
+         ],
+         "//conditions:default": [],
+-    }),
++    }) + [
++        "@abseil-cpp//absl/status",
++        "@abseil-cpp//absl/status:statusor",
++    ],
+ )
+ 
+diff --git a/googlemock/include/gmock/gmock.h b/googlemock/include/gmock/gmock.h
+index c78fb8ee..69b33572 100644
+--- a/googlemock/include/gmock/gmock.h
++++ b/googlemock/include/gmock/gmock.h
+@@ -95,3 +95,4 @@ GTEST_API_ void InitGoogleMock();
+ }  // namespace testing
+ 
++#include "gmock/internal/xla-gmock-macros.h"
+ #endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
+diff --git a/googlemock/include/gmock/internal/xla-gmock-macros.h b/googlemock/include/gmock/internal/xla-gmock-macros.h
+new file mode 100644
+index 00000000..b851bcca
+--- /dev/null
++++ b/googlemock/include/gmock/internal/xla-gmock-macros.h
+@@ -0,0 +1,118 @@
++/* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_XLA_GMOCK_MACROS_H_
++#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_XLA_GMOCK_MACROS_H_
++
++// gmock/gmock.h wrapper that also provides assert macros.
++//
++// These already exist in internal version of gmock, but upstream version
++// doesn't have them. We use this wrapper to make dependency translation when
++// exporting to OSS easier.
++//
++// - We want to use standard internal header and ASSERT_OK, EXPECT_OK macros
++//   when developing internally.
++// - We want the same macros to work externally, rather than having to add or
++//   strip TF_ prefix.
++// - We want the OSS export to still work after the export and header
++//   translation.
++// - We want to minimize the amount of patching third party projects to reduce
++//   maintenance overhead.
++// - To ensure the OSS patches cleanly apply onto internal repo, we need the
++//   header translation to be reversible, which requires 1:1 header mapping.
++//
++// To achieve this, we add those macros to gmock for all XLA code, which
++// should (TM) make ASSERT_OK/EXPECT_OK "just work" in all XLA tests.
++//
++// absl/status/status_matchers.h depends on gmock.h, so we can't simply add it
++// here. This causes a circular dependency between this and absl - which bazel
++// doesn't allow.
++
++#include "absl/status/status.h"
++#include "absl/status/statusor.h"
++
++// Macros for testing the results of functions that return absl::Status or
++// absl::StatusOr<T> (for any type T).
++#define EXPECT_OK(expression) \
++  EXPECT_THAT(expression, ::xla_testing::internal::IsOk())
++#define ASSERT_OK(expression) \
++  ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
++
++#define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
++  ASSERT_OK_AND_ASSIGN_IMPL(                                     \
++      XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
++      lhs, rexpr);
++
++#define ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
++  auto statusor = (rexpr);                              \
++  ASSERT_OK(statusor.status());                         \
++  lhs = std::move(statusor).value()
++
++#define XLA_STATUS_MACROS_CONCAT_NAME(x, y) XLA_STATUS_MACROS_CONCAT_IMPL(x, y)
++#define XLA_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
++
++namespace xla_testing {
++namespace internal {
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++inline const absl::Status& GetStatus(const absl::Status& status) {
++  return status;
++}
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++template <typename T>
++inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
++  return status.status();
++}
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++//
++// Monomorphic implementation of matcher IsOk() for a given type T.
++// T can be Status, StatusOr<>, or a reference to either of them.
++template <typename T>
++class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
++ public:
++  void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
++  void DescribeNegationTo(std::ostream* os) const override {
++    *os << "is not OK";
++  }
++  bool MatchAndExplain(T actual_value,
++                       ::testing::MatchResultListener*) const override {
++    return GetStatus(actual_value).ok();
++  }
++};
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++//
++// Implements IsOk() as a polymorphic matcher.
++class IsOkMatcher {
++ public:
++  template <typename T>
++  /*implicit*/ operator ::testing::Matcher<T>() const {  // NOLINT
++    return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
++  }
++};
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++//
++// Returns a gMock matcher that matches a Status or StatusOr<> which is OK.
++inline ::xla_testing::internal::IsOkMatcher IsOk() {
++  return ::xla_testing::internal::IsOkMatcher();
++}
++
++}  // namespace internal
++}  // namespace xla_testing
++
++#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_XLA_GMOCK_MACROS_H_
+-- 
+2.52.0.223.gf5cc29aaa4-goog
+
diff --git a/third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch b/third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch
new file mode 100644
index 00000000000000..93fa7f98c1b156
--- /dev/null
+++ b/third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch
@@ -0,0 +1,30 @@
+From 21affdb9aaa50767264c13d607d47cb2104c4e4a Mon Sep 17 00:00:00 2001
+From: Marcin Radomski <dextero@google.com>
+Date: Tue, 9 Dec 2025 18:23:26 +0000
+Subject: [PATCH 2/2] Rename dependencies for workspace.bzl build
+
+Must be separate from googletest.patch, because:
+- Tensorflow applies googletest.patch only
+- XLA bzlmod build applies patch that adds assert macros only, and
+  needs different repository name in deps
+- XLA workspace.bzl build applies everything
+---
+ BUILD.bazel | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/BUILD.bazel b/BUILD.bazel
+index 32d2a22c..a122fa28 100644
+--- a/BUILD.bazel
++++ b/BUILD.bazel
+@@ -164,6 +164,6 @@ cc_library(
+         "//conditions:default": [],
+     }) + [
+-        "@abseil-cpp//absl/status",
+-        "@abseil-cpp//absl/status:statusor",
++        "@com_google_absl//absl/status",
++        "@com_google_absl//absl/status:statusor",
+     ],
+ )
+-- 
+2.52.0.223.gf5cc29aaa4-goog
+
diff --git a/third_party/xla/third_party/googletest/README.add-status-macros.md b/third_party/xla/third_party/googletest/README.add-status-macros.md
new file mode 100644
index 00000000000000..8a169b6f6031d5
--- /dev/null
+++ b/third_party/xla/third_party/googletest/README.add-status-macros.md
@@ -0,0 +1,41 @@
+add-status-macros.patch adds `ASSERT_OK`, `EXPECT_OK`, `ASSERT_OK_AND_ASSIGN`
+to gmock.h so that the header's provided functionality matches internal gmock.
+
+What other things have we tried?
+
+1. Introducing a custom header to be used in OSS instead of `gmock/gmock.h`.
+
+   The export-to-OSS process imposes a few restrictions. Notably, header
+   rewrite has to be reversible, so we need a 1:1 mapping between headers used
+   internally and in OSS.
+
+   If we introduced a custom header to be used in OSS instead of gmock, it
+   would have to take the place of current rewrite of internal gmock to
+   `gmock/gmock.h`. This means, any use of `gmock/gmock.h` in OSS XLA code can
+   no longer map to internal gmock. We'd have to ban the header.
+
+   Therefore, updating OSS `gmock/gmock.h` seems necessary.
+
+2. Patching in the extra macros to `gmock/gmock.h` by including
+   `absl/status/status_macros.h`.
+
+   This introduces a circular dependency between absl and gmock which makes
+   bazel strongly opposed to the idea.
+
+3. Introducing a googletest bazel module wrapper.
+
+   This would be a module would proxy all `gmock/gmock.h` within XLA without
+   additional patching of googletest. However, having multiple sources of the
+   same gmock/gmock.h header path only works *sometimes*. The order of include
+   paths emitted by bazel depends on the target definition and ordering of
+   dependencies, so it ends up working in some case and not in others.
+
+4. Expanding 3. by renaming googletest's `gmock.h` to `gmock.upstream.h` to
+   avoid header name conflicts.
+
+   `gmock/gmock.h` is also included by googletest itself, so redirecting it to
+   `gmock/gmock.upstream.h` is needed. That boils down to even more brittle
+   patching.
+
+Overall, the add-status-macros.patch change is the least invasive one that
+works.
diff --git a/third_party/xla/third_party/googletest/googletest.patch b/third_party/xla/third_party/googletest/googletest.patch
index 7e6e300ed273a1..b9f95d91084e6d 100644
--- a/third_party/xla/third_party/googletest/googletest.patch
+++ b/third_party/xla/third_party/googletest/googletest.patch
@@ -2,40 +2,14 @@ diff --git a/BUILD.bazel b/BUILD.bazel
 index cc254457..49120384 100644
 --- a/BUILD.bazel
 +++ b/BUILD.bazel
-@@ -142,16 +142,16 @@ cc_library(
-     }),
-     deps = select({
-         ":has_absl": [
--            "@abseil-cpp//absl/container:flat_hash_set",
--            "@abseil-cpp//absl/debugging:failure_signal_handler",
--            "@abseil-cpp//absl/debugging:stacktrace",
--            "@abseil-cpp//absl/debugging:symbolize",
--            "@abseil-cpp//absl/flags:flag",
--            "@abseil-cpp//absl/flags:parse",
--            "@abseil-cpp//absl/flags:reflection",
--            "@abseil-cpp//absl/flags:usage",
--            "@abseil-cpp//absl/strings",
--            "@re2",
-+            "@com_google_absl//absl/container:flat_hash_set",
-+            "@com_google_absl//absl/debugging:failure_signal_handler",
-+            "@com_google_absl//absl/debugging:stacktrace",
-+            "@com_google_absl//absl/debugging:symbolize",
-+            "@com_google_absl//absl/flags:flag",
-+            "@com_google_absl//absl/flags:parse",
-+            "@com_google_absl//absl/flags:reflection",
-+            "@com_google_absl//absl/flags:usage",
-+            "@com_google_absl//absl/strings",
-+            "@com_googlesource_code_re2//:re2",
-         ],
+@@ -178,6 +178,10 @@ alias(
+ cc_library(
+     name = "gtest_main",
+     srcs = ["googlemock/src/gmock_main.cc"],
++    hdrs = glob([
++        "googletest/include/gtest/*.h",
++        "googlemock/include/gmock/*.h",
++    ]),
+     features = select({
+         ":windows": ["windows_export_all_symbols"],
          "//conditions:default": [],
-     }) + select({
-@@ -160,9 +160,6 @@ cc_library(
-         # so that's why these libraries are needed.
-         # Otherwise, builds targeting Fuchsia would fail to compile.
-         ":fuchsia": [
--            "@fuchsia_sdk//pkg/fdio",
--            "@fuchsia_sdk//pkg/syslog",
--            "@fuchsia_sdk//pkg/zx",
-         ],
-         "//conditions:default": [],
-     }),
diff --git a/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index ffa305c772e881..e9da7383842473 100644
--- a/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -16,6 +16,7 @@ load(
     "with_feature_set",
 )
 load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load("@rules_cc//cc/toolchains:cc_toolchain_config_info.bzl", "CcToolchainConfigInfo")
 
 def all_assembly_actions():
     return [
diff --git a/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
index e5a942b66c17fc..a97202d8e9fb61 100644
--- a/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
@@ -11,6 +11,7 @@ load(
     "with_feature_set",
 )
 load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load("@rules_cc//cc/toolchains:cc_toolchain_config_info.bzl", "CcToolchainConfigInfo")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
index 5d0295a6ee448b..e754300e3dbc9d 100644
--- a/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
@@ -16,6 +16,7 @@ load(
     "with_feature_set",
 )
 load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load("@rules_cc//cc/toolchains:cc_toolchain_config_info.bzl", "CcToolchainConfigInfo")
 
 def all_assembly_actions():
     return [
diff --git a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
index 40ca4a62607cda..3ee6d2d348b2fc 100644
--- a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -1,3 +1,5 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 # Macros for building CUDA code.
 def if_cuda(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with CUDA.
@@ -137,7 +139,7 @@ def cuda_header_library(
     target without virtual includes. This works around the fact that bazel can't
     mix 'includes' and 'include_prefix' in the same target."""
 
-    native.cc_library(
+    cc_library(
         name = name + "_virtual",
         hdrs = hdrs,
         include_prefix = include_prefix,
@@ -146,7 +148,7 @@ def cuda_header_library(
         visibility = ["//visibility:private"],
     )
 
-    native.cc_library(
+    cc_library(
         name = name,
         textual_hdrs = hdrs,
         deps = deps + [":%s_virtual" % name],
@@ -160,7 +162,7 @@ def cuda_library(copts = [], tags = [], deps = [], **kwargs):
     # "use of the "register" storage class specifier is not allowed" error.
     # This can and should be removed once we migrate on glibc-2.27 or newer.
     local_defines = kwargs.pop("local_defines", []) + ["register="]
-    native.cc_library(
+    cc_library(
         copts = cuda_default_copts() + copts,
         tags = tags + [
             "gpu",
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index c95f9a95933fbc..de7d5421af6ffa 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -105,6 +105,7 @@ cc_library(
         ":hip",
         ":hipblas",
         ":hipblaslt",
+        ":hipfft",
         ":hiprand",
         ":hipsolver",
         ":hipsparse",
@@ -116,12 +117,7 @@ cc_library(
         ":rocsolver",
         ":rocsparse",
         ":roctracer",
-    ] + select_threshold(
-        above_or_eq = [":hipfft"],
-        below = [":rocfft"],
-        threshold = 40100,
-        value = rocm_version_number(),
-    ),
+    ],
 )
 
 cc_library(
@@ -150,9 +146,11 @@ cc_library(
         ],
         ":multiple_rocm_paths": [
             "-Wl,-rpath=%{rocm_lib_paths}",
+            "-Lexternal/local_config_rocm/rocm/%{rocm_root}/lib",
         ],
         "//conditions:default": [
             "-Wl,-rpath,/opt/rocm/lib",
+            "-Lexternal/local_config_rocm/rocm/%{rocm_root}/lib",
         ],
     }),
     visibility = ["//visibility:public"],
@@ -410,11 +408,15 @@ cc_library(
 cc_library(
     name = "rocsolver",
     hdrs = glob(["%{rocm_root}/include/rocsolver/**"]),
-    data = glob(["%{rocm_root}/lib/librocsolver*.so*"]),
+    data = glob([
+        "%{rocm_root}/lib/librocsolver*.so*",
+        "%{rocm_root}/lib/host-math/lib/*.so*",
+    ]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lrocsolver"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
@@ -425,14 +427,18 @@ cc_library(
 
 cc_library(
     name = "rocsparse",
-    srcs = glob(["%{rocm_root}/lib/librocsparse*.so*"]),
+    data = glob(["%{rocm_root}/lib/librocsparse*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lrocsparse"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -443,9 +449,14 @@ cc_library(
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lhipsolver"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+        ":rocsparse",
+    ],
 )
 
 cc_library(
@@ -456,6 +467,7 @@ cc_library(
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lhipblas"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
@@ -533,7 +545,8 @@ cc_library(
 )
 
 cc_library(
-    name = "amd_comgr",
+    name = "amd_comgr_dynamic",
+    srcs = ["%{rocm_root}/lib/libamd_comgr_stub.a"],
     hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
     data = glob([
         "%{rocm_root}/lib/libamd_comgr_loader.so*",
@@ -544,12 +557,7 @@ cc_library(
     includes = [
         "%{rocm_root}/include",
     ],
-    linkopts = select({
-        ":build_hermetic": [
-            "-lamd_comgr_loader",
-        ],
-        "//conditions:default": [],
-    }),
+    linkopts = ["-lamd_comgr_loader"],
     strip_include_prefix = "%{rocm_root}",
     deps = [
         ":rocm_config",
@@ -558,6 +566,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "amd_comgr_static",
+    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
+    data = glob([
+        "%{rocm_root}/lib/libamd_comgr.so*",
+    ]),
+    include_prefix = "rocm",
+    includes = [
+        "%{rocm_root}/include",
+    ],
+    linkopts = ["-lamd_comgr"],
+    strip_include_prefix = "%{rocm_root}",
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+        ":system_libs",
+    ],
+)
+
+alias(
+    name = "amd_comgr",
+    actual = select_threshold(
+        above_or_eq = ":amd_comgr_dynamic",
+        below = ":amd_comgr_static",
+        threshold = 71000,
+        value = rocm_version_number(),
+    ),
+)
+
 cc_library(
     name = "rocm_smi",
     srcs = glob([
diff --git a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
index a690f767d8dbd5..d04a045907f274 100644
--- a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -1,3 +1,5 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 # Macros for building ROCm code.
 def if_rocm(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with ROCm.
@@ -80,7 +82,7 @@ def rocm_library(copts = [], deps = [], **kwargs):
     """Wrapper over cc_library which adds default ROCm options."""
     if "@local_config_rocm//rocm:rocm_headers" not in deps:
       deps.append("@local_config_rocm//rocm:rocm_headers")
-    native.cc_library(copts = rocm_default_copts() + copts, deps = deps, **kwargs)
+    cc_library(copts = rocm_default_copts() + copts, deps = deps, **kwargs)
 
 def get_rbe_amdgpu_pool(is_single_gpu = False):
     return "%{single_gpu_rbe_pool}" if is_single_gpu else "%{multi_gpu_rbe_pool}"
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
index 0628122609f8a2..6f7db647259a84 100644
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
+++ b/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
@@ -39,3 +39,22 @@ rocm_redist = {
         rocm_root = "_rocm_sdk_devel",
     ),
 }
+
+def _parse_rocm_distro_links(distro_links):
+    result = []
+    for pair in distro_links.split(","):
+        link = pair.split(":")
+        result.append(struct(target = link[0], link = link[1]))
+    return result
+
+def create_rocm_distro(distro_url, distro_hash, symlinks):
+    return struct(
+        packages = [
+            {
+                "url": distro_url,
+                "sha256": distro_hash,
+            },
+        ],
+        required_softlinks = _parse_rocm_distro_links(symlinks),
+        rocm_root = "",
+    )
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
index 0e0240f00ee501..0f7d57ccbce5f7 100644
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/gpus/rocm_configure.bzl
@@ -16,6 +16,7 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load(
     "//third_party/gpus/rocm:rocm_redist.bzl",
+    "create_rocm_distro",
     "rocm_redist",
 )
 load(
@@ -53,6 +54,9 @@ _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
 _TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
 _DISTRIBUTION_PATH = "rocm/rocm_dist"
 _ROCM_DISTRO_VERSION = "ROCM_DISTRO_VERSION"
+_ROCM_DISTRO_URL = "ROCM_DISTRO_URL"
+_ROCM_DISTRO_HASH = "ROCM_DISTRO_HASH"
+_ROCM_DISTRO_LINKS = "ROCM_DISTRO_LINKS"
 _TMPDIR = "TMPDIR"
 
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
@@ -553,23 +557,36 @@ def _remove_root_dir(path, root_dir):
         return path[len(root_dir) + 1:]
     return path
 
+def _setup_rocm_distro_dir_impl(repository_ctx, rocm_distro):
+    repository_ctx.file("rocm/.index")
+    for pkg in rocm_distro.packages:
+        _download_package(repository_ctx, pkg)
+
+    for entry in rocm_distro.required_softlinks:
+        repository_ctx.symlink(
+            "{}/{}".format(_DISTRIBUTION_PATH, entry.target),
+            "{}/{}".format(_DISTRIBUTION_PATH, entry.link),
+        )
+    bash_bin = get_bash_bin(repository_ctx)
+    return _get_rocm_config(repository_ctx, bash_bin, _canonical_path("{}/{}".format(_DISTRIBUTION_PATH, rocm_distro.rocm_root)), "")
+
 def _setup_rocm_distro_dir(repository_ctx):
     """Sets up the rocm hermetic installation directory to be used in hermetic build"""
     bash_bin = get_bash_bin(repository_ctx)
+    rocm_distro_url = repository_ctx.os.environ.get(_ROCM_DISTRO_URL)
+    if rocm_distro_url:
+        rocm_distro_hash = repository_ctx.os.environ.get(_ROCM_DISTRO_HASH)
+        if not rocm_distro_hash:
+            fail("{} environment variable is required", _ROCM_DISTRO_HASH)
+        rocm_distro_links = repository_ctx.os.environ.get(_ROCM_DISTRO_LINKS, "")
+        rocm_distro = create_rocm_distro(rocm_distro_url, rocm_distro_hash, rocm_distro_links)
+        return _setup_rocm_distro_dir_impl(repository_ctx, rocm_distro)
+
     rocm_distro = repository_ctx.os.environ.get(_ROCM_DISTRO_VERSION)
     multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
     if rocm_distro:
         redist = rocm_redist[rocm_distro]
-        repository_ctx.file("rocm/.index")
-        for pkg in redist.packages:
-            _download_package(repository_ctx, pkg)
-
-        for entry in redist.required_softlinks:
-            repository_ctx.symlink(
-                "{}/{}".format(_DISTRIBUTION_PATH, entry.target),
-                "{}/{}".format(_DISTRIBUTION_PATH, entry.link),
-            )
-        return _get_rocm_config(repository_ctx, bash_bin, _canonical_path("{}/{}".format(_DISTRIBUTION_PATH, redist.rocm_root)), "")
+        return _setup_rocm_distro_dir_impl(repository_ctx, rocm_distro)
     elif multiple_paths:
         paths_list = multiple_paths.split(":")
         for rocm_custom_path in paths_list:
@@ -866,6 +883,9 @@ _ENVIRONS = [
     _ROCM_TOOLKIT_PATH,
     _TF_ROCM_AMDGPU_TARGETS,
     _ROCM_DISTRO_VERSION,
+    _ROCM_DISTRO_URL,
+    _ROCM_DISTRO_HASH,
+    _ROCM_DISTRO_LINKS,
     _TF_ROCM_RBE_DOCKER_IMAGE,
     _TF_ROCM_RBE_SINGLE_GPU_POOL,
     _TF_ROCM_RBE_MULTI_GPU_POOL,
diff --git a/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl
index 8b4324dcc8c9da..debfd5d27639f7 100644
--- a/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl
@@ -1,3 +1,5 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 # Macros for building SYCL code.
 def if_sycl(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with SYCL.
@@ -40,7 +42,7 @@ def if_sycl_build_is_configured(x, y):
 
 def sycl_library(copts = [], linkopts = [], tags = [], deps = [], **kwargs):
     """Wrapper over cc_library which adds default SYCL options."""
-    native.cc_library(copts = sycl_default_copts() + copts,
+    cc_library(copts = sycl_default_copts() + copts,
                       linkopts = sycl_default_linkopts() + linkopts,
                       tags = tags + ["gpu"],
                       deps = deps + if_sycl_is_configured([
diff --git a/third_party/xla/third_party/highwayhash/highwayhash.BUILD b/third_party/xla/third_party/highwayhash/highwayhash.BUILD
index 0314bd443f2617..2c409c8eb8597e 100644
--- a/third_party/xla/third_party/highwayhash/highwayhash.BUILD
+++ b/third_party/xla/third_party/highwayhash/highwayhash.BUILD
@@ -255,6 +255,7 @@ cc_library(
     deps = [
         ":arch_specific",
         ":compiler_specific",
+        ":endianess",
         ":hh_types",
         ":iaca",
         ":load3",
diff --git a/third_party/xla/third_party/implib_so/BUILD.bazel b/third_party/xla/third_party/implib_so/BUILD.bazel
index ca6976cd8d3425..1cb7282ea89d71 100644
--- a/third_party/xla/third_party/implib_so/BUILD.bazel
+++ b/third_party/xla/third_party/implib_so/BUILD.bazel
@@ -1,3 +1,5 @@
+load("@rules_python//python:defs.bzl", "py_binary")
+
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # MIT
diff --git a/third_party/xla/third_party/llvm/generated.patch b/third_party/xla/third_party/llvm/generated.patch
index 509398da979e83..f82404ca1cbe14 100644
--- a/third_party/xla/third_party/llvm/generated.patch
+++ b/third_party/xla/third_party/llvm/generated.patch
@@ -1 +1,152 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
+--- a/clang/lib/Serialization/ASTReaderDecl.cpp
++++ b/clang/lib/Serialization/ASTReaderDecl.cpp
+@@ -2107,8 +2107,9 @@
+     auto *Def = DD.Definition;
+     DD = std::move(MergeDD);
+     DD.Definition = Def;
+-    for (auto *D : Def->redecls())
+-      cast<CXXRecordDecl>(D)->DefinitionData = &DD;
++    for (auto *R = Reader.getMostRecentExistingDecl(Def); R;
++         R = R->getPreviousDecl())
++      cast<CXXRecordDecl>(R)->DefinitionData = &DD;
+     return;
+   }
+ 
+diff -ruN --strip-trailing-cr a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
+--- a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
++++ b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
+@@ -61,14 +61,14 @@
+ LIBC_INLINE static void write_mxcsr(uint32_t w) { _mm_setcsr(w); }
+ 
+ LIBC_INLINE static void clear_except(uint16_t excepts) {
+-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
++  uint32_t mxcsr = get_mxcsr();
+   mxcsr &= ~static_cast<uint32_t>(excepts);
+-  _MM_SET_EXCEPTION_STATE(mxcsr);
++  write_mxcsr(mxcsr);
+ }
+ 
+ LIBC_INLINE static uint16_t test_except(uint16_t excepts) {
+   uint32_t mxcsr = get_mxcsr();
+-  return static_cast<uint16_t>(excepts & mxcsr);
++  return static_cast<uint16_t>(excepts & ExceptionFlags::ALL_F & mxcsr);
+ }
+ 
+ LIBC_INLINE static uint16_t get_except() {
+@@ -83,9 +83,9 @@
+ }
+ 
+ LIBC_INLINE static void raise_except(uint16_t excepts) {
+-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
+-  mxcsr |= excepts;
+-  _MM_SET_EXCEPTION_STATE(mxcsr);
++  uint32_t mxcsr = get_mxcsr();
++  mxcsr |= excepts & ExceptionFlags::ALL_F;
++  write_mxcsr(mxcsr);
+ #ifdef LIBC_TRAP_ON_RAISE_FP_EXCEPT
+   // We will try to trigger the SIGFPE if floating point exceptions are not
+   // masked.  Since we already set all the floating point exception flags, we
+diff -ruN --strip-trailing-cr a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
+--- a/libcxx/include/__flat_map/flat_map.h
++++ b/libcxx/include/__flat_map/flat_map.h
+@@ -465,13 +465,13 @@
+   }
+ 
+   // [flat.map.access], element access
+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
+     requires is_constructible_v<mapped_type>
+   {
+     return try_emplace(__x).first->second;
+   }
+ 
+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
+     requires is_constructible_v<mapped_type>
+   {
+     return try_emplace(std::move(__x)).first->second;
+@@ -480,7 +480,7 @@
+   template <class _Kp>
+     requires(__is_compare_transparent && is_constructible_v<key_type, _Kp> && is_constructible_v<mapped_type> &&
+              !is_convertible_v<_Kp &&, const_iterator> && !is_convertible_v<_Kp &&, iterator>)
+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
+     return try_emplace(std::forward<_Kp>(__x)).first->second;
+   }
+ 
+diff -ruN --strip-trailing-cr a/libcxx/include/map b/libcxx/include/map
+--- a/libcxx/include/map
++++ b/libcxx/include/map
+@@ -1092,9 +1092,9 @@
+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __tree_.size(); }
+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return __tree_.max_size(); }
+ 
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
+ #  ifndef _LIBCPP_CXX03_LANG
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
+ #  endif
+ 
+   template <class _Arg,
+diff -ruN --strip-trailing-cr a/libcxx/include/unordered_map b/libcxx/include/unordered_map
+--- a/libcxx/include/unordered_map
++++ b/libcxx/include/unordered_map
+@@ -1262,9 +1262,9 @@
+   }
+ #  endif // _LIBCPP_STD_VER >= 20
+ 
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
+ #  ifndef _LIBCPP_CXX03_LANG
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
+ #  endif
+ 
+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& at(const key_type& __k);
+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
+--- a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
++++ b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
+@@ -66,9 +66,9 @@
+   TransparentKey<int> tkey;
+ 
+   std::flat_map<int, int> nfm;
+-  nfm[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  fm[std::move(key)];  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  fm[std::move(tkey)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++  nfm[key];            // no-warning
++  fm[std::move(key)];  // no-warning
++  fm[std::move(tkey)]; // no-warning
+ 
+   fm.at(key);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+   cfm.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
+--- a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
++++ b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
+@@ -55,8 +55,8 @@
+ 
+   int key = 0;
+ 
+-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++  m[key];            // no-warning
++  m[std::move(key)]; // no-warning
+ 
+ #if TEST_STD_VER >= 14
+   std::map<std::string, int, std::less<>> strMap;
+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
+--- a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
++++ b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
+@@ -81,8 +81,8 @@
+   ctm.equal_range(tkey); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+ #endif
+ 
+-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++  m[key];            // no-warning
++  m[std::move(key)]; // no-warning
+ 
+   m.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+   cm.at(key); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index 5e3d8f2100a1be..29af0ffbd8c12c 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "8dee997a8558b460b82b23fb43b197d68258baac"
-    LLVM_SHA256 = "6a26975000c2cb45787813317bfeeadeafa0cba762e9434fb7940481ec4b27de"
+    LLVM_COMMIT = "7d381f2a5634d1e41b61299839d652cc4a021898"
+    LLVM_SHA256 = "f1641918fd3f5e1667d39afb9c261da39ed9f74e30f1c2f98031d6d609a8de15"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD.bazel b/third_party/xla/third_party/llvm_openmp/BUILD.bazel
index fbde2733a2a302..15f0218bf2f6a2 100644
--- a/third_party/xla/third_party/llvm_openmp/BUILD.bazel
+++ b/third_party/xla/third_party/llvm_openmp/BUILD.bazel
@@ -17,6 +17,7 @@ load(
     "if_macos",
     "if_windows",
 )
+load("@rules_python//python:defs.bzl", "py_binary")
 
 package(
     default_visibility = [
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
index 83cca313adf4f5..3a079c87ab9dd6 100644
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -156,13 +156,5 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@compute_library//:arm_compute",
-    ] + select({
-        # When using MKL-DNN on the AArch64 architecture, OpenMP is required
-        # for parallelization. Because the Hermetic C++ build environment uses
-        # the -nodefaultlibs flag, simply passing -fopenmp is insufficient.
-        # OpenMP's dependencies must be explicitly linked to ensure correct
-        # inclusion, as automatic linking is disabled.
-        "@rules_ml_toolchain//common:is_hermetic_cc_enabled": ["@rules_ml_toolchain//cc/sysroots:openmp"],
-        "//conditions:default": [],
-    }),
+    ],
 )
diff --git a/third_party/xla/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
index ac7f3bc92cff33..ad447657d907a2 100644
--- a/third_party/xla/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
@@ -2,6 +2,7 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
@@ -311,7 +312,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
 
     # Compile host and device code into library.
     lib = name + "_lib"
-    native.cc_library(
+    cc_library(
         name = lib,
         hdrs = hdrs,
         copts = _rdc_copts() + copts,
@@ -336,7 +337,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
 
     # Compile the source file into a library.
     dlink = name + "_dlink"
-    native.cc_library(
+    cc_library(
         name = dlink,
         srcs = [dlink_cc],
         textual_hdrs = [dlink_hdrs],
@@ -371,7 +372,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
     )
 
     # Create cc target from archive.
-    native.cc_library(
+    cc_library(
         name = name,
         srcs = [merged],
         hdrs = hdrs,
diff --git a/third_party/xla/third_party/nvtx/BUILD.bazel b/third_party/xla/third_party/nvtx/BUILD.bazel
index af6de99cb8fcf7..a8e181e57b1932 100644
--- a/third_party/xla/third_party/nvtx/BUILD.bazel
+++ b/third_party/xla/third_party/nvtx/BUILD.bazel
@@ -1,5 +1,7 @@
 # NVIDIA NVTX 3
 
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 licenses(["notice"])
 
 exports_files(["LICENSE.txt"])
diff --git a/third_party/xla/third_party/ortools/layering_check.patch b/third_party/xla/third_party/ortools/layering_check.patch
new file mode 100644
index 00000000000000..3c2240d8d39e44
--- /dev/null
+++ b/third_party/xla/third_party/ortools/layering_check.patch
@@ -0,0 +1,4261 @@
+diff --git a/examples/cpp/BUILD.bazel b/examples/cpp/BUILD.bazel
+index 6cc1490..a7fa5c1 100644
+--- a/examples/cpp/BUILD.bazel
++++ b/examples/cpp/BUILD.bazel
+@@ -711,6 +711,7 @@ cc_test(
+ cc_library(
+     name = "print_dimacs_assignment",
+     hdrs = ["print_dimacs_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+@@ -725,6 +726,7 @@ cc_library(
+ cc_library(
+     name = "parse_dimacs_assignment",
+     hdrs = ["parse_dimacs_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/graph:ebert_graph",
+@@ -878,6 +880,7 @@ cc_test(
+ cc_library(
+     name = "fap_parser",
+     hdrs = ["fap_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+@@ -891,6 +894,7 @@ cc_library(
+ cc_library(
+     name = "fap_model_printer",
+     hdrs = ["fap_model_printer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":fap_parser",
+         "//ortools/base",
+@@ -903,6 +907,7 @@ cc_library(
+ cc_library(
+     name = "fap_utilities",
+     hdrs = ["fap_utilities.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":fap_parser",
+         "//ortools/base",
+diff --git a/ortools/algorithms/BUILD.bazel b/ortools/algorithms/BUILD.bazel
+index be5f372..4d1c6ae 100644
+--- a/ortools/algorithms/BUILD.bazel
++++ b/ortools/algorithms/BUILD.bazel
+@@ -65,6 +65,7 @@ cc_library(
+     name = "binary_search",
+     srcs = [],
+     hdrs = ["binary_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/functional:function_ref",
+@@ -95,6 +96,7 @@ cc_library(
+     name = "radix_sort",
+     srcs = [],
+     hdrs = ["radix_sort.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/algorithm:container",
+         "@com_google_absl//absl/base",
+@@ -132,6 +134,7 @@ cc_library(
+     name = "duplicate_remover",
+     srcs = ["duplicate_remover.cc"],
+     hdrs = ["duplicate_remover.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/numeric:bits",
+@@ -147,6 +150,7 @@ cc_library(
+     name = "hungarian",
+     srcs = ["hungarian.cc"],
+     hdrs = ["hungarian.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/container:flat_hash_map",
+@@ -174,6 +178,7 @@ cc_test(
+ cc_library(
+     name = "adjustable_k_ary_heap",
+     hdrs = ["adjustable_k_ary_heap.h"],
++    features = ["-layering_check"],
+     deps = ["@com_google_absl//absl/log:check"],
+ )
+ 
+@@ -213,6 +218,7 @@ cc_library(
+         ":use_scip": ["-DUSE_SCIP"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/strings",
+@@ -269,6 +275,7 @@ cc_library(
+     name = "set_cover_lagrangian",
+     srcs = ["set_cover_lagrangian.cc"],
+     hdrs = ["set_cover_lagrangian.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":adjustable_k_ary_heap",
+         ":set_cover_invariant",
+@@ -282,6 +289,7 @@ cc_library(
+     name = "set_cover_model",
+     srcs = ["set_cover_model.cc"],
+     hdrs = ["set_cover_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_cc_proto",
+         "//ortools/base:intops",
+@@ -297,6 +305,7 @@ cc_library(
+     name = "set_cover_invariant",
+     srcs = ["set_cover_invariant.cc"],
+     hdrs = ["set_cover_invariant.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_cc_proto",
+         ":set_cover_model",
+@@ -311,6 +320,7 @@ cc_library(
+     name = "set_cover_heuristics",
+     srcs = ["set_cover_heuristics.cc"],
+     hdrs = ["set_cover_heuristics.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":adjustable_k_ary_heap",
+         ":set_cover_invariant",
+@@ -328,6 +338,7 @@ cc_library(
+     name = "set_cover_mip",
+     srcs = ["set_cover_mip.cc"],
+     hdrs = ["set_cover_mip.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_invariant",
+         ":set_cover_model",
+@@ -343,6 +354,7 @@ cc_library(
+     name = "set_cover_reader",
+     srcs = ["set_cover_reader.cc"],
+     hdrs = ["set_cover_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_model",
+         "//ortools/base:file",
+@@ -378,6 +390,7 @@ cc_test(
+ cc_library(
+     name = "dense_doubly_linked_list",
+     hdrs = ["dense_doubly_linked_list.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -387,6 +400,7 @@ cc_library(
+     name = "dynamic_partition",
+     srcs = ["dynamic_partition.cc"],
+     hdrs = ["dynamic_partition.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:murmur",
+         "@com_google_absl//absl/log:check",
+@@ -411,6 +425,7 @@ cc_library(
+     name = "sparse_permutation",
+     srcs = ["sparse_permutation.cc"],
+     hdrs = ["sparse_permutation.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -433,6 +448,7 @@ cc_library(
+     name = "dynamic_permutation",
+     srcs = ["dynamic_permutation.cc"],
+     hdrs = ["dynamic_permutation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sparse_permutation",
+         "//ortools/base",
+@@ -453,6 +469,7 @@ cc_library(
+     name = "find_graph_symmetries",
+     srcs = ["find_graph_symmetries.cc"],
+     hdrs = ["find_graph_symmetries.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":dense_doubly_linked_list",
+         ":dynamic_partition",
+@@ -507,6 +524,7 @@ cc_test(
+ cc_library(
+     name = "binary_indexed_tree",
+     hdrs = ["binary_indexed_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+     ],
+@@ -525,6 +543,7 @@ cc_library(
+     name = "n_choose_k",
+     srcs = ["n_choose_k.cc"],
+     hdrs = ["n_choose_k.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":binary_search",
+         "//ortools/base:mathutil",
+diff --git a/ortools/algorithms/python/BUILD.bazel b/ortools/algorithms/python/BUILD.bazel
+index fe3de2c..0a4ccf9 100644
+--- a/ortools/algorithms/python/BUILD.bazel
++++ b/ortools/algorithms/python/BUILD.bazel
+@@ -48,6 +48,7 @@ config_setting(
+ cc_library(
+     name = "knapsack_solver_doc",
+     hdrs = ["knapsack_solver_doc.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+ )
+ 
+diff --git a/ortools/base/BUILD.bazel b/ortools/base/BUILD.bazel
+index c57c0d2..6ebc65a 100644
+--- a/ortools/base/BUILD.bazel
++++ b/ortools/base/BUILD.bazel
+@@ -54,6 +54,7 @@ cc_library(
+         "-DOR_TOOLS_MINOR=11",
+         "-DOR_TOOLS_PATCH=9999",
+     ],
++    features = ["-layering_check"],
+     linkopts = select({
+         "on_linux": [],
+         "on_macos": ["-framework CoreFoundation"],
+@@ -83,6 +84,7 @@ cc_library(
+ cc_library(
+     name = "accurate_sum",
+     hdrs = ["accurate_sum.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+@@ -91,6 +93,7 @@ cc_library(
+         "adjustable_priority_queue.h",
+         "adjustable_priority_queue-inl.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+     ],
+@@ -99,18 +102,21 @@ cc_library(
+ cc_library(
+     name = "basictypes",
+     hdrs = ["basictypes.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "bitmap",
+     srcs = ["bitmap.cc"],
+     hdrs = ["bitmap.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "case",
+     srcs = ["case.cc"],
+     hdrs = ["case.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+@@ -120,6 +126,7 @@ cc_library(
+         "commandlineflags.cc",
+     ],
+     hdrs = ["commandlineflags.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/flags:flag",
+         "@com_google_absl//absl/flags:parse",
+@@ -130,6 +137,7 @@ cc_library(
+ cc_library(
+     name = "container_logging",
+     hdrs = ["container_logging.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+@@ -142,6 +150,7 @@ cc_library(
+         "on_windows": ["/Zc:preprocessor"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/container:inlined_vector",
+     ],
+@@ -167,6 +176,7 @@ cc_test(
+ cc_library(
+     name = "dynamic_library",
+     hdrs = ["dynamic_library.h"],
++    features = ["-layering_check"],
+     linkopts = select({
+         "on_linux": ["-Wl,--no-as-needed -ldl"],
+         "on_macos": [],
+@@ -182,12 +192,14 @@ cc_library(
+ cc_library(
+     name = "encodingutils",
+     hdrs = ["encodingutils.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "flags",
+     hdrs = ["flags.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/flags:flag",
+     ],
+@@ -205,6 +217,7 @@ cc_library(
+         "helpers.h",
+         "options.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":status_macros",
+         "@com_google_absl//absl/log",
+@@ -218,6 +231,7 @@ cc_library(
+ cc_library(
+     name = "status_matchers",
+     hdrs = ["status_matchers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@com_google_absl//absl/status",
+@@ -230,6 +244,7 @@ cc_library(
+ cc_library(
+     name = "message_matchers",
+     hdrs = ["message_matchers.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+         "@com_google_googletest//:gtest",
+@@ -240,6 +255,7 @@ cc_library(
+ cc_library(
+     name = "gmock",
+     hdrs = ["gmock.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":message_matchers",
+         ":status_matchers",
+@@ -249,6 +265,7 @@ cc_library(
+ 
+ cc_library(
+     name = "gmock_main",
++    features = ["-layering_check"],
+     deps = [
+         ":gmock",
+         "@com_google_googletest//:gtest_main",
+@@ -259,6 +276,7 @@ cc_library(
+     name = "gzipfile",
+     srcs = ["gzipfile.cc"],
+     hdrs = ["gzipfile.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":basictypes",
+@@ -272,6 +290,7 @@ cc_library(
+ cc_library(
+     name = "gzipstring",
+     hdrs = ["gzipstring.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@zlib",
+@@ -286,6 +305,7 @@ cc_library(
+     hdrs = [
+         "hash.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -294,24 +314,28 @@ cc_library(
+ cc_library(
+     name = "int_type",
+     hdrs = ["int_type.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "intops",
+     hdrs = ["strong_int.h"],
++    features = ["-layering_check"],
+     deps = [":int_type"],
+ )
+ 
+ cc_library(
+     name = "iterator_adaptors",
+     hdrs = ["iterator_adaptors.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "linked_hash_map",
+     hdrs = ["linked_hash_map.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":logging",
+@@ -324,6 +348,7 @@ cc_library(
+     name = "logging",
+     srcs = ["logging.cc"],
+     hdrs = ["logging.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":macros",
+         "@com_google_absl//absl/base:log_severity",
+@@ -344,11 +369,13 @@ cc_library(
+ cc_library(
+     name = "macros",
+     hdrs = ["macros.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "map_util",
+     hdrs = ["map_util.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+@@ -356,6 +383,7 @@ cc_library(
+     name = "mathutil",
+     srcs = ["mathutil.cc"],
+     hdrs = ["mathutil.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+     ],
+@@ -364,12 +392,14 @@ cc_library(
+ cc_library(
+     name = "memfile",
+     hdrs = ["memfile.h"],
++    features = ["-layering_check"],
+     deps = [],
+ )
+ 
+ cc_library(
+     name = "murmur",
+     hdrs = ["murmur.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":hash",
+@@ -380,6 +410,7 @@ cc_library(
+ cc_library(
+     name = "mutable_memfile",
+     hdrs = ["mutable_memfile.h"],
++    features = ["-layering_check"],
+     deps = [],
+ )
+ 
+@@ -387,6 +418,7 @@ cc_library(
+     name = "numbers",
+     srcs = ["numbers.cc"],
+     hdrs = ["numbers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":strtoint",
+         "@com_google_absl//absl/strings",
+@@ -396,6 +428,7 @@ cc_library(
+ cc_library(
+     name = "parse_text_proto",
+     hdrs = ["parse_text_proto.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_protobuf//:protobuf",
+@@ -406,6 +439,7 @@ cc_library(
+     name = "path",
+     srcs = ["path.cc"],
+     hdrs = ["path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@com_google_absl//absl/strings",
+@@ -416,6 +450,7 @@ cc_library(
+     name = "temp_path",
+     srcs = ["temp_path.cc"],
+     hdrs = ["temp_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":file",
+@@ -429,11 +464,13 @@ cc_library(
+ cc_library(
+     name = "protobuf_util",
+     hdrs = ["protobuf_util.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "protoutil",
+     hdrs = ["protoutil.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":timer",
+         "@com_google_absl//absl/status",
+@@ -445,12 +482,14 @@ cc_library(
+ cc_library(
+     name = "ptr_util",
+     hdrs = ["ptr_util.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "recordio",
+     srcs = ["recordio.cc"],
+     hdrs = ["recordio.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":file",
+@@ -465,18 +504,21 @@ cc_library(
+ cc_library(
+     name = "small_map",
+     hdrs = ["small_map.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "source_location",
+     hdrs = ["source_location.h"],
++    features = ["-layering_check"],
+     deps = ["@com_google_absl//absl/base:config"],
+ )
+ 
+ cc_library(
+     name = "status_builder",
+     hdrs = ["status_builder.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@com_google_absl//absl/status",
+@@ -487,6 +529,7 @@ cc_library(
+ cc_library(
+     name = "status_macros",
+     hdrs = ["status_macros.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":status_builder",
+@@ -498,12 +541,14 @@ cc_library(
+ cc_library(
+     name = "stl_util",
+     hdrs = ["stl_util.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "strong_vector",
+     hdrs = ["strong_vector.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":intops",
+@@ -514,6 +559,7 @@ cc_library(
+     name = "strtoint",
+     srcs = ["strtoint.cc"],
+     hdrs = ["strtoint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/strings",
+@@ -524,6 +570,7 @@ cc_library(
+     name = "sysinfo",
+     srcs = ["sysinfo.cc"],
+     hdrs = ["sysinfo.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -533,6 +580,7 @@ cc_library(
+     name = "threadpool",
+     srcs = ["threadpool.cc"],
+     hdrs = ["threadpool.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/synchronization",
+@@ -543,6 +591,7 @@ cc_library(
+     name = "timer",
+     srcs = ["timer.cc"],
+     hdrs = ["timer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":macros",
+         "@com_google_absl//absl/log:check",
+@@ -553,22 +602,26 @@ cc_library(
+ cc_library(
+     name = "top_n",
+     hdrs = ["top_n.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "typeid",
+     hdrs = ["typeid.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "types",
+     hdrs = ["types.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "zipfile",
+     srcs = ["zipfile.cc"],
+     hdrs = ["zipfile.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":basictypes",
+         ":file",
+diff --git a/ortools/bop/BUILD.bazel b/ortools/bop/BUILD.bazel
+index 4720990..605ce2b 100644
+--- a/ortools/bop/BUILD.bazel
++++ b/ortools/bop/BUILD.bazel
+@@ -30,6 +30,7 @@ cc_proto_library(
+ cc_library(
+     name = "bop_types",
+     hdrs = ["bop_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:intops",
+@@ -41,6 +42,7 @@ cc_library(
+     name = "bop_base",
+     srcs = ["bop_base.cc"],
+     hdrs = ["bop_base.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_parameters_cc_proto",
+         ":bop_solution",
+@@ -67,6 +69,7 @@ cc_library(
+     name = "bop_util",
+     srcs = ["bop_util.cc"],
+     hdrs = ["bop_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_solution",
+@@ -80,6 +83,7 @@ cc_library(
+     name = "bop_solution",
+     srcs = ["bop_solution.cc"],
+     hdrs = ["bop_solution.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_types",
+         "//ortools/base",
+@@ -94,6 +98,7 @@ cc_library(
+     name = "bop_fs",
+     srcs = ["bop_fs.cc"],
+     hdrs = ["bop_fs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_parameters_cc_proto",
+@@ -126,6 +131,7 @@ cc_library(
+     name = "bop_lns",
+     srcs = ["bop_lns.cc"],
+     hdrs = ["bop_lns.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_parameters_cc_proto",
+@@ -156,6 +162,7 @@ cc_library(
+     name = "complete_optimizer",
+     srcs = ["complete_optimizer.cc"],
+     hdrs = ["complete_optimizer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_solution",
+@@ -178,6 +185,7 @@ cc_library(
+     name = "bop_ls",
+     srcs = ["bop_ls.cc"],
+     hdrs = ["bop_ls.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_solution",
+@@ -199,6 +207,7 @@ cc_library(
+     name = "bop_portfolio",
+     srcs = ["bop_portfolio.cc"],
+     hdrs = ["bop_portfolio.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_fs",
+@@ -231,6 +240,7 @@ cc_library(
+     name = "bop_solver",
+     srcs = ["bop_solver.cc"],
+     hdrs = ["bop_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_fs",
+@@ -265,6 +275,7 @@ cc_library(
+     name = "integral_solver",
+     srcs = ["integral_solver.cc"],
+     hdrs = ["integral_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_fs",
+diff --git a/ortools/constraint_solver/BUILD.bazel b/ortools/constraint_solver/BUILD.bazel
+index 99d9b4d..6cedaa6 100644
+--- a/ortools/constraint_solver/BUILD.bazel
++++ b/ortools/constraint_solver/BUILD.bazel
+@@ -169,6 +169,7 @@ cc_library(
+         "constraint_solver.h",
+         "constraint_solveri.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":assignment_cc_proto",
+         ":demon_profiler_cc_proto",
+@@ -267,6 +268,7 @@ cc_library(
+     name = "routing_parameters",
+     srcs = ["routing_parameters.cc"],
+     hdrs = ["routing_parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp",
+         ":routing_enums_cc_proto",
+@@ -286,6 +288,7 @@ cc_library(
+ cc_library(
+     name = "routing_types",
+     hdrs = ["routing_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:intops",
+@@ -296,6 +299,7 @@ cc_library(
+     name = "routing_utils",
+     srcs = ["routing_utils.cc"],
+     hdrs = ["routing_utils.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base",
+@@ -307,6 +311,7 @@ cc_library(
+     name = "routing_neighborhoods",
+     srcs = ["routing_neighborhoods.cc"],
+     hdrs = ["routing_neighborhoods.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":cp",
+@@ -320,6 +325,7 @@ cc_library(
+     name = "routing_index_manager",
+     srcs = ["routing_index_manager.cc"],
+     hdrs = ["routing_index_manager.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":routing_types",
+         "//ortools/base",
+@@ -360,6 +366,7 @@ cc_library(
+         "on_windows": ["/Zc:preprocessor"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":cp",
+         ":routing_enums_cc_proto",
+diff --git a/ortools/flatzinc/BUILD.bazel b/ortools/flatzinc/BUILD.bazel
+index d3e8b22..5015c77 100644
+--- a/ortools/flatzinc/BUILD.bazel
++++ b/ortools/flatzinc/BUILD.bazel
+@@ -46,6 +46,7 @@ cc_library(
+     name = "model",
+     srcs = ["model.cc"],
+     hdrs = ["model.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:hash",
+@@ -71,6 +72,7 @@ cc_library(
+     copts = [
+         "$(STACK_FRAME_UNLIMITED)",  # parser.tab.cc
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -90,6 +92,7 @@ cc_library(
+         "on_windows": [],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":parser_yacc_lib",
+         "//ortools/base",
+@@ -102,6 +105,7 @@ cc_library(
+     name = "parser_lib",
+     srcs = ["parser.cc"],
+     hdrs = ["parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":parser_lex_lib",
+@@ -113,6 +117,7 @@ cc_library(
+     name = "presolve",
+     srcs = ["presolve.cc"],
+     hdrs = ["presolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -128,6 +133,7 @@ cc_library(
+     name = "checker",
+     srcs = ["checker.cc"],
+     hdrs = ["checker.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -142,6 +148,7 @@ cc_library(
+     name = "cp_model_fz_solver",
+     srcs = ["cp_model_fz_solver.cc"],
+     hdrs = ["cp_model_fz_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":checker",
+         ":model",
+diff --git a/ortools/glop/BUILD.bazel b/ortools/glop/BUILD.bazel
+index 687c48d..48e856c 100644
+--- a/ortools/glop/BUILD.bazel
++++ b/ortools/glop/BUILD.bazel
+@@ -54,6 +54,7 @@ SAFE_FP_CODE = select({
+ cc_library(
+     name = "pricing",
+     hdrs = ["pricing.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/lp_data:base",
+@@ -69,6 +70,7 @@ cc_library(
+     srcs = ["revised_simplex.cc"],
+     hdrs = ["revised_simplex.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":dual_edge_norms",
+@@ -106,6 +108,7 @@ cc_library(
+     srcs = ["update_row.cc"],
+     hdrs = ["update_row.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -125,6 +128,7 @@ cc_library(
+     srcs = ["variables_info.cc"],
+     hdrs = ["variables_info.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/lp_data:base",
+@@ -139,6 +143,7 @@ cc_library(
+     srcs = ["lu_factorization.cc"],
+     hdrs = ["lu_factorization.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":markowitz",
+         ":parameters_cc_proto",
+@@ -155,6 +160,7 @@ cc_library(
+     srcs = ["markowitz.cc"],
+     hdrs = ["markowitz.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         ":status",
+@@ -174,6 +180,7 @@ cc_library(
+     srcs = ["basis_representation.cc"],
+     hdrs = ["basis_representation.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":lu_factorization",
+         ":parameters_cc_proto",
+@@ -193,6 +200,7 @@ cc_library(
+     name = "rank_one_update",
+     hdrs = ["rank_one_update.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":status",
+         "//ortools/base",
+@@ -210,6 +218,7 @@ cc_library(
+     srcs = ["initial_basis.cc"],
+     hdrs = ["initial_basis.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":markowitz",
+         "//ortools/base",
+@@ -227,6 +236,7 @@ cc_library(
+     srcs = ["status.cc"],
+     hdrs = ["status.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -255,6 +265,7 @@ cc_library(
+     srcs = ["dual_edge_norms.cc"],
+     hdrs = ["dual_edge_norms.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -274,6 +285,7 @@ cc_library(
+     srcs = ["primal_edge_norms.cc"],
+     hdrs = ["primal_edge_norms.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -293,6 +305,7 @@ cc_library(
+     srcs = ["reduced_costs.cc"],
+     hdrs = ["reduced_costs.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -317,6 +330,7 @@ cc_library(
+     srcs = ["variable_values.cc"],
+     hdrs = ["variable_values.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":dual_edge_norms",
+@@ -338,6 +352,7 @@ cc_library(
+     srcs = ["entering_variable.cc"],
+     hdrs = ["entering_variable.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -366,6 +381,7 @@ cc_library(
+     srcs = ["preprocessor.cc"],
+     hdrs = ["preprocessor.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         ":revised_simplex",
+@@ -389,6 +405,7 @@ cc_library(
+     srcs = ["lp_solver.cc"],
+     hdrs = ["lp_solver.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         ":preprocessor",
+@@ -413,6 +430,7 @@ cc_library(
+     name = "parameters_validation",
+     srcs = ["parameters_validation.cc"],
+     hdrs = ["parameters_validation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         "@com_google_absl//absl/strings",
+diff --git a/ortools/glpk/BUILD.bazel b/ortools/glpk/BUILD.bazel
+index 246ee67..7f2c088 100644
+--- a/ortools/glpk/BUILD.bazel
++++ b/ortools/glpk/BUILD.bazel
+@@ -18,6 +18,7 @@ cc_library(
+     name = "glpk_env_deleter",
+     srcs = ["glpk_env_deleter.cc"],
+     hdrs = ["glpk_env_deleter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@glpk",
+@@ -28,6 +29,7 @@ cc_library(
+     name = "glpk_formatters",
+     srcs = ["glpk_formatters.cc"],
+     hdrs = ["glpk_formatters.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -38,6 +40,7 @@ cc_library(
+ cc_library(
+     name = "glpk_computational_form",
+     hdrs = ["glpk_computational_form.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@glpk",
+     ],
+diff --git a/ortools/graph/BUILD.bazel b/ortools/graph/BUILD.bazel
+index fe0f588..4bb9556 100644
+--- a/ortools/graph/BUILD.bazel
++++ b/ortools/graph/BUILD.bazel
+@@ -35,6 +35,7 @@ config_setting(
+ cc_library(
+     name = "graphs",
+     hdrs = ["graphs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -44,6 +45,7 @@ cc_library(
+ cc_library(
+     name = "graph",
+     hdrs = ["graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":iterators",
+         "//ortools/base",
+@@ -55,6 +57,7 @@ cc_library(
+ cc_library(
+     name = "bfs",
+     hdrs = ["bfs.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/status",
+         "@com_google_absl//absl/strings:str_format",
+@@ -64,6 +67,7 @@ cc_library(
+ cc_library(
+     name = "bounded_dijkstra",
+     hdrs = ["bounded_dijkstra.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base:iterator_adaptors",
+@@ -78,6 +82,7 @@ cc_library(
+ cc_library(
+     name = "multi_dijkstra",
+     hdrs = ["multi_dijkstra.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:map_util",
+         "//ortools/base:types",
+@@ -88,6 +93,7 @@ cc_library(
+ cc_library(
+     name = "bidirectional_dijkstra",
+     hdrs = ["bidirectional_dijkstra.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:iterator_adaptors",
+@@ -103,6 +109,7 @@ cc_library(
+     name = "cliques",
+     srcs = ["cliques.cc"],
+     hdrs = ["cliques.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:int_type",
+@@ -116,6 +123,7 @@ cc_library(
+ cc_library(
+     name = "hamiltonian_path",
+     hdrs = ["hamiltonian_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:types",
+@@ -129,6 +137,7 @@ cc_library(
+ cc_library(
+     name = "christofides",
+     hdrs = ["christofides.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":eulerian_path",
+         ":graph",
+@@ -147,6 +156,7 @@ cc_library(
+ cc_library(
+     name = "eulerian_path",
+     hdrs = ["eulerian_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -155,6 +165,7 @@ cc_library(
+ cc_library(
+     name = "minimum_spanning_tree",
+     hdrs = ["minimum_spanning_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":connected_components",
+         "//ortools/base:adjustable_priority_queue",
+@@ -167,6 +178,7 @@ cc_library(
+ cc_library(
+     name = "one_tree_lower_bound",
+     hdrs = ["one_tree_lower_bound.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":christofides",
+         ":graph",
+@@ -179,6 +191,7 @@ cc_library(
+ cc_library(
+     name = "ebert_graph",
+     hdrs = ["ebert_graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:types",
+@@ -192,6 +205,7 @@ cc_library(
+     name = "shortest_paths",
+     srcs = ["shortest_paths.cc"],
+     hdrs = ["shortest_paths.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -212,6 +226,7 @@ cc_library(
+ cc_library(
+     name = "k_shortest_paths",
+     hdrs = ["k_shortest_paths.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bounded_dijkstra",
+         ":ebert_graph",
+@@ -242,6 +257,7 @@ cc_library(
+     name = "max_flow",
+     srcs = ["max_flow.cc"],
+     hdrs = ["max_flow.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":flow_problem_cc_proto",
+@@ -290,6 +306,7 @@ cc_library(
+         "on_windows": ["/Zc:preprocessor"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -336,6 +353,7 @@ cc_library(
+     name = "assignment",
+     srcs = ["assignment.cc"],
+     hdrs = ["assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":linear_assignment",
+@@ -349,6 +367,7 @@ cc_library(
+     name = "linear_assignment",
+     srcs = ["linear_assignment.cc"],
+     hdrs = ["linear_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         "//ortools/base",
+@@ -364,6 +383,7 @@ cc_library(
+     name = "perfect_matching",
+     srcs = ["perfect_matching.cc"],
+     hdrs = ["perfect_matching.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:adjustable_priority_queue",
+@@ -382,6 +402,7 @@ cc_library(
+     name = "dag_shortest_path",
+     srcs = ["dag_shortest_path.cc"],
+     hdrs = ["dag_shortest_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -399,6 +420,7 @@ cc_library(
+     name = "dag_constrained_shortest_path",
+     srcs = ["dag_constrained_shortest_path.cc"],
+     hdrs = ["dag_constrained_shortest_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":dag_shortest_path",
+         ":graph",
+@@ -416,6 +438,7 @@ cc_library(
+ cc_library(
+     name = "rooted_tree",
+     hdrs = ["rooted_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/algorithm:container",
+@@ -437,6 +460,7 @@ cc_library(
+     hdrs = [
+         "connected_components.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -450,6 +474,7 @@ cc_library(
+ cc_library(
+     name = "io",
+     hdrs = ["io.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base:numbers",
+@@ -463,12 +488,14 @@ cc_library(
+ cc_library(
+     name = "iterators",
+     hdrs = ["iterators.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "random_graph",
+     srcs = ["random_graph.cc"],
+     hdrs = ["random_graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base:logging",
+@@ -485,6 +512,7 @@ cc_library(
+     hdrs = [
+         "strongly_connected_components.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -494,6 +522,7 @@ cc_library(
+     name = "topologicalsorter",
+     srcs = ["topologicalsorter.cc"],
+     hdrs = ["topologicalsorter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base",
+@@ -512,6 +541,7 @@ cc_library(
+     name = "util",
+     srcs = ["util.cc"],
+     hdrs = ["util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":connected_components",
+         ":graph",
+diff --git a/ortools/gscip/BUILD.bazel b/ortools/gscip/BUILD.bazel
+index d949483..37dd2ee 100644
+--- a/ortools/gscip/BUILD.bazel
++++ b/ortools/gscip/BUILD.bazel
+@@ -39,6 +39,7 @@ cc_library(
+     name = "gscip_parameters",
+     srcs = ["gscip_parameters.cc"],
+     hdrs = ["gscip_parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip_cc_proto",
+         "//ortools/base:status_macros",
+@@ -62,6 +63,7 @@ cc_library(
+     name = "legacy_scip_params",
+     srcs = ["legacy_scip_params.cc"],
+     hdrs = ["legacy_scip_params.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/linear_solver:scip_helper_macros",
+         "//ortools/linear_solver:scip_with_glop",
+@@ -81,6 +83,7 @@ cc_library(
+         "gscip.h",
+         "gscip_event_handler.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip_cc_proto",
+         ":gscip_message_handler",
+@@ -106,6 +109,7 @@ cc_library(
+     name = "gscip_ext",
+     srcs = ["gscip_ext.cc"],
+     hdrs = ["gscip_ext.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip",
+         "//ortools/base:status_macros",
+@@ -118,6 +122,7 @@ cc_library(
+     name = "gscip_message_handler",
+     srcs = ["gscip_message_handler.cc"],
+     hdrs = ["gscip_message_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/linear_solver:scip_helper_macros",
+@@ -131,6 +136,7 @@ cc_library(
+     name = "gscip_callback_result",
+     srcs = ["gscip_callback_result.cc"],
+     hdrs = ["gscip_callback_result.h"],
++    features = ["-layering_check"],
+     deps = ["@scip//:libscip"],
+ )
+ 
+@@ -138,6 +144,7 @@ cc_library(
+     name = "gscip_constraint_handler",
+     srcs = ["gscip_constraint_handler.cc"],
+     hdrs = ["gscip_constraint_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip",
+         ":gscip_callback_result",
+diff --git a/ortools/gurobi/BUILD.bazel b/ortools/gurobi/BUILD.bazel
+index d8e4a72..83da625 100644
+--- a/ortools/gurobi/BUILD.bazel
++++ b/ortools/gurobi/BUILD.bazel
+@@ -21,6 +21,7 @@ cc_library(
+     hdrs = [
+         "environment.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:dynamic_library",
+@@ -39,6 +40,7 @@ cc_library(
+     name = "gurobi_util",
+     srcs = ["gurobi_util.cc"],
+     hdrs = ["gurobi_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":environment",
+         "@com_google_absl//absl/strings",
+@@ -51,5 +53,6 @@ cc_library(
+     testonly = True,
+     srcs = ["gurobi_stdout_matchers.cc"],
+     hdrs = ["gurobi_stdout_matchers.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base:gmock"],
+ )
+diff --git a/ortools/gurobi/isv_public/BUILD.bazel b/ortools/gurobi/isv_public/BUILD.bazel
+index efae616..1006da8 100644
+--- a/ortools/gurobi/isv_public/BUILD.bazel
++++ b/ortools/gurobi/isv_public/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "gurobi_isv",
+     srcs = ["gurobi_isv.cc"],
+     hdrs = ["gurobi_isv.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/gurobi:environment",
+         "//ortools/math_opt/solvers:gurobi_cc_proto",
+diff --git a/ortools/init/BUILD.bazel b/ortools/init/BUILD.bazel
+index 0705399..aec2da3 100644
+--- a/ortools/init/BUILD.bazel
++++ b/ortools/init/BUILD.bazel
+@@ -16,6 +16,7 @@ package(default_visibility = ["//visibility:public"])
+ cc_library(
+     name = "init",
+     hdrs = ["init.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/gurobi:environment",
+diff --git a/ortools/init/python/BUILD.bazel b/ortools/init/python/BUILD.bazel
+index 1774f36..eb75897 100644
+--- a/ortools/init/python/BUILD.bazel
++++ b/ortools/init/python/BUILD.bazel
+@@ -21,6 +21,7 @@ load("@rules_python//python:defs.bzl", "py_test")
+ cc_library(
+     name = "init_doc",
+     hdrs = ["init_doc.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+ )
+ 
+diff --git a/ortools/linear_solver/BUILD.bazel b/ortools/linear_solver/BUILD.bazel
+index 618e192..b7bcf34 100644
+--- a/ortools/linear_solver/BUILD.bazel
++++ b/ortools/linear_solver/BUILD.bazel
+@@ -252,6 +252,7 @@ cc_library(
+         ":use_cplex": ["-DUSE_CPLEX"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":linear_solver_cc_proto",
+         ":model_exporter",
+@@ -323,6 +324,7 @@ cc_library(
+     name = "model_validator",
+     srcs = ["model_validator.cc"],
+     hdrs = ["model_validator.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_solver_cc_proto",
+@@ -352,6 +354,7 @@ copy_file(
+ cc_library(
+     name = "scip_with_glop",
+     srcs = ["lpi_glop.cpp"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/glop:lp_solver",
+         "@scip//:libscip",
+@@ -361,6 +364,7 @@ cc_library(
+ cc_library(
+     name = "scip_helper_macros",
+     hdrs = ["scip_helper_macros.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -373,6 +377,7 @@ cc_library(
+     name = "model_exporter",
+     srcs = ["model_exporter.cc"],
+     hdrs = ["model_exporter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":linear_solver_cc_proto",
+         "//ortools/base",
+@@ -412,6 +417,7 @@ cc_library(
+     name = "solve_mp_model",
+     srcs = ["solve_mp_model.cc"],
+     hdrs = ["solve_mp_model.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_solver",
+diff --git a/ortools/linear_solver/proto_solver/BUILD.bazel b/ortools/linear_solver/proto_solver/BUILD.bazel
+index 57a1d82..3998779 100644
+--- a/ortools/linear_solver/proto_solver/BUILD.bazel
++++ b/ortools/linear_solver/proto_solver/BUILD.bazel
+@@ -16,6 +16,7 @@ package(default_visibility = ["//visibility:public"])
+ cc_library(
+     name = "proto_utils",
+     hdrs = ["proto_utils.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/port:proto_utils",
+@@ -28,6 +29,7 @@ cc_library(
+     name = "glop_proto_solver",
+     srcs = ["glop_proto_solver.cc"],
+     hdrs = ["glop_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_utils",
+         "//ortools/glop:lp_solver",
+@@ -52,6 +54,7 @@ cc_library(
+     name = "pdlp_proto_solver",
+     srcs = ["pdlp_proto_solver.cc"],
+     hdrs = ["pdlp_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+@@ -71,6 +74,7 @@ cc_library(
+     name = "sat_solver_utils",
+     srcs = ["sat_solver_utils.cc"],
+     hdrs = ["sat_solver_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/glop:parameters_cc_proto",
+         "//ortools/glop:preprocessor",
+@@ -85,6 +89,7 @@ cc_library(
+     name = "sat_proto_solver",
+     srcs = ["sat_proto_solver.cc"],
+     hdrs = ["sat_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_utils",
+         ":sat_solver_utils",
+@@ -118,6 +123,7 @@ cc_library(
+         "//ortools/linear_solver:use_scip": ["USE_SCIP"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:timer",
+@@ -144,6 +150,7 @@ cc_library(
+     name = "gurobi_proto_solver",
+     srcs = ["gurobi_proto_solver.cc"],
+     hdrs = ["gurobi_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:timer",
+         "//ortools/gurobi:environment",
+@@ -171,6 +178,7 @@ cc_library(
+         "//ortools/linear_solver:use_highs": ["USE_HIGHS"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:timer",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+@@ -186,6 +194,7 @@ cc_library(
+     name = "xpress_proto_solver",
+     srcs = ["xpress_proto_solver.cc"],
+     hdrs = ["xpress_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:timer",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+diff --git a/ortools/linear_solver/wrappers/BUILD.bazel b/ortools/linear_solver/wrappers/BUILD.bazel
+index f0f031b..fce5554 100644
+--- a/ortools/linear_solver/wrappers/BUILD.bazel
++++ b/ortools/linear_solver/wrappers/BUILD.bazel
+@@ -35,6 +35,7 @@ cc_library(
+         "-DUSE_SCIP",
+         "-DUSE_LP_PARSER",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:file",
+diff --git a/ortools/lp_data/BUILD.bazel b/ortools/lp_data/BUILD.bazel
+index c0e2993..b8bbb47 100644
+--- a/ortools/lp_data/BUILD.bazel
++++ b/ortools/lp_data/BUILD.bazel
+@@ -48,6 +48,7 @@ cc_library(
+     name = "base",
+     srcs = ["lp_types.cc"],
+     hdrs = ["lp_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:hash",
+@@ -61,6 +62,7 @@ cc_library(
+     name = "permutation",
+     hdrs = ["permutation.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -73,6 +75,7 @@ cc_library(
+ cc_library(
+     name = "scattered_vector",
+     hdrs = ["scattered_vector.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -86,6 +89,7 @@ cc_library(
+     name = "sparse_vector",
+     hdrs = ["sparse_vector.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":permutation",
+@@ -102,6 +106,7 @@ cc_library(
+     srcs = ["sparse_column.cc"],
+     hdrs = ["sparse_column.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":sparse_vector",
+@@ -113,6 +118,7 @@ cc_library(
+     name = "sparse_row",
+     hdrs = ["sparse_row.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":sparse_vector",
+@@ -127,6 +133,7 @@ cc_library(
+         "sparse.h",
+     ],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":matrix_scaler_hdr",
+@@ -148,6 +155,7 @@ cc_library(
+     srcs = ["matrix_scaler.cc"],
+     hdrs = ["matrix_scaler.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_utils",
+@@ -165,6 +173,7 @@ cc_library(
+ cc_library(
+     name = "matrix_scaler_hdr",
+     hdrs = ["matrix_scaler.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -177,6 +186,7 @@ cc_library(
+     srcs = ["lp_data.cc"],
+     hdrs = ["lp_data.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_print_utils",
+@@ -200,6 +210,7 @@ cc_library(
+     name = "lp_data_utils",
+     srcs = ["lp_data_utils.cc"],
+     hdrs = ["lp_data_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -213,6 +224,7 @@ cc_library(
+     srcs = ["lp_utils.cc"],
+     hdrs = ["lp_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":scattered_vector",
+@@ -227,6 +239,7 @@ cc_library(
+     srcs = ["matrix_utils.cc"],
+     hdrs = ["matrix_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":sparse",
+@@ -241,6 +254,7 @@ cc_library(
+     hdrs = ["lp_parser.h"],
+     copts = SAFE_FP_CODE,
+     defines = ["USE_LP_PARSER"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -271,6 +285,7 @@ cc_library(
+     srcs = ["lp_print_utils.cc"],
+     hdrs = ["lp_print_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -285,6 +300,7 @@ cc_library(
+     srcs = ["proto_utils.cc"],
+     hdrs = ["proto_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -297,6 +313,7 @@ cc_library(
+     name = "mps_reader_template",
+     srcs = ["mps_reader_template.cc"],
+     hdrs = ["mps_reader_template.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -317,6 +334,7 @@ cc_library(
+     srcs = ["mps_reader.cc"],
+     hdrs = ["mps_reader.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":lp_data",
+         ":lp_print_utils",
+@@ -337,6 +355,7 @@ cc_library(
+     name = "model_reader",
+     srcs = ["model_reader.cc"],
+     hdrs = ["model_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":lp_data",
+         ":mps_reader",
+@@ -354,6 +373,7 @@ cc_library(
+     srcs = ["lp_decomposer.cc"],
+     hdrs = ["lp_decomposer.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -370,6 +390,7 @@ cc_library(
+     name = "sol_reader",
+     srcs = ["sol_reader.cc"],
+     hdrs = ["sol_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+diff --git a/ortools/math_opt/constraints/indicator/BUILD.bazel b/ortools/math_opt/constraints/indicator/BUILD.bazel
+index 12fdf6d..e4d2fa4 100644
+--- a/ortools/math_opt/constraints/indicator/BUILD.bazel
++++ b/ortools/math_opt/constraints/indicator/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "indicator_constraint",
+     srcs = ["indicator_constraint.cc"],
+     hdrs = ["indicator_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt/constraints/util:model_util",
+@@ -45,6 +46,7 @@ cc_library(
+     name = "storage",
+     srcs = ["storage.cc"],
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt:model_cc_proto",
+@@ -75,6 +77,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/quadratic/BUILD.bazel b/ortools/math_opt/constraints/quadratic/BUILD.bazel
+index e4a0925..d521c19 100644
+--- a/ortools/math_opt/constraints/quadratic/BUILD.bazel
++++ b/ortools/math_opt/constraints/quadratic/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "quadratic_constraint",
+     srcs = ["quadratic_constraint.cc"],
+     hdrs = ["quadratic_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt/constraints/util:model_util",
+@@ -50,6 +51,7 @@ cc_library(
+     name = "storage",
+     srcs = ["storage.cc"],
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:model_cc_proto",
+         "//ortools/math_opt:model_update_cc_proto",
+@@ -81,6 +83,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/second_order_cone/BUILD.bazel b/ortools/math_opt/constraints/second_order_cone/BUILD.bazel
+index 37ed646..17b383d 100644
+--- a/ortools/math_opt/constraints/second_order_cone/BUILD.bazel
++++ b/ortools/math_opt/constraints/second_order_cone/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "second_order_cone_constraint",
+     srcs = ["second_order_cone_constraint.cc"],
+     hdrs = ["second_order_cone_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":storage",
+         "//ortools/base:intops",
+@@ -47,6 +48,7 @@ cc_library(
+     name = "storage",
+     srcs = ["storage.cc"],
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt:model_cc_proto",
+@@ -79,6 +81,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/sos/BUILD.bazel b/ortools/math_opt/constraints/sos/BUILD.bazel
+index fade5cb..aad7cd5 100644
+--- a/ortools/math_opt/constraints/sos/BUILD.bazel
++++ b/ortools/math_opt/constraints/sos/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "sos1_constraint",
+     srcs = ["sos1_constraint.cc"],
+     hdrs = ["sos1_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":util",
+         "//ortools/base:intops",
+@@ -50,6 +51,7 @@ cc_library(
+     name = "sos2_constraint",
+     srcs = ["sos2_constraint.cc"],
+     hdrs = ["sos2_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":util",
+         "//ortools/base:intops",
+@@ -82,6 +84,7 @@ cc_test(
+ cc_library(
+     name = "storage",
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt:model_cc_proto",
+@@ -112,6 +115,7 @@ cc_test(
+ cc_library(
+     name = "util",
+     hdrs = ["util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/cpp:variable_and_expressions",
+         "//ortools/util:fp_roundtrip_conv",
+@@ -123,6 +127,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/util/BUILD.bazel b/ortools/math_opt/constraints/util/BUILD.bazel
+index c3d0c06..968ba25 100644
+--- a/ortools/math_opt/constraints/util/BUILD.bazel
++++ b/ortools/math_opt/constraints/util/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "model_util",
+     srcs = ["model_util.cc"],
+     hdrs = ["model_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt/cpp:variable_and_expressions",
+diff --git a/ortools/math_opt/core/BUILD.bazel b/ortools/math_opt/core/BUILD.bazel
+index 06da18f..45f3170 100644
+--- a/ortools/math_opt/core/BUILD.bazel
++++ b/ortools/math_opt/core/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "math_opt_proto_utils",
+     srcs = ["math_opt_proto_utils.cc"],
+     hdrs = ["math_opt_proto_utils.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":sparse_vector_view",
+@@ -42,6 +43,7 @@ cc_library(
+ cc_library(
+     name = "sparse_vector_view",
+     hdrs = ["sparse_vector_view.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":arrow_operator_proxy",
+         ":sparse_vector",
+@@ -59,6 +61,7 @@ cc_library(
+     name = "model_summary",
+     srcs = ["model_summary.cc"],
+     hdrs = ["model_summary.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:linked_hash_map",
+         "//ortools/base:status_macros",
+@@ -78,6 +81,7 @@ cc_library(
+     name = "solver_interface",
+     srcs = ["solver_interface.cc"],
+     hdrs = ["solver_interface.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":non_streamable_solver_init_arguments",
+         "//ortools/base:map_util",
+@@ -104,6 +108,7 @@ cc_library(
+     name = "solver",
+     srcs = ["solver.cc"],
+     hdrs = ["solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver",
+         ":concurrent_calls_guard",
+@@ -139,6 +144,7 @@ cc_library(
+     name = "non_streamable_solver_init_arguments",
+     srcs = ["non_streamable_solver_init_arguments.cc"],
+     hdrs = ["non_streamable_solver_init_arguments.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/math_opt:parameters_cc_proto"],
+ )
+ 
+@@ -146,22 +152,26 @@ cc_library(
+     name = "solver_debug",
+     srcs = ["solver_debug.cc"],
+     hdrs = ["solver_debug.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "arrow_operator_proxy",
+     hdrs = ["arrow_operator_proxy.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "sparse_vector",
+     hdrs = ["sparse_vector.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "sparse_submatrix",
+     srcs = ["sparse_submatrix.cc"],
+     hdrs = ["sparse_submatrix.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sparse_vector",
+         ":sparse_vector_view",
+@@ -176,6 +186,7 @@ cc_library(
+     name = "inverted_bounds",
+     srcs = ["inverted_bounds.cc"],
+     hdrs = ["inverted_bounds.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -188,6 +199,7 @@ cc_library(
+     name = "invalid_indicators",
+     srcs = ["invalid_indicators.cc"],
+     hdrs = ["invalid_indicators.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/algorithm:container",
+@@ -200,6 +212,7 @@ cc_library(
+     name = "concurrent_calls_guard",
+     srcs = ["concurrent_calls_guard.cc"],
+     hdrs = ["concurrent_calls_guard.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/base:core_headers",
+         "@com_google_absl//absl/log:check",
+@@ -213,6 +226,7 @@ cc_library(
+     name = "empty_bounds",
+     srcs = ["empty_bounds.cc"],
+     hdrs = ["empty_bounds.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:result_cc_proto",
+         "//ortools/util:fp_roundtrip_conv",
+@@ -223,6 +237,7 @@ cc_library(
+ cc_library(
+     name = "sorted",
+     hdrs = ["sorted.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/algorithm:container",
+         "@com_google_absl//absl/container:flat_hash_map",
+@@ -235,6 +250,7 @@ cc_library(
+     name = "base_solver",
+     srcs = ["base_solver.cc"],
+     hdrs = ["base_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:callback_cc_proto",
+         "//ortools/math_opt:infeasible_subsystem_cc_proto",
+diff --git a/ortools/math_opt/cpp/BUILD.bazel b/ortools/math_opt/cpp/BUILD.bazel
+index a606388..51e4b90 100644
+--- a/ortools/math_opt/cpp/BUILD.bazel
++++ b/ortools/math_opt/cpp/BUILD.bazel
+@@ -20,6 +20,7 @@ package(default_visibility = [
+ cc_library(
+     name = "math_opt",
+     hdrs = ["math_opt.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":model",
+@@ -32,6 +33,7 @@ cc_library(
+     name = "basis_status",
+     srcs = ["basis_status.cc"],
+     hdrs = ["basis_status.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         "//ortools/math_opt:solution_cc_proto",
+@@ -44,6 +46,7 @@ cc_library(
+     name = "sparse_containers",
+     srcs = ["sparse_containers.cc"],
+     hdrs = ["sparse_containers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":basis_status",
+         ":linear_constraint",
+@@ -71,6 +74,7 @@ cc_library(
+     name = "model",
+     srcs = ["model.cc"],
+     hdrs = ["model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":linear_constraint",
+@@ -105,6 +109,7 @@ cc_library(
+     name = "variable_and_expressions",
+     srcs = ["variable_and_expressions.cc"],
+     hdrs = ["variable_and_expressions.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":formatters",
+         ":key_types",
+@@ -126,6 +131,7 @@ cc_library(
+     name = "objective",
+     srcs = ["objective.cc"],
+     hdrs = ["objective.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":variable_and_expressions",
+@@ -140,6 +146,7 @@ cc_library(
+ cc_library(
+     name = "linear_constraint",
+     hdrs = ["linear_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":variable_and_expressions",
+@@ -156,6 +163,7 @@ cc_library(
+     name = "solution",
+     srcs = ["solution.cc"],
+     hdrs = ["solution.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":basis_status",
+         ":enums",
+@@ -184,6 +192,7 @@ cc_library(
+     name = "solve_result",
+     srcs = ["solve_result.cc"],
+     hdrs = ["solve_result.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         ":linear_constraint",
+@@ -212,6 +221,7 @@ cc_library(
+     name = "map_filter",
+     srcs = ["map_filter.cc"],
+     hdrs = ["map_filter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":linear_constraint",
+@@ -231,6 +241,7 @@ cc_library(
+     name = "callback",
+     srcs = ["callback.cc"],
+     hdrs = ["callback.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         ":map_filter",
+@@ -256,6 +267,7 @@ cc_library(
+ cc_library(
+     name = "key_types",
+     hdrs = ["key_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/storage:model_storage",
+         "@com_google_absl//absl/algorithm:container",
+@@ -270,6 +282,7 @@ cc_library(
+     name = "model_solve_parameters",
+     srcs = ["model_solve_parameters.cc"],
+     hdrs = ["model_solve_parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":linear_constraint",
+         ":map_filter",
+@@ -295,6 +308,7 @@ cc_library(
+     name = "update_tracker",
+     srcs = ["update_tracker.cc"],
+     hdrs = ["update_tracker.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/math_opt:model_cc_proto",
+@@ -310,6 +324,7 @@ cc_library(
+     name = "message_callback",
+     srcs = ["message_callback.cc"],
+     hdrs = ["message_callback.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/base:source_location",
+@@ -323,6 +338,7 @@ cc_library(
+ cc_library(
+     name = "solver_init_arguments",
+     hdrs = ["solver_init_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":streamable_solver_init_arguments",
+         "//ortools/math_opt/core:non_streamable_solver_init_arguments",
+@@ -333,6 +349,7 @@ cc_library(
+     name = "solve_arguments",
+     srcs = ["solve_arguments.cc"],
+     hdrs = ["solve_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":callback",
+         ":message_callback",
+@@ -350,6 +367,7 @@ cc_library(
+     name = "solve",
+     srcs = ["solve.cc"],
+     hdrs = ["solve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":compute_infeasible_subsystem_arguments",
+         ":compute_infeasible_subsystem_result",
+@@ -375,6 +393,7 @@ cc_library(
+     name = "streamable_solver_init_arguments",
+     srcs = ["streamable_solver_init_arguments.cc"],
+     hdrs = ["streamable_solver_init_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:parameters_cc_proto",
+         "//ortools/math_opt/solvers:gurobi_cc_proto",
+@@ -386,6 +405,7 @@ cc_library(
+     name = "parameters",
+     srcs = ["parameters.cc"],
+     hdrs = ["parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         "//ortools/base:linked_hash_map",
+@@ -414,6 +434,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["matchers.cc"],
+     hdrs = ["matchers.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_constraint",
+@@ -435,6 +456,7 @@ cc_library(
+ cc_library(
+     name = "enums",
+     hdrs = ["enums.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/strings",
+@@ -446,6 +468,7 @@ cc_library(
+     name = "statistics",
+     srcs = ["statistics.cc"],
+     hdrs = ["statistics.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":model",
+@@ -456,12 +479,14 @@ cc_library(
+ cc_library(
+     name = "formatters",
+     hdrs = ["formatters.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/util:fp_roundtrip_conv"],
+ )
+ 
+ cc_library(
+     name = "update_result",
+     hdrs = ["update_result.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/math_opt:model_update_cc_proto"],
+ )
+ 
+@@ -469,6 +494,7 @@ cc_library(
+     name = "compute_infeasible_subsystem_result",
+     srcs = ["compute_infeasible_subsystem_result.cc"],
+     hdrs = ["compute_infeasible_subsystem_result.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         ":key_types",
+@@ -500,6 +526,7 @@ cc_library(
+ cc_library(
+     name = "compute_infeasible_subsystem_arguments",
+     hdrs = ["compute_infeasible_subsystem_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":message_callback",
+         ":parameters",
+@@ -511,6 +538,7 @@ cc_library(
+     name = "solver_resources",
+     srcs = ["solver_resources.cc"],
+     hdrs = ["solver_resources.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:rpc_cc_proto",
+         "//ortools/port:proto_utils",
+@@ -524,6 +552,7 @@ cc_library(
+     name = "solve_impl",
+     srcs = ["solve_impl.cc"],
+     hdrs = ["solve_impl.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":compute_infeasible_subsystem_arguments",
+         ":compute_infeasible_subsystem_result",
+@@ -551,6 +580,7 @@ cc_library(
+ cc_library(
+     name = "incremental_solver",
+     hdrs = ["incremental_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":compute_infeasible_subsystem_arguments",
+         ":compute_infeasible_subsystem_result",
+diff --git a/ortools/math_opt/io/BUILD.bazel b/ortools/math_opt/io/BUILD.bazel
+index 428beaf..8e1bb4a 100644
+--- a/ortools/math_opt/io/BUILD.bazel
++++ b/ortools/math_opt/io/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "proto_converter",
+     srcs = ["proto_converter.cc"],
+     hdrs = ["proto_converter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+@@ -41,6 +42,7 @@ cc_library(
+     name = "mps_converter",
+     srcs = ["mps_converter.cc"],
+     hdrs = ["mps_converter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_converter",
+         "//ortools/base:status_macros",
+@@ -57,6 +59,7 @@ cc_library(
+     name = "names_removal",
+     srcs = ["names_removal.cc"],
+     hdrs = ["names_removal.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:model_cc_proto",
+         "//ortools/math_opt:model_update_cc_proto",
+@@ -67,6 +70,7 @@ cc_library(
+     name = "lp_converter",
+     srcs = ["lp_converter.cc"],
+     hdrs = ["lp_converter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_converter",
+         "//ortools/base:status_macros",
+@@ -81,6 +85,7 @@ cc_library(
+     name = "lp_parser",
+     srcs = ["lp_parser.cc"],
+     hdrs = ["lp_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":mps_converter",
+         "//ortools/base",
+diff --git a/ortools/math_opt/io/lp/BUILD.bazel b/ortools/math_opt/io/lp/BUILD.bazel
+index 95d079c..02f1634 100644
+--- a/ortools/math_opt/io/lp/BUILD.bazel
++++ b/ortools/math_opt/io/lp/BUILD.bazel
+@@ -15,6 +15,7 @@ cc_library(
+     name = "lp_model",
+     srcs = ["lp_model.cc"],
+     hdrs = ["lp_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":lp_name",
+         "//ortools/base:intops",
+@@ -32,6 +33,7 @@ cc_library(
+     name = "lp_name",
+     srcs = ["lp_name.cc"],
+     hdrs = ["lp_name.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -44,6 +46,7 @@ cc_library(
+     name = "model_utils",
+     srcs = ["model_utils.cc"],
+     hdrs = ["model_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":lp_model",
+         "//ortools/base:status_macros",
+diff --git a/ortools/math_opt/labs/BUILD.bazel b/ortools/math_opt/labs/BUILD.bazel
+index d048d84..c57e06a 100644
+--- a/ortools/math_opt/labs/BUILD.bazel
++++ b/ortools/math_opt/labs/BUILD.bazel
+@@ -15,6 +15,7 @@ cc_library(
+     name = "general_constraint_to_mip",
+     srcs = ["general_constraint_to_mip.cc"],
+     hdrs = ["general_constraint_to_mip.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_expr_util",
+@@ -28,6 +29,7 @@ cc_library(
+     name = "linear_expr_util",
+     srcs = ["linear_expr_util.cc"],
+     hdrs = ["linear_expr_util.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/math_opt/cpp:math_opt",
+@@ -39,6 +41,7 @@ cc_library(
+     name = "solution_feasibility_checker",
+     srcs = ["solution_feasibility_checker.cc"],
+     hdrs = ["solution_feasibility_checker.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:mathutil",
+@@ -57,6 +60,7 @@ cc_library(
+     name = "solution_improvement",
+     srcs = ["solution_improvement.cc"],
+     hdrs = ["solution_improvement.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:status_macros",
+@@ -77,6 +81,7 @@ cc_library(
+         "dualizer.cc",
+     ],
+     hdrs = ["dualizer.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:map_util",
+diff --git a/ortools/math_opt/solver_tests/BUILD.bazel b/ortools/math_opt/solver_tests/BUILD.bazel
+index 48fc1d2..e43f488 100644
+--- a/ortools/math_opt/solver_tests/BUILD.bazel
++++ b/ortools/math_opt/solver_tests/BUILD.bazel
+@@ -18,6 +18,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["base_solver_test.cc"],
+     hdrs = ["base_solver_test.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/base:linked_hash_map",
+@@ -34,6 +35,7 @@ cc_library(
+     data = [
+         "//ortools/math_opt/solver_tests/testdata:23588.mps",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         ":test_models",
+@@ -69,6 +71,7 @@ cc_library(
+     data = [
+         "//ortools/math_opt/solver_tests/testdata:23588.mps",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base",
+@@ -91,6 +94,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_tests.cc"],
+     hdrs = ["lp_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -111,6 +115,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_incomplete_solve_tests.cc"],
+     hdrs = ["lp_incomplete_solve_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base",
+@@ -131,6 +136,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["invalid_input_tests.cc"],
+     hdrs = ["invalid_input_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -167,6 +173,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["mip_tests.cc"],
+     hdrs = ["mip_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base",
+@@ -185,6 +192,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["ip_model_solve_parameters_tests.cc"],
+     hdrs = ["ip_model_solve_parameters_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -204,6 +212,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["ip_multiple_solutions_tests.cc"],
+     hdrs = ["ip_multiple_solutions_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt/cpp:matchers",
+@@ -220,6 +229,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_model_solve_parameters_tests.cc"],
+     hdrs = ["lp_model_solve_parameters_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         ":test_models",
+@@ -237,6 +247,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_parameter_tests.cc"],
+     hdrs = ["lp_parameter_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/base:status_macros",
+@@ -258,6 +269,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_initial_basis_tests.cc"],
+     hdrs = ["lp_initial_basis_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -279,6 +291,7 @@ cc_library(
+         "//ortools/math_opt/solver_tests/testdata:23588.mps",
+         "//ortools/math_opt/solver_tests/testdata:beavma.mps",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base",
+@@ -305,6 +318,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["multi_objective_tests.cc"],
+     hdrs = ["multi_objective_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/base:status_macros",
+@@ -327,6 +341,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["qp_tests.cc"],
+     hdrs = ["qp_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:gmock",
+@@ -345,6 +360,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["qc_tests.cc"],
+     hdrs = ["qc_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt/cpp:matchers",
+@@ -363,6 +379,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["second_order_cone_tests.cc"],
+     hdrs = ["second_order_cone_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt/cpp:matchers",
+@@ -381,6 +398,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["logical_constraint_tests.cc"],
+     hdrs = ["logical_constraint_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt:model_update_cc_proto",
+@@ -401,6 +419,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["test_models.cc"],
+     hdrs = ["test_models.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/cpp:math_opt",
+         "@com_google_absl//absl/log:check",
+@@ -426,6 +445,7 @@ cc_library(
+     testonly = True,
+     srcs = ["generic_tests.cc"],
+     hdrs = ["generic_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base:gmock",
+@@ -452,6 +472,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["infeasible_subsystem_tests.cc"],
+     hdrs = ["infeasible_subsystem_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/gurobi:gurobi_stdout_matchers",
+diff --git a/ortools/math_opt/solvers/BUILD.bazel b/ortools/math_opt/solvers/BUILD.bazel
+index e7e8054..ef6c123 100644
+--- a/ortools/math_opt/solvers/BUILD.bazel
++++ b/ortools/math_opt/solvers/BUILD.bazel
+@@ -22,6 +22,7 @@ cc_library(
+         "gscip_solver.cc",
+         "gscip_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":message_callback_data",
+@@ -106,6 +107,7 @@ cc_library(
+     name = "gurobi_callback",
+     srcs = ["gurobi_callback.cc"],
+     hdrs = ["gurobi_callback.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":message_callback_data",
+         "//ortools/base:linked_hash_map",
+@@ -140,6 +142,7 @@ cc_library(
+     hdrs = [
+         "gurobi_init_arguments.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":gurobi_callback",
+@@ -195,6 +198,7 @@ cc_library(
+         "glop_solver.cc",
+         "glop_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:map_util",
+@@ -245,6 +249,7 @@ cc_library(
+         "cp_sat_solver.cc",
+         "cp_sat_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:protoutil",
+@@ -311,9 +316,9 @@ cc_test(
+ 
+ cc_test(
+     name = "cp_sat_solver_test",
++    timeout = "eternal",
+     srcs = ["cp_sat_solver_test.cc"],
+     shard_count = 10,
+-    timeout = "eternal",
+     deps = [
+         ":cp_sat_solver",
+         "//ortools/base:gmock_main",
+@@ -342,6 +347,7 @@ cc_library(
+     name = "message_callback_data",
+     srcs = ["message_callback_data.cc"],
+     hdrs = ["message_callback_data.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/core:solver_interface",
+         "@com_google_absl//absl/strings",
+@@ -365,6 +371,7 @@ cc_library(
+     name = "pdlp_bridge",
+     srcs = ["pdlp_bridge.cc"],
+     hdrs = ["pdlp_bridge.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+@@ -391,6 +398,7 @@ cc_library(
+         "pdlp_solver.cc",
+         "pdlp_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":pdlp_bridge",
+@@ -464,6 +472,7 @@ cc_library(
+         "glpk_solver.cc",
+         "glpk_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":glpk_cc_proto",
+@@ -563,6 +572,7 @@ cc_library(
+         "highs_solver.cc",
+         "highs_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":highs_cc_proto",
+diff --git a/ortools/math_opt/solvers/glpk/BUILD.bazel b/ortools/math_opt/solvers/glpk/BUILD.bazel
+index b33dd3b..af07950 100644
+--- a/ortools/math_opt/solvers/glpk/BUILD.bazel
++++ b/ortools/math_opt/solvers/glpk/BUILD.bazel
+@@ -18,6 +18,7 @@ cc_library(
+     name = "rays",
+     srcs = ["rays.cc"],
+     hdrs = ["rays.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/base:status_macros",
+@@ -35,6 +36,7 @@ cc_library(
+     name = "glpk_sparse_vector",
+     srcs = ["glpk_sparse_vector.cc"],
+     hdrs = ["glpk_sparse_vector.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "@com_google_absl//absl/log:check",
+@@ -54,6 +56,7 @@ cc_library(
+     name = "gap",
+     srcs = ["gap.cc"],
+     hdrs = ["gap.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_test(
+diff --git a/ortools/math_opt/solvers/gscip/BUILD.bazel b/ortools/math_opt/solvers/gscip/BUILD.bazel
+index fd91d85..fcc2c9c 100644
+--- a/ortools/math_opt/solvers/gscip/BUILD.bazel
++++ b/ortools/math_opt/solvers/gscip/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "gscip_solver_constraint_handler",
+     srcs = ["gscip_solver_constraint_handler.cc"],
+     hdrs = ["gscip_solver_constraint_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:linked_hash_map",
+         "//ortools/base:protoutil",
+diff --git a/ortools/math_opt/solvers/gurobi/BUILD.bazel b/ortools/math_opt/solvers/gurobi/BUILD.bazel
+index 32f8f39..5039d84 100644
+--- a/ortools/math_opt/solvers/gurobi/BUILD.bazel
++++ b/ortools/math_opt/solvers/gurobi/BUILD.bazel
+@@ -19,6 +19,7 @@ cc_library(
+     hdrs = [
+         "g_gurobi.h",
+     ],
++    features = ["-layering_check"],
+     visibility = [
+         "//ortools/gurobi:__subpackages__",
+         "//ortools/math_opt:__subpackages__",
+diff --git a/ortools/math_opt/storage/BUILD.bazel b/ortools/math_opt/storage/BUILD.bazel
+index cb85a81..459d2b0 100644
+--- a/ortools/math_opt/storage/BUILD.bazel
++++ b/ortools/math_opt/storage/BUILD.bazel
+@@ -16,6 +16,7 @@ package(default_visibility = ["//ortools/math_opt:__subpackages__"])
+ cc_library(
+     name = "model_storage_types",
+     hdrs = ["model_storage_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "@com_google_absl//absl/strings",
+@@ -25,11 +26,13 @@ cc_library(
+ cc_library(
+     name = "range",
+     hdrs = ["range.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "iterators",
+     hdrs = ["iterators.h"],
++    features = ["-layering_check"],
+     deps = [":range"],
+ )
+ 
+@@ -37,6 +40,7 @@ cc_library(
+     name = "sparse_coefficient_map",
+     srcs = ["sparse_coefficient_map.cc"],
+     hdrs = ["sparse_coefficient_map.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         "//ortools/base:intops",
+@@ -51,6 +55,7 @@ cc_library(
+     name = "sparse_matrix",
+     srcs = ["sparse_matrix.cc"],
+     hdrs = ["sparse_matrix.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         "//ortools/base:intops",
+@@ -67,6 +72,7 @@ cc_library(
+ cc_library(
+     name = "linear_expression_data",
+     hdrs = ["linear_expression_data.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sparse_coefficient_map",
+         "//ortools/math_opt:sparse_containers_cc_proto",
+@@ -78,6 +84,7 @@ cc_library(
+ cc_library(
+     name = "update_trackers",
+     hdrs = ["update_trackers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         "//ortools/base:intops",
+@@ -93,6 +100,7 @@ cc_library(
+     name = "variable_storage",
+     srcs = ["variable_storage.cc"],
+     hdrs = ["variable_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         ":range",
+@@ -112,6 +120,7 @@ cc_library(
+     name = "objective_storage",
+     srcs = ["objective_storage.cc"],
+     hdrs = ["objective_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":range",
+         ":sparse_coefficient_map",
+@@ -136,6 +145,7 @@ cc_library(
+     name = "linear_constraint_storage",
+     srcs = ["linear_constraint_storage.cc"],
+     hdrs = ["linear_constraint_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         ":range",
+@@ -158,6 +168,7 @@ cc_library(
+ cc_library(
+     name = "atomic_constraint_storage",
+     hdrs = ["atomic_constraint_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         ":range",
+@@ -176,6 +187,7 @@ cc_library(
+     name = "model_storage",
+     srcs = ["model_storage.cc"],
+     hdrs = ["model_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":atomic_constraint_storage",
+         ":iterators",
+diff --git a/ortools/math_opt/testing/BUILD.bazel b/ortools/math_opt/testing/BUILD.bazel
+index e80e4e0..058bd4b 100644
+--- a/ortools/math_opt/testing/BUILD.bazel
++++ b/ortools/math_opt/testing/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "param_name",
+     testonly = True,
+     hdrs = ["param_name.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+     ],
+@@ -25,4 +26,5 @@ cc_library(
+ cc_library(
+     name = "stream",
+     hdrs = ["stream.h"],
++    features = ["-layering_check"],
+ )
+diff --git a/ortools/math_opt/tools/BUILD.bazel b/ortools/math_opt/tools/BUILD.bazel
+index 55c1f2f..adfeac6 100644
+--- a/ortools/math_opt/tools/BUILD.bazel
++++ b/ortools/math_opt/tools/BUILD.bazel
+@@ -66,6 +66,7 @@ cc_library(
+     name = "file_format_flags",
+     srcs = ["file_format_flags.cc"],
+     hdrs = ["file_format_flags.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+diff --git a/ortools/math_opt/validators/BUILD.bazel b/ortools/math_opt/validators/BUILD.bazel
+index 5448a93..c23ef8e 100644
+--- a/ortools/math_opt/validators/BUILD.bazel
++++ b/ortools/math_opt/validators/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "ids_validator",
+     srcs = ["ids_validator.cc"],
+     hdrs = ["ids_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt/core:model_summary",
+@@ -31,6 +32,7 @@ cc_library(
+     name = "scalar_validator",
+     srcs = ["scalar_validator.cc"],
+     hdrs = ["scalar_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/status",
+         "@com_google_absl//absl/strings",
+@@ -41,6 +43,7 @@ cc_library(
+     name = "sparse_matrix_validator",
+     srcs = ["sparse_matrix_validator.cc"],
+     hdrs = ["sparse_matrix_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         "//ortools/base:status_macros",
+@@ -56,6 +59,7 @@ cc_library(
+ cc_library(
+     name = "sparse_vector_validator",
+     hdrs = ["sparse_vector_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":scalar_validator",
+@@ -70,6 +74,7 @@ cc_library(
+     name = "model_validator",
+     srcs = ["model_validator.cc"],
+     hdrs = ["model_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":scalar_validator",
+@@ -94,6 +99,7 @@ cc_library(
+     name = "solve_stats_validator",
+     srcs = ["solve_stats_validator.cc"],
+     hdrs = ["solve_stats_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:protoutil",
+         "//ortools/math_opt:result_cc_proto",
+@@ -108,6 +114,7 @@ cc_library(
+     name = "result_validator",
+     srcs = ["result_validator.cc"],
+     hdrs = ["result_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":solution_validator",
+         ":solve_stats_validator",
+@@ -128,6 +135,7 @@ cc_library(
+     name = "solution_validator",
+     srcs = ["solution_validator.cc"],
+     hdrs = ["solution_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":scalar_validator",
+@@ -148,6 +156,7 @@ cc_library(
+     name = "solve_parameters_validator",
+     srcs = ["solve_parameters_validator.cc"],
+     hdrs = ["solve_parameters_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:protoutil",
+         "//ortools/base:status_macros",
+@@ -164,6 +173,7 @@ cc_library(
+     name = "callback_validator",
+     srcs = ["callback_validator.cc"],
+     hdrs = ["callback_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":model_parameters_validator",
+@@ -190,6 +200,7 @@ cc_library(
+     name = "model_parameters_validator",
+     srcs = ["model_parameters_validator.cc"],
+     hdrs = ["model_parameters_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":solution_validator",
+@@ -208,6 +219,7 @@ cc_library(
+     name = "linear_expression_validator",
+     srcs = ["linear_expression_validator.cc"],
+     hdrs = ["linear_expression_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":scalar_validator",
+         ":sparse_vector_validator",
+@@ -223,6 +235,7 @@ cc_library(
+     name = "infeasible_subsystem_validator",
+     srcs = ["infeasible_subsystem_validator.cc"],
+     hdrs = ["infeasible_subsystem_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bounds_and_status_validator",
+         ":ids_validator",
+@@ -238,6 +251,7 @@ cc_library(
+     name = "bounds_and_status_validator",
+     srcs = ["bounds_and_status_validator.cc"],
+     hdrs = ["bounds_and_status_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":scalar_validator",
+         "//ortools/base:status_macros",
+@@ -252,6 +266,7 @@ cc_library(
+     name = "termination_validator",
+     srcs = ["termination_validator.cc"],
+     hdrs = ["termination_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bounds_and_status_validator",
+         "//ortools/base:status_macros",
+diff --git a/ortools/packing/BUILD.bazel b/ortools/packing/BUILD.bazel
+index 04b7014..a774e8f 100644
+--- a/ortools/packing/BUILD.bazel
++++ b/ortools/packing/BUILD.bazel
+@@ -21,6 +21,7 @@ cc_library(
+     name = "arc_flow_builder",
+     srcs = ["arc_flow_builder.cc"],
+     hdrs = ["arc_flow_builder.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -37,6 +38,7 @@ cc_library(
+         "arc_flow_solver.cc",
+     ],
+     hdrs = ["arc_flow_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":arc_flow_builder",
+         "//ortools/base",
+@@ -65,6 +67,7 @@ cc_library(
+     name = "vector_bin_packing_parser",
+     srcs = ["vector_bin_packing_parser.cc"],
+     hdrs = ["vector_bin_packing_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":vector_bin_packing_cc_proto",
+@@ -131,6 +134,7 @@ cc_library(
+     name = "binpacking_2d_parser",
+     srcs = ["binpacking_2d_parser.cc"],
+     hdrs = ["binpacking_2d_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":multiple_dimensions_bin_packing_cc_proto",
+diff --git a/ortools/pdlp/BUILD.bazel b/ortools/pdlp/BUILD.bazel
+index 5b68856..739a948 100644
+--- a/ortools/pdlp/BUILD.bazel
++++ b/ortools/pdlp/BUILD.bazel
+@@ -20,6 +20,7 @@ package(default_visibility = ["//visibility:public"])
+ cc_library(
+     name = "scheduler",
+     hdrs = ["scheduler.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/functional:any_invocable",
+     ],
+@@ -62,6 +63,7 @@ py_proto_library(
+ cc_library(
+     name = "gtest_main",
+     srcs = ["gtest_main.cc"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:gmock",
+@@ -72,6 +74,7 @@ cc_library(
+     name = "iteration_stats",
+     srcs = ["iteration_stats.cc"],
+     hdrs = ["iteration_stats.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharded_quadratic_program",
+@@ -105,6 +108,7 @@ cc_library(
+     name = "primal_dual_hybrid_gradient",
+     srcs = ["primal_dual_hybrid_gradient.cc"],
+     hdrs = ["primal_dual_hybrid_gradient.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":iteration_stats",
+         ":quadratic_program",
+@@ -169,6 +173,7 @@ cc_library(
+     name = "quadratic_program",
+     srcs = ["quadratic_program.cc"],
+     hdrs = ["quadratic_program.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:status_macros",
+@@ -201,6 +206,7 @@ cc_library(
+     name = "quadratic_program_io",
+     srcs = ["quadratic_program_io.cc"],
+     hdrs = ["quadratic_program_io.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         "//ortools/base",
+@@ -224,6 +230,7 @@ cc_library(
+     name = "sharded_optimization_utils",
+     srcs = ["sharded_optimization_utils.cc"],
+     hdrs = ["sharded_optimization_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharded_quadratic_program",
+@@ -256,6 +263,7 @@ cc_library(
+     name = "sharded_quadratic_program",
+     srcs = ["sharded_quadratic_program.cc"],
+     hdrs = ["sharded_quadratic_program.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharder",
+@@ -285,6 +293,7 @@ cc_library(
+     name = "sharder",
+     srcs = ["sharder.cc"],
+     hdrs = ["sharder.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:mathutil",
+@@ -315,6 +324,7 @@ cc_library(
+     name = "solvers_proto_validation",
+     srcs = ["solvers_proto_validation.cc"],
+     hdrs = ["solvers_proto_validation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":solvers_cc_proto",
+         "//ortools/base:status_macros",
+@@ -340,6 +350,7 @@ cc_library(
+     name = "termination",
+     srcs = ["termination.cc"],
+     hdrs = ["termination.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":solve_log_cc_proto",
+         ":solvers_cc_proto",
+@@ -365,6 +376,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["test_util.cc"],
+     hdrs = ["test_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         "//ortools/base",
+@@ -390,6 +402,7 @@ cc_library(
+     name = "trust_region",
+     srcs = ["trust_region.cc"],
+     hdrs = ["trust_region.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharded_optimization_utils",
+diff --git a/ortools/port/BUILD.bazel b/ortools/port/BUILD.bazel
+index 00b8585..b947b31 100644
+--- a/ortools/port/BUILD.bazel
++++ b/ortools/port/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "sysinfo",
+     srcs = ["sysinfo.cc"],
+     hdrs = ["sysinfo.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:sysinfo",
+@@ -27,6 +28,7 @@ cc_library(
+     name = "proto_utils",
+     srcs = ["proto_utils.cc"],
+     hdrs = ["proto_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/util:parse_proto",
+@@ -38,6 +40,7 @@ cc_library(
+ cc_library(
+     name = "utf8",
+     hdrs = ["utf8.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:encodingutils",
+@@ -52,6 +55,7 @@ cc_library(
+     hdrs = [
+         "file.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:file",
+         "@com_google_absl//absl/status",
+@@ -64,4 +68,5 @@ cc_library(
+ cc_library(
+     name = "scoped_std_stream_capture",
+     hdrs = ["scoped_std_stream_capture.h"],
++    features = ["-layering_check"],
+ )
+diff --git a/ortools/routing/parsers/BUILD.bazel b/ortools/routing/parsers/BUILD.bazel
+index 94690f3..a99b6dd 100644
+--- a/ortools/routing/parsers/BUILD.bazel
++++ b/ortools/routing/parsers/BUILD.bazel
+@@ -30,6 +30,7 @@ cc_library(
+     name = "simple_graph",
+     srcs = ["simple_graph.cc"],
+     hdrs = ["simple_graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/hash",
+     ],
+@@ -50,6 +51,7 @@ cc_library(
+     name = "solomon_parser",
+     srcs = ["solomon_parser.cc"],
+     hdrs = ["solomon_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -79,6 +81,7 @@ cc_library(
+     name = "lilim_parser",
+     srcs = ["lilim_parser.cc"],
+     hdrs = ["lilim_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base:file",
+@@ -110,6 +113,7 @@ cc_library(
+     name = "carp_parser",
+     srcs = ["carp_parser.cc"],
+     hdrs = ["carp_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -153,6 +157,7 @@ cc_library(
+     name = "nearp_parser",
+     srcs = ["nearp_parser.cc"],
+     hdrs = ["nearp_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -186,6 +191,7 @@ cc_library(
+     name = "pdtsp_parser",
+     srcs = ["pdtsp_parser.cc"],
+     hdrs = ["pdtsp_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base",
+@@ -220,6 +226,7 @@ cc_library(
+     name = "tsplib_parser",
+     srcs = ["tsplib_parser.cc"],
+     hdrs = ["tsplib_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":simple_graph",
+@@ -268,6 +275,7 @@ cc_library(
+     name = "tsptw_parser",
+     srcs = ["tsptw_parser.cc"],
+     hdrs = ["tsptw_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":simple_graph",
+@@ -302,6 +310,7 @@ cc_library(
+     name = "solution_serializer",
+     srcs = ["solution_serializer.cc"],
+     hdrs = ["solution_serializer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -332,6 +341,7 @@ cc_library(
+     name = "cvrptw_lib",
+     srcs = ["cvrptw_lib.cc"],
+     hdrs = ["cvrptw_lib.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/constraint_solver:routing",
+@@ -343,6 +353,7 @@ cc_library(
+     name = "dow_parser",
+     srcs = ["dow_parser.cc"],
+     hdrs = ["dow_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":capacity_planning_cc_proto",
+         "//ortools/base",
+diff --git a/ortools/sat/BUILD.bazel b/ortools/sat/BUILD.bazel
+index 222559f..b05763d 100644
+--- a/ortools/sat/BUILD.bazel
++++ b/ortools/sat/BUILD.bazel
+@@ -24,6 +24,7 @@ cc_library(
+     name = "cp_model",
+     srcs = ["cp_model.cc"],
+     hdrs = ["cp_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_solver",
+@@ -42,6 +43,7 @@ cc_library(
+ cc_library(
+     name = "model",
+     hdrs = ["model.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:typeid",
+@@ -95,6 +97,7 @@ cc_library(
+     name = "cp_model_utils",
+     srcs = ["cp_model_utils.cc"],
+     hdrs = ["cp_model_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":sat_base",
+@@ -117,6 +120,7 @@ cc_library(
+     name = "synchronization",
+     srcs = ["synchronization.cc"],
+     hdrs = ["synchronization.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -162,6 +166,7 @@ cc_library(
+     name = "cp_model_checker",
+     srcs = ["cp_model_checker.cc"],
+     hdrs = ["cp_model_checker.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -185,6 +190,7 @@ cc_library(
+     name = "constraint_violation",
+     srcs = ["constraint_violation.cc"],
+     hdrs = ["constraint_violation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -208,6 +214,7 @@ cc_library(
+     name = "feasibility_jump",
+     srcs = ["feasibility_jump.cc"],
+     hdrs = ["feasibility_jump.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":constraint_violation",
+         ":cp_model_cc_proto",
+@@ -243,6 +250,7 @@ cc_library(
+     name = "linear_model",
+     srcs = ["linear_model.cc"],
+     hdrs = ["linear_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -258,6 +266,7 @@ cc_library(
+     name = "parameters_validation",
+     srcs = ["parameters_validation.cc"],
+     hdrs = ["parameters_validation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_search",
+         ":sat_parameters_cc_proto",
+@@ -269,6 +278,7 @@ cc_library(
+     name = "cp_model_search",
+     srcs = ["cp_model_search.cc"],
+     hdrs = ["cp_model_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_mapping",
+@@ -297,6 +307,7 @@ cc_library(
+     name = "cp_model_solver_helpers",
+     srcs = ["cp_model_solver_helpers.cc"],
+     hdrs = ["cp_model_solver_helpers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":circuit",
+         ":clause",
+@@ -381,6 +392,7 @@ cc_library(
+     name = "shaving_solver",
+     srcs = ["shaving_solver.cc"],
+     hdrs = ["shaving_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_lns",
+@@ -411,6 +423,7 @@ cc_library(
+     name = "cp_model_solver",
+     srcs = ["cp_model_solver.cc"],
+     hdrs = ["cp_model_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":circuit",
+         ":clause",
+@@ -498,6 +511,7 @@ cc_library(
+ cc_library(
+     name = "cp_model_mapping",
+     hdrs = ["cp_model_mapping.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -521,6 +535,7 @@ cc_library(
+     name = "cp_model_loader",
+     srcs = ["cp_model_loader.cc"],
+     hdrs = ["cp_model_loader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":all_different",
+         ":circuit",
+@@ -582,6 +597,7 @@ cc_library(
+     name = "presolve_util",
+     srcs = ["presolve_util.cc"],
+     hdrs = ["presolve_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -612,6 +628,7 @@ cc_library(
+     name = "presolve_context",
+     srcs = ["presolve_context.cc"],
+     hdrs = ["presolve_context.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_loader",
+@@ -653,6 +670,7 @@ cc_library(
+         "cp_model_presolve.cc",
+     ],
+     hdrs = ["cp_model_presolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_rectangle_presolve",
+         ":circuit",
+@@ -718,6 +736,7 @@ cc_library(
+         "cp_model_postsolve.cc",
+     ],
+     hdrs = ["cp_model_postsolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -734,6 +753,7 @@ cc_library(
+     name = "cp_model_expand",
+     srcs = ["cp_model_expand.cc"],
+     hdrs = ["cp_model_expand.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_checker",
+@@ -762,6 +782,7 @@ cc_library(
+ cc_library(
+     name = "sat_base",
+     hdrs = ["sat_base.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -788,6 +809,7 @@ cc_library(
+         "sat_solver.cc",
+     ],
+     hdrs = ["sat_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":drat_proof_handler",
+@@ -826,6 +848,7 @@ cc_library(
+     name = "restart",
+     srcs = ["restart.cc"],
+     hdrs = ["restart.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_decision",
+@@ -844,6 +867,7 @@ cc_library(
+     name = "probing",
+     srcs = ["probing.cc"],
+     hdrs = ["probing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":implied_bounds",
+@@ -874,6 +898,7 @@ cc_library(
+     name = "sat_inprocessing",
+     srcs = ["sat_inprocessing.cc"],
+     hdrs = ["sat_inprocessing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":drat_checker",
+@@ -907,6 +932,7 @@ cc_library(
+     name = "sat_decision",
+     srcs = ["sat_decision.cc"],
+     hdrs = ["sat_decision.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":pb_constraint",
+@@ -927,6 +953,7 @@ cc_library(
+     name = "clause",
+     srcs = ["clause.cc"],
+     hdrs = ["clause.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":drat_proof_handler",
+         ":inclusion",
+@@ -959,6 +986,7 @@ cc_library(
+     name = "simplification",
+     srcs = ["simplification.cc"],
+     hdrs = ["simplification.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":drat_proof_handler",
+         ":model",
+@@ -988,6 +1016,7 @@ cc_library(
+     name = "pb_constraint",
+     srcs = ["pb_constraint.cc"],
+     hdrs = ["pb_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_base",
+@@ -1012,6 +1041,7 @@ cc_library(
+     name = "symmetry",
+     srcs = ["symmetry.cc"],
+     hdrs = ["symmetry.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sat_base",
+         "//ortools/algorithms:sparse_permutation",
+@@ -1027,6 +1057,7 @@ cc_library(
+     name = "symmetry_util",
+     srcs = ["symmetry_util.cc"],
+     hdrs = ["symmetry_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/algorithms:dynamic_partition",
+         "//ortools/algorithms:sparse_permutation",
+@@ -1040,6 +1071,7 @@ cc_library(
+     name = "var_domination",
+     srcs = ["var_domination.cc"],
+     hdrs = ["var_domination.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -1068,6 +1100,7 @@ cc_library(
+     name = "integer",
+     srcs = ["integer.cc"],
+     hdrs = ["integer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_base",
+@@ -1097,6 +1130,7 @@ cc_library(
+     name = "integer_search",
+     srcs = ["integer_search.cc"],
+     hdrs = ["integer_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":cp_model_cc_proto",
+@@ -1134,6 +1168,7 @@ cc_library(
+     name = "lb_tree_search",
+     srcs = ["lb_tree_search.cc"],
+     hdrs = ["lb_tree_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -1165,6 +1200,7 @@ cc_library(
+     name = "pseudo_costs",
+     srcs = ["pseudo_costs.cc"],
+     hdrs = ["pseudo_costs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -1187,6 +1223,7 @@ cc_library(
+     name = "intervals",
+     srcs = ["intervals.cc"],
+     hdrs = ["intervals.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_constraints",
+         ":implied_bounds",
+@@ -1216,6 +1253,7 @@ cc_library(
+     name = "precedences",
+     srcs = ["precedences.cc"],
+     hdrs = ["precedences.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":cp_constraints",
+@@ -1251,6 +1289,7 @@ cc_library(
+     name = "integer_expr",
+     srcs = ["integer_expr.cc"],
+     hdrs = ["integer_expr.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":linear_constraint",
+@@ -1278,6 +1317,7 @@ cc_library(
+     name = "linear_propagation",
+     srcs = ["linear_propagation.cc"],
+     hdrs = ["linear_propagation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1310,6 +1350,7 @@ cc_library(
+     name = "all_different",
+     srcs = ["all_different.cc"],
+     hdrs = ["all_different.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1331,6 +1372,7 @@ cc_library(
+     name = "theta_tree",
+     srcs = ["theta_tree.cc"],
+     hdrs = ["theta_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         "//ortools/base",
+@@ -1342,6 +1384,7 @@ cc_library(
+     name = "disjunctive",
+     srcs = ["disjunctive.cc"],
+     hdrs = ["disjunctive.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":all_different",
+         ":integer",
+@@ -1368,6 +1411,7 @@ cc_library(
+     name = "timetable",
+     srcs = ["timetable.cc"],
+     hdrs = ["timetable.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":intervals",
+@@ -1384,6 +1428,7 @@ cc_library(
+     name = "timetable_edgefinding",
+     srcs = ["timetable_edgefinding.cc"],
+     hdrs = ["timetable_edgefinding.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":intervals",
+@@ -1399,6 +1444,7 @@ cc_library(
+     name = "cumulative",
+     srcs = ["cumulative.cc"],
+     hdrs = ["cumulative.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cumulative_energy",
+         ":disjunctive",
+@@ -1425,6 +1471,7 @@ cc_library(
+     name = "cumulative_energy",
+     srcs = ["cumulative_energy.cc"],
+     hdrs = ["cumulative_energy.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_orthogonal_packing",
+         ":diffn_util",
+@@ -1447,6 +1494,7 @@ cc_library(
+     name = "boolean_problem",
+     srcs = ["boolean_problem.cc"],
+     hdrs = ["boolean_problem.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem_cc_proto",
+         ":cp_model_cc_proto",
+@@ -1480,6 +1528,7 @@ cc_library(
+     name = "linear_relaxation",
+     srcs = ["linear_relaxation.cc"],
+     hdrs = ["linear_relaxation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":circuit",
+         ":clause",
+@@ -1524,6 +1573,7 @@ cc_library(
+     name = "linear_constraint",
+     srcs = ["linear_constraint.cc"],
+     hdrs = ["linear_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1544,6 +1594,7 @@ cc_library(
+     name = "linear_programming_constraint",
+     srcs = ["linear_programming_constraint.cc"],
+     hdrs = ["linear_programming_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_mapping",
+@@ -1590,6 +1641,7 @@ cc_library(
+     name = "linear_constraint_manager",
+     srcs = ["linear_constraint_manager.cc"],
+     hdrs = ["linear_constraint_manager.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":linear_constraint",
+@@ -1620,6 +1672,7 @@ cc_library(
+     name = "cuts",
+     srcs = ["cuts.cc"],
+     hdrs = ["cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":implied_bounds",
+@@ -1653,6 +1706,7 @@ cc_library(
+     name = "routing_cuts",
+     srcs = ["routing_cuts.cc"],
+     hdrs = ["routing_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cuts",
+@@ -1680,6 +1734,7 @@ cc_library(
+     name = "scheduling_cuts",
+     srcs = ["scheduling_cuts.cc"],
+     hdrs = ["scheduling_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cuts",
+         ":implied_bounds",
+@@ -1710,6 +1765,7 @@ cc_library(
+     name = "diffn_cuts",
+     srcs = ["diffn_cuts.cc"],
+     hdrs = ["diffn_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cuts",
+         ":diffn_util",
+@@ -1741,6 +1797,7 @@ cc_library(
+     name = "zero_half_cuts",
+     srcs = ["zero_half_cuts.cc"],
+     hdrs = ["zero_half_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":util",
+@@ -1756,6 +1813,7 @@ cc_library(
+     name = "lp_utils",
+     srcs = ["lp_utils.cc"],
+     hdrs = ["lp_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem",
+         ":boolean_problem_cc_proto",
+@@ -1786,6 +1844,7 @@ cc_library(
+     name = "optimization",
+     srcs = ["optimization.cc"],
+     hdrs = ["optimization.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem",
+         ":boolean_problem_cc_proto",
+@@ -1825,6 +1884,7 @@ cc_library(
+     name = "max_hs",
+     srcs = ["max_hs.cc"],
+     hdrs = ["max_hs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem",
+         ":cp_model_cc_proto",
+@@ -1871,6 +1931,7 @@ cc_library(
+     name = "util",
+     srcs = ["util.cc"],
+     hdrs = ["util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_base",
+@@ -1904,6 +1965,7 @@ cc_library(
+     name = "stat_tables",
+     srcs = ["stat_tables.cc"],
+     hdrs = ["stat_tables.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_lns",
+@@ -1926,6 +1988,7 @@ cc_library(
+     name = "table",
+     srcs = ["table.cc"],
+     hdrs = ["table.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1943,6 +2006,7 @@ cc_library(
+     name = "cp_constraints",
+     srcs = ["cp_constraints.cc"],
+     hdrs = ["cp_constraints.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1960,6 +2024,7 @@ cc_library(
+     name = "diffn_util",
+     srcs = ["diffn_util.cc"],
+     hdrs = ["diffn_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":intervals",
+@@ -1981,6 +2046,7 @@ cc_library(
+     name = "2d_orthogonal_packing",
+     srcs = ["2d_orthogonal_packing.cc"],
+     hdrs = ["2d_orthogonal_packing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_packing_brute_force",
+         ":integer",
+@@ -2000,6 +2066,7 @@ cc_library(
+     name = "2d_packing_brute_force",
+     srcs = ["2d_packing_brute_force.cc"],
+     hdrs = ["2d_packing_brute_force.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":diffn_util",
+         ":integer",
+@@ -2016,6 +2083,7 @@ cc_library(
+     name = "2d_rectangle_presolve",
+     srcs = ["2d_rectangle_presolve.cc"],
+     hdrs = ["2d_rectangle_presolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":diffn_util",
+         ":integer",
+@@ -2032,6 +2100,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["2d_orthogonal_packing_testing.cc"],
+     hdrs = ["2d_orthogonal_packing_testing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":diffn_util",
+         ":integer",
+@@ -2046,6 +2115,7 @@ cc_library(
+     name = "diffn",
+     srcs = ["diffn.cc"],
+     hdrs = ["diffn.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_orthogonal_packing",
+         ":cumulative_energy",
+@@ -2075,6 +2145,7 @@ cc_library(
+     name = "circuit",
+     srcs = ["circuit.cc"],
+     hdrs = ["circuit.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -2097,6 +2168,7 @@ cc_library(
+     name = "encoding",
+     srcs = ["encoding.cc"],
+     hdrs = ["encoding.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem_cc_proto",
+         ":pb_constraint",
+@@ -2117,6 +2189,7 @@ cc_library(
+     name = "cp_model_lns",
+     srcs = ["cp_model_lns.cc"],
+     hdrs = ["cp_model_lns.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_mapping",
+@@ -2160,6 +2233,7 @@ cc_library(
+     name = "feasibility_pump",
+     srcs = ["feasibility_pump.cc"],
+     hdrs = ["feasibility_pump.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -2193,6 +2267,7 @@ cc_library(
+     name = "rins",
+     srcs = ["rins.cc"],
+     hdrs = ["rins.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -2211,6 +2286,7 @@ cc_library(
+     name = "subsolver",
+     srcs = ["subsolver.cc"],
+     hdrs = ["subsolver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:threadpool",
+@@ -2230,6 +2306,7 @@ cc_library(
+     name = "drat_proof_handler",
+     srcs = ["drat_proof_handler.cc"],
+     hdrs = ["drat_proof_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":drat_checker",
+         ":drat_writer",
+@@ -2246,6 +2323,7 @@ cc_library(
+     name = "drat_checker",
+     srcs = ["drat_checker.cc"],
+     hdrs = ["drat_checker.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sat_base",
+         "//ortools/base",
+@@ -2265,6 +2343,7 @@ cc_library(
+     name = "drat_writer",
+     srcs = ["drat_writer.cc"],
+     hdrs = ["drat_writer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sat_base",
+         "//ortools/base",
+@@ -2311,6 +2390,7 @@ cc_binary(
+ cc_library(
+     name = "sat_cnf_reader",
+     hdrs = ["sat_cnf_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem_cc_proto",
+         ":cp_model_cc_proto",
+@@ -2328,6 +2408,7 @@ cc_library(
+     name = "cp_model_symmetries",
+     srcs = ["cp_model_symmetries.cc"],
+     hdrs = ["cp_model_symmetries.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_checker",
+@@ -2367,6 +2448,7 @@ cc_library(
+     name = "swig_helper",
+     srcs = ["swig_helper.cc"],
+     hdrs = ["swig_helper.h"],
++    features = ["-layering_check"],
+     visibility = [
+         "//ortools/sat/java:__pkg__",
+         "//ortools/sat/python:__pkg__",
+@@ -2389,6 +2471,7 @@ cc_library(
+     name = "implied_bounds",
+     srcs = ["implied_bounds.cc"],
+     hdrs = ["implied_bounds.h"],
++    features = ["-layering_check"],
+     deps = [
+         "linear_constraint",
+         ":clause",
+@@ -2418,6 +2501,7 @@ cc_library(
+ cc_library(
+     name = "inclusion",
+     hdrs = ["inclusion.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/log:check",
+@@ -2429,6 +2513,7 @@ cc_library(
+     name = "diophantine",
+     srcs = ["diophantine.cc"],
+     hdrs = ["diophantine.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":util",
+         "@com_google_absl//absl/log:check",
+@@ -2441,6 +2526,7 @@ cc_library(
+     name = "work_assignment",
+     srcs = ["work_assignment.cc"],
+     hdrs = ["work_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":cp_model_utils",
+diff --git a/ortools/scheduling/BUILD.bazel b/ortools/scheduling/BUILD.bazel
+index d2c0ef0..5c794d4 100644
+--- a/ortools/scheduling/BUILD.bazel
++++ b/ortools/scheduling/BUILD.bazel
+@@ -34,6 +34,7 @@ cc_library(
+     name = "jobshop_scheduling_parser",
+     srcs = ["jobshop_scheduling_parser.cc"],
+     hdrs = ["jobshop_scheduling_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":jobshop_scheduling_cc_proto",
+         "//ortools/base",
+@@ -63,6 +64,7 @@ cc_library(
+     name = "rcpsp_parser",
+     srcs = ["rcpsp_parser.cc"],
+     hdrs = ["rcpsp_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":rcpsp_cc_proto",
+diff --git a/ortools/util/BUILD.bazel b/ortools/util/BUILD.bazel
+index b2ee315..a123c8d 100644
+--- a/ortools/util/BUILD.bazel
++++ b/ortools/util/BUILD.bazel
+@@ -56,6 +56,7 @@ py_proto_library(
+ cc_library(
+     name = "affine_relation",
+     hdrs = ["affine_relation.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:iterator_adaptors",
+@@ -65,6 +66,7 @@ cc_library(
+ cc_library(
+     name = "filelineiter",
+     hdrs = ["filelineiter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+@@ -77,6 +79,7 @@ cc_library(
+     name = "bitset",
+     srcs = ["bitset.cc"],
+     hdrs = ["bitset.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+@@ -85,6 +88,7 @@ cc_library(
+     hdrs = [
+         "integer_pq.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -94,6 +98,7 @@ cc_library(
+     name = "cached_log",
+     srcs = ["cached_log.cc"],
+     hdrs = ["cached_log.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:types",
+@@ -103,18 +108,21 @@ cc_library(
+ cc_library(
+     name = "zvector",
+     hdrs = ["zvector.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "permutation",
+     hdrs = ["permutation.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "saturated_arithmetic",
+     hdrs = ["saturated_arithmetic.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bitset",
+         "//ortools/base",
+@@ -126,6 +134,7 @@ cc_library(
+     name = "piecewise_linear_function",
+     srcs = ["piecewise_linear_function.cc"],
+     hdrs = ["piecewise_linear_function.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":saturated_arithmetic",
+         "//ortools/base",
+@@ -140,6 +149,7 @@ cc_library(
+     name = "rational_approximation",
+     srcs = ["rational_approximation.cc"],
+     hdrs = ["rational_approximation.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -150,6 +160,7 @@ cc_library(
+     name = "sorted_interval_list",
+     srcs = ["sorted_interval_list.cc"],
+     hdrs = ["sorted_interval_list.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":saturated_arithmetic",
+         "//ortools/base",
+@@ -163,6 +174,7 @@ cc_library(
+ cc_library(
+     name = "string_array",
+     hdrs = ["string_array.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -171,6 +183,7 @@ cc_library(
+ cc_library(
+     name = "tuple_set",
+     hdrs = ["tuple_set.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:hash",
+@@ -183,6 +196,7 @@ cc_library(
+     name = "stats",
+     srcs = ["stats.cc"],
+     hdrs = ["stats.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:stl_util",
+@@ -200,6 +214,7 @@ cc_library(
+     name = "time_limit",
+     srcs = ["time_limit.cc"],
+     hdrs = ["time_limit.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":running_stat",
+         "//ortools/base",
+@@ -216,6 +231,7 @@ cc_library(
+     name = "sigint",
+     srcs = ["sigint.cc"],
+     hdrs = ["sigint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -234,6 +250,7 @@ cc_library(
+         "on_windows": [],
+         "//conditions:default": ["-frounding-math"],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":bitset",
+         "//ortools/base",
+@@ -244,18 +261,21 @@ cc_library(
+     name = "monoid_operation_tree",
+     srcs = [],
+     hdrs = ["monoid_operation_tree.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "return_macros",
+     hdrs = ["return_macros.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "running_stat",
+     hdrs = ["running_stat.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+@@ -263,6 +283,7 @@ cc_library(
+     name = "proto_tools",
+     srcs = ["proto_tools.cc"],
+     hdrs = ["proto_tools.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/status",
+         "@com_google_absl//absl/status:statusor",
+@@ -302,6 +323,7 @@ cc_library(
+     hdrs = [
+         "functions_swig_helpers.h",
+     ],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+@@ -311,12 +333,14 @@ cc_library(
+     hdrs = [
+         "functions_swig_test_helpers.h",
+     ],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "range_minimum_query",
+     hdrs = ["range_minimum_query.h"],
++    features = ["-layering_check"],
+     deps = [":bitset"],
+ )
+ 
+@@ -324,6 +348,7 @@ cc_library(
+     name = "range_query_function",
+     srcs = ["range_query_function.cc"],
+     hdrs = ["range_query_function.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":range_minimum_query",
+         "//ortools/base",
+@@ -333,6 +358,7 @@ cc_library(
+ cc_library(
+     name = "rev",
+     hdrs = ["rev.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -343,6 +369,7 @@ cc_library(
+ cc_library(
+     name = "vector_or_function",
+     hdrs = ["vector_or_function.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -354,6 +381,7 @@ cc_library(
+     name = "qap_reader",
+     srcs = ["qap_reader.cc"],
+     hdrs = ["qap_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/util:filelineiter",
+         "@com_google_absl//absl/strings",
+@@ -363,6 +391,7 @@ cc_library(
+ cc_library(
+     name = "sort",
+     hdrs = ["sort.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -372,6 +401,7 @@ cc_library(
+     name = "file_util",
+     srcs = ["file_util.cc"],
+     hdrs = ["file_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:dump_vars",
+@@ -391,6 +421,7 @@ cc_library(
+ cc_library(
+     name = "random_engine",
+     hdrs = ["random_engine.h"],
++    features = ["-layering_check"],
+     deps = [],
+ )
+ 
+@@ -398,6 +429,7 @@ cc_library(
+     name = "string_util",
+     srcs = ["string_util.cc"],
+     hdrs = ["string_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -408,12 +440,14 @@ cc_library(
+ cc_library(
+     name = "adaptative_parameter_value",
+     hdrs = ["adaptative_parameter_value.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "lazy_mutable_copy",
+     hdrs = ["lazy_mutable_copy.h"],
++    features = ["-layering_check"],
+     deps = ["@com_google_absl//absl/memory"],
+ )
+ 
+@@ -421,6 +455,7 @@ cc_library(
+     name = "logging",
+     srcs = ["logging.cc"],
+     hdrs = ["logging.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:timer",
+@@ -431,11 +466,13 @@ cc_library(
+ cc_library(
+     name = "testing_utils",
+     hdrs = ["testing_utils.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "strong_integers",
+     hdrs = ["strong_integers.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -445,6 +482,7 @@ cc_library(
+ cc_library(
+     name = "status_macros",
+     hdrs = ["status_macros.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -455,6 +493,7 @@ cc_library(
+     name = "fp_roundtrip_conv",
+     srcs = ["fp_roundtrip_conv.cc"],
+     hdrs = ["fp_roundtrip_conv.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:status_builder",
+@@ -468,6 +507,7 @@ cc_library(
+ cc_library(
+     name = "flat_matrix",
+     hdrs = ["flat_matrix.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/types:span",
+     ],
+@@ -477,6 +517,7 @@ cc_library(
+     name = "fp_roundtrip_conv_testing",
+     testonly = 1,
+     hdrs = ["fp_roundtrip_conv_testing.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -486,6 +527,7 @@ cc_library(
+     name = "aligned_memory",
+     srcs = ["aligned_memory_internal.h"],
+     hdrs = ["aligned_memory.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:mathutil",
+     ],
+@@ -495,6 +537,7 @@ cc_library(
+     name = "vector_sum",
+     srcs = ["vector_sum_internal.h"],
+     hdrs = ["vector_sum.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":aligned_memory",
+         "@com_google_absl//absl/base:core_headers",
+@@ -506,6 +549,7 @@ cc_library(
+     name = "parse_proto",
+     srcs = ["parse_proto.cc"],
+     hdrs = ["parse_proto.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+         "@com_google_protobuf//:protobuf",
+@@ -516,6 +560,7 @@ cc_library(
+     name = "solve_interrupter",
+     srcs = ["solve_interrupter.cc"],
+     hdrs = ["solve_interrupter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:intops",
+@@ -529,6 +574,7 @@ cc_library(
+ cc_library(
+     name = "dense_set",
+     hdrs = ["dense_set.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/types:span",
+diff --git a/ortools/util/python/BUILD.bazel b/ortools/util/python/BUILD.bazel
+index 925cf57..765f573 100644
+--- a/ortools/util/python/BUILD.bazel
++++ b/ortools/util/python/BUILD.bazel
+@@ -21,6 +21,7 @@ load("@rules_python//python:defs.bzl", "py_test")
+ cc_library(
+     name = "sorted_interval_list_doc",
+     hdrs = ["sorted_interval_list_doc.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+ )
+ 
+diff --git a/ortools/xpress/BUILD.bazel b/ortools/xpress/BUILD.bazel
+index 22b6ed9..a86bc1d 100644
+--- a/ortools/xpress/BUILD.bazel
++++ b/ortools/xpress/BUILD.bazel
+@@ -21,6 +21,7 @@ cc_library(
+     hdrs = [
+         "environment.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:dynamic_library",
diff --git a/third_party/xla/third_party/py/py_import.bzl b/third_party/xla/third_party/py/py_import.bzl
index 08aa56f1b42c20..7ab46f8bfa00ef 100644
--- a/third_party/xla/third_party/py/py_import.bzl
+++ b/third_party/xla/third_party/py/py_import.bzl
@@ -49,13 +49,15 @@ def py_import(
         wheel,
         deps = [],
         wheel_deps = [],
-        zip_deps = []):
+        zip_deps = [],
+        testonly = False):
     unpacked_wheel_name = name + "_unpacked_wheel"
     _unpacked_wheel(
         name = unpacked_wheel_name,
         wheel = wheel,
         wheel_deps = wheel_deps,
         zip_deps = zip_deps,
+        testonly = testonly,
     )
     py_library(
         name = name,
@@ -63,6 +65,7 @@ def py_import(
         imports = [unpacked_wheel_name],
         deps = deps,
         visibility = ["//visibility:public"],
+        testonly = testonly,
     )
 
 """Unpacks the wheel and uses its content as a py_library.
diff --git a/third_party/xla/third_party/py/python_init_pip.bzl b/third_party/xla/third_party/py/python_init_pip.bzl
index 7689b92b60a00a..39901b9b2e64ea 100644
--- a/third_party/xla/third_party/py/python_init_pip.bzl
+++ b/third_party/xla/third_party/py/python_init_pip.bzl
@@ -24,6 +24,10 @@ cc_library(
 cc_library(
     name = "numpy_headers",
     deps = [":numpy_headers_2", ":numpy_headers_1"],
+    # For the layering check to work we need to re-export the headers from the
+    # dependencies.
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]) +
+           glob(["site-packages/numpy/core/include/**/*.h"]),
 )
 """,
         ),
diff --git a/third_party/xla/third_party/py/python_init_rules.bzl b/third_party/xla/third_party/py/python_init_rules.bzl
index ac9b8eb3893441..e8bfd6548965e4 100644
--- a/third_party/xla/third_party/py/python_init_rules.bzl
+++ b/third_party/xla/third_party/py/python_init_rules.bzl
@@ -1,6 +1,5 @@
 """Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def python_init_rules(extra_patches = []):
@@ -11,15 +10,14 @@ def python_init_rules(extra_patches = []):
         set of patches.
     """
 
-    http_archive(
+    tf_http_archive(
         name = "rules_cc",
-        urls = ["https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"],
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"),
         strip_prefix = "rules_cc-0.1.0",
         sha256 = "4b12149a041ddfb8306a8fd0e904e39d673552ce82e4296e96fac9cbf0780e59",
-        patches = [
-            Label("//third_party/py:rules_cc_protobuf.patch"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_cc_protobuf.patch",
         ],
-        patch_args = ["-p1"],
     )
 
     tf_http_archive(
@@ -34,15 +32,14 @@ def python_init_rules(extra_patches = []):
         },
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_python",
         sha256 = "fa7dd2c6b7d63b3585028dd8a90a6cf9db83c33b250959c2ee7b583a6c130e12",
         strip_prefix = "rules_python-1.6.0",
-        url = "https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz",
-        patch_args = ["-p1"],
-        patches = [
-            Label("//third_party/py:rules_python_pip_version.patch"),
-            Label("//third_party/py:rules_python_freethreaded.patch"),
-            Label("//third_party/py:rules_python_versions.patch"),
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_python_pip_version.patch",
+            "@local_xla//third_party/py:rules_python_freethreaded.patch",
+            "@local_xla//third_party/py:rules_python_versions.patch",
         ] + extra_patches,
     )
diff --git a/third_party/xla/third_party/py/python_init_toolchains.bzl b/third_party/xla/third_party/py/python_init_toolchains.bzl
index 860fc08ceda2a8..82d755c32bbfba 100644
--- a/third_party/xla/third_party/py/python_init_toolchains.bzl
+++ b/third_party/xla/third_party/py/python_init_toolchains.bzl
@@ -41,7 +41,6 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
         tool_version = MINOR_MAPPING.get(HERMETIC_PYTHON_VERSION)
         if not tool_version:
             tool_version = HERMETIC_PYTHON_VERSION + ".0"
-        url_components = HERMETIC_PYTHON_URL.split("://", 1)
 
         sha256s = {}
         for platform in PLATFORMS.keys():
@@ -51,12 +50,12 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
 
         python_register_toolchains(
             name = get_toolchain_name_per_python_version(name),
-            base_url = url_components[0] + "://",
+            base_url = "",
             ignore_root_user_error = True,
             python_version = tool_version,
             tool_versions = {
                 tool_version: {
-                    "url": url_components[1],
+                    "url": HERMETIC_PYTHON_URL,
                     "sha256": sha256s,
                     "strip_prefix": HERMETIC_PYTHON_PREFIX,
                 },
diff --git a/third_party/xla/third_party/py/rules_python_versions.patch b/third_party/xla/third_party/py/rules_python_versions.patch
index 8dbc70bad193d7..c31b6772c2675f 100644
--- a/third_party/xla/third_party/py/rules_python_versions.patch
+++ b/third_party/xla/third_party/py/rules_python_versions.patch
@@ -1,8 +1,60 @@
 diff --git a/python/versions.bzl b/python/versions.bzl
-index 30929f82..8e79225a 100644
+index 30929f82..c0856d70 100644
 --- a/python/versions.bzl
 +++ b/python/versions.bzl
-@@ -855,6 +855,51 @@ TOOL_VERSIONS = {
+@@ -810,6 +810,51 @@ TOOL_VERSIONS = {
+             "x86_64-unknown-linux-gnu-freethreaded": "python/install",
+         },
+     },
++    "3.13.11": {
++        "url": "20251209/cpython-{python_version}+20251209-{platform}-{build}.{ext}",
++        "sha256": {
++            "aarch64-apple-darwin": "295a9f7bc899ea1cc08baf60bbf511bdd1e4a29b2dd7e5f59b48f18bfa6bf585",
++            "aarch64-unknown-linux-gnu": "ea1e678e6e82301bb32bf3917732125949b6e46d541504465972024a3f165343",
++            "ppc64le-unknown-linux-gnu": "7660e53aad9d35ee256913c6d98427f81f078699962035c5fa8b5c3138695109",
++            "riscv64-unknown-linux-gnu": "763fa1548e6a432e9402916e690c74ea30f26dcd2e131893dd506f72b87c27c9",
++            "s390x-unknown-linux-gnu": "ffb6af51fbfabfc6fbc4e7379bdec70c2f51e972b1d2f45c053493b9da3a1bbe",
++            "x86_64-apple-darwin": "dac4a0a0a9b71f6b02a8b0886547fa22814474239bffb948e3e77185406ea136",
++            "x86_64-pc-windows-msvc": "87822417007045a28a7eccc47fe67b8c61265b99b10dbbfa24d231a3622b1c27",
++            "aarch64-pc-windows-msvc": "ba646d0c3b7dd7bdfb770d9b2ebd6cd2df02a37fda90c9c79a7cf59c7df6f165",
++            "aarch64-pc-windows-msvc-freethreaded": "6daf6d092c7294cfe68c4c7bf2698ac134235489c874b3bf796c7972b9dbba30",
++            "x86_64-unknown-linux-gnu": "1ffa06d714a44aea14c0c54c30656413e5955a6c92074b4b3cb4351dcc28b63b",
++            "x86_64-unknown-linux-musl": "969fe24017380b987c4e3ce15e9edf82a4618c1e61672b2cc9b021a1c98eae78",
++            "aarch64-apple-darwin-freethreaded": "4213058b7fcd875596c12b58cd46a399358b0a87ecde4b349cbdd00cf87ed79a",
++            "aarch64-unknown-linux-gnu-freethreaded": "290ca3bd0007db9e551f90b08dfcb6c1b2d62c33b2fc3e9a43e77d385d94f569",
++            "ppc64le-unknown-linux-gnu-freethreaded": "09d4b50f8abb443f7e3af858c920aa61c2430b0954df465e861caa7078e55e69",
++            "riscv64-unknown-linux-gnu-freethreaded": "5406f2a7cacafbd2aac3ce2de066a0929aab55423824276c36e04cb83babc36c",
++            "s390x-unknown-linux-gnu-freethreaded": "3984b67c4292892eaccdd1c094c7ec788884c4c9b3534ab6995f6be96d5ed51d",
++            "x86_64-apple-darwin-freethreaded": "d6f489464045d6895ae68b0a04a9e16477e74fe3185a75f3a9a0af8ccd25eade",
++            "x86_64-pc-windows-msvc-freethreaded": "bb9a29a7ba8f179273b79971da6aaa7be592d78c606a63f99eff3e4c12fb0fae",
++            "x86_64-unknown-linux-gnu-freethreaded": "33f89c957d986d525529b8a980103735776f4d20cf52f55960a057c760188ac3",
++        },
++        "strip_prefix": {
++            "aarch64-apple-darwin": "python",
++            "aarch64-unknown-linux-gnu": "python",
++            "ppc64le-unknown-linux-gnu": "python",
++            "s390x-unknown-linux-gnu": "python",
++            "riscv64-unknown-linux-gnu": "python",
++            "x86_64-apple-darwin": "python",
++            "x86_64-pc-windows-msvc": "python",
++            "aarch64-pc-windows-msvc": "python",
++            "x86_64-unknown-linux-gnu": "python",
++            "x86_64-unknown-linux-musl": "python",
++            "aarch64-apple-darwin-freethreaded": "python/install",
++            "aarch64-unknown-linux-gnu-freethreaded": "python/install",
++            "ppc64le-unknown-linux-gnu-freethreaded": "python/install",
++            "riscv64-unknown-linux-gnu-freethreaded": "python/install",
++            "s390x-unknown-linux-gnu-freethreaded": "python/install",
++            "x86_64-apple-darwin-freethreaded": "python/install",
++            "x86_64-pc-windows-msvc-freethreaded": "python/install",
++            "aarch64-pc-windows-msvc-freethreaded": "python/install",
++            "x86_64-unknown-linux-gnu-freethreaded": "python/install",
++        },
++    },
+     "3.14.0rc1": {
+         "url": "20250808/cpython-{python_version}+20250808-{platform}-{build}.{ext}",
+         "sha256": {
+@@ -855,6 +900,51 @@ TOOL_VERSIONS = {
              "x86_64-unknown-linux-gnu-freethreaded": "python/install",
          },
      },
@@ -54,16 +106,18 @@ index 30929f82..8e79225a 100644
  }
  
  # buildifier: disable=unsorted-dict-items
-@@ -865,7 +910,7 @@ MINOR_MAPPING = {
+@@ -864,8 +954,8 @@ MINOR_MAPPING = {
+     "3.10": "3.10.18",
      "3.11": "3.11.13",
      "3.12": "3.12.11",
-     "3.13": "3.13.6",
+-    "3.13": "3.13.6",
 -    "3.14": "3.14.0rc1",
++    "3.13": "3.13.11",
 +    "3.14": "3.14.0",
  }
  
  def _generate_platforms():
-@@ -1045,29 +1090,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
+@@ -1045,29 +1135,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
      for u in url:
          p, _, _ = platform.partition(FREETHREADED)
  
diff --git a/third_party/xla/third_party/riegeli/BUILD.bazel b/third_party/xla/third_party/riegeli/BUILD.bazel
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/third_party/riegeli/workspace.bzl b/third_party/xla/third_party/riegeli/workspace.bzl
new file mode 100644
index 00000000000000..577511ee10e83e
--- /dev/null
+++ b/third_party/xla/third_party/riegeli/workspace.bzl
@@ -0,0 +1,11 @@
+"""Provides the repo macro to import riegeli"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "com_google_riegeli",
+        sha256 = "f63337f63f794ba9dc7dd281b20af3d036dfe0c1a5a4b7b8dc20b39f7e323b97",
+        strip_prefix = "riegeli-9f2744dc23e81d84c02f6f51244e9e9bb9802d57",
+        urls = tf_mirror_urls("https://github.com/google/riegeli/archive/9f2744dc23e81d84c02f6f51244e9e9bb9802d57.tar.gz"),
+    )
diff --git a/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD b/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
index 11795b3537e7a9..1e52bb31c540fc 100644
--- a/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
+++ b/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
@@ -24,6 +24,7 @@ cc_binary(
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:config",
     ],
 )
 
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 1f51d21f432dd8..e69de29bb2d1d6 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,1157 +0,0 @@
-diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index f04aa96..509398d 100644
---- a/third_party/llvm/generated.patch
-+++ b/third_party/llvm/generated.patch
-@@ -1,1137 +1 @@
- Auto generated patch. Do not edit or delete it, even if empty.
--diff -ruN --strip-trailing-cr a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
----- a/clang/docs/LanguageExtensions.rst
--+++ b/clang/docs/LanguageExtensions.rst
--@@ -1833,23 +1833,6 @@
-- 
-- Clang provides a few builtin aliases to improve the throughput of certain metaprogramming facilities.
-- 
---__builtin_common_reference
-----------------------------
---
---.. code-block:: c++
---
---  template <template <class, class, template <class> class, template <class> class> class BasicCommonReferenceT,
---            template <class... Args> CommonTypeT,
---            template <class> HasTypeMember,
---            class HasNoTypeMember,
---            class... Ts>
---  using __builtin_common_reference = ...;
---
---This alias is used for implementing ``std::common_reference``. If ``std::common_reference`` should contain a ``type``
---member, it is an alias to ``HasTypeMember<TheCommonReference>``. Otherwse it is an alias to ``HasNoTypeMember``. The
---``CommonTypeT`` is usually ``std::common_type_t``. ``BasicCommonReferenceT`` is usually an alias template to
---``basic_common_reference<T, U, TX, UX>::type``.
---
-- __builtin_common_type
-- ---------------------
-- 
--diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/BuiltinTemplates.td b/clang/include/clang/Basic/BuiltinTemplates.td
----- a/clang/include/clang/Basic/BuiltinTemplates.td
--+++ b/clang/include/clang/Basic/BuiltinTemplates.td
--@@ -10,11 +10,11 @@
--   string Name = name;
-- }
-- 
---class Template<list<TemplateArg> args, string name = ""> : TemplateArg<name> {
--+class Template<list<TemplateArg> args, string name> : TemplateArg<name> {
--   list<TemplateArg> Args = args;
-- }
-- 
---class Class<string name = "", bit is_variadic = 0> : TemplateArg<name> {
--+class Class<string name, bit is_variadic = 0> : TemplateArg<name> {
--   bit IsVariadic = is_variadic;
-- }
-- 
--@@ -56,32 +56,6 @@
--    Class<"HasNoTypeMember">,
--    Class<"Ts", /*is_variadic=*/1>]>;
-- 
---// template <template <class,"
---//                     class,"
---//                     template <class> class,"
---//                     template <class> class> class BasicCommonReferenceT,"
---//           template <class... Args> class CommonTypeT,"
---//           template <class> class HasTypeMember,"
---//           class HasNoTypeMember,"
---//           class... Ts>"
---def __builtin_common_reference : CPlusPlusBuiltinTemplate<
---            [Template<[Class<>,
---                       Class<>,
---                       Template<[Class<>]>,
---                       Template<[Class<>]>], "BasicCommonReferenceT">,
---             Template<[Class<"Args", /*is_variadic=*/1>], "CommonTypeT">,
---             Template<[Class<>], "HasTypeMember">,
---             Class<"HasNoTypeMember">,
---             Class<"Ts", /*is_variadic=*/1>]>;
---
---foreach Ref = ["", "lvalue", "rvalue"] in {
---  foreach Const = ["", "const"] in {
---    foreach Volatile = ["", "volatile"] in {
---      def __clang_internal_xref_#Ref#Const#Volatile : CPlusPlusBuiltinTemplate<[Class<>]>;
---    }
---  }
---}
---
-- // template <uint32_t Opcode,
-- //           uint32_t Size,
-- //           uint32_t Alignment,
--diff -ruN --strip-trailing-cr a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
----- a/clang/include/clang/Sema/Sema.h
--+++ b/clang/include/clang/Sema/Sema.h
--@@ -15322,17 +15322,6 @@
--   QualType BuiltinDecay(QualType BaseType, SourceLocation Loc);
--   QualType BuiltinAddReference(QualType BaseType, UTTKind UKind,
--                                SourceLocation Loc);
---
---  QualType BuiltinAddRValueReference(QualType BaseType, SourceLocation Loc) {
---    return BuiltinAddReference(BaseType, UnaryTransformType::AddRvalueReference,
---                               Loc);
---  }
---
---  QualType BuiltinAddLValueReference(QualType BaseType, SourceLocation Loc) {
---    return BuiltinAddReference(BaseType, UnaryTransformType::AddLvalueReference,
---                               Loc);
---  }
---
--   QualType BuiltinRemoveExtent(QualType BaseType, UTTKind UKind,
--                                SourceLocation Loc);
--   QualType BuiltinRemoveReference(QualType BaseType, UTTKind UKind,
--@@ -15347,9 +15336,6 @@
--   QualType BuiltinChangeSignedness(QualType BaseType, UTTKind UKind,
--                                    SourceLocation Loc);
-- 
---  bool BuiltinIsConvertible(QualType From, QualType To, SourceLocation Loc,
---                            bool CheckNothrow = false);
---
--   bool BuiltinIsBaseOf(SourceLocation RhsTLoc, QualType LhsT, QualType RhsT);
-- 
--   /// Ensure that the type T is a literal type.
--diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
----- a/clang/lib/Sema/SemaTemplate.cpp
--+++ b/clang/lib/Sema/SemaTemplate.cpp
--@@ -3212,36 +3212,6 @@
--   }
-- }
-- 
---static QualType InstantiateTemplate(Sema &S, ElaboratedTypeKeyword Keyword,
---                                    TemplateName Template,
---                                    ArrayRef<TemplateArgument> Args,
---                                    SourceLocation Loc) {
---  TemplateArgumentListInfo ArgList;
---  for (auto Arg : Args) {
---    if (Arg.getKind() == TemplateArgument::Type) {
---      ArgList.addArgument(TemplateArgumentLoc(
---          Arg, S.Context.getTrivialTypeSourceInfo(Arg.getAsType())));
---    } else {
---      ArgList.addArgument(
---          S.getTrivialTemplateArgumentLoc(Arg, QualType(), Loc));
---    }
---  }
---
---  EnterExpressionEvaluationContext UnevaluatedContext(
---      S, Sema::ExpressionEvaluationContext::Unevaluated);
---  Sema::SFINAETrap SFINAE(S, /*AccessCheckingSFINAE=*/true);
---  Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
---
---  QualType Instantiation =
---      S.CheckTemplateIdType(Keyword, Template, Loc, ArgList, /*Scope=*/nullptr,
---                            /*ForNestedNameSpecifier=*/false);
---
---  if (SFINAE.hasErrorOccurred())
---    return QualType();
---
---  return Instantiation;
---}
---
-- static QualType builtinCommonTypeImpl(Sema &S, ElaboratedTypeKeyword Keyword,
--                                       TemplateName BaseTemplate,
--                                       SourceLocation TemplateLoc,
--@@ -3254,7 +3224,25 @@
--       return builtinCommonTypeImpl(S, Keyword, BaseTemplate, TemplateLoc,
--                                    {T1, T2});
-- 
---    return InstantiateTemplate(S, Keyword, BaseTemplate, {T1, T2}, TemplateLoc);
--+    TemplateArgumentListInfo Args;
--+    Args.addArgument(TemplateArgumentLoc(
--+        T1, S.Context.getTrivialTypeSourceInfo(T1.getAsType())));
--+    Args.addArgument(TemplateArgumentLoc(
--+        T2, S.Context.getTrivialTypeSourceInfo(T2.getAsType())));
--+
--+    EnterExpressionEvaluationContext UnevaluatedContext(
--+        S, Sema::ExpressionEvaluationContext::Unevaluated);
--+    Sema::SFINAETrap SFINAE(S, /*ForValidityCheck=*/true);
--+    Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
--+
--+    QualType BaseTemplateInst = S.CheckTemplateIdType(
--+        Keyword, BaseTemplate, TemplateLoc, Args,
--+        /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
--+
--+    if (SFINAE.hasErrorOccurred())
--+      return QualType();
--+
--+    return BaseTemplateInst;
--   };
-- 
--   // Note A: For the common_type trait applied to a template parameter pack T of
--@@ -3361,233 +3349,6 @@
--   }
-- }
-- 
---static QualType CopyCV(QualType From, QualType To) {
---  if (From.isConstQualified())
---    To.addConst();
---  if (From.isVolatileQualified())
---    To.addVolatile();
---  return To;
---}
---
---// Let COND-RES(X, Y) be
---//  decltype(false ? declval<X(&)()>()() : declval<Y(&)()>()())
---static QualType CondRes(Sema &S, QualType X, QualType Y, SourceLocation Loc) {
---  EnterExpressionEvaluationContext UnevaluatedContext(
---      S, Sema::ExpressionEvaluationContext::Unevaluated);
---  Sema::SFINAETrap SFINAE(S, /*AccessCheckingSFINAE=*/true);
---  Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
---
---  // false
---  OpaqueValueExpr CondExpr(SourceLocation(), S.Context.BoolTy, VK_PRValue);
---  ExprResult Cond = &CondExpr;
---
---  // declval<X(&)()>()()
---  OpaqueValueExpr LHSExpr(Loc, X.getNonLValueExprType(S.Context),
---                          Expr::getValueKindForType(X));
---  ExprResult LHS = &LHSExpr;
---
---  // declval<Y(&)()>()()
---  OpaqueValueExpr RHSExpr(Loc, Y.getNonLValueExprType(S.Context),
---                          Expr::getValueKindForType(Y));
---  ExprResult RHS = &RHSExpr;
---
---  ExprValueKind VK = VK_PRValue;
---  ExprObjectKind OK = OK_Ordinary;
---
---  // decltype(false ? declval<X(&)()>()() : declval<Y(&)()>()())
---  QualType Result = S.CheckConditionalOperands(Cond, LHS, RHS, VK, OK, Loc);
---
---  if (SFINAE.hasErrorOccurred())
---    return QualType();
---  if (VK == VK_LValue)
---    return S.BuiltinAddLValueReference(Result, Loc);
---  if (VK == VK_XValue)
---    return S.BuiltinAddRValueReference(Result, Loc);
---  return Result;
---}
---
---static QualType CommonRef(Sema &S, QualType A, QualType B, SourceLocation Loc) {
---  // Given types A and B, let X be remove_reference_t<A>, let Y be
---  // remove_reference_t<B>, and let COMMON-​REF(A, B) be:
---  assert(A->isReferenceType() && B->isReferenceType() &&
---         "A and B have to be ref qualified for a COMMON-REF");
---  auto X = A.getNonReferenceType();
---  auto Y = B.getNonReferenceType();
---
---  // If A and B are both lvalue reference types, COMMON-REF(A, B) is
---  // COND-RES(COPYCV(X, Y) &, COPYCV(​Y, X) &) if that type exists and is a
---  // reference type.
---  if (A->isLValueReferenceType() && B->isLValueReferenceType()) {
---    auto CR = CondRes(S, S.BuiltinAddLValueReference(CopyCV(X, Y), Loc),
---                      S.BuiltinAddLValueReference(CopyCV(Y, X), Loc), Loc);
---    if (CR.isNull() || !CR->isReferenceType())
---      return QualType();
---    return CR;
---  }
---
---  // Otherwise, let C be remove_reference_t<COMMON-REF(X&, Y&)>&&. If A and B
---  // are both rvalue reference types, C is well-formed, and
---  // is_convertible_v<A, C> && is_convertible_v<B, C> is true, then
---  // COMMON-REF(A, B) is C.
---  if (A->isRValueReferenceType() && B->isRValueReferenceType()) {
---    auto C = CommonRef(S, S.BuiltinAddLValueReference(X, Loc),
---                       S.BuiltinAddLValueReference(Y, Loc), Loc);
---    if (C.isNull())
---      return QualType();
---
---    C = C.getNonReferenceType();
---
---    if (S.BuiltinIsConvertible(A, C, Loc) && S.BuiltinIsConvertible(B, C, Loc))
---      return S.BuiltinAddRValueReference(C, Loc);
---    return QualType();
---  }
---
---  // Otherwise, if A is an lvalue reference and B is an rvalue reference, then
---  // COMMON-REF(A, B) is COMMON-REF(B, A).
---  if (A->isLValueReferenceType() && B->isRValueReferenceType())
---    std::swap(A, B);
---
---  // Otherwise, let D be COMMON-REF(const X&, Y&). If A is an rvalue reference
---  // and B is an lvalue reference and D is well-formed and
---  // is_convertible_v<A, D> is true, then COMMON-REF(A, B) is D.
---  if (A->isRValueReferenceType() && B->isLValueReferenceType()) {
---    auto X2 = X;
---    X2.addConst();
---    auto D = CommonRef(S, S.BuiltinAddLValueReference(X2, Loc),
---                       S.BuiltinAddLValueReference(Y, Loc), Loc);
---    if (!D.isNull() && S.BuiltinIsConvertible(A, D, Loc))
---      return D;
---    return QualType();
---  }
---
---  // Otherwise, COMMON-REF(A, B) is ill-formed.
---  // This is implemented by returning from the individual branches above.
---
---  llvm_unreachable("The above cases should be exhaustive");
---}
---
---static QualType builtinCommonReferenceImpl(Sema &S,
---                                           ElaboratedTypeKeyword Keyword,
---                                           TemplateName CommonReference,
---                                           TemplateName CommonType,
---                                           SourceLocation TemplateLoc,
---                                           ArrayRef<TemplateArgument> Ts) {
---  switch (Ts.size()) {
---  // If sizeof...(T) is zero, there shall be no member type.
---  case 0:
---    return QualType();
---
---  // Otherwise, if sizeof...(T) is one, let T0 denote the sole type in the
---  // pack T. The member typedef type shall denote the same type as T0.
---  case 1:
---    return Ts[0].getAsType();
---
---  // Otherwise, if sizeof...(T) is two, let T1 and T2 denote the two types in
---  // the pack T. Then
---  case 2: {
---    auto T1 = Ts[0].getAsType();
---    auto T2 = Ts[1].getAsType();
---
---    // Let R be COMMON-REF(T1, T2). If T1 and T2 are reference types, R is
---    // well-formed, and is_convertible_v<add_pointer_t<T1>, add_pointer_t<R>> &&
---    // is_convertible_v<add_pointer_t<T2>, add_pointer_t<R>> is true, then the
---    // member typedef type denotes R.
---    if (T1->isReferenceType() && T2->isReferenceType()) {
---      QualType R = CommonRef(S, T1, T2, TemplateLoc);
---      if (!R.isNull()) {
---        if (S.BuiltinIsConvertible(S.BuiltinAddPointer(T1, TemplateLoc),
---                                   S.BuiltinAddPointer(R, TemplateLoc),
---                                   TemplateLoc) &&
---            S.BuiltinIsConvertible(S.BuiltinAddPointer(T2, TemplateLoc),
---                                   S.BuiltinAddPointer(R, TemplateLoc),
---                                   TemplateLoc)) {
---          return R;
---        }
---      }
---    }
---
---    // Otherwise, if basic_common_reference<remove_cvref_t<T1>,
---    // remove_cvref_t<T2>, ​XREF(​T1), XREF(T2)>​::​type is well-formed,
---    // then the member typedef type denotes that type.
---    {
---      auto getXRef = [&](QualType T) {
---        BuiltinTemplateDecl *Quals[12] = {
---            S.Context.get__clang_internal_xref_Decl(),
---            S.Context.get__clang_internal_xref_constDecl(),
---            S.Context.get__clang_internal_xref_volatileDecl(),
---            S.Context.get__clang_internal_xref_constvolatileDecl(),
---            S.Context.get__clang_internal_xref_lvalueDecl(),
---            S.Context.get__clang_internal_xref_lvalueconstDecl(),
---            S.Context.get__clang_internal_xref_lvaluevolatileDecl(),
---            S.Context.get__clang_internal_xref_lvalueconstvolatileDecl(),
---            S.Context.get__clang_internal_xref_rvalueDecl(),
---            S.Context.get__clang_internal_xref_rvalueconstDecl(),
---            S.Context.get__clang_internal_xref_rvaluevolatileDecl(),
---            S.Context.get__clang_internal_xref_rvalueconstvolatileDecl(),
---        };
---        size_t Index = 0;
---        if (T->isLValueReferenceType()) {
---          T = T.getNonReferenceType();
---          Index += 4;
---        } else if (T->isRValueReferenceType()) {
---          T = T.getNonReferenceType();
---          Index += 8;
---        }
---        if (T.isConstQualified())
---          Index += 1;
---
---        if (T.isVolatileQualified())
---          Index += 2;
---
---        return Quals[Index];
---      };
---
---      auto BCR = InstantiateTemplate(S, Keyword, CommonReference,
---                                     {S.BuiltinRemoveCVRef(T1, TemplateLoc),
---                                      S.BuiltinRemoveCVRef(T2, TemplateLoc),
---                                      TemplateName{getXRef(T1)},
---                                      TemplateName{getXRef(T2)}},
---                                     TemplateLoc);
---      if (!BCR.isNull())
---        return BCR;
---    }
---
---    // Otherwise, if COND-RES(T1, T2) is well-formed, then the member typedef
---    // type denotes that type.
---    if (auto CR = CondRes(S, T1, T2, TemplateLoc); !CR.isNull())
---      return CR;
---
---    // Otherwise, if common_type_t<T1, T2> is well-formed, then the member
---    // typedef type denotes that type.
---    if (auto CT =
---            InstantiateTemplate(S, Keyword, CommonType, {T1, T2}, TemplateLoc);
---        !CT.isNull())
---      return CT;
---
---    // Otherwise, there shall be no member type.
---    return QualType();
---  }
---
---  // Otherwise, if sizeof...(T) is greater than two, let T1, T2, and Rest,
---  // respectively, denote the first, second, and (pack of) remaining types
---  // comprising T. Let C be the type common_reference_t<T1, T2>. Then:
---  default: {
---    auto T1 = Ts[0];
---    auto T2 = Ts[1];
---    auto Rest = Ts.drop_front(2);
---    auto C = builtinCommonReferenceImpl(S, Keyword, CommonReference, CommonType,
---                                        TemplateLoc, {T1, T2});
---    if (C.isNull())
---      return QualType();
---    llvm::SmallVector<TemplateArgument, 4> Args;
---    Args.emplace_back(C);
---    Args.append(Rest.begin(), Rest.end());
---    return builtinCommonReferenceImpl(S, Keyword, CommonReference, CommonType,
---                                      TemplateLoc, Args);
---  }
---  }
---}
---
-- static bool isInVkNamespace(const RecordType *RT) {
--   DeclContext *DC = RT->getDecl()->getDeclContext();
--   if (!DC)
--@@ -3746,89 +3507,6 @@
--     return HasNoTypeMember;
--   }
-- 
---  case BTK__builtin_common_reference: {
---    assert(Converted.size() == 5);
---    if (llvm::any_of(Converted, [](auto &C) { return C.isDependent(); }))
---      return QualType();
---
---    TemplateName BasicCommonReference = Converted[0].getAsTemplate();
---    TemplateName CommonType = Converted[1].getAsTemplate();
---    TemplateName HasTypeMember = Converted[2].getAsTemplate();
---    QualType HasNoTypeMember = Converted[3].getAsType();
---    ArrayRef<TemplateArgument> Ts = Converted[4].getPackAsArray();
---    if (auto CR =
---            builtinCommonReferenceImpl(SemaRef, Keyword, BasicCommonReference,
---                                       CommonType, TemplateLoc, Ts);
---        !CR.isNull()) {
---      TemplateArgumentListInfo TAs;
---      TAs.addArgument(TemplateArgumentLoc(
---          TemplateArgument(CR), SemaRef.Context.getTrivialTypeSourceInfo(
---                                    CR, TemplateArgs[1].getLocation())));
---      return SemaRef.CheckTemplateIdType(Keyword, HasTypeMember, TemplateLoc,
---                                         TAs, /*Scope=*/nullptr,
---                                         /*ForNestedNameSpecifier=*/false);
---    }
---    return HasNoTypeMember;
---  }
---
---  case BTK__clang_internal_xref_:
---  case BTK__clang_internal_xref_const:
---  case BTK__clang_internal_xref_volatile:
---  case BTK__clang_internal_xref_constvolatile:
---  case BTK__clang_internal_xref_lvalue:
---  case BTK__clang_internal_xref_lvalueconst:
---  case BTK__clang_internal_xref_lvaluevolatile:
---  case BTK__clang_internal_xref_lvalueconstvolatile:
---  case BTK__clang_internal_xref_rvalue:
---  case BTK__clang_internal_xref_rvalueconst:
---  case BTK__clang_internal_xref_rvaluevolatile:
---  case BTK__clang_internal_xref_rvalueconstvolatile: {
---    if (llvm::any_of(Converted, [](auto &C) { return C.isDependent(); }))
---      return QualType();
---
---    auto BTK = BTD->getBuiltinTemplateKind();
---    auto anyOf = [&](auto... Vals) { return ((BTK == Vals) || ...); };
---
---    bool AddCV = anyOf(BTK__clang_internal_xref_constvolatile,
---                       BTK__clang_internal_xref_lvalueconstvolatile,
---                       BTK__clang_internal_xref_rvalueconstvolatile);
---
---    bool AddConst = AddCV || anyOf(BTK__clang_internal_xref_const,
---                                   BTK__clang_internal_xref_lvalueconst,
---                                   BTK__clang_internal_xref_rvalueconst);
---
---    bool AddVolatile = AddCV || anyOf(BTK__clang_internal_xref_volatile,
---                                      BTK__clang_internal_xref_lvaluevolatile,
---                                      BTK__clang_internal_xref_rvaluevolatile);
---
---    bool AddLValue = anyOf(BTK__clang_internal_xref_lvalue,
---                           BTK__clang_internal_xref_lvalueconst,
---                           BTK__clang_internal_xref_lvaluevolatile,
---                           BTK__clang_internal_xref_lvalueconstvolatile);
---
---    bool AddRValue = anyOf(BTK__clang_internal_xref_rvalue,
---                           BTK__clang_internal_xref_rvalueconst,
---                           BTK__clang_internal_xref_rvaluevolatile,
---                           BTK__clang_internal_xref_rvalueconstvolatile);
---
---    assert(Converted.size() == 1);
---
---    QualType T = Converted[0].getAsType();
---
---    if (AddConst)
---      T.addConst();
---
---    if (AddVolatile)
---      T.addVolatile();
---
---    if (AddLValue)
---      T = SemaRef.BuiltinAddLValueReference(T, TemplateLoc);
---    else if (AddRValue)
---      T = SemaRef.BuiltinAddRValueReference(T, TemplateLoc);
---
---    return T;
---  }
---
--   case BTK__hlsl_spirv_type: {
--     assert(Converted.size() == 4);
-- 
--diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
----- a/clang/lib/Sema/SemaType.cpp
--+++ b/clang/lib/Sema/SemaType.cpp
--@@ -32,8 +32,6 @@
-- #include "clang/Lex/Preprocessor.h"
-- #include "clang/Sema/DeclSpec.h"
-- #include "clang/Sema/DelayedDiagnostic.h"
---#include "clang/Sema/EnterExpressionEvaluationContext.h"
---#include "clang/Sema/Initialization.h"
-- #include "clang/Sema/Lookup.h"
-- #include "clang/Sema/ParsedAttr.h"
-- #include "clang/Sema/ParsedTemplate.h"
--@@ -10074,81 +10072,6 @@
--   return Context.getQualifiedType(Underlying, BaseType.getQualifiers());
-- }
-- 
---bool Sema::BuiltinIsConvertible(QualType From, QualType To, SourceLocation Loc,
---                                bool CheckNothrow) {
---  if (To->isVoidType())
---    return From->isVoidType();
---
---  // [meta.rel]
---  // From and To shall be complete types, cv void, or arrays of unknown bound.
---  if ((!From->isIncompleteArrayType() && !From->isVoidType() &&
---       RequireCompleteType(
---           Loc, From, diag::err_incomplete_type_used_in_type_trait_expr)) ||
---      (!To->isIncompleteArrayType() && !To->isVoidType() &&
---       RequireCompleteType(Loc, To,
---                           diag::err_incomplete_type_used_in_type_trait_expr)))
---    return false;
---
---  // C++11 [meta.rel]p4:
---  //   Given the following function prototype:
---  //
---  //     template <class T>
---  //       typename add_rvalue_reference<T>::type create();
---  //
---  //   the predicate condition for a template specialization
---  //   is_convertible<From, To> shall be satisfied if and only if
---  //   the return expression in the following code would be
---  //   well-formed, including any implicit conversions to the return
---  //   type of the function:
---  //
---  //     To test() {
---  //       return create<From>();
---  //     }
---  //
---  //   Access checking is performed as if in a context unrelated to To and
---  //   From. Only the validity of the immediate context of the expression
---  //   of the return-statement (including conversions to the return type)
---  //   is considered.
---  //
---  // We model the initialization as a copy-initialization of a temporary
---  // of the appropriate type, which for this expression is identical to the
---  // return statement (since NRVO doesn't apply).
---
---  // Functions aren't allowed to return function or array types.
---  if (To->isFunctionType() || To->isArrayType())
---    return false;
---
---  // A function definition requires a non-abstract return type.
---  if (isAbstractType(Loc, To))
---    return false;
---
---  From = BuiltinAddRValueReference(From, Loc);
---
---  // Build a fake source and destination for initialization.
---  InitializedEntity ToEntity(InitializedEntity::InitializeTemporary(To));
---  OpaqueValueExpr FromExpr(Loc, From.getNonLValueExprType(Context),
---                           Expr::getValueKindForType(From));
---  InitializationKind Kind =
---      InitializationKind::CreateCopy(Loc, SourceLocation());
---
---  // Perform the initialization in an unevaluated context within a SFINAE
---  // trap at translation unit scope.
---  EnterExpressionEvaluationContext Unevaluated(
---      *this, Sema::ExpressionEvaluationContext::Unevaluated);
---  Sema::SFINAETrap SFINAE(*this, /*AccessCheckingSFINAE=*/true);
---  Sema::ContextRAII TUContext(*this, Context.getTranslationUnitDecl());
---  Expr *FromExprPtr = &FromExpr;
---  InitializationSequence Init(*this, ToEntity, Kind, FromExprPtr);
---  if (Init.Failed())
---    return false;
---
---  ExprResult Result = Init.Perform(*this, ToEntity, Kind, FromExprPtr);
---  if (Result.isInvalid() || SFINAE.hasErrorOccurred())
---    return false;
---
---  return !CheckNothrow || canThrow(Result.get()) == CT_Cannot;
---}
---
-- QualType Sema::BuildUnaryTransformType(QualType BaseType, UTTKind UKind,
--                                        SourceLocation Loc) {
--   if (BaseType->isDependentType())
--diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
----- a/clang/lib/Sema/SemaTypeTraits.cpp
--+++ b/clang/lib/Sema/SemaTypeTraits.cpp
--@@ -1212,6 +1212,76 @@
--                                     const TypeSourceInfo *Rhs,
--                                     SourceLocation KeyLoc);
-- 
--+static ExprResult CheckConvertibilityForTypeTraits(
--+    Sema &Self, const TypeSourceInfo *Lhs, const TypeSourceInfo *Rhs,
--+    SourceLocation KeyLoc, llvm::BumpPtrAllocator &OpaqueExprAllocator) {
--+
--+  QualType LhsT = Lhs->getType();
--+  QualType RhsT = Rhs->getType();
--+
--+  // C++0x [meta.rel]p4:
--+  //   Given the following function prototype:
--+  //
--+  //     template <class T>
--+  //       typename add_rvalue_reference<T>::type create();
--+  //
--+  //   the predicate condition for a template specialization
--+  //   is_convertible<From, To> shall be satisfied if and only if
--+  //   the return expression in the following code would be
--+  //   well-formed, including any implicit conversions to the return
--+  //   type of the function:
--+  //
--+  //     To test() {
--+  //       return create<From>();
--+  //     }
--+  //
--+  //   Access checking is performed as if in a context unrelated to To and
--+  //   From. Only the validity of the immediate context of the expression
--+  //   of the return-statement (including conversions to the return type)
--+  //   is considered.
--+  //
--+  // We model the initialization as a copy-initialization of a temporary
--+  // of the appropriate type, which for this expression is identical to the
--+  // return statement (since NRVO doesn't apply).
--+
--+  // Functions aren't allowed to return function or array types.
--+  if (RhsT->isFunctionType() || RhsT->isArrayType())
--+    return ExprError();
--+
--+  // A function definition requires a complete, non-abstract return type.
--+  if (!Self.isCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT) ||
--+      Self.isAbstractType(Rhs->getTypeLoc().getBeginLoc(), RhsT))
--+    return ExprError();
--+
--+  // Compute the result of add_rvalue_reference.
--+  if (LhsT->isObjectType() || LhsT->isFunctionType())
--+    LhsT = Self.Context.getRValueReferenceType(LhsT);
--+
--+  // Build a fake source and destination for initialization.
--+  InitializedEntity To(InitializedEntity::InitializeTemporary(RhsT));
--+  Expr *From = new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
--+      OpaqueValueExpr(KeyLoc, LhsT.getNonLValueExprType(Self.Context),
--+                      Expr::getValueKindForType(LhsT));
--+  InitializationKind Kind =
--+      InitializationKind::CreateCopy(KeyLoc, SourceLocation());
--+
--+  // Perform the initialization in an unevaluated context within a SFINAE
--+  // trap at translation unit scope.
--+  EnterExpressionEvaluationContext Unevaluated(
--+      Self, Sema::ExpressionEvaluationContext::Unevaluated);
--+  Sema::SFINAETrap SFINAE(Self, /*ForValidityCheck=*/true);
--+  Sema::ContextRAII TUContext(Self, Self.Context.getTranslationUnitDecl());
--+  InitializationSequence Init(Self, To, Kind, From);
--+  if (Init.Failed())
--+    return ExprError();
--+
--+  ExprResult Result = Init.Perform(Self, To, Kind, From);
--+  if (Result.isInvalid() || SFINAE.hasErrorOccurred())
--+    return ExprError();
--+
--+  return Result;
--+}
--+
-- static APValue EvaluateSizeTTypeTrait(Sema &S, TypeTrait Kind,
--                                       SourceLocation KWLoc,
--                                       ArrayRef<TypeSourceInfo *> Args,
--@@ -1372,8 +1442,9 @@
--           S.Context.getPointerType(T.getNonReferenceType()));
--       TypeSourceInfo *UPtr = S.Context.CreateTypeSourceInfo(
--           S.Context.getPointerType(U.getNonReferenceType()));
---      return S.BuiltinIsConvertible(UPtr->getType(), TPtr->getType(),
---                                    RParenLoc);
--+      return !CheckConvertibilityForTypeTraits(S, UPtr, TPtr, RParenLoc,
--+                                               OpaqueExprAllocator)
--+                  .isInvalid();
--     }
-- 
--     if (Kind == clang::TT_IsNothrowConstructible)
--@@ -1624,9 +1695,20 @@
--   }
--   case BTT_IsConvertible:
--   case BTT_IsConvertibleTo:
---  case BTT_IsNothrowConvertible:
---    return Self.BuiltinIsConvertible(LhsT, RhsT, KeyLoc,
---                                     BTT == BTT_IsNothrowConvertible);
--+  case BTT_IsNothrowConvertible: {
--+    if (RhsT->isVoidType())
--+      return LhsT->isVoidType();
--+    llvm::BumpPtrAllocator OpaqueExprAllocator;
--+    ExprResult Result = CheckConvertibilityForTypeTraits(Self, Lhs, Rhs, KeyLoc,
--+                                                         OpaqueExprAllocator);
--+    if (Result.isInvalid())
--+      return false;
--+
--+    if (BTT != BTT_IsNothrowConvertible)
--+      return true;
--+
--+    return Self.canThrow(Result.get()) == CT_Cannot;
--+  }
-- 
--   case BTT_IsAssignable:
--   case BTT_IsNothrowAssignable:
--diff -ruN --strip-trailing-cr a/clang/test/SemaCXX/type-trait-common-reference.cpp b/clang/test/SemaCXX/type-trait-common-reference.cpp
----- a/clang/test/SemaCXX/type-trait-common-reference.cpp
--+++ b/clang/test/SemaCXX/type-trait-common-reference.cpp
--@@ -1,136 +0,0 @@
---// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=c++17 -Wno-vla-cxx-extension %s
---// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=c++20 -Wno-vla-cxx-extension %s
---
---#if !__has_builtin(__builtin_common_reference)
---#  error
---#endif
---
---// expected-note@*:* {{template <template <class, class, template <class> class, template <class> class> class, template <class ...> class, template <class> class, class, class ...>}}
---
---void test() {
---  __builtin_common_reference<> a; // expected-error {{too few template arguments for template '__builtin_common_reference'}}
---  __builtin_common_reference<1> b; // expected-error {{template argument for template template parameter must be a class template or type alias template}}
---  __builtin_common_reference<int, 1> c; // expected-error {{template argument for template template parameter must be a class template or type alias template}}
---}
---
---struct empty_type {};
---
---template <class T>
---struct type_identity {
---  using type = T;
---};
---
---template <class...>
---struct common_type;
---
---template <class... Args>
---using common_type_t = typename common_type<Args...>::type;
---
---template <class, class, template <class> class, template <class> class>
---struct basic_common_reference {};
---
---template <class T, class U, template <class> class TX, template <class> class UX>
---using basic_common_reference_t = typename basic_common_reference<T, U, TX, UX>::type;
---
---void test_vla() {
---  int i = 4;
---  int VLA[i];
---  __builtin_common_reference<basic_common_reference_t, common_type_t, type_identity, empty_type, decltype(VLA)> d; // expected-error {{variably modified type 'decltype(VLA)' (aka 'int[i]') cannot be used as a template argument}}
---}
---
---template <class... Args>
---using common_reference_base = __builtin_common_reference<basic_common_reference_t, common_type_t, type_identity, empty_type, Args...>;
---
---template <class... Args>
---struct common_reference : common_reference_base<Args...> {};
---
---template <class... Args>
---using common_reference_t = typename __builtin_common_reference<basic_common_reference_t, common_type_t, type_identity, empty_type, Args...>::type;
---
---struct Incomplete;
---
---template<>
---struct common_type<Incomplete, Incomplete>;
---
---static_assert(__is_same(common_reference_base<>, empty_type));
---
---static_assert(__is_same(common_reference_base<Incomplete>, type_identity<Incomplete>));
---static_assert(__is_same(common_reference_base<char>, type_identity<char>));
---static_assert(__is_same(common_reference_base<int>, type_identity<int>));
---static_assert(__is_same(common_reference_base<const int>, type_identity<const int>));
---static_assert(__is_same(common_reference_base<volatile int>, type_identity<volatile int>));
---static_assert(__is_same(common_reference_base<const volatile int>, type_identity<const volatile int>));
---static_assert(__is_same(common_reference_base<int[]>, type_identity<int[]>));
---static_assert(__is_same(common_reference_base<const int[]>, type_identity<const int[]>));
---static_assert(__is_same(common_reference_base<void(&)()>, type_identity<void(&)()>));
---
---static_assert(__is_same(common_reference_base<int[], int[]>, type_identity<int*>));
---static_assert(__is_same(common_reference_base<int, int>, type_identity<int>));
---static_assert(__is_same(common_reference_base<int, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long, int>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long, long>, type_identity<long>));
---
---static_assert(__is_same(common_reference_base<const int, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<const volatile int, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<int, const long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<int, const volatile long>, type_identity<long>));
---
---static_assert(__is_same(common_reference_base<int*, long*>, empty_type));
---static_assert(__is_same(common_reference_base<const unsigned int *const &, const unsigned int *const &>, type_identity<const unsigned int *const &>));
---
---static_assert(__is_same(common_reference_base<int, long, float>, type_identity<float>));
---static_assert(__is_same(common_reference_base<unsigned, char, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long long, long long, long>, type_identity<long long>));
---
---static_assert(__is_same(common_reference_base<int [[clang::address_space(1)]]>, type_identity<int [[clang::address_space(1)]]>));
---static_assert(__is_same(common_reference_base<int [[clang::address_space(1)]], int>, type_identity<int>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], int>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], int [[clang::address_space(1)]]>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], long [[clang::address_space(1)]]>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], long [[clang::address_space(2)]]>, type_identity<long>));
---
---struct S {};
---struct T : S {};
---struct U {};
---
---static_assert(__is_same(common_reference_base<S&&, T&&>, type_identity<S&&>));
---
---static_assert(__is_same(common_reference_base<int S::*, int S::*>, type_identity<int S::*>));
---static_assert(__is_same(common_reference_base<int S::*, int T::*>, type_identity<int T::*>));
---static_assert(__is_same(common_reference_base<int S::*, long S::*>, empty_type));
---
---static_assert(__is_same(common_reference_base<int (S::*)(), int (S::*)()>, type_identity<int (S::*)()>));
---static_assert(__is_same(common_reference_base<int (S::*)(), int (T::*)()>, type_identity<int (T::*)()>));
---static_assert(__is_same(common_reference_base<int (S::*)(), long (S::*)()>, empty_type));
---
---static_assert(__is_same(common_reference_base<int&, int&>, type_identity<int&>));
---static_assert(__is_same(common_reference_base<int&, const int&>, type_identity<const int&>));
---static_assert(__is_same(common_reference_base<volatile int&, const int&>, type_identity<const volatile int&>));
---
---template <class T, class U>
---struct my_pair;
---
---template <class T1, class U1, class T2, class U2, template <class> class TX, template <class> class UX>
---struct basic_common_reference<my_pair<T1, U1>, my_pair<T2, U2>, TX, UX> {
---  using type = my_pair<common_reference_t<TX<T1>, UX<T2>>, common_reference_t<TX<U1>, UX<U2>>>;
---};
---
---static_assert(__is_same(common_reference_base<my_pair<const int&, int&>, my_pair<int&, volatile int&>>, type_identity<my_pair<const int&, volatile int&>>));
---static_assert(__is_same(common_reference_base<const my_pair<int, int>&, my_pair<int&, volatile int&>>, type_identity<my_pair<const int&, const volatile int&>>));
---static_assert(__is_same(common_reference_base<const int&, const volatile int&>, type_identity<const volatile int&>));
---static_assert(__is_same(common_reference_base<int&&, const volatile int&>, type_identity<int>));
---static_assert(__is_same(common_reference_base<my_pair<int, int>&&, my_pair<int&, volatile int&>>, type_identity<my_pair<const int&, int>>));
---static_assert(__is_same(common_reference_base<my_pair<int, int>&&, my_pair<int&, int>&&>, type_identity<my_pair<const int&, int&&>>));
---
---struct conversion_operator {
---  operator volatile int&&() volatile;
---};
---
---static_assert(__is_same(common_reference_base<volatile conversion_operator&&, volatile int&&>, type_identity<volatile int&&>));
---
---struct reference_wrapper {
---  reference_wrapper(int&);
---  operator int&() const;
---};
---
---static_assert(__is_same(common_reference_base<const reference_wrapper&, int&>, empty_type));
--diff -ruN --strip-trailing-cr a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h
----- a/libcxx/include/__locale_dir/num.h
--+++ b/libcxx/include/__locale_dir/num.h
--@@ -436,6 +436,7 @@
--         ++__first;
--         if (__first == __last) {
--           __err |= ios_base::eofbit;
--+          __v = 0;
--           return __first;
--         }
--         // __c2 == 'x' || __c2 == 'X'
--@@ -444,6 +445,7 @@
--           ++__first;
--         } else {
--           __base = 8;
--+          __parsed_num = true; // We only swallowed '0', so we've started to parse a number
--         }
--       } else {
--         __base = 10;
--diff -ruN --strip-trailing-cr a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
----- a/libcxx/include/module.modulemap.in
--+++ b/libcxx/include/module.modulemap.in
--@@ -1517,7 +1517,6 @@
--       header "__iterator/iterator_traits.h"
--       export std_core.type_traits.integral_constant
--       export std_core.type_traits.is_convertible
---      export std_core.type_traits.nat
--     }
--     module iterator_with_data         { header "__iterator/iterator_with_data.h" }
--     module iterator                   { header "__iterator/iterator.h" }
--diff -ruN --strip-trailing-cr a/libcxx/include/__type_traits/common_reference.h b/libcxx/include/__type_traits/common_reference.h
----- a/libcxx/include/__type_traits/common_reference.h
--+++ b/libcxx/include/__type_traits/common_reference.h
--@@ -18,37 +18,16 @@
-- #include <__type_traits/is_reference.h>
-- #include <__type_traits/remove_cvref.h>
-- #include <__type_traits/remove_reference.h>
---#include <__type_traits/type_identity.h>
-- #include <__utility/declval.h>
-- 
-- #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-- #  pragma GCC system_header
-- #endif
-- 
---#if _LIBCPP_STD_VER >= 20
---
-- _LIBCPP_BEGIN_NAMESPACE_STD
-- 
---template <class...>
---struct _LIBCPP_NO_SPECIALIZATIONS common_reference;
---
---template <class... _Types>
---using common_reference_t = typename common_reference<_Types...>::type;
---
---template <class, class, template <class> class, template <class> class>
---struct basic_common_reference {};
---
---#  if __has_builtin(__builtin_common_reference)
---
---template <class _Tp, class _Up, template <class> class _Tx, template <class> class _Ux>
---using __basic_common_reference_t = basic_common_reference<_Tp, _Up, _Tx, _Ux>::type;
---
---template <class... _Args>
---struct _LIBCPP_NO_SPECIALIZATIONS common_reference
---    : __builtin_common_reference<__basic_common_reference_t, common_type_t, type_identity, __empty, _Args...> {};
---
---#  else
---
--+// common_reference
--+#if _LIBCPP_STD_VER >= 20
-- // Let COND_RES(X, Y) be:
-- template <class _Xp, class _Yp>
-- using __cond_res _LIBCPP_NODEBUG = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_Yp (&)()>()());
--@@ -130,10 +109,19 @@
-- 
-- // Note C: For the common_reference trait applied to a parameter pack [...]
-- 
--+template <class...>
--+struct _LIBCPP_NO_SPECIALIZATIONS common_reference;
--+
--+template <class... _Types>
--+using common_reference_t = typename common_reference<_Types...>::type;
--+
--+template <class, class, template <class> class, template <class> class>
--+struct basic_common_reference {};
--+
-- _LIBCPP_DIAGNOSTIC_PUSH
---#    if __has_warning("-Winvalid-specialization")
--+#  if __has_warning("-Winvalid-specialization")
-- _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization")
---#    endif
--+#  endif
-- // bullet 1 - sizeof...(T) == 0
-- template <>
-- struct common_reference<> {};
--@@ -207,10 +195,8 @@
-- template <class...>
-- struct _LIBCPP_NO_SPECIALIZATIONS common_reference {};
-- 
---#  endif // __has_builtin(__builtin_common_reference)
--+#endif // _LIBCPP_STD_VER >= 20
-- 
-- _LIBCPP_END_NAMESPACE_STD
-- 
---#endif // _LIBCPP_STD_VER >= 20
---
-- #endif // _LIBCPP___TYPE_TRAITS_COMMON_REFERENCE_H
--diff -ruN --strip-trailing-cr a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
----- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
--+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
--@@ -670,5 +670,101 @@
--       assert(v == std::numeric_limits<long>::min());
--     }
-- 
--+  { // Check that auto-detection of the base works properly
--+    ios.flags(ios.flags() & ~std::ios::basefield);
--+    { // zeroes
--+      {
--+        v                          = -1;
--+        const char str[]           = "0";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 1), ios, err, v);
--+        assert(base(iter) == str + 1);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "00";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 2), ios, err, v);
--+        assert(base(iter) == str + 2);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0x0";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 3);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0X0";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 3);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+    }
--+    { // first character after base is out of range
--+      {
--+        v                          = -1;
--+        const char str[]           = "08";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 2), ios, err, v);
--+        assert(base(iter) == str + 1);
--+        assert(err == ios.goodbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "1a";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 2), ios, err, v);
--+        assert(base(iter) == str + 1);
--+        assert(err == ios.goodbit);
--+        assert(v == 1);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0xg";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 2);
--+        assert(err == ios.failbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0Xg";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 2);
--+        assert(err == ios.failbit);
--+        assert(v == 0);
--+      }
--+    }
--+  }
--+
--   return 0;
-- }
--diff -ruN --strip-trailing-cr a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
----- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
--+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
--@@ -948,8 +948,7 @@
--     auto [id, args] = NVVM::MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
--                       *op, moduleTranslation, builder);
-- 
---    int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
---    if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
--+    if (op.getNumResults() > 0)
--       $res = createIntrinsicCall(builder, id, args);
--     else
--       createIntrinsicCall(builder, id, args);
--@@ -985,9 +984,7 @@
--   string llvmBuilder = [{
--     auto [id, args] = NVVM::MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
--                       *op, moduleTranslation, builder);
---
---    int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
---    if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
--+    if (op.getNumResults() > 0)
--       $res = createIntrinsicCall(builder, id, args);
--     else
--       createIntrinsicCall(builder, id, args);
--diff -ruN --strip-trailing-cr a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
----- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
--+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
--@@ -44,6 +44,7 @@
--   gpu.host_register %2 : memref<*xf64>
--   gpu.host_register %20 : memref<*xf64>
--   gpu.host_register %33 : memref<*xf64>
--+  gpu.host_register %34 : memref<*xf64>
-- 
--   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
--              threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) {
--diff -ruN --strip-trailing-cr a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
----- a/utils/bazel/configure.bzl
--+++ b/utils/bazel/configure.bzl
--@@ -27,7 +27,6 @@
--     "XCore",
-- ]
-- 
---
-- MAX_TRAVERSAL_STEPS = 1000000  # "big number" upper bound on total visited dirs
-- 
-- def _overlay_directories(repository_ctx):
--@@ -44,7 +43,9 @@
--     for _ in range(MAX_TRAVERSAL_STEPS):
--         rel_dir = stack.pop()
-- 
---        overlay_dirs = set()
--+        # TODO: `set()` is only available in bazel 8.1.
--+        # Use `set()` after downstream users are on more recent versions.
--+        overlay_dirs = {}
-- 
--         # Symlink overlay files, overlay dirs will be handled in future iterations.
--         for entry in overlay_root.get_child(rel_dir).readdir():
--@@ -53,7 +54,7 @@
-- 
--             if entry.is_dir:
--                 stack.append(full_rel_path)
---                overlay_dirs.add(name)
--+                overlay_dirs[name] = None
--             else:
--                 src_path = overlay_root.get_child(full_rel_path)
--                 dst_path = target_root.get_child(full_rel_path)
--@@ -62,7 +63,7 @@
--         # Symlink source dirs (if not themselves overlaid) and files.
--         for src_entry in src_root.get_child(rel_dir).readdir():
--             name = src_entry.basename
---            if name in overlay_dirs:
--+            if name in overlay_dirs.keys():
--                 # Skip: overlay has a directory with this name
--                 continue
-- 
-diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index f8f483c..5e3d8f2 100644
---- a/third_party/llvm/workspace.bzl
-+++ b/third_party/llvm/workspace.bzl
-@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
- 
- def repo(name):
-     """Imports LLVM."""
--    LLVM_COMMIT = "ac66ae45cd22a7958ace645a035831000bfcbf51"
--    LLVM_SHA256 = "3bb51316595bbe99da8bee121f1fc39993176afc5b55f72a5d5010214dcd24a8"
-+    LLVM_COMMIT = "8dee997a8558b460b82b23fb43b197d68258baac"
-+    LLVM_SHA256 = "6a26975000c2cb45787813317bfeeadeafa0cba762e9434fb7940481ec4b27de"
- 
-     tf_http_archive(
-         name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index be133debb9e029..0d75504d3d6e06 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "e0131b277694a847e26d0ce4ce489423f399e26c"
-    SHARDY_SHA256 = "54341c51d72c773217023a77db3b53253bcf49faa5a3d59d088e8b3e75d976be"
+    SHARDY_COMMIT = "89db8f8a60c810205365b1117e6c27ac99aa40f3"
+    SHARDY_SHA256 = "a5d33fa1af43f162e62a7bdff411cda7ca0a4992c6c304ac2e3344524c30e65d"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index adc1a07d777e1f..71f4c67bb1848b 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,1842 +1,561 @@
 diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
 --- stablehlo/BUILD.bazel
 +++ stablehlo/BUILD.bazel
-@@ -1183,6 +1183,7 @@
-         ":chlo_ops",
-         ":chlo_rewriters_inc_gen",
-         ":stablehlo_aggressive_simplification_inc_gen",
+@@ -257,6 +257,7 @@
+         ":chlo_enums_inc_gen",
+         ":chlo_ops_inc_gen",
+         ":stablehlo_assembly_format",
 +        ":stablehlo_broadcast_lowering",
-         ":stablehlo_create_compatibility_expander_inc_gen",
-         ":stablehlo_create_complex_math_expander_inc_gen",
-         ":stablehlo_legalize_deprecated_ops_inc_gen",
-@@ -1922,6 +1923,24 @@
-     ],
- )
- 
-+cc_test(
-+    name = "chlo_builder_test",
-+    srcs = ["stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp"],
-+    deps = [
-+        ":attr_type_builder_util",
-+        ":chlo_builder",
-+        ":func_builder",
-+        ":mlir_builder",
-+        ":register",
-+        ":stablehlo_builder",
-+        ":stablehlo_ops",
-+        "@llvm-project//mlir:IR",
-+        "@llvm-project//mlir:Support",
-+        "@llvm-project//third-party/unittest:gmock",
-+        "@llvm-project//third-party/unittest:gtest",
-+    ],
-+)
-+
- gentbl_cc_library(
-     name = "func_builder_inc",
-     tbl_outs = {
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir b/stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
---- stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
-+++ stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
-@@ -913,6 +913,15 @@
- 
- // -----
- 
-+// CHECK-LABEL: func @reshape_0D_0D
-+func.func @reshape_0D_0D(%arg0: tensor<i32>) ->tensor<i32> {
-+  %0 = "stablehlo.reshape"(%arg0) : (tensor<i32>) -> tensor<i32>
-+  func.return %0 : tensor<i32>
-+}
-+// CHECK: return %arg0 : tensor<i32>
-+
-+// -----
-+
- // CHECK-LABEL: func @reshape_0D_1D_unsigned
- // CHECK-SAME:    %[[ARG_UNSIGNED:[a-zA-Z0-9_]*]]
- func.func @reshape_0D_1D_unsigned(%arg0: tensor<ui32>) -> tensor<1xui32> {
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-@@ -1103,6 +1103,12 @@
- 
-     if (!resultType.hasStaticShape()) return failure();
- 
-+    // If the reshape is a no-op simply fold it away.
-+    if (resultType == operandType) {
-+      rewriter.replaceOp(reshapeOp, operand);
-+      return success();
-+    }
-+
-     // If any of the output dimensions is 0, the tensor has no elements. In that
-     // case, we can just replace the reshape with an empty op.
-     if (llvm::is_contained(resultType.getShape(), 0)) {
+         ":stablehlo_type_inference",
+         "@llvm-project//llvm:Support",
+         "@llvm-project//mlir:BytecodeOpInterface",
 diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
 --- stablehlo/stablehlo/dialect/Base.cpp
 +++ stablehlo/stablehlo/dialect/Base.cpp
-@@ -29,6 +29,7 @@
+@@ -25,7 +25,6 @@
+ #include <utility>
+ 
+ #include "llvm/ADT/APInt.h"
+-#include "llvm/ADT/Hashing.h"
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/Sequence.h"
  #include "llvm/ADT/SmallVector.h"
-+#include "llvm/Support/Casting.h"
- #include "llvm/Support/Debug.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "mlir/Dialect/Quant/IR/QuantTypes.h"
-@@ -781,6 +782,14 @@
-           numScales == rankedType.getDimSize(quantDim));
- }
+@@ -47,7 +46,6 @@
+ #include "mlir/Interfaces/SideEffectInterfaces.h"
+ #include "mlir/Support/LLVM.h"
+ #include "mlir/Support/LogicalResult.h"
+-#include "mlir/Support/TypeID.h"
+ 
+ // Include order matters
+ #include "stablehlo/dialect/BaseAttrInterfaces.cpp.inc"
+@@ -246,7 +244,7 @@
+   if (boundsLen != rank)
+     return emitError() << "Bounds length is " << boundsLen
+                        << ", expected to be equal to rank(" << rank
+-                       << ") of the tensor";
++                       << ") of the tensor " << type;
+ 
+   for (int64_t dim = 0; dim < rank; ++dim) {
+     int64_t bound = bounds[dim];
+@@ -254,7 +252,8 @@
+     if (bound != ShapedType::kDynamic && dimSize != ShapedType::kDynamic)
+       return emitError() << "Static dimension " << dim
+                          << " cannot have a bound, use ShapedType::kDynamic to "
+-                            "indicate a missing bound";
++                            "indicate a missing bound in tensor "
++                         << type;
+   }
  
-+bool isBoundedDynamic(Type type) {
-+  RankedTensorType rankedType = dyn_cast<RankedTensorType>(type);
-+  if (!rankedType) return false;
-+  auto boundedAttr =
-+      mlir::dyn_cast_if_present<BoundedAttrInterface>(rankedType.getEncoding());
-+  return boundedAttr != nullptr;
-+}
-+
- bool hasSingleBoundedDimension(Type type) {
-   RankedTensorType rankedType = dyn_cast<RankedTensorType>(type);
-   auto boundedAttr =
+   return success();
 diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
 --- stablehlo/stablehlo/dialect/Base.h
 +++ stablehlo/stablehlo/dialect/Base.h
-@@ -101,6 +101,9 @@
- // mentioned in the StableHLO specification.
- bool isValidQuantizedDimension(Type type);
- 
-+// Returns true if the given type is a bounded dynamic tensor.
-+bool isBoundedDynamic(Type type);
-+
- // Returns true if the given type has a single bounded dimension.
- bool hasSingleBoundedDimension(Type type);
- 
+@@ -486,9 +486,12 @@
+                                 inferredReturnTypes)))
+       return failure();
+     if (inferredReturnTypes.size() != 1) return failure();
+-    auto inferredReturnType = dyn_cast<ShapedType>(inferredReturnTypes[0]);
++    auto inferredReturnType =
++        dyn_cast<RankedTensorType>(inferredReturnTypes[0]);
+     if (!inferredReturnType) return failure();
+-    inferredReturnShapes.push_back(inferredReturnType);
++    inferredReturnShapes.emplace_back(inferredReturnType.getShape(),
++                                      inferredReturnType.getElementType(),
++                                      inferredReturnType.getEncoding());
+     return success();
+   }
+ };
 diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
 --- stablehlo/stablehlo/dialect/ChloOps.cpp
 +++ stablehlo/stablehlo/dialect/ChloOps.cpp
-@@ -365,11 +365,14 @@
-   Type elementType = op.getValue().getType();
-   Type operandType = op.getOperand().getType();
-   if (isa<UnrankedTensorType>(operandType)) {
-+    // TODO(b/326463552): Remove unranked dynamism from CHLO.
-     inferredReturnShapes.emplace_back(elementType);
--  } else {
--    const auto& shape = cast<RankedTensorType>(operandType).getShape();
--    inferredReturnShapes.emplace_back(shape, elementType);
--  }
-+    return success();
-+  }
-+  auto rankedType = cast<RankedTensorType>(operandType);
-+  const auto& shape = rankedType.getShape();
-+  Attribute encoding = rankedType.getEncoding();
-+  inferredReturnShapes.emplace_back(shape, elementType, encoding);
-   return success();
- }
- 
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
---- stablehlo/stablehlo/dialect/StablehloOps.cpp
-+++ stablehlo/stablehlo/dialect/StablehloOps.cpp
-@@ -4024,6 +4024,61 @@
-   ReturnOp::create(*builder, loc, compare);
- }
- 
-+void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
-+                           OpBuilder& builder) {
-+  OpBuilder::InsertionGuard guard(builder);
-+  if (body.getBlocks().empty()) builder.createBlock(&body);
-+  Block* block = &body.getBlocks().front();
-+
-+  Type value_type = RankedTensorType::get(/*shape=*/{}, elementType);
-+  Type index_type = RankedTensorType::get(/*shape=*/{}, indices_type);
-+  Location loc = body.getLoc();
-+  block->addArguments({value_type, index_type}, {loc, loc});
-+  block->addArguments({value_type, index_type}, {loc, loc});
-+
-+  auto lhs_value = block->getArgument(0);
-+  auto lhs_index = block->getArgument(1);
-+  auto rhs_value = block->getArgument(2);
-+  auto rhs_index = block->getArgument(3);
-+
-+  auto gt_pred =
-+      builder
-+          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::GT)
-+          .getResult();
-+
-+  // Tie-Breaker Condition: (lhs == rhs) AND (lhs_index < rhs_index)
-+  auto eq_pred =
-+      builder
-+          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::EQ)
-+          .getResult();
-+  auto lt_index_pred =
-+      builder
-+          .create<CompareOp>(loc, lhs_index, rhs_index, ComparisonDirection::LT)
-+          .getResult();
-+  auto tie_breaker_condition =
-+      builder.create<AndOp>(loc, eq_pred, lt_index_pred).getResult();
-+
-+  // Final lhs Selection Condition: (gt_pred) OR (tie_breaker_condition)
-+  auto final_lhs_condition =
-+      builder.create<OrOp>(loc, gt_pred, tie_breaker_condition).getResult();
-+
-+  // Select Final Results:
-+  // if final_lhs_condition:
-+  //     return (lhs_value, lhs_index)
-+  // else:
-+  //     return (rhs_value, rhs_index)
-+  auto selected_value = builder
-+                            .create<stablehlo::SelectOp>(
-+                                loc, final_lhs_condition, lhs_value, rhs_value)
-+                            .getResult();
-+  auto selected_index = builder
-+                            .create<stablehlo::SelectOp>(
-+                                loc, final_lhs_condition, lhs_index, rhs_index)
-+                            .getResult();
-+  builder.create<stablehlo::ReturnOp>(
-+      loc, mlir::ValueRange{selected_value, selected_index});
-+}
-+
- SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
-                     const llvm::ArrayRef<Value>& operands,
-                     const llvm::ArrayRef<Type>& elementTypes, int64_t dimension,
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.h b/stablehlo/stablehlo/dialect/StablehloOps.h
---- stablehlo/stablehlo/dialect/StablehloOps.h
-+++ stablehlo/stablehlo/dialect/StablehloOps.h
-@@ -204,6 +204,16 @@
-   stablehlo::ReturnOp::create(builder, loc, reducer.getResult());
- }
- 
-+// Builds the region `body` for a max-and-argmax computation, suitable for
-+// use in ReduceWindow operations with varidic value and index inputs.
-+// It creates four block arguments (val1, idx1, val2, idx2) of `elementType` and
-+// `indices_type`, and returns two results: result_val and result_idx.
-+// result_val is the maximum of val1 and val2, and result_idx is the index
-+// corresponding to result_val. If val1 >= val2, idx1 is returned, otherwise
-+// idx2 is returned.
-+void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
-+                           OpBuilder& builder);
-+
- // PrecisionConfigAttr is a constraint attribute on ArrayAttrs.
- // Create this class to allow for building this attr similar to other
- // attributes.
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt b/stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
---- stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
-+++ stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
-@@ -137,6 +137,7 @@
-     set_target_properties(check-stablehlo-ci PROPERTIES FOLDER "Tests")
-     add_unittest(check-stablehlo-ci "unittests"
-       MlirBuilderTest.cpp
-+      ChloBuilderTest.cpp
-       StablehloBuilderTest.cpp
-       AttrTypeBuilderUtilTest.cpp
-     )
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp b/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp
-@@ -31,5 +31,15 @@
- 
- #include "stablehlo/integrations/cpp/builder/ChloBuilder.cpp.inc"
- 
-+/////////////////
-+// MANUAL APIs
-+/////////////////
-+
-+MlirOp ConstantLike(MlirOp input, DenseElementsAttr val) {
-+  MlirBuilder& builder = input.getBuilder();
-+  auto splat_val = val.getSplatValue<TypedAttr>();
-+  return builder.create<chlo::ConstantLikeOp>(splat_val, input.getValue());
-+}
-+
- }  // namespace chlo
- }  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h b/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h
---- stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h
-+++ stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h
-@@ -19,6 +19,7 @@
+@@ -19,14 +19,14 @@
+ #include <algorithm>
+ #include <cassert>
  #include <cstdint>
+-#include <iostream>
+ #include <iterator>
+ #include <optional>
+ #include <string>
  
+ #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/SmallVector.h"
-+#include "mlir/IR/BuiltinAttributes.h"
- #include "stablehlo/dialect/ChloOps.h"
- #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
- 
-@@ -31,6 +32,12 @@
- 
- #include "stablehlo/integrations/cpp/builder/ChloBuilder.h.inc"
+-#include "llvm/ADT/TypeSwitch.h"
++#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
++#include "llvm/Support/ErrorHandling.h"
+ #include "mlir/Dialect/Complex/IR/Complex.h"
+ #include "mlir/Dialect/Traits.h"
+ #include "mlir/IR/Attributes.h"
+@@ -51,6 +51,7 @@
+ #include "stablehlo/dialect/BroadcastUtils.h"
+ #include "stablehlo/dialect/ChloBytecode.h"
+ #include "stablehlo/dialect/TypeInference.h"
++#include "stablehlo/transforms/StablehloBroadcastLowering.h"
  
-+/////////////////
-+// MANUAL APIs
-+/////////////////
-+
-+MlirOp ConstantLike(MlirOp input, DenseElementsAttr val);
-+
- }  // namespace chlo
- }  // namespace mlir
+ // Include order matters
+ #include "stablehlo/dialect/ChloEnums.cpp.inc"
+@@ -104,54 +105,95 @@
+ //===----------------------------------------------------------------------===//
  
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp
-@@ -0,0 +1,141 @@
-+/* Copyright 2025 The OpenXLA Authors.
-+
-+Licensed under the Apache License, Version 2.0 (the "License");
-+you may not use this file except in compliance with the License.
-+You may obtain a copy of the License at
-+
-+    http://www.apache.org/licenses/LICENSE-2.0
-+
-+Unless required by applicable law or agreed to in writing, software
-+distributed under the License is distributed on an "AS IS" BASIS,
-+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+See the License for the specific language governing permissions and
-+limitations under the License.
-+==============================================================================*/
-+
-+#include <string>
-+
-+#include "mlir/IR/BuiltinAttributes.h"
-+#include "mlir/IR/BuiltinOps.h"
-+#include "mlir/IR/DialectRegistry.h"
-+#include "mlir/IR/MLIRContext.h"
-+#include "mlir/IR/OwningOpRef.h"
-+#include "mlir/IR/Types.h"
-+#include "mlir/IR/Verifier.h"
-+#include "mlir/Support/DebugStringHelper.h"
-+#include "mlir/Support/LLVM.h"
-+#include "stablehlo/dialect/Register.h"
-+#include "stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h"
-+#include "stablehlo/integrations/cpp/builder/ChloBuilder.h"
-+#include "stablehlo/integrations/cpp/builder/FuncBuilder.h"
-+#include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
-+#include "testing/base/public/gunit.h"
-+#include "stablehlo/integrations/cpp/builder/StablehloBuilder.h"
-+
-+namespace mlir {
-+namespace chlo {
-+
-+namespace {
-+
-+// Wrap a module builder and register the classes needed
-+class ChloModuleBuilder {
-+ public:
-+  ChloModuleBuilder()
-+      : context_(), module_builder_(context_, mlir::unknownLoc(context_)) {
-+    DialectRegistry registry;
-+    stablehlo::registerAllDialects(registry);
-+    context_.appendDialectRegistry(registry);
-+    context_.loadAllAvailableDialects();
-+  }
-+
-+  ModuleBuilder& get() { return module_builder_; }
-+  ModuleBuilder* operator->() { return &module_builder_; }
-+
-+ private:
-+  MLIRContext context_;
-+  ModuleBuilder module_builder_;
-+};
-+
-+// TODO: Make a FileCheck matcher
+ namespace {
 +
-+}  // namespace
-+
-+TEST(ChloBuilderTest, SmokeTest) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xi64>) -> tensor<2xi64> {
-+    %0 = chlo.constant dense<1> : tensor<i64>
-+    %1 = chlo.broadcast_add %arg0, %0 : (tensor<2xi64>, tensor<i64>) -> tensor<2xi64>
-+    return %1 : tensor<2xi64>
-+  }
-+})mlir";
-+
-+  ChloModuleBuilder mb;
-+  {  // Build Main Func
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type2xi64 = makeTensorType(mb->getContext(), {2}, ElementType::I64);
-+    auto typeScalari64 = makeTensorType(mb->getContext(), {}, ElementType::I64);
-+    auto arg0 = func::Argument(fb, type2xi64);
-+    auto cst = Constant(fb, mlir::makeConstant(1L, typeScalari64));
-+    auto add = BroadcastAdd(arg0, cst);
-+    func::Return(fb, {add});
-+  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_TRUE(succeeded(mlir::verify(*module)));
-+  EXPECT_EQ(expected, debugString(*module));
++bool isStaticOrBoundedDynamicTensor(RankedTensorType type) {
++  return type.hasStaticShape() || hlo::isBoundedDynamic(type);
 +}
 +
-+TEST(MlirBuilderTest, ConstantLike) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xi64>) -> tensor<2xi64> {
-+    %0 = "chlo.constant_like"(%arg0) <{value = 1 : i64}> : (tensor<2xi64>) -> tensor<2xi64>
-+    return %0 : tensor<2xi64>
+ // Gets the resulting type from a broadcast between two types.
+-ShapedTypeComponents getBroadcastType(
+-    Type x, Type y, Type elementType,
+-    std::optional<ArrayRef<int64_t>> broadcastDimensionsAttr) {
+-  auto xRanked = dyn_cast<RankedTensorType>(x);
+-  auto yRanked = dyn_cast<RankedTensorType>(y);
+-  if (!xRanked || !yRanked) return {elementType};
+-
+-  auto shapeX = xRanked.getShape();
+-  auto shapeY = yRanked.getShape();
+-
+-  // If no broadcast dimensions, assume "numpy" broadcasting.
+-  if (shapeX.size() == shapeY.size() || !broadcastDimensionsAttr.has_value()) {
+-    llvm::SmallVector<int64_t, 4> outShape;
+-    if (!mlir::OpTrait::util::getBroadcastedShape(shapeX, shapeY, outShape)) {
++ShapedTypeComponents getNumpyBroadcastType(ArrayRef<Value> operands,
++                                           Type elementType) {
++  if (operands.empty())
++    llvm::report_fatal_error("Called getNumpyBroadcastType with no operands");
++
++  // Handle unranked tensors
++  if (llvm::any_of(operands,
++                   [](Value v) { return !isa<RankedTensorType>(v.getType()); }))
++    return {elementType};
++
++  // All static or bounded, use bounded dynamic aware broadcasting.
++  bool allStaticOrBounded = llvm::all_of(operands, [](Value v) {
++    return isStaticOrBoundedDynamicTensor(cast<RankedTensorType>(v.getType()));
++  });
++  if (allStaticOrBounded) {
++    Location errorLoc = operands[0].getLoc();
++    FailureOr<stablehlo::Dimensions> outShape =
++        stablehlo::getNumpyBroadcastShape(errorLoc, operands);
++    if (failed(outShape)) {
+       // Signal illegal broadcast_dimensions as unranked.
+       return {elementType};
+     }
+-    return {outShape, elementType};
+-  }
+-
+-  auto shapeLarge = shapeX.size() > shapeY.size() ? shapeX : shapeY;
+-  auto shapeSmall = shapeX.size() <= shapeY.size() ? shapeX : shapeY;
++    RankedTensorType outType =
++        stablehlo::getRankedTensorType(*outShape, elementType);
++    return {outType.getShape(), outType.getElementType(),
++            outType.getEncoding()};
 +  }
-+})mlir";
 +
-+  ChloModuleBuilder mb;
-+  {  // Build Main Func
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type2xi64 = makeTensorType(mb->getContext(), {2}, ElementType::I64);
-+    auto typeScalari64 = makeTensorType(mb->getContext(), {}, ElementType::I64);
-+    auto arg0 = func::Argument(fb, type2xi64);
-+    auto cst = ConstantLike(arg0, mlir::makeConstant(1L, typeScalari64));
-+    func::Return(fb, {cst});
++  // Fall back to non-bounded dynamic aware broadcasting
++  // Will pick more lenient output shapes `x . ? => ?`
++  llvm::SmallVector<int64_t, 4> outShape =
++      llvm::to_vector(cast<RankedTensorType>(operands[0].getType()).getShape());
++  for (Value operand : operands) {
++    // Make a copy of current shape since `getBroadcastedShape` will modify it.
++    llvm::SmallVector<int64_t, 4> currentShape = outShape;
++    auto operandShape = cast<RankedTensorType>(operand.getType()).getShape();
++    if (!mlir::OpTrait::util::getBroadcastedShape(currentShape, operandShape,
++                                                  outShape)) {
++      return {elementType};
++    }
 +  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_TRUE(succeeded(mlir::verify(*module)));
-+  EXPECT_EQ(expected, debugString(*module));
++  return {outShape, elementType};
 +}
 +
-+TEST(MlirBuilderTest, ConstantLikeBounded) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xi64>, %arg1: tensor<i32>) -> tensor<?xi32, #stablehlo.bounds<2>> {
-+    %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<2xi64>, tensor<i32>) -> tensor<?xi64, #stablehlo.bounds<2>>
-+    %1 = "chlo.constant_like"(%0) <{value = 1 : i32}> : (tensor<?xi64, #stablehlo.bounds<2>>) -> tensor<?xi32, #stablehlo.bounds<2>>
-+    return %1 : tensor<?xi32, #stablehlo.bounds<2>>
-+  }
-+})mlir";
++ShapedTypeComponents getBroadcastTypeWithBroadcastDimensions(
++    Value x, Value y, Type elementType,
++    std::optional<ArrayRef<int64_t>> broadcastDimensionsAttr) {
++  if (!broadcastDimensionsAttr.has_value())
++    return getNumpyBroadcastType({x, y}, elementType);
 +
-+  ChloModuleBuilder mb;
-+  {  // Build Main Func
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type2xi64 = makeTensorType(mb->getContext(), {2}, ElementType::I64);
-+    auto typei32 = makeTensorType(mb->getContext(), {}, ElementType::I32);
-+    auto arg0 = func::Argument(fb, type2xi64);
-+    auto arg1 = func::Argument(fb, typei32);
-+    auto sds = stablehlo::SetDimensionSize(arg0, arg1, 0);
-+    auto cst = ConstantLike(sds, mlir::makeConstant(1L, typei32));
-+    func::Return(fb, {cst});
-+  }
++  // Only support two operands if broadcast_dimensions is specified.
++  auto shapeX = dyn_cast<RankedTensorType>(x.getType());
++  auto shapeY = dyn_cast<RankedTensorType>(y.getType());
 +
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_TRUE(succeeded(mlir::verify(*module)));
-+  EXPECT_EQ(expected, debugString(*module));
-+}
++  // Handle unranked tensors
++  if (!shapeX || !shapeY) return {elementType};
 +
-+}  // namespace chlo
-+}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp b/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
-@@ -203,6 +203,9 @@
-   // If the op does not support type inference, return a default output shape
-   // parameter that must be injected.
-   MethodParameter getDefaultOutputShape() {
-+    if (hasSingleVariadicResult(getOp()) || getOp().getNumResults() > 1) {
-+      return MethodParameter("TypeRange", "resultTypes");
-+    }
-     return MethodParameter("Type", "resultType");
-   }
- 
-@@ -276,7 +279,7 @@
-     BuilderParams params = getOpBuilderParameters();
-     SmallVector<MethodParameter> parameters;
-     if (params.outputShape.has_value()) {
--      parameters.push_back(getDefaultOutputShape());
-+      parameters.push_back(params.outputShape.value());
-     }
-     for (auto& operand : params.operands) {
-       parameters.push_back(
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
-@@ -67,6 +67,7 @@
-   MlirOp operand = input;
-   auto inputType = mlir::cast<RankedTensorType>(input.getType());
-   auto resultType = inputType.clone(resultElementType);
-+  if (inputType == resultType) return input;  // skip no-op convert
-   if (isa<ComplexType>(inputType.getElementType()) &&
-       !isa<ComplexType>(resultElementType)) {
-     operand = stablehlo::Real(operand);
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
-@@ -17,12 +17,12 @@
- #include <cstdint>
- #include <string>
++  auto shapeLarge = shapeX.getRank() > shapeY.getRank() ? shapeX : shapeY;
++  auto shapeSmall = shapeX.getRank() <= shapeY.getRank() ? shapeX : shapeY;
  
--#include "gtest/gtest.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/DialectRegistry.h"
- #include "mlir/IR/MLIRContext.h"
- #include "mlir/IR/OwningOpRef.h"
-+#include "mlir/IR/Types.h"
- #include "mlir/IR/Verifier.h"
- #include "mlir/Support/DebugStringHelper.h"
- #include "mlir/Support/LLVM.h"
-@@ -32,6 +32,7 @@
- #include "stablehlo/integrations/cpp/builder/FuncBuilder.h"
- #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
- #include "stablehlo/integrations/cpp/builder/StablehloBuilder.h"
-+#include "gtest/gtest.h"
- 
- namespace mlir {
- namespace stablehlo {
-@@ -1517,6 +1518,29 @@
-   EXPECT_EQ(expected, debugString(*module));
+   auto broadcastDimensions = broadcastDimensionsAttr.value();
+-  if (broadcastDimensions.size() != shapeSmall.size()) {
++  if (broadcastDimensions.size() != shapeSmall.getRank()) {
+     // Signal illegal broadcast_dimensions as unranked.
+     return {elementType};
+   }
+   llvm::SmallVector<int64_t, 4> shapeLargeFiltered;
+-  shapeLargeFiltered.reserve(shapeSmall.size());
++  shapeLargeFiltered.reserve(shapeSmall.getRank());
+   for (const auto& dim : broadcastDimensions) {
+-    if (dim >= static_cast<int64_t>(shapeLarge.size())) return {elementType};
+-    shapeLargeFiltered.push_back(shapeLarge[dim]);
++    if (dim >= static_cast<int64_t>(shapeLarge.getRank())) return {elementType};
++    shapeLargeFiltered.push_back(shapeLarge.getDimSize(dim));
+   }
+   llvm::SmallVector<int64_t, 4> outShapeFiltered;
+-  if (!mlir::OpTrait::util::getBroadcastedShape(shapeSmall, shapeLargeFiltered,
+-                                                outShapeFiltered))
++  if (!mlir::OpTrait::util::getBroadcastedShape(
++          shapeSmall.getShape(), shapeLargeFiltered, outShapeFiltered))
+     // Signal illegal broadcast_dimensions as unranked.
+     return {elementType};
+ 
+   // Update according to the broadcast dimensions.
+-  llvm::SmallVector<int64_t, 4> outShape(shapeLarge.begin(), shapeLarge.end());
++  llvm::SmallVector<int64_t, 4> outShape(shapeLarge.getShape());
+   for (const auto& indexPair : llvm::enumerate(broadcastDimensions)) {
+     auto newValue = outShapeFiltered[indexPair.index()];
+     outShape[indexPair.value()] = newValue;
+   }
+-
+   return {outShape, elementType};
  }
  
-+TEST(MlirBuilderTest, VariadicResult) {
-+  std::string expected = R"mlir(module {
-+  func.func @main() -> (tensor<f64>, tensor<f64>) {
-+    %0:2 = stablehlo.custom_call @two_outs() : () -> (tensor<f64>, tensor<f64>)
-+    return %0#0, %0#1 : tensor<f64>, tensor<f64>
-+  }
-+})mlir";
-+
-+  StablehloModuleBuilder mb;
-+  {
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type = makeTensorType(fb.getContext(), {}, ElementType::F64);
-+    SmallVector<Type> resultTypes = {type, type};
-+    // Pass double data with i64 type.
-+    auto cc = stablehlo::CustomCall(fb, resultTypes, {}, "two_outs");
-+    func::Return(fb, {cc});
-+  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_EQ(expected, debugString(*module));
-+}
-+
- ////////
- // Custom Attribute Tests
- ////////
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-@@ -622,6 +622,10 @@
-   func.return %result : tensor<complex<f32>>
+@@ -160,6 +202,7 @@
+     DictionaryAttr attributes, OpaqueProperties properties,
+     std::optional<ArrayRef<int64_t>> broadcastDimensions, Type elementType,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
++  // Handle unranked.
+   ShapedType lhsType = cast<ShapedType>(operands[0].getType());
+   ShapedType rhsType = cast<ShapedType>(operands[1].getType());
+   if (!lhsType || !rhsType ||
+@@ -167,8 +210,8 @@
+           lhsType.getElementType(), rhsType.getElementType()))
+     return emitOptionalError(location, "mismatched operand types");
+   if (!elementType) elementType = lhsType.getElementType();
+-  inferredReturnShapes.push_back(
+-      getBroadcastType(lhsType, rhsType, elementType, broadcastDimensions));
++  inferredReturnShapes.push_back(getBroadcastTypeWithBroadcastDimensions(
++      operands[0], operands[1], elementType, broadcastDimensions));
+   return success();
  }
  
-+//////
-+// Broadcast binary elementwise ops tests are located in
-+// chlo_legalize_to_stablehlo_broadcast.mlir
-+
- // -----
- 
- // Lower statically shaped `constant_like` to constant.
-@@ -632,6 +636,24 @@
-   %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
-       : (tensor<1x2xi64>) -> tensor<1x2xf32>
-   func.return %result : tensor<1x2xf32>
-+}
-+
-+// -----
-+
-+// Lower dynamically shaped `constant_like` to broadcasted constant.
-+// CHECK-LABEL: constant_like_bounded_dynamic_shape
-+// CHECK-SAME: (%[[ARG0:.*]]: tensor<2xi64>, %[[ARG1:.*]]: tensor<i32>)
-+func.func @constant_like_bounded_dynamic_shape(%arg0: tensor<2xi64>, %arg1: tensor<i32>) -> tensor<?xi32, #stablehlo.bounds<2>> {
-+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<2xi64>, tensor<i32>) -> tensor<?xi64, #stablehlo.bounds<2>>
-+  // CHECK-NOT: chlo.constant_like
-+  // CHECK: %[[ARG0_DYN:.*]] = stablehlo.set_dimension_size %[[ARG0]], %[[ARG1]], dim = 0 : (tensor<2xi64>, tensor<i32>) -> tensor<?xi64, #stablehlo.bounds<2>>
-+  // CHECK: %[[CST:.*]] = stablehlo.constant dense<1> : tensor<i32>
-+  // CHECK-NEXT: %[[BCAST:.*]] = stablehlo.broadcast_in_dim %[[CST]], dims = [] : (tensor<i32>) -> tensor<2xi32>
-+  // CHECK-NEXT: %[[GDS:.*]] = stablehlo.get_dimension_size %[[ARG0_DYN]], dim = 0 : (tensor<?xi64, #stablehlo.bounds<2>>) -> tensor<i32>
-+  // CHECK-NEXT: %[[SDS:.*]] = stablehlo.set_dimension_size %[[BCAST]], %[[GDS]], dim = 0 : (tensor<2xi32>, tensor<i32>) -> tensor<?xi32, #stablehlo.bounds<2>>
-+  // CHECK-NEXT: return %[[SDS]] : tensor<?xi32, #stablehlo.bounds<2>>
-+  %1 = "chlo.constant_like"(%0) <{value = 1 : i32}> : (tensor<?xi64, #stablehlo.bounds<2>>) -> tensor<?xi32, #stablehlo.bounds<2>>
-+  return %1 : tensor<?xi32, #stablehlo.bounds<2>>
+@@ -397,7 +440,6 @@
+     DictionaryAttr, OpaqueProperties, RegionRange,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   BroadcastSelectOp::Adaptor op(operands.getValues());
+-  auto predType = cast<ShapedType>(op.getPred().getType());
+   auto onTrueType = cast<ShapedType>(op.getOnTrue().getType());
+   auto onFalseType = cast<ShapedType>(op.getOnFalse().getType());
+ 
+@@ -407,12 +449,8 @@
+   Type elementType = onTrueType.getElementType();
+ 
+   // Compute the result shape as two binary broadcasts.
+-  ShapedTypeComponents& components = inferredReturnShapes.emplace_back(
+-      getBroadcastType(onTrueType, onFalseType, elementType, std::nullopt));
+-  if (components.hasRank())
+-    components = getBroadcastType(
+-        RankedTensorType::get(components.getDims(), elementType), predType,
+-        elementType, std::nullopt);
++  inferredReturnShapes.emplace_back(
++      getNumpyBroadcastType(llvm::to_vector(op.getOperands()), elementType));
+   return success();
  }
  
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir
-@@ -3,8 +3,8 @@
- // Check the non-broadcast case for each registered op, then just check a
- // representative op for detailed broadcast semantics.
- 
--// CHECK-LABEL: @addWithoutBroadcast
--func.func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @add_no_broadcast
-+func.func @add_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.add %arg0, %arg1
-   %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -12,8 +12,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @addStaticBroadcastExpanding
--func.func @addStaticBroadcastExpanding(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @add_static_broadcast_expanding
-+func.func @add_static_broadcast_expanding(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-   // CHECK:      %[[BROADCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f32>) -> tensor<4xf32>
-   // CHECK-NEXT: stablehlo.add %arg0, %[[BROADCAST]]
-   // CHECK-NOT: shape
-@@ -23,8 +23,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @addStaticBroadcastSameRank
--func.func @addStaticBroadcastSameRank(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
-+// CHECK-LABEL: @add_static_broadcast_same_rank
-+func.func @add_static_broadcast_same_rank(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
-   // CHECK:      %[[ARG0_B:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<1x4xf32>) -> tensor<4x4xf32>
-   // CHECK-NEXT: %[[ARG1_B:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<4x1xf32>) -> tensor<4x4xf32>
-   // CHECK-NEXT: stablehlo.add %[[ARG0_B]], %[[ARG1_B]] : tensor<4x4xf32>
-@@ -35,11 +35,33 @@
- 
- // -----
- 
--
--// CHECK-LABEL: @dynamicBroadcast
-+// [<=10] x [<=10] => [<=10]
-+// CHECK-LABEL: func @add_bounded_dynamic_no_broadcast
-+func.func @add_bounded_dynamic_no_broadcast(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> tensor<?xf64, #stablehlo.bounds<10>> {
-+  // CHECK-NEXT: stablehlo.add %arg0, %arg1
-+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<?xf64, #stablehlo.bounds<10>>) -> tensor<?xf64, #stablehlo.bounds<10>>
-+  return %0 : tensor<?xf64, #stablehlo.bounds<10>>
-+}
-+
-+// -----
-+
-+// [<=10] x [] => [<=10]
-+// CHECK-LABEL: func @add_bounded_dynamic_expanding
-+func.func @add_bounded_dynamic_expanding(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>> {
-+  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f64>) -> tensor<10xf64>
-+  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
-+  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST]], %[[DIM_SIZE]], dim = 0
-+  // CHECK-NEXT: stablehlo.add %arg0, %[[RHS_BCAST_DYN]]
-+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>>
-+  return %0 : tensor<?xf64, #stablehlo.bounds<10>>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: @add_dynamic_broadcast
- // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
- // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
--func.func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-+func.func @add_dynamic_broadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-   // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-   // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-@@ -57,10 +79,10 @@
- 
- // -----
- 
--// CHECK-LABEL: @dynamicBroadcastComplex
-+// CHECK-LABEL: @dynamic_broadcast_complex
- // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
- // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
--func.func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-+func.func @dynamic_broadcast_complex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-   // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-   // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-@@ -78,10 +100,10 @@
- 
- // -----
- 
--// CHECK-LABEL: @dynamicBroadcastCompare
-+// CHECK-LABEL: @compare_dynamic_broadcast
- // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
- // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
--func.func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-+func.func @compare_dynamic_broadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-   // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-@@ -191,8 +213,8 @@
- // -----
- 
- // Verifies that broadcast_dimensions validity checks are valid.
--// CHECK-LABEL: @dynamicNonScalarBroadcastDimensions
--func.func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-+// CHECK-LABEL: @dynamic_non_scalar_broadcast_dimensions
-+func.func @dynamic_non_scalar_broadcast_dimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-   // CHECK: stablehlo.add
-   %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions =  array<i64: 1> } : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-   func.return %0 : tensor<1x4xf32>
-@@ -201,8 +223,8 @@
- // -----
- 
- // Verifies that broadcast_dimensions validity checks are valid.
--// CHECK-LABEL: @dynamicNonScalarByScalarBroadcastDimensions
--func.func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
-+// CHECK-LABEL: @dynamic_non_scalar_by_scalar_broadcast_dimensions
-+func.func @dynamic_non_scalar_by_scalar_broadcast_dimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
-   // CHECK: stablehlo.add
-   %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
-   func.return %0 : tensor<1x4xf32>
-@@ -211,7 +233,7 @@
- // -----
- 
- // Verifies that invalid broadcast dimensions are rejected.
--func.func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-+func.func @dynamic_non_scalar_broadcast_dimensions_size_mismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-   // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-   // expected-error @+1 {{failed to legalize operation}}
-   %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-@@ -221,7 +243,7 @@
- // -----
- 
- // Verifies that invalid broadcast dimensions are rejected.
--func.func @dynamicNonScalarBroadcastDimensionsMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-+func.func @dynamic_non_scalar_broadcast_dimensions_mismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-   // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-   // expected-error @+1 {{failed to legalize operation}}
-   %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-@@ -232,8 +254,8 @@
- // Note that broadcast_add is used as a proxy for all of the template
- // expansions. Tests below merely verify that the op has an expansion.
- 
--// CHECK-LABEL: @andWithoutBroadcast
--func.func @andWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-+// CHECK-LABEL: @and_no_broadcast
-+func.func @and_no_broadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-   // CHECK: stablehlo.and %arg0, %arg1
-   %0 = chlo.broadcast_and %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
-@@ -241,8 +263,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @atan2WithoutBroadcast
--func.func @atan2WithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @atan2_no_broadcast
-+func.func @atan2_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.atan2 %arg0, %arg1
-   %0 = chlo.broadcast_atan2 %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -250,8 +272,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @compareWithoutBroadcast
--func.func @compareWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
-+// CHECK-LABEL: @compare_no_broadcast
-+func.func @compare_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
-   // CHECK: stablehlo.compare EQ, %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-   %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
-@@ -259,8 +281,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @complexWithoutBroadcast
--func.func @complexWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
-+// CHECK-LABEL: @complex_no_broadcast
-+func.func @complex_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
-   // CHECK: stablehlo.complex %arg0, %arg1 : tensor<4xcomplex<f32>>
-   %0 = chlo.broadcast_complex %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
-   func.return %0 : tensor<4xcomplex<f32>>
-@@ -268,8 +290,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @divideWithoutBroadcast
--func.func @divideWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @divide_no_broadcast
-+func.func @divide_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.divide %arg0, %arg1
-   %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -277,8 +299,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @maximumWithoutBroadcast
--func.func @maximumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @maximum_no_broadcast
-+func.func @maximum_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.maximum %arg0, %arg1
-   %0 = chlo.broadcast_maximum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -286,8 +308,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @minimumWithoutBroadcast
--func.func @minimumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @minimum_no_broadcast
-+func.func @minimum_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.minimum %arg0, %arg1
-   %0 = chlo.broadcast_minimum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -295,8 +317,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @multiplyWithoutBroadcast
--func.func @multiplyWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @multiply_no_broadcast
-+func.func @multiply_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.multiply %arg0, %arg1
-   %0 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -304,8 +326,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @orWithoutBroadcast
--func.func @orWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-+// CHECK-LABEL: @or_no_broadcast
-+func.func @or_no_broadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-   // CHECK: stablehlo.or %arg0, %arg1
-   %0 = chlo.broadcast_or %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
-@@ -313,8 +335,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @powerWithoutBroadcast
--func.func @powerWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @power_no_broadcast
-+func.func @power_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.power %arg0, %arg1
-   %0 = chlo.broadcast_power %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -322,8 +344,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @remainderWithoutBroadcast
--func.func @remainderWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @remainder_no_broadcast
-+func.func @remainder_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.remainder %arg0, %arg1
-   %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -331,8 +353,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @shift_leftWithoutBroadcast
--func.func @shift_leftWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-+// CHECK-LABEL: @shift_left_no_broadcast
-+func.func @shift_left_no_broadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-   // CHECK: stablehlo.shift_left %arg0, %arg1
-   %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -340,8 +362,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @shift_right_arithmeticWithoutBroadcast
--func.func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-+// CHECK-LABEL: @shift_right_arithmetic_no_broadcast
-+func.func @shift_right_arithmetic_no_broadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-   // CHECK: stablehlo.shift_right_arithmetic %arg0, %arg1
-   %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -349,8 +371,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @shift_right_logicalWithoutBroadcast
--func.func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-+// CHECK-LABEL: @shift_right_logical_no_broadcast
-+func.func @shift_right_logical_no_broadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-   // CHECK: stablehlo.shift_right_logical %arg0, %arg1
-   %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -358,8 +380,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @subWithoutBroadcast
--func.func @subWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @sub_no_broadcast
-+func.func @sub_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.subtract %arg0, %arg1
-   %0 = chlo.broadcast_subtract %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -367,16 +389,16 @@
- 
- // -----
- 
--// CHECK-LABEL: @xorWithoutBroadcast
--func.func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-+// CHECK-LABEL: @xor_no_broadcast
-+func.func @xor_no_broadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-   // CHECK: stablehlo.xor %arg0, %arg1
-   %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -1569,7 +1569,8 @@
+ void ConvertOp::build(OpBuilder& builder, OperationState& result, Value operand,
+                       Type resultElementTy) {
+   auto rankedTy = cast<RankedTensorType>(operand.getType());
+-  auto resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy);
++  auto resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy,
++                                        rankedTy.getEncoding());
+   build(builder, result, resultTy, operand);
  }
  
- // -----
--// CHECK-LABEL: @NextAfterWithoutBroadcast
--func.func @NextAfterWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-+// CHECK-LABEL: @next_after_no_broadcast
-+func.func @next_after_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-     -> tensor<4xf32> {
-   // CHECK-NOT: chlo.broadcast_next_after
-   %0 = chlo.broadcast_next_after %arg0, %arg1
-@@ -386,8 +408,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @PolygammaWithoutBroadcast
--func.func @PolygammaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-+// CHECK-LABEL: @Polygamma_no_broadcast
-+func.func @Polygamma_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-     -> tensor<4xf32> {
-   // CHECK-NOT: chlo.broadcast_polygamma
-   // CHECK-NOT: chlo.polygamma
-@@ -398,8 +420,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @ZetaWithoutBroadcast
--func.func @ZetaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-+// CHECK-LABEL: @Zeta_no_broadcast
-+func.func @Zeta_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-     -> tensor<4xf32> {
-   // CHECK-NOT: chlo.broadcast_zeta
-   // CHECK-NOT: chlo.zeta
-diff --ruN a/stablehlo/stablehlo/tests/ops_broadcasting.mlir b/stablehlo/stablehlo/tests/ops_broadcasting.mlir
---- stablehlo/stablehlo/tests/ops_broadcasting.mlir
-+++ stablehlo/stablehlo/tests/ops_broadcasting.mlir
-@@ -92,6 +92,8 @@
- // [<=10] x [1] => [<=10]
- // [1] x [<=10] => [<=10]
- // [1] x [1, <=10, 1] => [1, <=10, 1]
-+// [5] x [10, 1] => [10, 5]
-+// [5] x [<=10, 1] => [<=10, 5]
- 
- 
- // [1] x [1] => [1]
-@@ -232,6 +234,38 @@
- 
- // -----
- 
-+// [5] x [10, 1] => [10, 5]
-+// CHECK-LABEL: func @tensor_broadcast_5_x_10_1
-+func.func @tensor_broadcast_5_x_10_1(%arg0: tensor<5xf64>, %arg1: tensor<10x1xf64>) -> !stablehlo.token {
-+  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [1] : (tensor<5xf64>) -> tensor<10x5xf64>
-+  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<10x1xf64>) -> tensor<10x5xf64>
-+  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST]])
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<5xf64>, tensor<10x1xf64>) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
-+// [<=10, 1] x [5] => [<=10, 5]
-+// CHECK-LABEL: func @tensor_broadcast_b5_1_x_5
-+func.func @tensor_broadcast_b5_1_x_5(
-+  %arg0: tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
-+  %arg1: tensor<5xf64>
-+) -> !stablehlo.token {
-+  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>) -> tensor<?x5xf64, #stablehlo.bounds<10, ?>>
-+  // CHECK: %[[RHS_BCAST_STATIC:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [1] : (tensor<5xf64>) -> tensor<10x5xf64>
-+  // CHECK: %[[ARG0_DIM0_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
-+  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST_STATIC]], %[[ARG0_DIM0_SIZE]], dim = 0
-+  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST_DYN]])
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (
-+    tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
-+    tensor<5xf64>
-+  ) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
- //////
- // N-ary broadcast tests.
+diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo/dialect/TypeInference.cpp
+--- stablehlo/stablehlo/dialect/TypeInference.cpp
++++ stablehlo/stablehlo/dialect/TypeInference.cpp
+@@ -2013,12 +2013,12 @@
+     MLIRContext* context, std::optional<Location>, Value lhs,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   // compare_c1
+-  ShapedTypeComponents& components =
+-      inferredReturnShapes.emplace_back(IntegerType::get(context, /*width=*/1));
+-  auto argTy = cast<ShapedType>(lhs.getType());
++  ShapedTypeComponents& components = inferredReturnShapes.emplace_back();
++  auto argTy = cast<RankedTensorType>(lhs.getType());
++  auto resElementTy = IntegerType::get(context, /*width=*/1);
+   // compare_c2
+   components =
+-      ShapedTypeComponents(argTy.getShape(), components.getElementType());
++      ShapedTypeComponents(argTy.getShape(), resElementTy, argTy.getEncoding());
+   return success();
+ }
  
-@@ -247,3 +281,42 @@
-   return %0 : !stablehlo.token
+@@ -2119,9 +2119,10 @@
+ LogicalResult inferConvertOp(
+     std::optional<Location> location, Value operand,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  auto operandType = cast<ShapedType>(operand.getType());
++  auto operandType = cast<RankedTensorType>(operand.getType());
+   // convert_c1
+-  inferredReturnShapes.emplace_back(operandType.getShape());
++  inferredReturnShapes.emplace_back(operandType.getShape(), nullptr,
++                                    operandType.getEncoding());
+   return success();
  }
  
-+// -----
-+
-+/////
-+// Broadcast errors
-+
-+// [10] x [5] => error
-+// expected-error @+1 {{incompatible shapes for broadcasting 10 and 5}}
-+func.func @broadcast_error_10_x_5(%arg0: tensor<10xf64>, %arg1: tensor<5xf64>) -> !stablehlo.token {
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, tensor<5xf64>) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
-+// [10] x [<=10] => error
-+// expected-error @+1 {{cannot mix bounded and static dimensions in broadcast}}
-+func.func @broadcast_error_10_x_b10(%arg0: tensor<10xf64>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token {
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token
-+  return %0 : !stablehlo.token
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -3913,6 +3913,149 @@
+ 
+ // -----
+ 
++!bounded_type = tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK-LABEL:   func.func @erf_inv_bounded(
++// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x16xf32, #stablehlo.bounds<16, ?>>) {
++// CHECK:           %[[NEGATE_0:.*]] = stablehlo.negate %[[ARG0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_0:.*]] = stablehlo.multiply %[[ARG0]], %[[NEGATE_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[LOG_PLUS_ONE_0:.*]] = stablehlo.log_plus_one %[[MULTIPLY_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[NEGATE_1:.*]] = stablehlo.negate %[[LOG_PLUS_ONE_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_0:.*]] = stablehlo.constant dense<5.000000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_0:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_0]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_0:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_0:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_0]], %[[GET_DIMENSION_SIZE_0]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[COMPARE_0:.*]] = stablehlo.compare  LT, %[[NEGATE_1]], %[[SET_DIMENSION_SIZE_0]] : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<?x16xi1, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_1:.*]] = stablehlo.constant dense<2.500000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_1:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_1]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_1:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_1:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_1]], %[[GET_DIMENSION_SIZE_1]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SUBTRACT_0:.*]] = stablehlo.subtract %[[NEGATE_1]], %[[SET_DIMENSION_SIZE_1]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SQRT_0:.*]] = stablehlo.sqrt %[[NEGATE_1]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_2:.*]] = stablehlo.constant dense<3.000000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_2:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_2]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_2:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_2:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_2]], %[[GET_DIMENSION_SIZE_2]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SUBTRACT_1:.*]] = stablehlo.subtract %[[SQRT_0]], %[[SET_DIMENSION_SIZE_2]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_0:.*]] = stablehlo.select %[[COMPARE_0]], %[[SUBTRACT_0]], %[[SUBTRACT_1]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_3:.*]] = stablehlo.constant dense<2.81022636E-8> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_3:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_3]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_3:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_3:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_3]], %[[GET_DIMENSION_SIZE_3]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_4:.*]] = stablehlo.constant dense<-2.00214257E-4> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_4:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_4]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_4:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_4:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_4]], %[[GET_DIMENSION_SIZE_4]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_1:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_3]], %[[SET_DIMENSION_SIZE_4]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_5:.*]] = stablehlo.constant dense<3.43273939E-7> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_5:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_5]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_5:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_5:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_5]], %[[GET_DIMENSION_SIZE_5]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_6:.*]] = stablehlo.constant dense<1.00950558E-4> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_6:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_6]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_6:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_6:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_6]], %[[GET_DIMENSION_SIZE_6]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_2:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_5]], %[[SET_DIMENSION_SIZE_6]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_1:.*]] = stablehlo.multiply %[[SELECT_1]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_0:.*]] = stablehlo.add %[[SELECT_2]], %[[MULTIPLY_1]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_7:.*]] = stablehlo.constant dense<-3.5233877E-6> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_7:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_7]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_7:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_7:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_7]], %[[GET_DIMENSION_SIZE_7]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_8:.*]] = stablehlo.constant dense<0.00134934322> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_8:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_8]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_8:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_8:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_8]], %[[GET_DIMENSION_SIZE_8]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_3:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_7]], %[[SET_DIMENSION_SIZE_8]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_2:.*]] = stablehlo.multiply %[[ADD_0]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_1:.*]] = stablehlo.add %[[SELECT_3]], %[[MULTIPLY_2]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_9:.*]] = stablehlo.constant dense<-4.39150654E-6> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_9:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_9]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_9:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_9:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_9]], %[[GET_DIMENSION_SIZE_9]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_10:.*]] = stablehlo.constant dense<-0.00367342844> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_10:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_10]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_10:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_10:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_10]], %[[GET_DIMENSION_SIZE_10]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_4:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_9]], %[[SET_DIMENSION_SIZE_10]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_3:.*]] = stablehlo.multiply %[[ADD_1]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_2:.*]] = stablehlo.add %[[SELECT_4]], %[[MULTIPLY_3]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_11:.*]] = stablehlo.constant dense<2.1858087E-4> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_11:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_11]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_11:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_11:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_11]], %[[GET_DIMENSION_SIZE_11]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_12:.*]] = stablehlo.constant dense<0.00573950773> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_12:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_12]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_12:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_12:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_12]], %[[GET_DIMENSION_SIZE_12]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_5:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_11]], %[[SET_DIMENSION_SIZE_12]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_4:.*]] = stablehlo.multiply %[[ADD_2]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_3:.*]] = stablehlo.add %[[SELECT_5]], %[[MULTIPLY_4]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_13:.*]] = stablehlo.constant dense<-0.00125372503> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_13:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_13]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_13:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_13:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_13]], %[[GET_DIMENSION_SIZE_13]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_14:.*]] = stablehlo.constant dense<-0.0076224613> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_14:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_14]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_14:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_14:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_14]], %[[GET_DIMENSION_SIZE_14]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_6:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_13]], %[[SET_DIMENSION_SIZE_14]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_5:.*]] = stablehlo.multiply %[[ADD_3]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_4:.*]] = stablehlo.add %[[SELECT_6]], %[[MULTIPLY_5]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_15:.*]] = stablehlo.constant dense<-0.00417768164> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_15:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_15]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_15:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_15:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_15]], %[[GET_DIMENSION_SIZE_15]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_16:.*]] = stablehlo.constant dense<0.00943887047> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_16:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_16]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_16:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_16:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_16]], %[[GET_DIMENSION_SIZE_16]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_7:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_15]], %[[SET_DIMENSION_SIZE_16]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_6:.*]] = stablehlo.multiply %[[ADD_4]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_5:.*]] = stablehlo.add %[[SELECT_7]], %[[MULTIPLY_6]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_17:.*]] = stablehlo.constant dense<0.246640727> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_17:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_17]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_17:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_17:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_17]], %[[GET_DIMENSION_SIZE_17]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_18:.*]] = stablehlo.constant dense<1.00167406> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_18:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_18]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_18:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_18:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_18]], %[[GET_DIMENSION_SIZE_18]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_8:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_17]], %[[SET_DIMENSION_SIZE_18]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_7:.*]] = stablehlo.multiply %[[ADD_5]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_6:.*]] = stablehlo.add %[[SELECT_8]], %[[MULTIPLY_7]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_19:.*]] = stablehlo.constant dense<1.50140941> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_19:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_19]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_19:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_19:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_19]], %[[GET_DIMENSION_SIZE_19]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_20:.*]] = stablehlo.constant dense<2.83297682> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_20:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_20]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_20:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_20:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_20]], %[[GET_DIMENSION_SIZE_20]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_9:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_19]], %[[SET_DIMENSION_SIZE_20]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_8:.*]] = stablehlo.multiply %[[ADD_6]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_7:.*]] = stablehlo.add %[[SELECT_9]], %[[MULTIPLY_8]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_9:.*]] = stablehlo.multiply %[[ADD_7]], %[[ARG0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ABS_0:.*]] = stablehlo.abs %[[ARG0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_21:.*]] = stablehlo.constant dense<1.000000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_21:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_21]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_21:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_21:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_21]], %[[GET_DIMENSION_SIZE_21]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[COMPARE_1:.*]] = stablehlo.compare  EQ, %[[ABS_0]], %[[SET_DIMENSION_SIZE_21]] : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<?x16xi1, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_22:.*]] = stablehlo.constant dense<0x7F800000> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_22:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_22]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_22:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_22:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_22]], %[[GET_DIMENSION_SIZE_22]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_10:.*]] = stablehlo.multiply %[[ARG0]], %[[SET_DIMENSION_SIZE_22]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_10:.*]] = stablehlo.select %[[COMPARE_1]], %[[MULTIPLY_10]], %[[MULTIPLY_9]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           return
++// CHECK:         }
++func.func @erf_inv_bounded(%arg0 : !bounded_type) {
++  %0 = chlo.erf_inv %arg0 : !bounded_type -> !bounded_type
++  return
 +}
 +
 +// -----
 +
-+// [10] x not_tensor => error
-+func.func @broadcast_error_not_tensor(%arg0: tensor<10xf64>, %arg1: !stablehlo.token) -> !stablehlo.token {
-+  // expected-error @+1 {{expected ranked tensor type for broadcast inputs}}
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, !stablehlo.token) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
+ // CHECK-LABEL:   @square_complex_f32(
+ // CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<complex<f32>>) -> tensor<complex<f32>> {
+ // CHECK:           %[[VAL_1:.*]] = stablehlo.real %[[VAL_0]] : (tensor<complex<f32>>) -> tensor<f32>
+diff --ruN a/stablehlo/stablehlo/tests/infer_chlo.mlir b/stablehlo/stablehlo/tests/infer_chlo.mlir
+--- stablehlo/stablehlo/tests/infer_chlo.mlir
++++ stablehlo/stablehlo/tests/infer_chlo.mlir
+@@ -239,3 +239,41 @@
+   %r17 = "hlo_test_infer.get_return_types"(%17) : (tensor<2xf32>) -> tensor<2xf32>
+   func.return %r17 : tensor<2xf32>
+ }
 +
 +// -----
 +
-+// [] => error
-+func.func @broadcast_error_empty() -> !stablehlo.token {
-+  // expected-error @+1 {{requires at least one operand to broadcast}}
-+  %0 = "hlo_test_broadcast.numpy_broadcast"() : () -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
++/////
++// Bounded dynamic
 +
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-@@ -47,8 +47,8 @@
- ////////
- // CaseOp
- 
--// CHECK-LABEL: func.func @case_fold_constant_branch_index
--func.func @case_fold_constant_branch_index(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-+// CHECK-LABEL: func.func @case_fold_constant_branch_index_int_result
-+func.func @case_fold_constant_branch_index_int_result(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-   // CHECK-NEXT: {{(^ *|func\.)}}return %arg1
-   // CHECK-NOT:  stablehlo.case
-   %branch_index = stablehlo.constant dense<1> : tensor<i32>
-@@ -60,6 +60,47 @@
-     stablehlo.return %arg2 : tensor<i32>
-   }) : (tensor<i32>) -> tensor<i32>
-   func.return %result: tensor<i32>
++// [<=10] x [1] => [<=10]
++// CHECK-LABEL: @bounded_dynamic_broadcast_scalar
++func.func @bounded_dynamic_broadcast_scalar(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>> {
++  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>>
++  // CHECK: types0 = tensor<?xf64, #stablehlo.bounds<10>>
++  %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<?xf64, #stablehlo.bounds<10>>) -> tensor<?xf64, #stablehlo.bounds<10>>
++  return %1 : tensor<?xf64, #stablehlo.bounds<10>>
 +}
 +
 +// -----
 +
-+// CHECK-LABEL: func.func @case_fold_constant_branch_index_complex_result
-+func.func @case_fold_constant_branch_index_complex_result(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>, %arg2: tensor<complex<f32>>) -> tensor<complex<f32>> {
-+  // CHECK-NEXT: {{(^ *|func\.)}}return %arg1
-+  // CHECK-NOT:  stablehlo.case
-+  %branch_index = stablehlo.constant dense<1> : tensor<i32>
-+  %result = "stablehlo.case"(%branch_index) ({
-+    stablehlo.return %arg0 : tensor<complex<f32>>
-+  }, {
-+    stablehlo.return %arg1 : tensor<complex<f32>>
-+  }, {
-+    stablehlo.return %arg2 : tensor<complex<f32>>
-+  }) : (tensor<i32>) -> tensor<complex<f32>>
-+  func.return %result: tensor<complex<f32>>
++// [<=10] x [?] => [?]
++// CHECK-LABEL: @bounded_dynamic_broadcast_unbounded
++!bounded_type = tensor<?xf64, #stablehlo.bounds<10>>
++!unbounded_type = tensor<?xf64>
++func.func @bounded_dynamic_broadcast_unbounded(%arg0: !bounded_type, %arg1: !unbounded_type) -> !unbounded_type {
++  %0 = chlo.broadcast_add %arg0, %arg1 : (!bounded_type, !unbounded_type) -> !unbounded_type
++  // CHECK: types0 = tensor<?xf64>
++  %1 = "hlo_test_infer.get_return_types"(%0) : (!unbounded_type) -> !unbounded_type
++  return %1 : !unbounded_type
 +}
 +
 +// -----
 +
-+// CHECK-LABEL: func.func @case_fold_inline_call_tf_function
-+func.func @case_fold_inline_call_tf_function(%arg0: !stablehlo.token {jax.token = true}, %arg1: tensor<16xi32>, %arg2: tensor<16xi64>) -> (!stablehlo.token {jax.token = true}, tensor<16xi32> {jax.result_info = "result"}) {
-+  // CHECK: [[RESULT_TOKEN:%.+]] = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2)
-+  // CHECK: [[UNUSED_TOKEN:%.+]] = {{"?}}stablehlo.case{{"?}}(
-+  // CHECK: return [[RESULT_TOKEN]], %arg1
-+  %c = stablehlo.constant dense<1> : tensor<i32>
-+  %c_0 = stablehlo.constant dense<0> : tensor<i32>
-+  %0 = "stablehlo.case"(%c_0) ({
-+    stablehlo.return %c_0 : tensor<i32>
-+  }, {
-+    stablehlo.return %c : tensor<i32>
-+  }) : (tensor<i32>) -> tensor<i32>
-+  %1 = "stablehlo.case"(%0) ({
-+    %2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_index = 0 : i64, has_token_input_output = true}} : (!stablehlo.token, tensor<16xi32>, tensor<16xi64>) -> !stablehlo.token
-+    stablehlo.return %2 : !stablehlo.token
-+  }, {
-+    %2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_index = 1 : i64, has_token_input_output = true}} : (!stablehlo.token, tensor<16xi32>, tensor<16xi64>) -> !stablehlo.token
-+    stablehlo.return %2 : !stablehlo.token
-+  }) : (tensor<i32>) -> !stablehlo.token
-+  return %1, %arg1 : !stablehlo.token, tensor<16xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
-@@ -128,6 +128,16 @@
-   return %7 : tensor<3x2x3x3xi32>
- }
- 
-+// CHECK-LABEL: func.func @broadcast_in_dim_nested_bounded
-+func.func @broadcast_in_dim_nested_bounded(%arg0: tensor<3x3xi32>, %arg1: tensor<i32>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>> {
-+  // CHECK: [[SDS:%.+]] = stablehlo.set_dimension_size
-+  // CHECK-NEXT: stablehlo.broadcast_in_dim [[SDS]], dims = [2, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
-+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<3x3xi32>, tensor<i32>) -> tensor<?x3xi32, #stablehlo.bounds<3, ?>>
-+  %1 = stablehlo.broadcast_in_dim %0, dims = [1, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>
-+  %2 = stablehlo.broadcast_in_dim %1, dims = [0, 2, 1] : (tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
-+  return %2 : tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
-+}
-+
- // CHECK-LABEL: func.func @broadcast_in_dim_reshape
- // CHECK-SAME:   ([[ARG0:%.+]]: tensor<3x6xi32>)
- func.func @broadcast_in_dim_reshape(%arg0: tensor<3x6xi32>)
-@@ -140,6 +150,15 @@
- 
-   // CHECK-NEXT: return [[R0]], [[R5]]
-   return %0, %5 : tensor<1x3x6xi32>, tensor<3x6x1xi32>
++// CHECK-LABEL: @broadcast_select_types_bounded
++!bounded_type = tensor<?xf64, #stablehlo.bounds<10>>
++func.func @broadcast_select_types_bounded(%arg0: tensor<i1>, %arg1: !bounded_type, %arg2: !bounded_type) -> !bounded_type {
++  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, !bounded_type, !bounded_type) -> !bounded_type
++  // CHECK: types0 = tensor<?xf64, #stablehlo.bounds<10>>
++  %1 = "hlo_test_infer.get_return_types"(%0) : (!bounded_type) -> !bounded_type
++  return %1: !bounded_type
 +}
-+
-+// CHECK-LABEL: func.func @broadcast_in_dim_bounded_no_reshape
-+func.func @broadcast_in_dim_bounded_no_reshape(%arg0: tensor<20xf32>, %arg1: tensor<i32>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>> {
-+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<20xf32>, tensor<i32>) -> tensor<?xf32, #stablehlo.bounds<20>>
-+  // CHECK: stablehlo.set_dimension_size
-+  // CHECK-NEXT: stablehlo.broadcast_in_dim
-+  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<?xf32, #stablehlo.bounds<20>>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>>
-+  return %1 : tensor<1x?xf32, #stablehlo.bounds<?, 20>>
- }
- 
- // CHECK-LABEL: func.func @broadcast_in_dim_prefer_nested_reshape
-diff --ruN a/stablehlo/stablehlo/transforms/CMakeLists.txt b/stablehlo/stablehlo/transforms/CMakeLists.txt
---- stablehlo/stablehlo/transforms/CMakeLists.txt
-+++ stablehlo/stablehlo/transforms/CMakeLists.txt
-@@ -113,6 +113,7 @@
-   MLIRTransformUtils
-   StablehloBase
-   StablehloBroadcastUtils
-+  StablehloBroadcastLowering
-   StablehloLinalgTransforms
-   StablehloOps
-   StablehloOptimizationPasses
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -35,7 +35,6 @@
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinTypeInterfaces.h"
- #include "mlir/IR/BuiltinTypes.h"
--#include "mlir/IR/ImplicitLocOpBuilder.h"
- #include "mlir/IR/MLIRContext.h"
- #include "mlir/IR/PatternMatch.h"
- #include "mlir/IR/TypeUtilities.h"
-@@ -51,6 +50,7 @@
- #include "stablehlo/transforms/ChloDecompositionUtils.h"
- #include "stablehlo/transforms/PassUtils.h"
- #include "stablehlo/transforms/Passes.h"
-+#include "stablehlo/transforms/StablehloBroadcastLowering.h"
- 
- // This must precede all other headers, otherwise during Windows cross
- // compilation, M_PI will not be defined.
-@@ -201,34 +201,13 @@
-       val);
- }
- 
--// Broadcast using numpy-style broadcasting semantics.
--// This is only valid if the CHLO op has static shaped operands, and no
--// explicitly specified broadcast_dimensions.
--//
--// Asserts that input is ranked tensor type.
--Value numpyBroadcastIfNeeded(Value op, RankedTensorType opResultType,
--                             PatternRewriter& rewriter) {
--  RankedTensorType inputType = cast<RankedTensorType>(op.getType());
--  RankedTensorType broadcastedResultType =
--      opResultType.clone(inputType.getElementType());
--
--  // No broadcasting needed if input type matches broadcasted result type.
--  if (inputType == broadcastedResultType) return op;
--
--  // broadcast dims are the last dims for numpy style broadcasting.
--  int64_t inputRank = inputType.getRank();
--  int64_t resultRank = opResultType.getRank();
--  auto broadcastDimensions =
--      llvm::to_vector(llvm::seq<int64_t>(resultRank - inputRank, resultRank));
--  return stablehlo::BroadcastInDimOp::create(rewriter, op.getLoc(),
--                                             broadcastedResultType, op,
--                                             broadcastDimensions)
--      .getResult();
--}
--
- //===----------------------------------------------------------------------===//
- // Broadcasting Patterns.
- //===----------------------------------------------------------------------===//
-+
-+bool isStaticOrBoundedDynamicTensor(RankedTensorType type) {
-+  return type.hasStaticShape() || hlo::isBoundedDynamic(type);
-+}
- 
- // Converts binary ops that statically are determined to not broadcast directly
- // to the corresponding stablehlo non-broadcasting op.
-@@ -243,12 +222,14 @@
-     // Only rewrite for statically determinable non-broadcasting cases.
-     auto lhsType = dyn_cast<RankedTensorType>(adaptor.getLhs().getType());
-     auto rhsType = dyn_cast<RankedTensorType>(adaptor.getRhs().getType());
--    if (!lhsType || !rhsType || lhsType.getShape() != rhsType.getShape() ||
--        !lhsType.hasStaticShape() || !rhsType.hasStaticShape())
-+    if (!lhsType || !rhsType || !isStaticOrBoundedDynamicTensor(lhsType) ||
-+        !isStaticOrBoundedDynamicTensor(rhsType) ||
-+        lhsType.getShape() != rhsType.getShape() ||
-+        lhsType.getEncoding() != rhsType.getEncoding())
-       return rewriter.notifyMatchFailure(
-           op,
-           "expected LHS and RHS to be ranked tensors with matching shapes that "
--          "are all static");
-+          "are all static or bounded dynamic");
- 
-     rewriter.replaceOp(
-         op, ValueRange{Adaptor::createOp(op, op.getType(),
-@@ -270,41 +251,46 @@
-     // Only rewrite for statically determinable non-broadcasting cases.
-     auto lhsType = dyn_cast<RankedTensorType>(adaptor.getLhs().getType());
-     auto rhsType = dyn_cast<RankedTensorType>(adaptor.getRhs().getType());
--    if (!lhsType || !rhsType || !lhsType.hasStaticShape() ||
--        !rhsType.hasStaticShape())
-+    if (!lhsType || !rhsType || !isStaticOrBoundedDynamicTensor(lhsType) ||
-+        !isStaticOrBoundedDynamicTensor(rhsType))
-       return rewriter.notifyMatchFailure(
-           op,
--          "expected LHS and RHS to be ranked tensor types with static "
--          "shape");
-+          "expected LHS and RHS to be ranked tensor types with static or "
-+          "bounded dynamic shape");
- 
-     // Rely on CHLO type inference to figure out the proper broadcasted shape.
-     auto resultType = dyn_cast<RankedTensorType>(op.getResult().getType());
--    if (!resultType || !resultType.hasStaticShape())
-+    if (!resultType || !isStaticOrBoundedDynamicTensor(resultType))
-       return rewriter.notifyMatchFailure(
--          op, "expected result to be a ranked tensor type with static shape");
-+          op,
-+          "expected result to be a ranked tensor type with static or bounded "
-+          "dynamic shape");
- 
-     auto lhs = adaptor.getLhs();
-     auto rhs = adaptor.getRhs();
-     auto broadcastDimensions = adaptor.getBroadcastDimensions();
-     if (broadcastDimensions &&
--        !hlo::isLegalNumpyRankedBroadcast(lhs, rhs, *broadcastDimensions))
-+        !hlo::isLegalNumpyRankedBroadcast(lhs, rhs, *broadcastDimensions)) {
-       return rewriter.notifyMatchFailure(
-           op,
-           "expected implicit broadcast_dimensions or numpy-style broadcasting");
-+    }
- 
-     LLVM_DEBUG(llvm::dbgs()
-                << "CHLO Decomposing " << op->getName() << " with broadcast "
-                << lhsType << " x " << rhsType << " -> " << resultType << "\n");
- 
--    // If operands are static directly create stablehlo broadcasting ops.
--    // Use numpy-style broadcasting with using StableHLO broadcast ops,
--    // when user didn't specify broadcast_dimensions.
--    auto lhsBroadcast =
--        numpyBroadcastIfNeeded(adaptor.getLhs(), resultType, rewriter);
--    auto rhsBroadcast =
--        numpyBroadcastIfNeeded(adaptor.getRhs(), resultType, rewriter);
--    auto result = Adaptor::createOp(op, resultType,
--                                    {lhsBroadcast, rhsBroadcast}, rewriter);
-+    // If operands are static or bounded dynamic, directly create stablehlo
-+    // broadcasting ops. Use numpy-style broadcasting with using StableHLO
-+    // broadcast ops. Can leave off broadcast_dimensions since the above
-+    // logic verifies that they are the default for numpy-style broadcasting.
-+    mlir::SmallVector<Value> broadcastOperands = {lhs, rhs};
-+    auto broadcasted_values =
-+        stablehlo::numpyBroadcastIfNeeded(rewriter, broadcastOperands);
-+    if (failed(broadcasted_values)) return failure();
-+
-+    auto result =
-+        Adaptor::createOp(op, resultType, *broadcasted_values, rewriter);
-     rewriter.replaceOp(op, {result.getResult()});
-     return success();
-   }
-@@ -425,7 +411,21 @@
-       return success();
-     }
- 
--    // Lower to broadcasted constant.
-+    // Lower to cst -> broadcast -> set_dimension_size if bounded dynamic.
-+    if (hlo::isBoundedDynamic(resultTy)) {
-+      Value constant = mlir::stablehlo::ConstantOp::create(
-+          rewriter, op.getLoc(), op.getValue());
-+      mlir::FailureOr<stablehlo::Dimensions> operandDims =
-+          getDimensions(adaptor.getOperand());
-+      if (failed(operandDims)) return failure();
-+      mlir::FailureOr<Value> broadcast =
-+          stablehlo::numpyBroadcastIfNeeded(rewriter, constant, *operandDims);
-+      if (failed(broadcast)) return failure();
-+      rewriter.replaceOp(op, *broadcast);
-+      return success();
-+    }
-+
-+    // Lower unbounded dynamic to broadcasted constant.
-     Location loc = op.getLoc();
-     Value constant =
-         mlir::stablehlo::ConstantOp::create(rewriter, loc, op.getValue());
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
 --- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
 +++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
-@@ -59,29 +59,11 @@
-   };
- }
- 
--FailureOr<Dimensions> getDimensions(Value op) {
--  // Get tensor type
--  mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
--  if (!tensor_type)
--    return emitError(op.getLoc(), "expected ranked tensor type");
--
--  auto encoding =
--      mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
--          tensor_type.getEncoding());
--
--  Dimensions dimensions;
--  dimensions.reserve(tensor_type.getRank());
--  for (int64_t idx = 0; idx < tensor_type.getRank(); ++idx) {
--    auto dimInfo = getDimensionInfo(op, tensor_type, encoding, idx);
--    dimensions.push_back(dimInfo);
--  }
--  return dimensions;
--}
--
--FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(const Dimensions& a,
-+FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(Value op,
-+                                                       const Dimensions& a,
-                                                        const Dimensions& b) {
-   LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] inputs: "
--                          << toString(a) << " * " << toString(b));
-+                          << toString(a) << " * " << toString(b) << "\n");
-   size_t max_rank = std::max(a.size(), b.size());
-   Dimensions result(max_rank);
- 
-@@ -110,14 +92,14 @@
+@@ -92,7 +92,6 @@
  
      // If both LHS and RHS are not 1, dim size must match.
      if (dim_a.size != dim_b.size) {
--      return emitError(a[a_idx].boundOp.value().getLoc(),
--                       "incompatible shapes for broadcasting ")
-+      // FIXME
-+      return emitError(op.getLoc(), "incompatible shapes for broadcasting ")
+-      // FIXME
+       return emitError(op.getLoc(), "incompatible shapes for broadcasting ")
               << dim_a.size << " and " << dim_b.size;
      }
- 
-     // If bounded both must be bounded
-     if (dim_a.boundOp.has_value() != dim_b.boundOp.has_value()) {
--      return emitError(a[a_idx].boundOp.value().getLoc(),
-+      return emitError(op.getLoc(),
-                        "cannot mix bounded and static dimensions in broadcast");
-     }
- 
-@@ -126,8 +108,30 @@
-   }
- 
-   LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] result: "
--                          << toString(result));
-+                          << toString(result) << "\n");
-   return result;
-+}
-+
-+}  // namespace
-+
-+FailureOr<Dimensions> getDimensions(Value op) {
-+  // Get tensor type
-+  mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
-+  if (!tensor_type)
-+    return emitError(op.getLoc(),
-+                     "expected ranked tensor type for broadcast inputs");
-+
-+  auto encoding =
-+      mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
-+          tensor_type.getEncoding());
-+
-+  Dimensions dimensions;
-+  dimensions.reserve(tensor_type.getRank());
-+  for (int64_t idx = 0; idx < tensor_type.getRank(); ++idx) {
-+    auto dimInfo = getDimensionInfo(op, tensor_type, encoding, idx);
-+    dimensions.push_back(dimInfo);
-+  }
-+  return dimensions;
- }
- 
- mlir::RankedTensorType getRankedTensorType(const Dimensions& dims,
-@@ -153,10 +157,12 @@
+@@ -157,11 +156,10 @@
    return mlir::RankedTensorType::get(shape, element_type, encoding);
  }
  
--}  // namespace
--
--FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops) {
--  if (ops.empty()) return failure();
-+
-+FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
-+                                             ArrayRef<Value> ops) {
-+  if (ops.empty())
-+    return emitError(builder.getInsertionPoint()->getLoc(),
-+                     "requires at least one operand to broadcast");
+-FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
++FailureOr<Dimensions> getNumpyBroadcastShape(Location loc,
+                                              ArrayRef<Value> ops) {
+   if (ops.empty())
+-    return emitError(builder.getInsertionPoint()->getLoc(),
+-                     "requires at least one operand to broadcast");
++    return emitError(loc, "requires at least one operand to broadcast");
  
    Value first = ops[0];
    auto bcastShapeOrFail = getDimensions(first);
-@@ -168,7 +174,7 @@
-     auto dims = getDimensions(currOp);
-     if (failed(dims)) return failure();
-     auto currBcastShapeOrFail =
--        getNumpyBroadcastShapeWithBounds(bcastShape, *dims);
-+        getNumpyBroadcastShapeWithBounds(currOp, bcastShape, *dims);
-     if (failed(currBcastShapeOrFail)) return failure();
-     bcastShape = std::move(*currBcastShapeOrFail);
-   }
-@@ -192,7 +198,7 @@
+@@ -197,7 +195,8 @@
  FailureOr<SmallVector<Value>> numpyBroadcastIfNeeded(OpBuilder& builder,
                                                       ArrayRef<Value> operands) {
    // Figure out the broadcast shape
--  auto bcastShapeOrFail = getNumpyBroadcastShape(operands);
-+  auto bcastShapeOrFail = getNumpyBroadcastShape(builder, operands);
+-  auto bcastShapeOrFail = getNumpyBroadcastShape(builder, operands);
++  auto errLoc = builder.getInsertionPoint()->getLoc();
++  auto bcastShapeOrFail = getNumpyBroadcastShape(errLoc, operands);
    if (failed(bcastShapeOrFail)) return failure();
    Dimensions bcastShape = std::move(*bcastShapeOrFail);
  
-@@ -208,35 +214,34 @@
- 
- FailureOr<Value> numpyBroadcastIfNeeded(OpBuilder& builder, Value input,
-                                         const Dimensions& shape) {
--  LLVM_DEBUG(llvm::dbgs() << "[BroadcastIfNeeded] input: " << input
--                          << " shape: " << toString(shape));
-+  LLVM_DEBUG(llvm::dbgs() << "[numpyBroadcastIfNeeded] Broadcasting input "
-+                          << input.getType() << " => " << toString(shape)
-+                          << "\n");
-   auto loc = input.getLoc();
--  mlir::RankedTensorType input_type =
-+  mlir::RankedTensorType inputType =
-       dyn_cast<RankedTensorType>(input.getType());
--  if (!input_type) return emitError(input.getLoc(), "expected tensor type");
--  mlir::RankedTensorType output_type =
--      getRankedTensorType(shape, input_type.getElementType());
-+  if (!inputType)
-+    return emitError(loc, "expected ranked tensor type for broadcast inputs");
-+  mlir::RankedTensorType outputType =
-+      getRankedTensorType(shape, inputType.getElementType());
- 
-   // Short circuit if no broadcasting is needed.
--  if (input_type == output_type) return input;
--
--  int64_t input_rank = input_type.getRank();
--  int64_t output_rank = output_type.getRank();
--  if (input_rank > output_rank)
-+  if (inputType == outputType) return input;
-+
-+  int64_t inputRank = inputType.getRank();
-+  int64_t outputRank = outputType.getRank();
-+  if (inputRank > outputRank)
-     return emitError(loc, "input rank must be <= output rank, got ")
--           << input_rank << " vs " << output_rank;
--
--  size_t rank_diff = output_rank - input_rank;
--  SmallVector<int64_t> bcast_dims;
--  bcast_dims.reserve(input_rank);
--
-+           << inputRank << " vs " << outputRank;
-+
-+  size_t rankDiff = outputRank - inputRank;
-   auto inputShapeOrFail = getDimensions(input);
-   if (failed(inputShapeOrFail)) return failure();
-   Dimensions inputShape = std::move(*inputShapeOrFail);
- 
-   // Construct broadcast dimensions.
-   auto broadcastDimensions = llvm::to_vector(
--      llvm::seq<int64_t>(output_rank - input_rank, output_rank));
-+      llvm::seq<int64_t>(outputRank - inputRank, outputRank));
- 
-   // Construct the result type of the broadcast
-   //  - If input is static and target shape is static, use static shape.
-@@ -244,33 +249,35 @@
-   //  - If input is not bounded, but target shape is bounded, broadcast to
-   //    the padded shape then call SetDimensionSize to make dynamic.
-   auto bcastShape = shape;
--  for (int64_t i = 0; i < input_rank; ++i) {
--    int64_t input_dim_size = inputShape[i].size;
--    int64_t result_idx = i + rank_diff;
--    int64_t result_dim_size = shape[result_idx].size;
--    if (input_dim_size != 1 && input_dim_size != result_dim_size)
-+  for (int64_t i = 0; i < inputRank; ++i) {
-+    int64_t inputDimSize = inputShape[i].size;
-+    int64_t resultIdx = i + rankDiff;
-+    int64_t resultDimSize = shape[resultIdx].size;
-+    if (inputDimSize != 1 && inputDimSize != resultDimSize)
-       return emitError(loc, "Cannot broadcast input: ")
--             << input_type << " to target shape " << toString(shape);
-+             << inputType << " to target shape " << toString(shape);
- 
-     if (!inputShape[i].boundOp.has_value() &&
--        shape[result_idx].boundOp.has_value()) {
-+        shape[resultIdx].boundOp.has_value()) {
-       // Use padded shape in broadcast.
--      bcastShape[result_idx] = DimensionInfo{shape[result_idx].size};
--    }
--    bcast_dims.push_back(result_idx);
-+      bcastShape[resultIdx] = DimensionInfo{shape[resultIdx].size};
-+    }
-   }
- 
-   // Broadcast to padded size for remaining dimensions.
--  for (size_t i = input_rank; i < shape.size(); ++i) {
-+  for (size_t i = 0; i < rankDiff; ++i) {
-     bcastShape[i] = DimensionInfo{shape[i].size};
-   }
- 
-   // Insert broadcast ops
--  mlir::RankedTensorType bcast_type =
--      getRankedTensorType(bcastShape, input_type.getElementType());
--  Value bcast_op = stablehlo::BroadcastInDimOp::create(
--      builder, loc, bcast_type, input, broadcastDimensions);
--  if (bcast_op.getType() == output_type) return bcast_op;
-+  mlir::RankedTensorType bcastType =
-+      getRankedTensorType(bcastShape, inputType.getElementType());
-+  LLVM_DEBUG(
-+      llvm::dbgs() << "[numpyBroadcastIfNeeded] Broadcast to padded type "
-+                   << bcastType << "\n");
-+  Value bcastOp = stablehlo::BroadcastInDimOp::create(
-+      builder, loc, bcastType, input, broadcastDimensions);
-+  if (bcastOp.getType() == outputType) return bcastOp;
- 
-   // Mark the padded broadcast as dynamic where the result is bounded.
-   // Inserts `GetDimSize(boundOp)->SetDimSize(inputBcast)` for any bounded
-@@ -278,13 +285,13 @@
-   for (size_t i = 0; i < shape.size(); ++i) {
-     if (!bcastShape[i].boundOp.has_value() && shape[i].boundOp.has_value()) {
-       Value boundOp = shape[i].boundOp.value();
--      auto dim_size = stablehlo::GetDimensionSizeOp::create(
-+      auto dimSize = stablehlo::GetDimensionSizeOp::create(
-           builder, loc, boundOp, shape[i].boundOpDim);
--      bcast_op = stablehlo::SetDimensionSizeOp::create(builder, loc, bcast_op,
--                                                       dim_size, i);
--    }
--  }
--  return bcast_op;
-+      bcastOp = stablehlo::SetDimensionSizeOp::create(builder, loc, bcastOp,
-+                                                       dimSize, i);
-+    }
-+  }
-+  return bcastOp;
- }
- 
- }  // namespace stablehlo
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
 --- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
 +++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
-@@ -47,9 +47,18 @@
- using Dimensions = SmallVector<DimensionInfo>;
- std::string toString(const Dimensions& dims);
+@@ -22,6 +22,7 @@
+ #include <string>
+ 
+ #include "mlir/IR/Builders.h"
++#include "mlir/IR/Location.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Support/LLVM.h"
+ 
+@@ -57,8 +58,7 @@
  
-+// Returns the dimensions of the given op, or failure if the op's type is not a
-+// ranked tensor.
-+FailureOr<Dimensions> getDimensions(Value op);
-+
-+// Returns the ranked tensor type with the given dimensions and element type.
-+mlir::RankedTensorType getRankedTensorType(const Dimensions& dims,
-+                                           mlir::Type element_type);
-+
  // Returns the common shape these ops would broadcast to, or an error if the
  // ops are not broadcastable.
--FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops);
-+FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
-+                                             ArrayRef<Value> ops);
+-FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
+-                                             ArrayRef<Value> ops);
++FailureOr<Dimensions> getNumpyBroadcastShape(Location loc, ArrayRef<Value> ops);
  
  // Apply numpy broadcasting to the given operands, returning an error if any
  // operands are not broadcastable.
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-@@ -14,6 +14,7 @@
- 
- #include <cassert>
- #include <cmath>
-+#include <complex>
- #include <cstddef>
- #include <cstdint>
- #include <functional>
-@@ -38,6 +39,7 @@
- #include "mlir/Dialect/CommonFolders.h"
- #include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "mlir/Dialect/Utils/IndexingUtils.h"
-+#include "mlir/IR/Builders.h"
- #include "mlir/IR/BuiltinAttributeInterfaces.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinTypeInterfaces.h"
-@@ -82,6 +84,71 @@
-                 /*isUnsigned=*/!isSigned);
- }
- 
-+class LazyPlaceholderValue {
-+ public:
-+  static FailureOr<LazyPlaceholderValue> preparePlaceholderFor(
-+      PatternRewriter& rewriter, Value likeValue) {
-+    Type valueType = likeValue.getType();
-+
-+    // If `getZeroAttr(valueType)` returns a valid attribute, simply wrap the
-+    // result in a `stablehlo.constant` op.
-+    if (TypedAttr placeholderAttr = rewriter.getZeroAttr(valueType)) {
-+      return LazyPlaceholderValue([&rewriter, placeholderAttr](Location loc) {
-+        return ConstantOp::create(rewriter, loc, placeholderAttr);
-+      });
-+    }
-+
-+    // `getZeroAttr` doesn't support complex types, so we handle that case here.
-+    if (auto shapedType = dyn_cast<ShapedType>(valueType)) {
-+      if (auto complexElementType =
-+              dyn_cast<ComplexType>(shapedType.getElementType())) {
-+        if (!isa<FloatType>(complexElementType.getElementType()))
-+          return rewriter.notifyMatchFailure(
-+              likeValue.getLoc(),
-+              "unexpected real component type for complex element type");
-+        auto realImagComponentFloatType =
-+            cast<FloatType>(complexElementType.getElementType());
-+        APFloat apFloatZero(0.0);
-+        bool losesInfo;
-+        apFloatZero.convert(realImagComponentFloatType.getFloatSemantics(),
-+                            llvm::RoundingMode::NearestTiesToEven, &losesInfo);
-+        std::complex<APFloat> complexZeroScalar(apFloatZero, apFloatZero);
-+        auto complexZeroSplat =
-+            SplatElementsAttr::get(shapedType, complexZeroScalar);
-+        return LazyPlaceholderValue(
-+            [&rewriter, complexZeroSplat](Location loc) {
-+              return ConstantOp::create(rewriter, loc, complexZeroSplat);
-+            });
-+      }
-+    }
-+
-+    // If `valueType` is a token type, use `stablehlo.after_all` with no
-+    // arguments to create a placeholder token.
-+    if (isa<TokenType>(valueType)) {
-+      return LazyPlaceholderValue([&rewriter](Location loc) {  //
-+        return AfterAllOp::create(rewriter, loc, {});
-+      });
-+    }
-+
-+    // TODO: Support quantized and buffer types.
-+
-+    return rewriter.notifyMatchFailure(
-+        likeValue.getLoc(), "unable to create placeholder value for type");
-+  }
-+
-+  Value createAt(Location loc) const {
-+    if (!lazyInitializer)
-+      llvm::report_fatal_error("No lazy initializer for this value type.");
-+    return lazyInitializer(loc);
-+  }
-+
-+ private:
-+  LazyPlaceholderValue(std::function<Value(Location)> lazyInitializer)
-+      : lazyInitializer(std::move(lazyInitializer)) {}
-+
-+  std::function<Value(Location)> lazyInitializer;
-+};
-+
- LogicalResult validateStaticShapeResult(PatternRewriter& rewriter,
-                                         Operation* op, ShapedType resultType) {
-   if (!resultType.hasStaticShape())
-@@ -737,18 +804,14 @@
-     Operation* terminator = blockToInline->getTerminator();
-     ValueRange results = terminator->getOperands();
- 
--    // TODO: Add support for complex, quantized, and token return types.
--    // Currently, this pattern only supports int and float return types. We'll
--    // need a more general equivalent of `getZeroAttr` to support other types.
--    SmallVector<TypedAttr> placeholderAttrs;
-+    SmallVector<LazyPlaceholderValue> lazyPlaceholderResults;
-     for (auto result : op.getResults()) {
--      TypedAttr placeholderAttr = rewriter.getZeroAttr(result.getType());
--      if (!placeholderAttr)
--        return rewriter.notifyMatchFailure(
--            op,
--            "The case op's return type isn't currently supported by this "
--            "optimization pattern.");
--      placeholderAttrs.push_back(placeholderAttr);
-+      auto placeholder =
-+          LazyPlaceholderValue::preparePlaceholderFor(rewriter, result);
-+
-+      if (failed(placeholder)) return failure();
-+
-+      lazyPlaceholderResults.push_back(std::move(placeholder.value()));
-     }
- 
-     // Inline the active branch of the `case` op.
-@@ -763,9 +826,9 @@
-     Block& noopBlock = region.emplaceBlock();
-     SmallVector<Value> placeholderResults;
-     rewriter.setInsertionPointToEnd(&noopBlock);
--    for (auto placeholderAttr : placeholderAttrs) {
-+    for (const auto& lazyPlaceholderResult : lazyPlaceholderResults) {
-       placeholderResults.push_back(
--          ConstantOp::create(rewriter, region.getLoc(), placeholderAttr));
-+          lazyPlaceholderResult.createAt(region.getLoc()));
-     }
-     stablehlo::ReturnOp::create(rewriter, region.getLoc(), placeholderResults);
- 
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
-@@ -44,7 +44,8 @@
-     "same number of elements">;
- 
- def BroadcastNotReducibleToReshape : Constraint<
--    CPred<"llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
-+    CPred<"!llvm::cast<ShapedType>($0.getType()).hasStaticShape() || "
-+          "llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
-           "!("
-             "llvm::is_sorted($0.getDefiningOp<stablehlo::BroadcastInDimOp>().getBroadcastDimensions()) && "
-             "llvm::cast<ShapedType>($0.getType()).getNumElements() == llvm::cast<ShapedType>($1.getType()).getNumElements()"
-@@ -134,8 +135,7 @@
- 
- def MergePermutations : NativeCodeCall<"getMergedTransposePermutation($_builder, $0, $1)">;
- 
--def MergeDiscardableAttributes
--    : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
-+def MergeDiscardableAttributes : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
- 
- def StableHLO_ConvertOpWithShape : NativeCodeCall<
-     "stablehlo::ConvertOp::create($_builder, $_loc, $0.getType(), $1)">;
-@@ -151,10 +151,10 @@
- 
- // op(cst, X) -> op(X, cst)
- class CanonicalizeConstantToRhs<Op StableHLO_OpType>
--    : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
--          (StableHLO_OpType:$new_op $rhs, $lhs),
--          [(NotConstantOp $rhs), (CommutativeOp $op)],
--          [(MergeDiscardableAttributes $op, $new_op)]>;
-+  : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
-+        (StableHLO_OpType:$new_op $rhs, $lhs),
-+        [(NotConstantOp $rhs), (CommutativeOp $op)],
-+        [(MergeDiscardableAttributes $op, $new_op)]>;
- 
- ////////
- // AddOp
-@@ -165,9 +165,9 @@
- 
- // Pattern: add(X, 0) -> X
- def AddOp_RemoveNoop
--    : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
--          (replaceWithValue $lhs), [],
--          [(MergeDiscardableAttributes $op, $lhs)]>;
-+  : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // AndOp
-@@ -177,25 +177,26 @@
-   : CanonicalizeConstantToRhs<StableHLO_AndOp>;
- 
- // Pattern: and(X, 0) -> 0
--def AndOp_FoldToZero : Pat<(StableHLO_AndOp:$op $lhs,
--                               (StableHLO_ConstantOp:$zero IntZero:$value)),
--                           (replaceWithValue $zero), [],
--                           [(MergeDiscardableAttributes $op, $zero)]>;
-+def AndOp_FoldToZero
-+  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
-+        (replaceWithValue $zero), [],
-+        [(MergeDiscardableAttributes $op, $zero)]>;
- 
- // Pattern: and(X, 1) -> X
--def AndOp_RemoveNoop : Pat<(StableHLO_AndOp:$op $lhs,
--                               (StableHLO_ConstantOp:$one IntAllOnes:$value)),
--                           (replaceWithValue $lhs), [],
--                           [(MergeDiscardableAttributes $op, $lhs)]>;
-+def AndOp_RemoveNoop
-+  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // BroadcastInDimOp
- 
- // Pattern: broadcast_in_dim(X, [iota...]) -> X
- def BroadcastInDimOp_RemoveNoop
--    : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
--          (replaceWithValue $operand), [(TypesEqual $op, $operand)],
--          [(MergeDiscardableAttributes $op, $operand)]>;
-+  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
-+        (replaceWithValue $operand),
-+        [(TypesEqual $op, $operand)],
-+        [(MergeDiscardableAttributes $op, $operand)]>;
- 
- // Pattern: broadcast_in_dim(broadcast_in_dim(X, [dimsA...]), [dimsB...])
- //       -> broadcast_in_dim(X, merge(dimsA, dimsB))
-@@ -210,8 +211,10 @@
- 
- // Pattern: broadcast_in_dim(X, [sorted...]) -> reshape(X, [sorted...])
- //          [if same numel]
-+// TODO: Figure out if static extents matching is valid (i.e. <=10 -> 1x[<=10])
-+// for bounded dynamism, same for BroadcastInDimOp_ReplaceWithReshape
- def BroadcastInDimOp_ReplaceWithReshape
--  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, SortedDims:$dims),
-+  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, SortedDims:$dims),
-         (StableHLO_ReshapeOpWithShape $op, $operand),
-         [(NumberOfElementsEqual $op, $operand)],
-         [],
-@@ -220,7 +223,7 @@
- // Pattern: broadcast_in_dim(X, [dims...]) -> transpose(X, [dims...])
- //          [if same numel & rank]
- def BroadcastInDimOp_ReplaceWithTranspose
--  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, $dims),
-+  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, $dims),
-         (StableHLO_TransposeOp $operand, (InvertBroadcastDims $dims)),
-         [(NumberOfElementsEqual $op, $operand), (RankEqual $op, $operand)]>;
- 
-@@ -259,9 +262,10 @@
- 
- // Pattern: convert(X, [X.type]) -> X
- def ConvertOp_RemoveNoop
--    : Pat<(StableHLO_ConvertOp:$convert $operand),
--          (replaceWithValue $operand), [(TypesEqual $convert, $operand)],
--          [(MergeDiscardableAttributes $convert, $operand)]>;
-+  : Pat<(StableHLO_ConvertOp:$convert $operand),
-+        (replaceWithValue $operand),
-+        [(TypesEqual $convert, $operand)],
-+        [(MergeDiscardableAttributes $convert, $operand)]>;
- 
- ////////
- // DynamicBroadcastInDimOp
-@@ -447,16 +451,16 @@
- //
- // Multiplication by 0. This fold is not trivial for floats in presence of NaNs,
- // so we currently only enable it for ints.
--def MulOp_FoldToZero : Pat<(StableHLO_MulOp:$mul_op $lhs,
--                               (StableHLO_ConstantOp:$zero IntZero:$value)),
--                           (replaceWithValue $zero), [],
--                           [(MergeDiscardableAttributes $mul_op, $zero)]>;
-+def MulOp_FoldToZero
-+  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
-+        (replaceWithValue $zero), [],
-+        [(MergeDiscardableAttributes $mul_op, $zero)]>;
- 
- // Pattern: multiply(X, 1i) -> X
- def MulOp_RemoveNoop
--    : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
--          (replaceWithValue $lhs), [],
--          [(MergeDiscardableAttributes $mul_op, $lhs)]>;
-+  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $mul_op, $lhs)]>;
- 
- ////////
- // OrOp
-@@ -465,16 +469,16 @@
- def OrOp_CanonicalizeConstantToRhs : CanonicalizeConstantToRhs<StableHLO_OrOp>;
- 
- // Pattern: or(X, 1) -> 1
--def OrOp_FoldToOne : Pat<(StableHLO_OrOp:$op $lhs,
--                             (StableHLO_ConstantOp:$one IntAllOnes:$value)),
--                         (replaceWithValue $one), [],
--                         [(MergeDiscardableAttributes $op, $one)]>;
-+def OrOp_FoldToOne
-+  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
-+        (replaceWithValue $one), [],
-+        [(MergeDiscardableAttributes $op, $one)]>;
- 
- // Pattern: or(X, 0) -> X
--def OrOp_RemoveNoop : Pat<(StableHLO_OrOp:$op $lhs,
--                              (StableHLO_ConstantOp:$zero IntZero:$value)),
--                          (replaceWithValue $lhs), [],
--                          [(MergeDiscardableAttributes $op, $lhs)]>;
-+def OrOp_RemoveNoop
-+  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // PadOp
-@@ -574,10 +578,10 @@
-         (StableHLO_ConstantLike<"0"> $operand)>;
- 
- // Pattern: subtract(X, 0) -> X
--def SubtractOp_RemoveNoop : Pat<(StableHLO_SubtractOp:$op $lhs,
--                                    (StableHLO_ConstantOp AnyZero:$value)),
--                                (replaceWithValue $lhs), [],
--                                [(MergeDiscardableAttributes $op, $lhs)]>;
-+def SubtractOp_RemoveNoop
-+  : Pat<(StableHLO_SubtractOp:$op $lhs, (StableHLO_ConstantOp AnyZero:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // SliceOp
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 6012798b53e02e..ed5215b42b4c30 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "96acdcb7724f4a9eec6d2e5af2597b0750c13948"
-    STABLEHLO_SHA256 = "68e068a78d71f0764d5dd385ef434df922050530de99001969493298a00d64a0"
+    STABLEHLO_COMMIT = "d496423cdb7f7d5272f14d517681202a0b9cbe41"
+    STABLEHLO_SHA256 = "eac3bd19f6c0b86ed3216b63d871d7c34a1aa679ca4a34975fe70fd043b34b85"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel b/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
index 56e26d779de155..45a8d0a78aa78c 100644
--- a/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
+++ b/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
@@ -2,6 +2,7 @@
 # This package contains build targets for select TensorRT plugins included in the
 # TensorRT open source repository.
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_library")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
index 15b35c93bc457c..656b9c894904d8 100644
--- a/third_party/xla/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -8,9 +8,5 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
-    "//third_party/triton:llvm_integration/cl831451347.patch",
-    "//third_party/triton:llvm_integration/cl833447018.patch",
-    "//third_party/triton:llvm_integration/cl835942347.patch",
-    "//third_party/triton:llvm_integration/cl838780160.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch b/third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch
new file mode 100644
index 00000000000000..9386923e48f637
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch
@@ -0,0 +1,106 @@
+https://github.com/triton-lang/triton/pull/8645 removed cluster dimensions from the
+kernel metadata. We don't need to pass them to the launch API anymore.
+
+--- a/third_party/nvidia/backend/cuda_utils.cc	2025-11-17 02:33:44.000000000 -0800
++++ b/third_party/nvidia/backend/cuda_utils.cc	2025-11-18 03:58:02.000000000 -0800
+@@ -119,8 +119,8 @@
+     constexpr int size() const { return x * y * z; }
+   };
+   Dim grid;                     // Number of clusters per grid
+-  Dim cluster;                  // Number of blocks per cluster
+   int num_warps;                // number of warps per block
++  int num_ctas;                 // number of CTAs per block
+   int shared_memory;            // Size of shared memory in bytes to allocate
+   int launch_cooperative_grid;  // Non-zero to launch coop grid
+   int launch_pdl;               // Non-zero to use programatic-dependent launch
+@@ -137,16 +137,15 @@
+   // APIs if needed.
+   Py_BEGIN_ALLOW_THREADS;
+   const auto& grid = config.grid;
+-  const auto& cluster = config.cluster;
+   if (grid.size() == 0) {
+     PyEval_RestoreThread(_save);
+     Py_RETURN_NONE;
+   }
+ 
+   CUlaunchConfig cu_config;
+-  cu_config.gridDimX = grid.x * cluster.x;
+-  cu_config.gridDimY = grid.y * cluster.y;
+-  cu_config.gridDimZ = grid.z * cluster.z;
++  cu_config.gridDimX = grid.x * config.num_ctas;
++  cu_config.gridDimY = grid.y;
++  cu_config.gridDimZ = grid.z;
+   cu_config.blockDimX = 32 * config.num_warps;
+   cu_config.blockDimY = 1;
+   cu_config.blockDimZ = 1;
+@@ -169,12 +168,12 @@
+       .value = { .cooperative =  1}
+     };
+   }
+-  if (config.cluster.size() > 1) {
++  if (config.num_ctas != 1) {
+     auto& clusterDimAttr = launchAttr[cu_config.numAttrs++];
+     clusterDimAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+-    clusterDimAttr.value.clusterDim.x = cluster.x;
+-    clusterDimAttr.value.clusterDim.y = cluster.y;
+-    clusterDimAttr.value.clusterDim.z = cluster.z;
++    clusterDimAttr.value.clusterDim.x = config.num_ctas;
++    clusterDimAttr.value.clusterDim.y = 1;
++    clusterDimAttr.value.clusterDim.z = 1;
+     auto& clusterDimSchedulingAttr = launchAttr[cu_config.numAttrs++];
+     clusterDimSchedulingAttr.id =
+         CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+@@ -518,9 +517,8 @@
+ :param launch_pdl: enable programmatic dependent launch
+ :type launch_pdl: bool
+ :param packed_metadata: Kernel metadata, including in sequence:
+-    number of warps, number of CTAs, required bytes of shared memory,
+-    cluster dimensions x, y, and z
+-:type packed_metadata: 6-tuple
++    number of warps, number of CTAs, required bytes of shared memory
++:type packed_metadata: 3-tuple
+ :param hook_args: arguments to pass to the enter and exit hooks
+ :type hook_args: object
+ :param launch_enter_hook: hook to call just before launching the kernel
+@@ -542,7 +540,6 @@
+   ensureCudaContext();
+   TritonLaunchConfig config{};
+   auto& grid = config.grid;
+-  auto& cluster = config.cluster;
+   // PyObject* kernel_metadata = nullptr;
+   PyObject* hook_args = nullptr;
+   PyObject* launch_enter_hook = nullptr;
+@@ -551,15 +548,13 @@
+   PyObject* kernel_args = nullptr;
+   PyObject* global_scratch = nullptr;
+   PyObject* profile_scratch = nullptr;
+-  int num_ctas = 0;
+-  if (!PyArg_ParseTuple(args, "iiiKKpp(iiiiii)OOOSOOO", &grid.x, &grid.y,
+-                        &grid.z, &config.stream, &config.function,
++  if (!PyArg_ParseTuple(args, "iiiKKpp(iii)OOOSOOO", &grid.x, &grid.y, &grid.z,
++                        &config.stream, &config.function,
+                         &config.launch_cooperative_grid, &config.launch_pdl,
+-                        &config.num_warps, &num_ctas, &config.shared_memory,
+-                        &cluster.x, &cluster.y, &cluster.z, &hook_args,
+-                        &launch_enter_hook, &launch_exit_hook,
+-                        &signature_metadata_bytes, &global_scratch,
+-                        &profile_scratch, &kernel_args)) {
++                        &config.num_warps, &config.num_ctas,
++                        &config.shared_memory, &hook_args, &launch_enter_hook,
++                        &launch_exit_hook, &signature_metadata_bytes,
++                        &global_scratch, &profile_scratch, &kernel_args)) {
+     return nullptr;
+   }
+   llvm::ArrayRef<char> signature_metadata(
+
+--- a/third_party/nvidia/backend/driver.py	2025-11-13 05:31:00.000000000 -0800
++++ b/third_party/nvidia/backend/driver.py	2025-11-18 03:45:28.000000000 -0800
+@@ -223,7 +223,7 @@
+     def wrapper(grid_dim_x: int, grid_dim_y: int, grid_dim_z: int,
+                 stream: int, kernel: int, launch_cooperative_grid: bool,
+                 launch_pdl: bool, global_scratch: any, profile_scratch: any,
+-                packed_metadata: tuple[int, int, int, int, int, int],
++                packed_metadata: tuple[int, int, int],
+                 hook_args: any,
+                 launch_enter_hook: Callable[..., None],
+                 launch_exit_hook: Callable[..., None],
diff --git a/third_party/xla/third_party/triton/temporary/launcher_addition.patch b/third_party/xla/third_party/triton/temporary/launcher_addition.patch
new file mode 100644
index 00000000000000..7aa0f2e9551d9a
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_addition.patch
@@ -0,0 +1,70 @@
+
+--- a/third_party/nvidia/backend/cuda_utils.cc	2025-11-13 05:31:00.000000000 -0800
++++ b/third_party/nvidia/backend/cuda_utils.cc	2025-11-14 02:50:50.000000000 -0800
+@@ -59,7 +59,7 @@
+ #define CUDA_CHECK_AND_RETURN_NULL(ans)                                        \
+   do {                                                                         \
+     if (!gpuAssert((ans), __FILE__, __LINE__))                                 \
+-      goto cleanup;                                                            \
++      return NULL;                                                            \
+   } while (0)
+ 
+ // To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
+@@ -77,7 +77,7 @@
+     if ((funcPointer) == NULL) {                                               \
+       (funcPointer) = (initializerFunction)();                                 \
+       if ((funcPointer) == NULL) {                                             \
+-        goto cleanup;                                                          \
++        return NULL;                                                          \
+       }                                                                        \
+     }                                                                          \
+   } while (0)
+@@ -912,16 +912,21 @@
+ 
+ // clang-format off
+ static PyTypeObject PyCUtensorMapType = {
+-    PyVarObject_HEAD_INIT(NULL, 0)
++    .ob_base = {
++        .ob_base = {
++            .ob_type = NULL,
++        },
++        .ob_size = 0,
++    },
+     .tp_name = "triton.backends.nvidia.PyCUtensorMap",
+     .tp_basicsize = sizeof(PyCUtensorMapObject),
+     .tp_itemsize = 0,
++    .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
+     .tp_flags = Py_TPFLAGS_DEFAULT,
+     .tp_doc = "<PyCUtensorMap object>",
+     .tp_methods = PyCUtensorMap_methods,
++    .tp_alloc = PyCUtensorMap_alloc,
+     .tp_new = PyType_GenericNew,
+-    .tp_alloc = PyCUtensorMap_alloc,
+-    .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
+     .tp_free = PyCUtensorMap_free,
+ };
+ // clang-format on
+@@ -1056,9 +1061,11 @@
+   INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
+                                       getCuTensorMapEncodeTiledHandle);
+   CUresult res = cuTensorMapEncodeTiled(
+-      &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
+-      stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
+-      swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
++      &desc->tensorMap, (CUtensorMapDataType)elemType, rank,
++      (void*)global_address, shapeInt, stridesLL, blockSizeInt, elementStrides,
++      CU_TENSOR_MAP_INTERLEAVE_NONE, (CUtensorMapSwizzle)swizzle,
++      CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
++
+   if (res != CUDA_SUCCESS) {
+     const char *str;
+     cuGetErrorString(res, &str);
+@@ -1104,8 +1111,6 @@
+   }
+ 
+   return (PyObject *)desc;
+-
+-  return result;
+ }
+ 
+ static PyMethodDef ModuleMethods[] = {
diff --git a/third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch b/third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch
new file mode 100644
index 00000000000000..c4e4872e455f08
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch
@@ -0,0 +1,19 @@
+This should be merged with the launcher patch afterwards.
+
+--- a/third_party/nvidia/backend/cuda_utils.cc	2025-11-18 05:00:39.000000000 -0800
++++ b/third_party/nvidia/backend/cuda_utils.cc	2025-12-02 08:10:32.000000000 -0800
+@@ -180,6 +180,14 @@
+     clusterDimSchedulingAttr.value.clusterSchedulingPolicyPreference =
+         CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+   }
++
++  // As per the comment in third_party/nvidia/backend/driver.py,
++  // "num_ctas == 16 is non-portable. Does work for H100 and B200 tho".
++  if (config.num_ctas == 16) {
++    CUDA_CHECK(cuFuncSetAttribute(
++        config.function, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
++        1));
++  }
+ 
+   // cuLaunchKernelEx was added in CUDA 12, so load it dynamically to be
+   // able to link on CUDA 11 and earlier.
diff --git a/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch b/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
deleted file mode 100644
index 57d4c2121e37ea..00000000000000
--- a/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
+++ /dev/null
@@ -1,144 +0,0 @@
-diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
---- a/third_party/nvidia/backend/cuda_utils.cc
-+++ b/third_party/nvidia/backend/cuda_utils.cc
-@@ -270,51 +270,16 @@ bool extractPointer(PyObject* obj, void*
-   return true;
- }
- 
-+CUtensorMap* getTmaDesc(PyObject* obj);
-+
- // Extract a CUtensorMap descriptor from a python object, and store it to the
- // memory location pointed by ptr.
- bool extractTmaDesc(PyObject* obj, void* ptr) {
--  if (sizeof(CUtensorMap*) != 8) {
--    PyErr_SetString(PyExc_SystemError,
--                "extractTmaDesc() requires 64-bit compilation");
--    return false;
--  }
--
--  UniquePyObjectPtr method_ret(
--      PyObject_CallMethod(obj, "tma_desc_cpu_ptr", nullptr));
--  // Checking the error retains context if tma_desc_cpu_ptr raises an exception.
--  if (PyErr_Occurred()) {
--    return false;
--  }
--
--  if (!method_ret) {
--    PyErr_SetString(PyExc_SystemError, "Call to tma_desc_cpu_ptr() failed");
-+  CUtensorMap* tensor_map = getTmaDesc(obj);
-+  if (tensor_map == nullptr) {
-     return false;
-   }
--
--  if (!PyLong_Check(method_ret.get())) {
--    PyErr_SetString(PyExc_TypeError,
--                    "tma_desc_cpu_ptr() must return 64-bit int");
--    return false;
--  }
--
--  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret.get());
--  if (PyErr_Occurred()) {
--    return false;
--  }
--
--  if (!ptr_as_uint) {
--    PyErr_SetString(PyExc_ValueError,
--                    "received NULL ptr from tma_desc_cpu_ptr()");
--    return false;
--  }
--  if (ptr_as_uint % 64 != 0) {
--    PyErr_SetString(PyExc_ValueError,
--                    "tma_desc_cpu_ptr() must be 64-byte aligned");
--    return false;
--  }
--
--  *static_cast<CUtensorMap*>(ptr) =
--      *reinterpret_cast<CUtensorMap*>(ptr_as_uint);
-+  *static_cast<CUtensorMap*>(ptr) = *tensor_map;
-   return true;
- }
- 
-@@ -392,6 +357,7 @@ struct ExtractionInfo {
-   // Prefixes of types reprs supported by the extractor.
-   llvm::SmallVector<llvm::StringRef> supported_type_repr_prefixes;
-   std::size_t size;         // Size required by the extracted value.
-+  std::size_t alignment;    // Alignment requirement for the extracted value.
-   ExtractorType extractor;  // Function to call to extract the value.
- 
-   // Builds an ExtractionInfo for a given type T and a list of type reprs that
-@@ -400,7 +366,7 @@ struct ExtractionInfo {
-   static ExtractionInfo build(
-       std::initializer_list<llvm::StringRef> supported_type_reprs,
-       ExtractorType extractor = extractValue<T>) {
--    return {supported_type_reprs, sizeof(T), extractor};
-+    return {supported_type_reprs, sizeof(T), alignof(T), extractor};
-   }
- 
-   // Checks if the extractor supports extracting a given type repr.
-@@ -428,7 +394,7 @@ const ExtractionInfo kExtractionInfos[]{
-     // Note: types are e.g. '*fp32', so no closing quote is intentional.
-     ExtractionInfo::build<void*>({"'*"}, extractPointer),
-     ExtractionInfo{
--        {"None", "'none'"}, 0, nullptr},  // Represent constexprs as None
-+        {"None", "'none'"}, 0, 0, nullptr},  // Represent constexprs as None
-     ExtractionInfo::build<CUtensorMap>({"'nvTmaDesc'"}, extractTmaDesc),
- };
- 
-@@ -628,7 +594,19 @@ PyObject* launch(PyObject* self, PyObjec
-     if (extraction_info.size == 0) {
-       continue;  // skip adding constexpr parameters
-     }
--    config.params[params_idx] = alloca(extraction_info.size);
-+    size_t alignment = std::max(1UL, extraction_info.alignment);
-+
-+    // Allocate enough space on the stack to guarantee an aligned block.
-+    size_t size_with_alignment = extraction_info.size + alignment - 1;
-+    void *param_storage_ptr = alloca(size_with_alignment);
-+
-+    void *aligned_ptr = std::align(alignment, extraction_info.size,
-+                                   param_storage_ptr, size_with_alignment);
-+    if (aligned_ptr == nullptr) {
-+      PyErr_SetString(PyExc_MemoryError, "Failed to align parameter storage");
-+      return nullptr;
-+    }
-+    config.params[params_idx] = aligned_ptr;
-     if (!extraction_info.extractor(arg, config.params[params_idx])) {
-       return nullptr;
-     }
-@@ -940,6 +918,36 @@ static PyTypeObject PyCUtensorMapType = 
- };
- // clang-format on
- 
-+namespace {
-+
-+// Extracts a pointer to `CUtensorMap` from a `PyCUtensorMapObject`.
-+CUtensorMap* getTmaDesc(PyObject* obj) {
-+  if (sizeof(CUtensorMap*) != 8) {
-+    PyErr_SetString(PyExc_SystemError,
-+                    "getTmaDesc() requires 64-bit compilation");
-+    return nullptr;
-+  }
-+  if (Py_TYPE(obj) != static_cast<PyTypeObject*>(&PyCUtensorMapType)) {
-+    PyErr_Format(PyExc_TypeError,
-+                 "object must be of type PyCUtensorMap, got %s",
-+                 Py_TYPE(obj)->tp_name);
-+    return nullptr;
-+  }
-+  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
-+  // PyCUtensorMapObject aligns tensorMap to 128.
-+  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
-+  if (align_128 != 0) {
-+    PyErr_Format(
-+        PyExc_ValueError,
-+        "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld",
-+        align_128);
-+    return nullptr;
-+  }
-+  return map;
-+}
-+
-+}  // namespace
-+
- static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
-   unsigned long long global_address;
-   int swizzle;
diff --git a/third_party/xla/third_party/triton/temporary/utility-fix.patch b/third_party/xla/third_party/triton/temporary/utility-fix.patch
deleted file mode 100644
index f8cc5d0821f098..00000000000000
--- a/third_party/xla/third_party/triton/temporary/utility-fix.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-This patch would probably not be accepted upstream because our infrastructure
-uses Index type for indexing, while they use Integer type. Triton frontend
-wouldn't generate a situation that would run into this issue.
-
-diff --git a/lib/Dialect/Triton/IR/Utility.cpp b/lib/Dialect/Triton/IR/Utility.cpp
---- a/lib/Dialect/Triton/IR/Utility.cpp
-+++ b/lib/Dialect/Triton/IR/Utility.cpp
-@@ -97,8 +97,12 @@ Value tt::getLastInductionValue(OpBuilde
-   // (ub - lb -1) // step * step + lb
-   Value diff =
-       b.create<arith::SubIOp>(loc, loop.getUpperBound(), loop.getLowerBound());
--  diff = b.create<arith::SubIOp>(
--      loc, diff, b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1)));
-+  Value one;
-+  if (diff.getType().isIndex())
-+    one = b.create<arith::ConstantIndexOp>(loc, 1);
-+  else
-+    one = b.create<arith::ConstantOp>(loc, b.getIntegerAttr(diff.getType(), 1));
-+  diff = b.create<arith::SubIOp>(loc, diff, one);
-   Value ceilStep = b.create<arith::MulIOp>(
-       loc, b.create<arith::DivSIOp>(loc, diff, loop.getStep()), loop.getStep());
-   return b.create<arith::AddIOp>(loc, ceilStep, loop.getLowerBound());
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 6316fc91a1a9fa..fd263961ed17fd 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -7,8 +7,8 @@ load("//third_party/triton:temporary/series.bzl", "temporary_patch_list")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "triton_integrate_branch-1.14"
-    TRITON_SHA256 = "b684cff8d07e839f8a1ea6cc7d331f370615b4c5530489db76f619aa7aa66608"
+    TRITON_COMMIT = "triton_integrate_branch-1.15"
+    TRITON_SHA256 = "a502364ad54bd822dae5d2fc6215695f7d343617a8c643a39a49f40ef474d013"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
index 0b151f9ab38b3c..dcac9f547373b8 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
@@ -1,3 +1,4 @@
+# copybara:uncomment(oss-unused) load("//devtools/clif/python:clif_build_rule.bzl", "pyclif_proto_library")
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # Placeholder: load py_proto_library
 load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
@@ -19,6 +20,16 @@ tf_proto_library(
     visibility = internal_visibility([":friends"]),
 )
 
+# copybara:uncomment_begin(google-only)
+# pyclif_proto_library(
+#     name = "xplane_pyclif",
+#     proto_lib = ":xplane_proto",
+#     visibility = [
+#         "//visibility:public",
+#     ],
+# )
+# copybara:uncomment_end
+
 tf_proto_library(
     name = "profiler_options_proto",
     srcs = ["profiler_options.proto"],
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel b/third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel
deleted file mode 100644
index 09ae2f3c729c6e..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel
+++ /dev/null
@@ -1,38 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "gtest",
-    # Upstream gtest is *not* marked testonly for some reason, but
-    # gutil:status_matchers is, and non-testonly targets can't depend on
-    # testonly ones.
-    #
-    # XLA doesn't use gtest in non-testonly targets though so making this
-    # testonly should (TM) be fine.
-    testonly = True,
-    hdrs = [
-        "include/gmock/gmock.h",
-        "include/gtest/gtest.h",
-    ],
-    includes = [
-        "include",
-    ],
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest_upstream//:gtest",
-    ],
-)
-
-cc_library(
-    name = "gtest_main",
-    testonly = True,
-    deps = [
-        ":gtest",
-        "@com_google_googletest_upstream//:gtest_main",
-    ],
-)
-
-alias(
-    name = "gtest_for_library",
-    actual = ":gtest",
-)
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel b/third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel
deleted file mode 100644
index 24583fb0d3406a..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel
+++ /dev/null
@@ -1,4 +0,0 @@
-module(name = "xla_googletest_wrapper", version = "1.0")
-
-bazel_dep(name = "abseil-cpp", version = "20250814.0", repo_name = "com_google_absl")
-bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest_upstream")
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/README.md b/third_party/xla/third_party/xla_googletest_wrapper/README.md
deleted file mode 100644
index f2065e18995bd4..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-A gtest wrapper that adds ASSERT_OK, EXPECT_OK, ASSERT_OK_AND_ASSIGN to gmock.h
-so that the header's provided functionality matches internal gmock.
-
-The repo contains a minimal set of reexports necessary to build XLA with this as
-a drop-in replacement for googletest.
\ No newline at end of file
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel b/third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel
deleted file mode 100644
index 1bab3de8da4637..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel
+++ /dev/null
@@ -1 +0,0 @@
-# Mark the repo as a bazel repo.
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl b/third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl
deleted file mode 100644
index 0f980ef53dd946..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Reexports googletest_deps from upstream googletest."""
-
-# protobuf loads for @com_google_googletest//:googletest_deps.bzl so we need to
-# provide one in the wrapper.
-load("@com_google_googletest_upstream//:googletest_deps.bzl", upstream_deps = "googletest_deps")
-
-def googletest_deps():
-    upstream_deps()
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
index 1d8aa31e110661..cd1c5d7891af7f 100644
--- a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
+++ b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
@@ -1,129 +1,129 @@
-/* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
-#define GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
-
-// gmock/gmock.h wrapper that also provides assert macros.
-//
-// These already exist in internal version of gmock, but upstream version
-// doesn't have them. We use this wrapper to make dependency translation when
-// exporting to OSS easier.
-//
-// - We want to use standard internal header and ASSERT_OK, EXPECT_OK macros
-//   when developing internally.
-// - We want the same macros to work externally, rather than having to add or
-//   strip TF_ prefix.
-// - We want the OSS export to still work after the export and header
-//   translation.
-// - We want to minimize the amount of patching third party projects to reduce
-//   maintenance overhead.
-// - To ensure the OSS patches cleanly apply onto internal repo, we need the
-//   header translation to be reversible, which requires 1:1 header mapping.
-//
-// To achieve this, we swap out gmock.h for this wrapper in all XLA code, which
-// should (TM) make ASSERT_OK/EXPECT_OK "just work" in all XLA tests.
-//
-// The only way to make this work without patching googletest and/or absl is to
-// make XLA *always* use this wrapper, and *never* directly depend on upstream
-// googletest.
-//
-// absl/status/status_matchers.h depends on gmock.h, so we can't simply add it
-// here. This causes either:
-//
-// - A circular dependency between this and absl - which bazel doesn't allow,
-// - absl dependency on the upstream gmock - which depending on the dependency
-//   graph structure may introduce upstream gmock include path *before* one
-//   defined in here, so we end up with *sometimes* including the wrong one and
-//   the entire idea of drop-in replacing gmock.h goes out of the window.
-
-#include_next "gmock/gmock.h"
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-
-// Macros for testing the results of functions that return absl::Status or
-// absl::StatusOr<T> (for any type T).
-#define EXPECT_OK(expression) \
-  EXPECT_THAT(expression, ::xla_testing::internal::IsOk())
-#define ASSERT_OK(expression) \
-  ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
-
-#define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
-  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                     \
-      XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
-      lhs, rexpr);
-
-#define ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
-  auto statusor = (rexpr);                              \
-  ASSERT_OK(statusor.status());                         \
-  lhs = std::move(statusor).value()
-
-#define XLA_STATUS_MACROS_CONCAT_NAME(x, y) XLA_STATUS_MACROS_CONCAT_IMPL(x, y)
-#define XLA_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
-
-namespace xla_testing {
-namespace internal {
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-inline const absl::Status& GetStatus(const absl::Status& status) {
-  return status;
-}
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-template <typename T>
-inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
-  return status.status();
-}
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-//
-// Monomorphic implementation of matcher IsOk() for a given type T.
-// T can be Status, StatusOr<>, or a reference to either of them.
-template <typename T>
-class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
- public:
-  void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
-  void DescribeNegationTo(std::ostream* os) const override {
-    *os << "is not OK";
-  }
-  bool MatchAndExplain(T actual_value,
-                       ::testing::MatchResultListener*) const override {
-    return GetStatus(actual_value).ok();
-  }
-};
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-//
-// Implements IsOk() as a polymorphic matcher.
-class IsOkMatcher {
- public:
-  template <typename T>
-  /*implicit*/ operator ::testing::Matcher<T>() const {  // NOLINT
-    return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
-  }
-};
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-//
-// Returns a gMock matcher that matches a Status or StatusOr<> which is OK.
-inline ::xla_testing::internal::IsOkMatcher IsOk() {
-  return ::xla_testing::internal::IsOkMatcher();
-}
-
-}  // namespace internal
-}  // namespace xla_testing
-
-#endif  // GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
+ /* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+ 
+ #ifndef GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
+ #define GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
+ 
+ // gmock/gmock.h wrapper that also provides assert macros.
+ //
+ // These already exist in internal version of gmock, but upstream version
+ // doesn't have them. We use this wrapper to make dependency translation when
+ // exporting to OSS easier.
+ //
+ // - We want to use standard internal header and ASSERT_OK, EXPECT_OK macros
+ //   when developing internally.
+ // - We want the same macros to work externally, rather than having to add or
+ //   strip TF_ prefix.
+ // - We want the OSS export to still work after the export and header
+ //   translation.
+ // - We want to minimize the amount of patching third party projects to reduce
+ //   maintenance overhead.
+ // - To ensure the OSS patches cleanly apply onto internal repo, we need the
+ //   header translation to be reversible, which requires 1:1 header mapping.
+ //
+ // To achieve this, we swap out gmock.h for this wrapper in all XLA code, which
+ // should (TM) make ASSERT_OK/EXPECT_OK "just work" in all XLA tests.
+ //
+ // The only way to make this work without patching googletest and/or absl is to
+ // make XLA *always* use this wrapper, and *never* directly depend on upstream
+ // googletest.
+ //
+ // absl/status/status_matchers.h depends on gmock.h, so we can't simply add it
+ // here. This causes either:
+ //
+ // - A circular dependency between this and absl - which bazel doesn't allow,
+ // - absl dependency on the upstream gmock - which depending on the dependency
+ //   graph structure may introduce upstream gmock include path *before* one
+ //   defined in here, so we end up with *sometimes* including the wrong one and
+ //   the entire idea of drop-in replacing gmock.h goes out of the window.
+ 
+ #include_next "gmock/gmock.h"
+ 
+ #include "absl/status/status.h"
+ #include "absl/status/statusor.h"
+ 
+ // Macros for testing the results of functions that return absl::Status or
+ // absl::StatusOr<T> (for any type T).
+ #define EXPECT_OK(expression) \
+   EXPECT_THAT(expression, ::xla_testing::internal::IsOk())
+ #define ASSERT_OK(expression) \
+   ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
+ 
+ #define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
+  ASSERT_OK_AND_ASSIGN_IMPL(                                        \
+       XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
+       lhs, rexpr);
+ 
+ #define ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
+   auto statusor = (rexpr);                              \
+   ASSERT_OK(statusor.status());                         \
+   lhs = std::move(statusor).value()
+ 
+ #define XLA_STATUS_MACROS_CONCAT_NAME(x, y) XLA_STATUS_MACROS_CONCAT_IMPL(x, y)
+ #define XLA_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+ 
+ namespace xla_testing {
+ namespace internal {
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ inline const absl::Status& GetStatus(const absl::Status& status) {
+   return status;
+ }
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ template <typename T>
+ inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
+   return status.status();
+ }
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ //
+ // Monomorphic implementation of matcher IsOk() for a given type T.
+ // T can be Status, StatusOr<>, or a reference to either of them.
+ template <typename T>
+ class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
+  public:
+   void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
+   void DescribeNegationTo(std::ostream* os) const override {
+     *os << "is not OK";
+   }
+   bool MatchAndExplain(T actual_value,
+                        ::testing::MatchResultListener*) const override {
+     return GetStatus(actual_value).ok();
+   }
+ };
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ //
+ // Implements IsOk() as a polymorphic matcher.
+ class IsOkMatcher {
+  public:
+   template <typename T>
+   /*implicit*/ operator ::testing::Matcher<T>() const {  // NOLINT
+     return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
+   }
+ };
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ //
+ // Returns a gMock matcher that matches a Status or StatusOr<> which is OK.
+ inline ::xla_testing::internal::IsOkMatcher IsOk() {
+   return ::xla_testing::internal::IsOkMatcher();
+ }
+ 
+ }  // namespace internal
+ }  // namespace xla_testing
+ 
+ #endif  // GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
\ No newline at end of file
diff --git a/third_party/xla/third_party/xnnpack/layering_check_fix.patch b/third_party/xla/third_party/xnnpack/layering_check_fix.patch
new file mode 100644
index 00000000000000..1e663770ba66e5
--- /dev/null
+++ b/third_party/xla/third_party/xnnpack/layering_check_fix.patch
@@ -0,0 +1,12 @@
+diff --git a/ynnpack/kernels/unary/BUILD b/ynnpack/kernels/unary/BUILD
+index 9c46262..2e5ac81 100644
+--- a/ynnpack/kernels/unary/BUILD
++++ b/ynnpack/kernels/unary/BUILD
+@@ -197,6 +197,7 @@ ynn_cc_library(
+         "x86_avx.inc",
+         "x86_avx2.inc",
+         "x86_avx512f.inc",
++        "x86_f16c.inc",
+         "x86_fma3.inc",
+         "x86_sse2.inc",
+         "x86_sse41.inc",
\ No newline at end of file
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index 5251d630a034e0..a06db393d261a6 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,9 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f644ad3ac88b3b0208a82742938bca35235865d6ca64950dac58b166877eb2a5",
-        strip_prefix = "XNNPACK-1b918df9d1744ae40725254f4baa592ed05c912e",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/1b918df9d1744ae40725254f4baa592ed05c912e.zip"),
+        sha256 = "961965b04b0cee7c0ece34bb21dbdf69e483772ae7bdb275a08e6d457ed7e38b",
+        strip_prefix = "XNNPACK-2c1a512208d0481d6e6bd87c2bd5e23408febc3e",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/2c1a512208d0481d6e6bd87c2bd5e23408febc3e.zip"),
+        patch_file = ["//third_party/xnnpack:layering_check_fix.patch"],
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
diff --git a/third_party/xla/third_party/zlib.BUILD b/third_party/xla/third_party/zlib.BUILD
index bab615b54dfd50..ae7a59d28af143 100644
--- a/third_party/xla/third_party/zlib.BUILD
+++ b/third_party/xla/third_party/zlib.BUILD
@@ -27,11 +27,13 @@ cc_library(
         "trees.c",
         "trees.h",
         "uncompr.c",
-        "zconf.h",
         "zutil.c",
         "zutil.h",
     ],
-    hdrs = ["zlib.h"],
+    hdrs = [
+        "zconf.h",
+        "zlib.h",
+    ],
     copts = select({
         "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 0507da08abb6d3..fa0212dfcea2eb 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -7,6 +7,7 @@ load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependenci
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
@@ -109,21 +110,20 @@ def workspace():
     # Note: We add this to fix Kokoro builds.
     # The rules below call into `rules_proto` but the hash has changed and
     # Bazel refuses to continue. So, we add our own mirror.
-    http_archive(
+    tf_http_archive(
         name = "rules_proto",
         sha256 = "20b240eba17a36be4b0b22635aca63053913d5c1ee36e16be36499d167a2f533",
         strip_prefix = "rules_proto-11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
-        ],
+        ),
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_shell",
         sha256 = "bc61ef94facc78e20a645726f64756e5e285a045037c7a61f65af2941f4c25e1",
         strip_prefix = "rules_shell-0.4.1",
-        url = "https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz"),
     )
 
     # Now, finally use the rules
@@ -138,13 +138,13 @@ def workspace():
     # Toolchains for ML projects hermetic builds.
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     if "rules_ml_toolchain" not in native.existing_rules():
-        http_archive(
+        tf_http_archive(
             name = "rules_ml_toolchain",
-            sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
-            strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
-            urls = [
-                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
-            ],
+            sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+            strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
+            urls = tf_mirror_urls(
+                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
+            ),
         )
 
     # If a target is bound twice, the later one wins, so we have to do tf bindings
diff --git a/third_party/xla/workspace1.bzl b/third_party/xla/workspace1.bzl
index 05121708fc5fa5..cbd3d24aec8a1a 100644
--- a/third_party/xla/workspace1.bzl
+++ b/third_party/xla/workspace1.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 
 # buildifier: disable=unnamed-macro
@@ -15,14 +15,13 @@ def workspace():
 
     closure_repositories()
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_toolchains",
         sha256 = "294cdd859e57fcaf101d4301978c408c88683fbc46fbc1a3829da92afbea55fb",
         strip_prefix = "bazel-toolchains-8c717f8258cd5f6c7a45b97d974292755852b658",
-        urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
-        ],
+        ),
     )
 
     grpc_deps()
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 0ba9ce943a1c08..587fff531d2bfd 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -7,9 +7,10 @@ load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("@rules_ml_toolchain//gpu/sycl:sycl_configure.bzl", "sycl_configure")
 load("@rules_ml_toolchain//gpu/sycl:sycl_init_repository.bzl", "sycl_init_repository")
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls", "tf_vendored")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
+load("//third_party/brotli:workspace.bzl", brotli = "repo")
 load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("//third_party/cpuinfo:workspace.bzl", cpuinfo = "repo")
 load("//third_party/cudnn_frontend:workspace.bzl", cudnn_frontend = "repo")
@@ -43,6 +44,7 @@ load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
 load("//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
 load("//third_party/raft:workspace.bzl", raft = "repo")
 load("//third_party/rapids_logger:workspace.bzl", rapids_logger = "repo")
+load("//third_party/riegeli:workspace.bzl", riegeli = "repo")
 load("//third_party/rmm:workspace.bzl", rmm = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
@@ -77,6 +79,8 @@ def _initialize_third_party():
     farmhash()
     fmt()
     fxdiv()
+    riegeli()
+    brotli()
     gemmlowp()
     gloo()
     gutil()
@@ -164,9 +168,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "fb4f8180171d035a08432b086194121f627d00a76d58cebaad57d7a87ad40dbd",
-        strip_prefix = "kleidiai-7a3a609a3278106df7157bdd27b8f0e75ab00b60",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/7a3a609a3278106df7157bdd27b8f0e75ab00b60.zip"),
+        sha256 = "5e922c9afb7a0c881fc4359b58488f3faa840e8435de1a2207a6525935ed83c2",
+        strip_prefix = "kleidiai-63205aa90afa6803d8f58bc3081b69288e9f1906",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/63205aa90afa6803d8f58bc3081b69288e9f1906.zip"),
     )
 
     tf_http_archive(
@@ -306,33 +310,16 @@ def _tf_repositories():
         },
     )
 
-    # We use a vendored wrapper over googletest to provide
-    # ASSERT_OK/EXPECT_OK/ASSERT_OK_AND_ASSIGN macros through gmock/gmock.h.
-    #
-    # Internal gmock includes those macros, but the external one doesn't. This
-    # caused issues where internal builds succeed, but the copybara export to
-    # github doesn't compile because those macros are not defined. The
-    # workaround was to use custom TF_-prefixed variants of those macros.
-    #
-    # This wrapper lets us have the same code work in both by just swapping the
-    # internal header with gmock/gmock.h. This applies to XLA only, not to TF,
-    # so the TF_ macros that are still in use there must stay, and can't just
-    # expand to non-TF_ variants as.
-    tf_vendored(
-        name = "com_google_googletest",
-        path = "third_party/xla_googletest_wrapper",
-    )
-
     tf_http_archive(
-        name = "com_google_googletest_upstream",
+        name = "com_google_googletest",
         # Use the commit on 2025/6/09:
         # https://github.com/google/googletest/commit/28e9d1f26771c6517c3b4be10254887673c94018
         sha256 = "f253ca1a07262f8efde8328e4b2c68979e40ddfcfc001f70d1d5f612c7de2974",
         strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
         # Patch googletest to:
-        #   - avoid dependencies on @fuchsia_sdk,
-        #   - refer to re2 as @com_googlesource_code_re2,
-        #   - refer to abseil as @com_google_absl.
+        #   - make the gtest_main target export the gtest.h header.
+        #   - add status assert macros for consistency with internal gmock (see
+        #     README.add-status-macros.md).
         #
         # To update the patch, run:
         # $ cd ~
@@ -345,8 +332,16 @@ def _tf_repositories():
         # $ git diff > <client-root>/third_party/tensorflow/third_party/googletest/googletest.patch
         #
         # The patch path is relative to third_party/xla.
-        patch_file = ["//third_party/googletest:googletest.patch"],
+        patch_file = [
+            "//third_party/googletest:googletest.patch",
+            "//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
+            "//third_party/googletest:0002-Rename-dependencies-for-workspace.bzl-build.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c940189.zip"),
+        repo_mapping = {
+            "@abseil-cpp": "@com_google_absl",
+            "@re2": "@com_googlesource_code_re2",
+        },
     )
 
     tf_http_archive(
@@ -401,9 +396,9 @@ def _tf_repositories():
     tf_http_archive(
         name = "snappy",
         build_file = "//third_party:snappy.BUILD",
-        sha256 = "2e458b7017cd58dcf1469ab315389e85e7f445bd035188f2983f81fb19ecfb29",
-        strip_prefix = "snappy-984b191f0fefdeb17050b42a90b7625999c13b8d",
-        urls = tf_mirror_urls("https://github.com/google/snappy/archive/984b191f0fefdeb17050b42a90b7625999c13b8d.tar.gz"),
+        sha256 = "736aeb64d86566d2236ddffa2865ee5d7a82d26c9016b36218fcc27ea4f09f86",
+        strip_prefix = "snappy-1.2.1",
+        urls = tf_mirror_urls("https://github.com/google/snappy/archive/refs/tags/1.2.1.tar.gz"),
     )
 
     tf_http_archive(
@@ -444,6 +439,7 @@ def _tf_repositories():
         sha256 = "9dc53f851107eaf87b391136d13b815df97ec8f76dadb487b58b2fc45e624d2c",
         strip_prefix = "boringssl-c00d7ca810e93780bd0c8ee4eea28f4f2ea4bcdc",
         system_build_file = "//third_party:boringssl.BUILD",
+        patch_file = ["//third_party:boringssl.patch"],
         urls = tf_mirror_urls("https://github.com/google/boringssl/archive/c00d7ca810e93780bd0c8ee4eea28f4f2ea4bcdc.tar.gz"),
     )
 
@@ -451,7 +447,14 @@ def _tf_repositories():
         name = "com_google_ortools",
         sha256 = "f6a0bd5b9f3058aa1a814b798db5d393c31ec9cbb6103486728997b49ab127bc",
         strip_prefix = "or-tools-9.11",
-        patch_file = ["//third_party/ortools:ortools.patch"],
+        patch_file = [
+            "//third_party/ortools:ortools.patch",
+            # On a version upgrade, this patch can be regenerated with the command:
+            # third_party/gen_disable_layering_check_patch.sh \
+            #   https://github.com/google/or-tools/archive/v9.11.tar.gz \
+            #   > third_party/ortools/layering_check.patch
+            "//third_party/ortools:layering_check.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/google/or-tools/archive/v9.11.tar.gz"),
         repo_mapping = {
             "@com_google_protobuf_cc": "@com_google_protobuf",
diff --git a/third_party/xla/workspace3.bzl b/third_party/xla/workspace3.bzl
index feee699160ed44..10212d417103d1 100644
--- a/third_party/xla/workspace3.bzl
+++ b/third_party/xla/workspace3.bzl
@@ -1,71 +1,66 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # buildifier: disable=function-docstring
 # buildifier: disable=unnamed-macro
 def workspace():
-    http_archive(
+    tf_http_archive(
         name = "io_bazel_rules_closure",
         sha256 = "5b00383d08dd71f28503736db0500b6fb4dda47489ff5fc6bed42557c07c6ba9",
         strip_prefix = "rules_closure-308b05b2419edb5c8ee0471b67a40403df940149",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",  # 2019-06-13
-        ],
+        ),
     )
 
     # https://github.com/bazelbuild/bazel-skylib/releases
-    http_archive(
+    tf_http_archive(
         name = "bazel_skylib",
         sha256 = "bc283cdfcd526a52c3201279cda4bc298652efa898b10b4db0837dc51652756f",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
-        ],
+        ),
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_license",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
-        ],
+        ),
         sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_pkg",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
-        ],
+        ),
         sha256 = "451e08a4d78988c06fa3f9306ec813b836b1d076d0f055595444ba4ff22b867f",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_features",
         sha256 = "4fd9922d464686820ffd8fcefa28ccffa147f7cdc6b6ac0d8b07fde565c65d66",
         strip_prefix = "bazel_features-1.25.0",
-        url = "https://github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz",
+        urls = tf_mirror_urls("https://github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz"),
     )
 
     # Maven dependencies.
     RULES_JVM_EXTERNAL_TAG = "4.3"
-    http_archive(
+    tf_http_archive(
         name = "rules_jvm_external",
         strip_prefix = "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG,
         sha256 = "6274687f6fc5783b589f56a2f1ed60de3ce1f99bc4e8f9edef3de43bdf7c6e74",
-        url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG),
     )
 
     # Platforms
-    http_archive(
+    tf_http_archive(
         name = "platforms",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
-        ],
+        ),
         sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
     )
 
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 151c46c8df3408..be1d16e34727eb 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,6 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 
+# copybara:uncomment load("//devtools/build_cleaner/skylark:action_config_test.bzl", "action_config_test")
 # copybara:uncomment load("@rules_python//python:proto.bzl", "py_proto_library")
 load("//xla:package_groups.bzl", "xla_package_groups")
 load("//xla:xla.default.bzl", "xla_bzl_library", "xla_cc_test", "xla_py_proto_library")
@@ -216,6 +217,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
+        "@ml_dtypes_py//ml_dtypes:intn",
     ],
 )
 
@@ -338,6 +340,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:bfloat16",
         "@local_tsl//tsl/platform:casts",
@@ -564,6 +567,7 @@ xla_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1286,7 +1290,6 @@ xla_cc_test(
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:test",
             "//xla/tsl/util:command_line_flags",
-            "@com_google_absl//absl/base:nullability",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/status",
             "@com_google_absl//absl/status:status_matchers",
@@ -1408,6 +1411,11 @@ cc_library(
 #     visibility = internal_visibility([":friends"]),
 #     deps = [":xla_proto"],
 # )
+#
+# action_config_test(
+#     name = "build_cleaner_spec_test",
+#     src = "build_cleaner_spec.textproto",
+# )
 # copybara:uncomment_end
 
 cc_library(
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index b3ded36658852c..ceac0079baaaa3 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -64,6 +64,7 @@ message AutotuneResult {
 
   message GemmKey {
     int64 algorithm = 1;
+    int64 autotune_workspace_size = 2;
   }
 
   // Legacy and unused in new data; superseded by AlgorithmProto.
diff --git a/third_party/xla/xla/backends/autotuner/BUILD b/third_party/xla/xla/backends/autotuner/BUILD
index a801131ae7e8c0..911610ef7b8207 100644
--- a/third_party/xla/xla/backends/autotuner/BUILD
+++ b/third_party/xla/xla/backends/autotuner/BUILD
@@ -64,6 +64,8 @@ cc_library(
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -182,7 +184,6 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:any_cc_proto",
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 2c2dcb14750ef7..2f950b590aa8ef 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -57,6 +57,8 @@ limitations under the License.
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
 
@@ -314,7 +316,8 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
         absl::StrCat("Autotuner could not find any supported configs for HLO: ",
                      instr->ToString()));
   }
-  VLOG(1) << "Found " << supported_configs.size() << " supported configs.";
+  VLOG(1) << "Found total of " << supported_configs.size()
+          << " supported configs.";
 
   std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables =
       CompileAll(instr, supported_configs);
@@ -331,6 +334,17 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
     }
   }
 
+  if (autotune_config_.exclude_cublas_config) {
+    executable_candidates.erase(
+        std::remove_if(executable_candidates.begin(),
+                       executable_candidates.end(),
+                       [](const ExecutableCandidate& candidate) {
+                         return candidate.config.codegen_backend->name() ==
+                                "Cublas_fission";
+                       }),
+        executable_candidates.end());
+  }
+
   if (executable_candidates.empty()) {
     return absl::InternalError(
         absl::StrCat("Autotuner could not compile any configs for HLO: ",
@@ -411,8 +425,13 @@ absl::StatusOr<std::vector<Autotuner::Config>> Autotuner::GetSupportedConfigs(
     absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
         per_backend_configs = codegen_backend->GetSupportedConfigs(*instr);
     if (!per_backend_configs.ok()) {
+      VLOG(3) << "Failed to get supported configs for backend "
+              << codegen_backend->name() << ": "
+              << per_backend_configs.status();
       continue;
     }
+    VLOG(3) << "Found of " << per_backend_configs->size()
+            << " supported configs for backend " << codegen_backend->name();
     for (auto& config : *per_backend_configs) {
       configs.push_back({codegen_backend.get(), std::move(config)});
     }
@@ -422,6 +441,28 @@ absl::StatusOr<std::vector<Autotuner::Config>> Autotuner::GetSupportedConfigs(
 
 std::vector<absl::StatusOr<std::unique_ptr<Executable>>> Autotuner::CompileAll(
     HloInstruction* instr, std::vector<Config>& configs) {
+  XLA_SCOPED_LOGGING_TIMER_LEVEL("CompileAll", 5);
+  tsl::profiler::TraceMe traceme("CompileAll");
+  tsl::profiler::ScopedAnnotation annotation("XlaAutotunerCompilation");
+
+  if (autotune_config_.select_first_config) {
+    std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables;
+    for (int i = 0; i < configs.size(); ++i) {
+      absl::StatusOr<std::unique_ptr<Executable>> executable =
+          configs[i].codegen_backend->Compile(*instr,
+                                              *configs[i].backend_config);
+      if (executable.ok()) {
+        std::vector<absl::StatusOr<std::unique_ptr<Executable>>> success_result;
+        success_result.push_back(std::move(executable));
+        Config selected_config = std::move(configs[i]);
+        configs.clear();
+        configs.push_back(std::move(selected_config));
+        return success_result;
+      }
+    }
+    return executables;
+  }
+
   if (thread_pool_ == nullptr) {
     std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables;
     executables.reserve(configs.size());
@@ -458,6 +499,7 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
 
   std::optional<ScopedShapedBuffer> reference_output;
   if (autotune_config_.check_buffers) {
+    VLOG(2) << "Checking buffers";
     reference_output = GetReferenceOutput(candidates, *input_buffers);
     if (!reference_output.has_value()) {
       LOG(WARNING) << "No reference output found even though buffer checking "
@@ -496,15 +538,6 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
 
 absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
     std::vector<ConfigResult>& results) {
-  if (autotune_config_.exclude_cublas_config) {
-    results.erase(
-        std::remove_if(results.begin(), results.end(),
-                       [](const ConfigResult& result) {
-                         return result.config.codegen_backend->name() ==
-                                "Cublas_fission";
-                       }),
-        results.end());
-  }
 
   absl::Duration min_duration = absl::InfiniteDuration();
   ConfigResult* best_result = nullptr;
@@ -573,6 +606,8 @@ std::optional<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
       continue;
     }
     if (profile_result.value().output_buffer.has_value()) {
+      VLOG(2) << "Found reference output for config: "
+              << candidate.config.ToString();
       return std::move(profile_result.value().output_buffer.value());
     }
   }
@@ -700,4 +735,28 @@ std::string Autotuner::Config::ToString() const {
                          UnpackedAnyShortDebugString(*backend_config));
 }
 
+std::string AutotuneConfig::ToString() const {
+  return absl::StrFormat(
+      "{\n"
+      "  \"check_buffers\": %s,\n"
+      "  \"relative_tolerance\": %f,\n"
+      "  \"crash_on_check_failure\": %s,\n"
+      "  \"optimize_scratch_bytes\": %s,\n"
+      "  \"scratch_bytes_window_size_us\": %d,\n"
+      "  \"expect_all_instructions_in_cache\": %s,\n"
+      "  \"dump_logs_to\": \"%s\",\n"
+      "  \"exclude_cublas_config\": %s,\n"
+      "  \"select_first_config\": %s,\n"
+      "  \"use_default_config\": %s,\n"
+      "  \"dump_hlos\": %s\n"
+      "}",
+      check_buffers ? "true" : "false", relative_tolerance,
+      crash_on_check_failure ? "true" : "false",
+      optimize_scratch_bytes ? "true" : "false", scratch_bytes_window_size_us,
+      expect_all_instructions_in_cache ? "true" : "false", dump_logs_to,
+      exclude_cublas_config ? "true" : "false",
+      select_first_config ? "true" : "false",
+      use_default_config ? "true" : "false", dump_hlos ? "true" : "false");
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.h b/third_party/xla/xla/backends/autotuner/autotuner.h
index eb15e516070b22..bec84c70609d33 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.h
+++ b/third_party/xla/xla/backends/autotuner/autotuner.h
@@ -87,6 +87,8 @@ struct AutotuneConfig {
   // If true, dump the autotuned instructions to the modules's xla_dump_to or
   // to stdout if not set.
   bool dump_hlos = false;
+
+  std::string ToString() const;
 };
 
 class Autotuner {
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_test.cc b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
index bdedc6e620505c..a8749ded33c003 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_test.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
@@ -267,7 +267,7 @@ TEST_F(AutotunerTest, AutotuneButNoSupportedConfigs) {
   auto profiler = std::make_unique<MockProfiler>();
   auto device_description = CreateDummyDeviceDescription();
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -295,7 +295,7 @@ TEST_F(AutotunerTest, AutotuneButNoCompiledConfigs) {
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -334,7 +334,7 @@ TEST_F(AutotunerTest, AutotuneAppliesBestConfigAndSkipsNonCompilableConfig) {
       .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -372,7 +372,7 @@ TEST_F(AutotunerTest, AutotuneAppliesBestConfigUsingThreadPool) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test", 2);
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager), &thread_pool));
@@ -386,27 +386,27 @@ TEST_F(AutotunerTest, AutotuneModuleFindsNoInstructionsToAutotune) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
   auto device_description = CreateDummyDeviceDescription();
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), nullptr, config_,
                         std::make_unique<MockAutotunerCache>()));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
   EXPECT_THAT(autotuner->Autotune(
                   module.get(), [](const HloInstruction& _) { return false; }),
               absl_testing::IsOk());
 }
 
 TEST_F(AutotunerTest, AutotuneModuleFollowsFilter) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
 
   auto should_autotune = [](const HloInstruction& instruction) {
     return instruction.opcode() == HloOpcode::kCopy;
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kCopy},
@@ -417,13 +417,13 @@ TEST_F(AutotunerTest, AutotuneModuleFollowsFilter) {
 }
 
 TEST_F(AutotunerTest, AutotuneModuleWithDuplicateInstructions) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
 
   auto should_autotune = [](const HloInstruction& instruction) {
     return instruction.opcode() == HloOpcode::kAdd;
   };
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kAdd},
@@ -456,7 +456,7 @@ TEST_F(AutotunerTest, AutotuneButOneBackendFails) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(good_backend));
   backends.push_back(std::move(bad_backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -486,7 +486,7 @@ TEST_F(AutotunerTest, CacheHit) {
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -533,7 +533,7 @@ TEST_F(AutotunerTest, AutotuneWithBufferCheckFiltersWrongResults) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend_1));
   backends.push_back(std::move(backend_2));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::make_unique<MockAutotunerCache>()));
@@ -570,7 +570,7 @@ TEST_F(AutotunerTest, AutotuneSkipsBufferCheckWhenNoReferenceOutput) {
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::make_unique<MockAutotunerCache>()));
@@ -627,7 +627,7 @@ TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
   backends.push_back(std::move(backend_1));
   config_.optimize_scratch_bytes = true;
   config_.scratch_bytes_window_size_us = 8;
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::make_unique<MockAutotunerCache>()));
@@ -647,16 +647,16 @@ TEST_F(AutotunerTest, ExpectAllInstructionsInCache) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto autotuner, Autotuner::Create(std::move(backends), nullptr, config_,
-                                        std::move(cache_manager)));
+  ASSERT_OK_AND_ASSIGN(auto autotuner,
+                       Autotuner::Create(std::move(backends), nullptr, config_,
+                                         std::move(cache_manager)));
   auto dummy_instr = HloInstruction::CreateConstant(LiteralUtil::CreateR0(1));
   EXPECT_THAT(autotuner->Autotune(dummy_instr.get()),
               StatusIs(absl::StatusCode::kNotFound));
 }
 
 TEST_F(AutotunerTest, DumpLogsToFile) {
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       tsl::testing::TemporaryDirectory temp_dir,
       tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
   config_.dump_logs_to = tsl::io::JoinPath(temp_dir.path(), "dump.log");
@@ -686,7 +686,7 @@ TEST_F(AutotunerTest, DumpLogsToFile) {
       .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -736,13 +736,7 @@ TEST_F(AutotunerTest, ExcludeCublasConfig) {
   backends.push_back(std::move(backend));
 
   auto profiler = std::make_unique<MockProfiler>();
-  EXPECT_CALL(*profiler, CreateInputBuffers(_))
-      .WillOnce(Return(std::make_unique<InputBuffers>()));
-  EXPECT_CALL(*profiler, Profile(_, _))
-      .WillOnce(Return(ProfileResult({absl::Seconds(1)})))
-      .WillOnce(Return(ProfileResult({absl::Seconds(2)})));
-
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -762,7 +756,6 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
   EXPECT_CALL(*backend, GetSupportedConfigs(_))
       .WillOnce(Return(std::move(configs)));
   EXPECT_CALL(*backend, Compile(_, _))
-      .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()));
   EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_1")))
       .Times(1)
@@ -772,7 +765,70 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
 
   auto profiler = std::make_unique<MockProfiler>();
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
+      auto autotuner, Autotuner::Create(std::move(backends),
+                                        std::move(profiler), config_, nullptr));
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  auto dummy_instr = module->entry_computation()->root_instruction();
+  EXPECT_THAT(autotuner->Autotune(dummy_instr), absl_testing::IsOk());
+}
+
+TEST_F(AutotunerTest, SelectFirstConfigStopsAfterFirstSuccess) {
+  config_.select_first_config = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+  configs.push_back(GetTestConfig("test_config_3"));
+
+  auto backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*backend, GetSupportedConfigs(_))
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_1")))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_2"))).Times(0);
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_3"))).Times(0);
+
+  EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_1")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+
+  auto profiler = std::make_unique<MockProfiler>();
+
+  ASSERT_OK_AND_ASSIGN(
+      auto autotuner, Autotuner::Create(std::move(backends),
+                                        std::move(profiler), config_, nullptr));
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  auto dummy_instr = module->entry_computation()->root_instruction();
+  EXPECT_THAT(autotuner->Autotune(dummy_instr), absl_testing::IsOk());
+}
+
+TEST_F(AutotunerTest, SelectFirstConfigFirstConfigFails) {
+  config_.select_first_config = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+
+  auto backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*backend, GetSupportedConfigs(_))
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_1")))
+      .WillOnce(Return(absl::InternalError("test error")));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_2")))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+
+  EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_2")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+
+  auto profiler = std::make_unique<MockProfiler>();
+
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -780,6 +836,35 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
   EXPECT_THAT(autotuner->Autotune(dummy_instr), absl_testing::IsOk());
 }
 
+TEST_F(AutotunerTest, SelectFirstConfigAllConfigsFail) {
+  config_.select_first_config = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+
+  auto backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*backend, GetSupportedConfigs(_))
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_1")))
+      .WillOnce(Return(absl::InternalError("test error")));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_2")))
+      .WillOnce(Return(absl::InternalError("test error")));
+
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+
+  auto profiler = std::make_unique<MockProfiler>();
+
+  ASSERT_OK_AND_ASSIGN(
+      auto autotuner, Autotuner::Create(std::move(backends),
+                                        std::move(profiler), config_, nullptr));
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  auto dummy_instr = module->entry_computation()->root_instruction();
+  EXPECT_THAT(autotuner->Autotune(dummy_instr),
+              StatusIs(absl::StatusCode::kInternal));
+}
+
 TEST_F(AutotunerTest, UseDefaultConfig) {
   config_.use_default_config = true;
 
@@ -793,7 +878,7 @@ TEST_F(AutotunerTest, UseDefaultConfig) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), /*profiler=*/nullptr, config_,
                         /*cache=*/nullptr));
@@ -815,7 +900,7 @@ TEST_F(AutotunerTest, UseDefaultConfigUnimplemented) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), /*profiler=*/nullptr, config_,
                         /*cache=*/nullptr));
@@ -843,8 +928,8 @@ AutotunerCacheInterface::Config GetCacheConfig(absl::string_view name) {
 };
 
 TEST_F(AutotunerTest, ShardedAutotuning) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
   constexpr int kShardCount = 2;
   auto should_autotune = [](const HloInstruction& instruction) {
     return instruction.opcode() == HloOpcode::kAdd ||
@@ -875,7 +960,7 @@ TEST_F(AutotunerTest, ShardedAutotuning) {
   EXPECT_CALL(*cache, Lookup(InstrPtrMatcher(HloOpcode::kAdd)))
       .WillOnce(Return(GetCacheConfig("best_config")));
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kCopy},
@@ -892,7 +977,7 @@ TEST_F(AutotunerTest, ShardedAutotuning) {
 }
 
 TEST_F(AutotunerTest, DumpHlos) {
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       tsl::testing::TemporaryDirectory dump_dir,
       tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -903,7 +988,7 @@ TEST_F(AutotunerTest, DumpHlos) {
            instruction.opcode() == HloOpcode::kAdd;
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kCopy, HloOpcode::kAdd},
@@ -928,5 +1013,36 @@ TEST_F(AutotunerTest, DumpHlos) {
           MatchesRegex(".*\\.test_module\\.autotuner_1\\.add\\.before\\.txt")));
 }
 
+TEST(AutotuneConfigTest, ToString) {
+  AutotuneConfig config;
+  config.check_buffers = true;
+  config.relative_tolerance = 1e-4;
+  config.crash_on_check_failure = false;
+  config.optimize_scratch_bytes = true;
+  config.scratch_bytes_window_size_us = 10;
+  config.expect_all_instructions_in_cache = false;
+  config.dump_logs_to = "/tmp/log";
+  config.exclude_cublas_config = true;
+  config.select_first_config = false;
+  config.use_default_config = true;
+  config.dump_hlos = false;
+
+  std::string expected =
+      "{\n"
+      "  \"check_buffers\": true,\n"
+      "  \"relative_tolerance\": 0.000100,\n"
+      "  \"crash_on_check_failure\": false,\n"
+      "  \"optimize_scratch_bytes\": true,\n"
+      "  \"scratch_bytes_window_size_us\": 10,\n"
+      "  \"expect_all_instructions_in_cache\": false,\n"
+      "  \"dump_logs_to\": \"/tmp/log\",\n"
+      "  \"exclude_cublas_config\": true,\n"
+      "  \"select_first_config\": false,\n"
+      "  \"use_default_config\": true,\n"
+      "  \"dump_hlos\": false\n"
+      "}";
+  EXPECT_EQ(config.ToString(), expected);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc b/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
index 969286250aa1e9..d9aee5dcd69dd8 100644
--- a/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
+++ b/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index b124f4f72962ea..690cbbcb3a3b13 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -54,7 +54,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
@@ -68,7 +67,6 @@ cc_library(
         "//xla/ffi",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -122,34 +120,12 @@ onednn_graph_cc_library(
     ],
 )
 
+# TODO: b/467367981, this is deprecated and should be removed.
 tf_proto_library(
     name = "xnn_fusion_options_proto",
     srcs = ["xnn_fusion_options.proto"],
 )
 
-cc_library(
-    name = "xnn_emitter",
-    srcs = ["xnn_emitter.cc"],
-    hdrs = ["xnn_emitter.h"],
-    deps = [
-        ":xnn_support",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/hlo/ir:hlo",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "ynn_emitter",
     srcs = ["ynn_emitter.cc"],
@@ -177,61 +153,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "xnn_gemm_config",
-    srcs = ["xnn_gemm_config.cc"],
-    hdrs = ["xnn_gemm_config.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_dims",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/log:check",
-        "@llvm-project//llvm:Target",
-    ],
-)
-
-cc_library(
-    name = "xnn_support",
-    srcs = ["xnn_support.cc"],
-    hdrs = ["xnn_support.h"],
-    deps = [
-        ":xnn_gemm_config",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_dims",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:pattern_matcher",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_support_test",
-    srcs = ["xnn_support_test.cc"],
-    deps = [
-        ":xnn_support",
-        "//xla/hlo/ir:hlo",
-        "@XNNPACK",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 tf_proto_library(
     name = "ynn_fusion_options_proto",
     srcs = ["ynn_fusion_options.proto"],
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index 89b7f87c5cdce7..34e11696f7d15f 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -24,13 +24,10 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:executable",
         "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/host:host_platform",
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -49,7 +46,7 @@ cc_library(
         "//xla/backends/autotuner:profiler",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/cpu:cpu_executable",
         "//xla/tsl/platform:errors",
@@ -82,50 +79,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "xnnpack_backend",
-    srcs = ["xnnpack_backend.cc"],
-    hdrs = ["xnnpack_backend.h"],
-    deps = [
-        ":cpu_codegen_backend",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla/backends/autotuner:codegen_backend",
-        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:compiler",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:casts",
-    ],
-)
-
-xla_cc_test(
-    name = "xnnpack_backend_test",
-    srcs = ["xnnpack_backend_test.cc"],
-    deps = [
-        ":cpu_codegen_backend",
-        ":xnnpack_backend",
-        "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service:compiler",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/service/cpu:cpu_compiler",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:status_matchers",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "llvm_kernel_backend",
     srcs = ["llvm_kernel_backend.cc"],
@@ -203,7 +156,5 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc
index cdb3e83337345a..cbb6e945058710 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/xla_data.pb.h"
@@ -96,7 +96,7 @@ absl::StatusOr<ProfileResult> CpuProfiler::Profile(
 }
 
 absl::Status CpuProfiler::Execute(
-    Executable* executable, absl::Span<const MaybeOwningDeviceMemory> buffers,
+    Executable* executable, absl::Span<const MaybeOwningDeviceAddress> buffers,
     ExecutionProfile* profile) {
   ExecutableRunOptions run_options;
   run_options.set_execution_profile(profile);
diff --git a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h
index cb62437957c187..5d5f32c780cd20 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h
+++ b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/literal.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/xla_data.pb.h"
 
@@ -33,7 +33,7 @@ namespace xla::cpu {
 
 struct LiteralBackedCpuBuffers : public InputBuffers {
   std::vector<Literal> backing_literals;
-  std::vector<MaybeOwningDeviceMemory> buffers;
+  std::vector<MaybeOwningDeviceAddress> buffers;
 };
 
 class CpuProfiler : public Profiler {
@@ -60,7 +60,7 @@ class CpuProfiler : public Profiler {
   explicit CpuProfiler(ProfileOptions options) : options_(options) {}
 
   absl::Status Execute(Executable* executable,
-                       absl::Span<const MaybeOwningDeviceMemory> buffers,
+                       absl::Span<const MaybeOwningDeviceAddress> buffers,
                        ExecutionProfile* profile);
 
  private:
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
deleted file mode 100644
index 765a50a887cd54..00000000000000
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/autotuner/xnnpack_backend.h"
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/cpu/xnn_fusion_options.pb.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/compiler.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace xla::cpu {
-
-absl::Status CheckIfXnnFusion(const HloInstruction& instr) {
-  if (instr.opcode() != HloOpcode::kFusion) {
-    return xla::InvalidArgument(
-        "XnnpackBackend only supports fusion instructions. Received %s.",
-        HloOpcodeString(instr.opcode()));
-  }
-  if (!instr.has_backend_config()) {
-    return xla::InvalidArgument("Instruction %s does not have backend config.",
-                                instr.ToString());
-  }
-
-  TF_ASSIGN_OR_RETURN(auto backend_config,
-                      instr.backend_config<BackendConfig>());
-
-  if (!backend_config.has_fusion_config()) {
-    return xla::InvalidArgument(
-        "Backend config %s does not have an fusion config.",
-        backend_config.DebugString());
-  }
-
-  if (backend_config.fusion_config().kind() != kXnnFusionKind) {
-    return xla::InvalidArgument(
-        "Backend kind %s doesn't match expected kind %s.",
-        backend_config.fusion_config().kind(), kXnnFusionKind);
-  }
-
-  return absl::OkStatus();
-}
-
-absl::StatusOr<std::unique_ptr<CodegenBackend>> XnnpackBackend::Create(
-    Compiler* compiler) {
-  return absl::WrapUnique(new XnnpackBackend(compiler));
-}
-
-bool XnnpackBackend::IsSupported(const HloInstruction& instr) {
-  return CheckIfXnnFusion(instr).ok();
-}
-
-absl::StatusOr<std::vector<std::unique_ptr<xla::BackendConfig>>>
-XnnpackBackend::GetSupportedConfigs(const HloInstruction& instr) {
-  TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  std::vector<std::unique_ptr<xla::BackendConfig>> configs;
-  {
-    XnnFusionOptions options;
-    options.set_use_threadpool(true);
-    auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(options);
-    configs.push_back(std::move(any));
-  }
-
-  {
-    XnnFusionOptions options;
-    options.set_use_threadpool(false);
-    auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(options);
-    configs.push_back(std::move(any));
-  }
-  return configs;
-}
-absl::StatusOr<std::unique_ptr<xla::BackendConfig>>
-XnnpackBackend::GetDefaultConfig(const HloInstruction& instr) {
-  TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  auto config = std::make_unique<XnnFusionOptions>();
-  config->set_use_threadpool(true);
-  auto any = std::make_unique<xla::BackendConfig>();
-  any->PackFrom(*config);
-  return any;
-}
-
-absl::Status XnnpackBackend::ApplyConfig(HloInstruction& instr,
-                                         const xla::BackendConfig& config) {
-  TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  TF_ASSIGN_OR_RETURN(auto backend_config,
-                      instr.backend_config<xla::cpu::BackendConfig>());
-
-  XnnFusionOptions options;
-  config.UnpackTo(&options);
-
-  *backend_config.mutable_fusion_config()->mutable_xnn_fusion_options() =
-      options;
-
-  TF_RETURN_IF_ERROR(instr.set_backend_config(backend_config));
-
-  return absl::OkStatus();
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
deleted file mode 100644
index 71b8b8c86d8011..00000000000000
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_AUTOTUNER_XNNPACK_BACKEND_H_
-#define XLA_BACKENDS_CPU_AUTOTUNER_XNNPACK_BACKEND_H_
-
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/cpu/autotuner/cpu_codegen_backend.h"
-#include "xla/backends/cpu/xnn_fusion_options.pb.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/compiler.h"
-
-namespace xla::cpu {
-
-inline constexpr absl::string_view kXnnpackBackendName = "xnnpack";
-
-class XnnpackBackend : public CpuCodegenBackend {
- public:
-  static absl::StatusOr<std::unique_ptr<CodegenBackend>> Create(
-      Compiler* compiler);
-
-  bool IsSupported(const HloInstruction& instr);
-
-  absl::StatusOr<std::vector<std::unique_ptr<xla::BackendConfig>>>
-  GetSupportedConfigs(const HloInstruction& instr) final;
-
-  absl::StatusOr<std::unique_ptr<xla::BackendConfig>> GetDefaultConfig(
-      const HloInstruction& instr) final;
-
-  absl::Status ApplyConfig(HloInstruction& instr,
-                           const xla::BackendConfig& config) final;
-
- protected:
-  explicit XnnpackBackend(Compiler* compiler)
-      : CpuCodegenBackend(compiler, kXnnpackBackendName) {}
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_AUTOTUNER_XNNPACK_BACKEND_H_
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
deleted file mode 100644
index 9dea563a7b6d0a..00000000000000
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/autotuner/xnnpack_backend.h"
-
-#include <memory>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/status_matchers.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/cpu/autotuner/cpu_codegen_backend.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/compiler.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::cpu {
-namespace {
-
-constexpr absl::string_view kXnnpackFusionHlo = R"(
-    HloModule eltwise_f32_0
-
-    xnn_fusion {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      add0 = f32[1024,1024] add(p0, p1)
-      mul0 = f32[1024,1024] multiply(add0, add0)
-      ROOT sub = f32[1024,1024] subtract(mul0, p0)
-    }
-
-    ENTRY e {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      ROOT %result = f32[1024,1024] fusion(%p0, %p1), kind=kCustom,
-        calls=xnn_fusion,
-        backend_config={"fusion_config": {"kind": "__xnn_fusion"}}
-    }
-  )";
-
-class XnnpackBackendTest : public HloHardwareIndependentTestBase {
- protected:
-  void SetUp() override {
-    TF_ASSERT_OK_AND_ASSIGN(compiler_,
-                            CpuCodegenBackend::CreateBackendCompiler());
-    TF_ASSERT_OK_AND_ASSIGN(backend_, XnnpackBackend::Create(compiler_.get()));
-  }
-  std::unique_ptr<CodegenBackend> backend_;
-  std::unique_ptr<Compiler> compiler_;
-};
-
-TEST_F(XnnpackBackendTest, NameTest) {
-  EXPECT_THAT(backend_->name(), "xnnpack");
-}
-
-TEST_F(XnnpackBackendTest, GetDefaultConfigTest) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto config, backend_->GetDefaultConfig(
-                       *module->entry_computation()->root_instruction()));
-  XnnFusionOptions xnn_fusion_options;
-  config->UnpackTo(&xnn_fusion_options);
-
-  EXPECT_TRUE(xnn_fusion_options.use_threadpool());
-}
-
-TEST_F(XnnpackBackendTest, InvalidFusionKind) {
-  constexpr absl::string_view bad_fusion_kind_hlo = R"(
-    HloModule eltwise_f32_0
-
-    not_xnn_fusion {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      add0 = f32[1024,1024] add(p0, p1)
-      mul0 = f32[1024,1024] multiply(add0, add0)
-      ROOT sub = f32[1024,1024] subtract(mul0, p0)
-    }
-
-    ENTRY e {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      ROOT %result = f32[1024,1024] fusion(%p0, %p1), kind=kCustom,
-        calls=not_xnn_fusion,
-        backend_config={fusion_config: {kind: "__not_xnn_fusion"}}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(bad_fusion_kind_hlo));
-  auto config = backend_->GetDefaultConfig(
-      *module->entry_computation()->root_instruction());
-
-  EXPECT_THAT(config,
-              absl_testing::StatusIs(
-                  absl::StatusCode::kInvalidArgument,
-                  testing::HasSubstr("Backend kind __not_xnn_fusion doesn't "
-                                     "match expected kind __xnn_fusion.")));
-}
-
-TEST_F(XnnpackBackendTest, GetSupportedConfigsTest) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto configs, backend_->GetSupportedConfigs(
-                        *module->entry_computation()->root_instruction()));
-
-  EXPECT_EQ(configs.size(), 2);
-  XnnFusionOptions xnn_fusion_options0;
-  configs[0]->UnpackTo(&xnn_fusion_options0);
-  EXPECT_TRUE(xnn_fusion_options0.use_threadpool());
-  XnnFusionOptions xnn_fusion_options1;
-  configs[1]->UnpackTo(&xnn_fusion_options1);
-  EXPECT_FALSE(xnn_fusion_options1.use_threadpool());
-}
-
-TEST_F(XnnpackBackendTest, CompileSupportedBackends) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  HloInstruction* fusion_instruction =
-      module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(auto configs,
-                          backend_->GetSupportedConfigs(*fusion_instruction));
-  for (auto& config : configs) {
-    TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                            backend_->Compile(*fusion_instruction, *config));
-  }
-}
-
-TEST_F(XnnpackBackendTest, EnsureConfigIsApplied) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  HloInstruction* fusion_instruction =
-      module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(auto configs,
-                          backend_->GetSupportedConfigs(*fusion_instruction));
-
-  for (const auto& config : configs) {
-    XnnFusionOptions xnn_fusion_options;
-    config->UnpackTo(&xnn_fusion_options);
-    EXPECT_TRUE(backend_->ApplyConfig(*fusion_instruction, *config).ok());
-
-    TF_ASSERT_OK_AND_ASSIGN(
-        auto instruction_backend_config,
-        fusion_instruction->backend_config<BackendConfig>());
-
-    EXPECT_EQ(instruction_backend_config.fusion_config()
-                  .xnn_fusion_options()
-                  .use_threadpool(),
-              xnn_fusion_options.use_threadpool());
-  }
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/BUILD b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
index 71cd84d147728b..8ad37162b2c78f 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/BUILD
+++ b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
@@ -68,8 +68,10 @@ cc_library(
         "//xla/tools:run_hlo_module_proto_cc",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -587,11 +589,10 @@ xla_cc_test(
 )
 
 xla_cc_test(
-    name = "xnn_fusion_benchmark_test",
-    srcs = ["xnn_fusion_benchmark_test.cc"],
+    name = "ynn_fusion_benchmark_test",
+    srcs = ["ynn_fusion_benchmark_test.cc"],
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     fail_if_no_test_selected = False,  # NOLINT=This contains benchmarks only, no tests.
-    linkstatic = 1,  # required to override pthreadpool symbols
     deps = [
         ":hlo_benchmark_runner",
         ":multi_benchmark_config",
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
index 0aeeb6d69a12bc..4d5d25d78d5fdc 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -56,6 +57,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/path.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::cpu {
 
@@ -130,10 +132,10 @@ absl::Status RunHloBenchmark(benchmark::State& state,
                              absl::Span<const Literal* const> args,
                              StrToStrMapping replacements,
                              const HloBenchmarkOptions& benchmark_options) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      ParseAndReturnUnverifiedModule(
-                          absl::StrReplaceAll(hlo_module, replacements),
-                          HloModuleConfig() /* unused */));
+  ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                   ParseAndReturnUnverifiedModule(
+                       absl::StrReplaceAll(hlo_module, replacements),
+                       HloModuleConfig() /* unused */));
   return RunHloBenchmark(state, std::move(module), args, benchmark_options);
 }
 
@@ -152,16 +154,16 @@ absl::Status RunHloBenchmark(benchmark::State& state,
 }
 
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
-absl::Status RunHloBenchmark(benchmark::State& state,
-                             std::unique_ptr<HloModule> module,
-                             absl::Span<const Literal* const> args,
-                             const HloBenchmarkOptions& benchmark_options) {
+absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
+                                 std::unique_ptr<HloModule> module,
+                                 absl::Span<const Literal* const> args,
+                                 const HloBenchmarkOptions& benchmark_options) {
   xla::CpuClientOptions client_options;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                      xla::GetXlaPjrtCpuClient(client_options));
+  ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                   xla::GetXlaPjrtCpuClient(client_options));
   PjRtDevice* device = client->devices().front();
-  TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
-                      device->default_memory_space());
+  ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
+                   device->default_memory_space());
 
   XlaComputation computation(module->ToProto());
 
@@ -175,12 +177,12 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   std::unique_ptr<PjRtLoadedExecutable> executable;
   if (benchmark_options.aot_options) {
     auto* cpu_client = tsl::down_cast<PjRtCpuClient*>(client.get());
-    TF_ASSIGN_OR_RETURN(executable, cpu_client->CompileAheadOfTimeAndLoad(
-                                        computation, compile_options,
-                                        *benchmark_options.aot_options));
+    ASSIGN_OR_RETURN(executable, cpu_client->CompileAheadOfTimeAndLoad(
+                                     computation, compile_options,
+                                     *benchmark_options.aot_options));
   } else {
-    TF_ASSIGN_OR_RETURN(executable,
-                        client->CompileAndLoad(computation, compile_options));
+    ASSIGN_OR_RETURN(executable,
+                     client->CompileAndLoad(computation, compile_options));
   }
 
   CHECK_GE(benchmark_options.num_executions, 1);
@@ -197,14 +199,14 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   // If the user has not passed any arguments we need to generate
   // fake arguments based on the number of inputs to the hlo module.
   if (args.empty()) {
-    TF_ASSIGN_OR_RETURN(std::vector<Literal> fake_args,
-                        MakeFakeArguments(module.get()));
+    ASSIGN_OR_RETURN(std::vector<Literal> fake_args,
+                     MakeFakeArguments(module.get()));
     for (auto& args_buffers : execution_args_buffers) {
       args_buffers.reserve(fake_args.size());
       for (const Literal& arg : fake_args) {
-        TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
-                            client->BufferFromHostLiteral(arg, memory_space));
-        TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+        ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                         client->BufferFromHostLiteral(arg, memory_space));
+        RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
       }
     }
   } else {
@@ -217,9 +219,9 @@ absl::Status RunHloBenchmark(benchmark::State& state,
     for (auto& args_buffers : execution_args_buffers) {
       args_buffers.reserve(args.size());
       for (const Literal* arg : args) {
-        TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
-                            client->BufferFromHostLiteral(*arg, memory_space));
-        TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+        ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                         client->BufferFromHostLiteral(*arg, memory_space));
+        RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
       }
     }
   }
@@ -283,32 +285,50 @@ absl::Status RunHloBenchmark(benchmark::State& state,
       std::vector<std::unique_ptr<PjRtBuffer>>& args_buffers =
           execution_args_buffers[i];
       std::vector<PjRtBuffer*>& args_ptrs = execution_args_ptrs[i];
-      TF_RETURN_IF_ERROR(alias_helper.SwapOutputAliasedBuffersToArgumentBuffers(
+      RETURN_IF_ERROR(alias_helper.SwapOutputAliasedBuffersToArgumentBuffers(
           execution_results[i], args_buffers, args_ptrs));
     }
 
     return absl::OkStatus();
   };
 
-  // Warm up executable.
-  TF_RETURN_IF_ERROR(run_benchmark_once());
+  // Run once. For a regular benchmark this will serve as a warm-up;
+  // for RunHloBenchmarkOnce this will be the only run.
+  RETURN_IF_ERROR(run_benchmark_once());
 
   // Benchmark executable.
-  for (auto _ : state) {
-    TF_RETURN_IF_ERROR(run_benchmark_once());
+  if (state) {
+    for (auto _ : *state) {
+      RETURN_IF_ERROR(run_benchmark_once());
+    }
   }
 
   return absl::OkStatus();
 }
 
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::unique_ptr<HloModule> module,
+                             absl::Span<const Literal* const> args,
+                             const HloBenchmarkOptions& benchmark_options) {
+  return RunHloBenchmarkImpl(&state, std::move(module), args,
+                             benchmark_options);
+}
+
+absl::Status RunHloBenchmarkOnce(std::unique_ptr<HloModule> module,
+                                 absl::Span<const Literal* const> args,
+                                 const HloBenchmarkOptions& benchmark_options) {
+  return RunHloBenchmarkImpl(nullptr, std::move(module), args,
+                             benchmark_options);
+}
+
 absl::Status CompileHloBenchmark(benchmark::State& state,
                                  absl::string_view hlo_module,
                                  StrToStrMapping replacements,
                                  const HloBenchmarkOptions& benchmark_options) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      ParseAndReturnUnverifiedModule(
-                          absl::StrReplaceAll(hlo_module, replacements),
-                          HloModuleConfig() /* unused */));
+  ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                   ParseAndReturnUnverifiedModule(
+                       absl::StrReplaceAll(hlo_module, replacements),
+                       HloModuleConfig() /* unused */));
 
   return CompileHloBenchmark(state, std::move(module), benchmark_options);
 }
@@ -317,8 +337,8 @@ absl::Status CompileHloBenchmark(benchmark::State& state,
                                  std::unique_ptr<HloModule> module,
                                  const HloBenchmarkOptions& benchmark_options) {
   xla::CpuClientOptions client_options;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                      xla::GetXlaPjrtCpuClient(client_options));
+  ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                   xla::GetXlaPjrtCpuClient(client_options));
 
   XlaComputation computation(module->ToProto());
 
@@ -329,8 +349,8 @@ absl::Status CompileHloBenchmark(benchmark::State& state,
   }
 
   for (auto _ : state) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                        client->CompileAndLoad(computation, compile_options));
+    ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                     client->CompileAndLoad(computation, compile_options));
     tsl::testing::DoNotOptimize(executable);
   }
 
@@ -346,11 +366,11 @@ LoadFromHloSnapshotOrHloModuleProto(absl::string_view hlo_data,
   auto iteration_literals_proto =
       std::make_unique<RunHloModuleIterationLiterals>();
   if (extension == "pb" || extension == "pbtxt") {
-    TF_ASSIGN_OR_RETURN(iteration_literals_proto,
-                        LoadInputFromData(hlo_data, extension));
+    ASSIGN_OR_RETURN(iteration_literals_proto,
+                     LoadInputFromData(hlo_data, extension));
   }
 
-  TF_ASSIGN_OR_RETURN(auto hlo_module, LoadModuleFromData(hlo_data, extension));
+  ASSIGN_OR_RETURN(auto hlo_module, LoadModuleFromData(hlo_data, extension));
 
   return std::make_pair(std::move(hlo_module),
                         std::move(iteration_literals_proto));
@@ -360,12 +380,12 @@ absl::StatusOr<std::pair<std::unique_ptr<HloModule>,
                          std::unique_ptr<RunHloModuleIterationLiterals>>>
 LoadFromHloUnoptimizedSnapshot(
     const HloUnoptimizedSnapshot& unoptimized_snapshot) {
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(unoptimized_snapshot.hlo_module(),
                                              xla::GetDebugOptionsFromFlags()));
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
       HloModule::CreateFromProto(unoptimized_snapshot.hlo_module(), config));
 
@@ -391,8 +411,8 @@ absl::StatusOr<std::pair<std::unique_ptr<HloModule>,
                          std::unique_ptr<RunHloModuleIterationLiterals>>>
 LoadHloModuleAndMaybeIterationLiterals(absl::string_view hlo_path) {
   std::string hlo_data;
-  TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(),
-                                           std::string(hlo_path), &hlo_data));
+  RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(),
+                                        std::string(hlo_path), &hlo_data));
 
   HloUnoptimizedSnapshot unoptimized_snapshot;
   if (unoptimized_snapshot.ParseFromString(hlo_data)) {
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
index 87fac7b2cbb1cb..562853d29e9c2b 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
@@ -70,6 +70,13 @@ absl::Status RunHloBenchmark(benchmark::State& state,
                              absl::Span<const Literal* const> args,
                              const HloBenchmarkOptions& benchmark_options = {});
 
+// Same as above, except that it runs the module exactly once and does not
+// have a benchmark::State parameter, which makes it suitable for unit tests.
+absl::Status RunHloBenchmarkOnce(
+    std::unique_ptr<HloModule> hlo_module,
+    absl::Span<const Literal* const> args,
+    const HloBenchmarkOptions& benchmark_options = {});
+
 // Benchmarks the given HLO's compilation time.
 //
 // Takes the same options as RunHloBenchmark, except no arguments since the
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/ynn_fusion_benchmark_test.cc
similarity index 90%
rename from third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
rename to third_party/xla/xla/backends/cpu/benchmarks/ynn_fusion_benchmark_test.cc
index e0aee67ebae66b..061e681937330c 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/ynn_fusion_benchmark_test.cc
@@ -37,7 +37,7 @@ namespace xla::cpu {
 static absl::Status RunFusionBenchmark(benchmark::State& state,
                                        HloBenchmarkOptions options,
                                        absl::string_view hlo,
-                                       bool is_xnn_fusion = false) {
+                                       bool is_ynn_fusion = false) {
   int64_t d0 = state.range(0);  // Tensor size.
   int64_t n = state.range(1);   // Number of add-multiply iterations.
 
@@ -58,7 +58,7 @@ static absl::Status RunFusionBenchmark(benchmark::State& state,
       ShapeUtil::MakeShape(F32, {d0, d0}), &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0, &p1};
 
-  if (is_xnn_fusion) {
+  if (is_ynn_fusion) {
     options.disable_parallel_task_assigner = true;
     options.aot_options = nullptr;
   }
@@ -88,13 +88,13 @@ static void BM_EltwiseF32(benchmark::State& state,
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo));
 }
 
-static void BM_XnnEltwiseF32(benchmark::State& state,
+static void BM_YnnEltwiseF32(benchmark::State& state,
                              HloBenchmarkOptions options) {
   // Perform `n+1` iterations of `add` and `multiply`, then end with `subtract`.
   absl::string_view hlo = R"(
     HloModule eltwise_f32_$n
 
-    xnn_fusion {
+    ynn_fusion {
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       add0 = f32[$d0,$d0] add(p0, p1)
@@ -107,12 +107,12 @@ static void BM_XnnEltwiseF32(benchmark::State& state,
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       ROOT %result = f32[$d0,$d0] fusion(%p0, %p1), kind=kCustom,
-        calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
+        calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     }
   )";
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo,
-                              /*is_xnn_fusion=*/true));
+                              /*is_ynn_fusion=*/true));
 }
 
 static void BM_DotAndEltwiseF32(benchmark::State& state,
@@ -136,14 +136,14 @@ static void BM_DotAndEltwiseF32(benchmark::State& state,
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo));
 }
 
-static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
+static void BM_YnnDotAndEltwiseF32(benchmark::State& state,
                                    HloBenchmarkOptions options) {
   // Perform `dot` followed by `n+1` iterations of `add` and `multiply`, then
   // end with `subtract`.
   absl::string_view hlo = R"(
     HloModule dot_and_eltwise_f32_$n
 
-    xnn_fusion {
+    ynn_fusion {
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       dot0 = f32[$d0,$d0] dot(p0, p1), lhs_contracting_dims={1},
@@ -158,12 +158,12 @@ static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       ROOT %result = f32[$d0,$d0] fusion(%p0, %p1), kind=kCustom,
-        calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
+        calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     }
   )";
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo,
-                              /*is_xnn_fusion=*/true));
+                              /*is_ynn_fusion=*/true));
 }
 
 #define BENCHMARK_FUSION(name)  \
@@ -175,8 +175,8 @@ static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
       ->Args({1024, 32})
 
 BENCHMARK_FUSION(BM_EltwiseF32);
-BENCHMARK_FUSION(BM_XnnEltwiseF32);
+BENCHMARK_FUSION(BM_YnnEltwiseF32);
 BENCHMARK_FUSION(BM_DotAndEltwiseF32);
-BENCHMARK_FUSION(BM_XnnDotAndEltwiseF32);
+BENCHMARK_FUSION(BM_YnnDotAndEltwiseF32);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index 4753a80c341de5..4a4d285391904d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -94,7 +94,6 @@ cc_library(
     srcs = ["ir_compiler.cc"],
     hdrs = ["ir_compiler.h"],
     deps = [
-        ":cpu_features",
         ":kernel_api_ir_builder",
         ":polynomial_approximations",
         "//xla:util",
@@ -133,7 +132,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
-        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
@@ -195,6 +193,7 @@ cc_library(
         "//xla/codegen:trace_pass_instrumentation",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
+        "//xla/codegen/emitters/transforms:lower_to_llvm_cpu_pass",
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/codegen/xtile/ir:xtile",
@@ -351,7 +350,6 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
@@ -557,9 +555,7 @@ cc_library(
     hdrs = ["object_loader.h"],
     deps = [
         ":compiled_function_library",
-        ":contiguous_section_memory_manager",
         ":execution_engine",
-        ":jit_memory_mapper",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -716,7 +712,6 @@ cc_library(
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_cluster",
@@ -776,7 +771,6 @@ py_strict_test(
         "//third_party/py/numpy",
         "//xla/backends/cpu/testlib",
         "//xla/codegen/testlib",
-        "//xla/python:xla_extension",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
index 067d7d21b6df1d..97db03ab0cfcaa 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
@@ -15,7 +15,6 @@ cc_library(
         "//xla:util",
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_kernel_source",
@@ -25,7 +24,6 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:dot_op_emitter",
         "//xla/service/llvm_ir:ir_array",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
index bdce35a2907d7e..f31b161c01e632 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
@@ -35,7 +35,6 @@ cc_library(
         "//xla/service/cpu:backend_config_proto_cc",
         "//xla/service/cpu:ir_emitter",
         "//xla/service/llvm_ir:ir_array",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -104,7 +103,6 @@ xla_cc_test(
         ":elemental_kernel_emitter",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:kernel_emitter",
         "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
index d0ac6c73095272..5832c34b6c6fe1 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -48,7 +48,6 @@ cc_library(
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
@@ -105,7 +104,6 @@ xla_cc_test(
         "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -124,6 +122,5 @@ xla_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/platform:casts",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
index 53f2f0dc055c5f..bbbd45800e3f5b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -64,7 +65,6 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -227,8 +227,8 @@ absl::StatusOr<mlir::func::FuncOp> EmitEntryFunctionApi(
   }
 
   builder.setInsertionPointToStart(fusion_module.getBody());
-  auto entry_func = builder.create<FuncOp>(
-      loc, entry_function_name,
+  auto entry_func = FuncOp::create(
+      builder, loc, entry_function_name,
       mlir::FunctionType::get(context, param_types, result_types),
       /*sym_visibility=*/mlir::StringAttr{},
       mlir::ArrayAttr::get(context, arg_attrs),
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
index 9dc42fa9c96f15..042417aac54ee4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
@@ -28,11 +28,11 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
index 0c09ff70124398..e8b0a586fc14e2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
index 537569a53404e5..218f246803de9b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -62,7 +63,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -165,19 +165,19 @@ SmallVector<Value> EmitScatterComputation(
     ret.reserve(reduced_values.size());
     for (const auto& [reduced_value, output_tensor] :
          llvm::zip(reduced_values, output_tensors)) {
-      ret.push_back(b.create<mlir::tensor::InsertOp>(reduced_value,
-                                                     output_tensor, indices));
+      ret.push_back(mlir::tensor::InsertOp::create(b, reduced_value,
+                                                   output_tensor, indices));
     }
     return ret;
   }
   Value output_tensor = output_tensors.front();
   Value update_elem = update_elems.front();
-  auto atomic_rmw = b.create<AtomicRMWOp>(output_tensor, indices);
+  auto atomic_rmw = AtomicRMWOp::create(b, output_tensor, indices);
   mlir::OpBuilder body_builder = atomic_rmw.getBodyBuilder();
   auto reduced_val =
       emitters::InlineBlock(body_builder, reducer.getBody().front(),
                             {atomic_rmw.getCurrentValue(), update_elem})[0];
-  body_builder.create<xla::YieldOp>(reducer->getLoc(), reduced_val);
+  xla::YieldOp::create(body_builder, reducer->getLoc(), reduced_val);
   return {atomic_rmw->getResult(0)};
 }
 
@@ -444,7 +444,7 @@ absl::Status CpuScatterFusion::EmitEntryFunction(
                           updated_outputs);
                     },
                     [&](mlir::OpBuilder& else_b, mlir::Location else_loc) {
-                      else_b.create<scf::YieldOp>(else_loc, output_tensors);
+                      scf::YieldOp::create(else_b, else_loc, output_tensors);
                     })
                 .getResults();
         return predicated_updates;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
index 20c4a7332b86f0..5f8fa0d525cda8 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
@@ -25,12 +25,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
index 1f74e57abe45a5..123e0394dcd297 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
@@ -61,9 +61,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:UBDialect",
         "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:VectorUtils",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc
index 57b529747f90fa..231889025496a5 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc
@@ -114,9 +114,9 @@ struct PeelWorkgroupLoopPattern : public mlir::OpRewritePattern<xla::LoopOp> {
       }
 
       mlir::ImplicitLocOpBuilder builder(loop_op.getLoc(), rewriter);
-      auto cmp_op = builder.create<mlir::arith::CmpIOp>(
-          mlir::arith::CmpIPredicate::sle, work_group_dim.operand,
-          builder.create<mlir::arith::ConstantIndexOp>(query_dimension_upper));
+      auto cmp_op = mlir::arith::CmpIOp::create(
+          builder, mlir::arith::CmpIPredicate::sle, work_group_dim.operand,
+          mlir::arith::ConstantIndexOp::create(builder, query_dimension_upper));
 
       auto loop_body_cloner = GetLoopBodyCloner(loop_op);
 
@@ -128,11 +128,11 @@ struct PeelWorkgroupLoopPattern : public mlir::OpRewritePattern<xla::LoopOp> {
             query_dimension_upper;
         peeled_map.Simplify();
 
-        auto peeled_loop =
-            then_builder.create<LoopOp>(then_loc, peeled_map, loop_op.getDims(),
-                                        loop_op.getInits(), loop_body_cloner);
-        then_builder.create<mlir::scf::YieldOp>(then_loc,
-                                                peeled_loop.getResults());
+        auto peeled_loop = LoopOp::create(then_builder, then_loc, peeled_map,
+                                          loop_op.getDims(), loop_op.getInits(),
+                                          loop_body_cloner);
+        mlir::scf::YieldOp::create(then_builder, then_loc,
+                                   peeled_loop.getResults());
       };
       auto else_body_builder = [&](mlir::OpBuilder& else_builder,
                                    mlir::Location else_loc) -> void {
@@ -142,14 +142,14 @@ struct PeelWorkgroupLoopPattern : public mlir::OpRewritePattern<xla::LoopOp> {
         tail_map.Simplify();
 
         auto tail_loop =
-            else_builder.create<LoopOp>(else_loc, tail_map, loop_op.getDims(),
-                                        loop_op.getInits(), loop_body_cloner);
-        else_builder.create<mlir::scf::YieldOp>(else_loc,
-                                                tail_loop.getResults());
+            LoopOp::create(else_builder, else_loc, tail_map, loop_op.getDims(),
+                           loop_op.getInits(), loop_body_cloner);
+        mlir::scf::YieldOp::create(else_builder, else_loc,
+                                   tail_loop.getResults());
       };
 
-      auto if_op = builder.create<mlir::scf::IfOp>(cmp_op, then_body_builder,
-                                                   else_body_builder);
+      auto if_op = mlir::scf::IfOp::create(builder, cmp_op, then_body_builder,
+                                           else_body_builder);
 
       rewriter.replaceOp(loop_op, if_op.getResults());
       return mlir::success();
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
index fd71b9ee8d36f0..8eee5850bbdde4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
@@ -90,24 +90,24 @@ struct LowerLoadOp : public mlir::OpRewritePattern<LoadOp> {
     auto kernel_arg = KernelArgType(b.getContext());
 
     // Get a pointer to the first `KernelArg` struct.
-    auto cast = b.create<mlir::UnrealizedConversionCastOp>(op.getLoc(), ptr,
-                                                           op.getCallFrame())
+    auto cast = mlir::UnrealizedConversionCastOp::create(b, op.getLoc(), ptr,
+                                                         op.getCallFrame())
                     .getResult(0);
-    auto args_gep = b.create<mlir::LLVM::GEPOp>(
-        ptr, kernel_call_frame, cast,
+    auto args_gep = mlir::LLVM::GEPOp::create(
+        b, ptr, kernel_call_frame, cast,
         llvm::SmallVector<mlir::LLVM::GEPArg, 2>{mlir::LLVM::GEPArg(0),
                                                  mlir::LLVM::GEPArg(3)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
-    auto args_ptr = b.create<mlir::LLVM::LoadOp>(ptr, args_gep);
+    auto args_ptr = mlir::LLVM::LoadOp::create(b, ptr, args_gep);
     args_ptr.setInvariant(true);
 
     // Get a pointer to the `KernelArg` at the given index.
-    auto arg_gep = b.create<mlir::LLVM::GEPOp>(
-        ptr, kernel_arg, args_ptr,
+    auto arg_gep = mlir::LLVM::GEPOp::create(
+        b, ptr, kernel_arg, args_ptr,
         llvm::SmallVector<mlir::LLVM::GEPArg, 2>{
             mlir::LLVM::GEPArg(op.getIndex()), mlir::LLVM::GEPArg(0)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
-    auto arg_ptr = b.create<mlir::LLVM::LoadOp>(ptr, arg_gep);
+    auto arg_ptr = mlir::LLVM::LoadOp::create(b, ptr, arg_gep);
     arg_ptr.setInvariant(true);
 
     if (auto dereferenceable = op->getAttrOfType<mlir::IntegerAttr>(
@@ -121,12 +121,12 @@ struct LowerLoadOp : public mlir::OpRewritePattern<LoadOp> {
       mlir::LLVMTypeConverter converter(rewriter.getContext());
       mlir::Value memref_desc = mlir::MemRefDescriptor::fromStaticShape(
           b, op.getLoc(), converter, memref_type, arg_ptr);
-      auto memref_cast = b.create<mlir::UnrealizedConversionCastOp>(
-          op.getLoc(), op.getResult().getType(), memref_desc);
+      auto memref_cast = mlir::UnrealizedConversionCastOp::create(
+          b, op.getLoc(), op.getResult().getType(), memref_desc);
       rewriter.replaceOp(op, memref_cast);
     } else {
-      auto arg_ptr_cast = b.create<mlir::UnrealizedConversionCastOp>(
-          op.getLoc(), op.getResult().getType(), arg_ptr.getResult());
+      auto arg_ptr_cast = mlir::UnrealizedConversionCastOp::create(
+          b, op.getLoc(), op.getResult().getType(), arg_ptr.getResult());
       rewriter.replaceOp(op, arg_ptr_cast.getResult(0));
     }
     return mlir::success();
@@ -149,26 +149,25 @@ struct LowerExtractWorkgroupIdOp
     auto i64_ty = builder.getI64Type();
 
     // Get a pointer to the `WorkGroupThread` struct.
-    auto cast = builder
-                    .create<mlir::UnrealizedConversionCastOp>(ptr_type,
-                                                              op.getCallFrame())
+    auto cast = mlir::UnrealizedConversionCastOp::create(builder, ptr_type,
+                                                         op.getCallFrame())
                     .getResult(0);
-    auto workgroup_gep = builder.create<mlir::LLVM::GEPOp>(
-        ptr_type, kernel_call_frame, cast,
+    auto workgroup_gep = mlir::LLVM::GEPOp::create(
+        builder, ptr_type, kernel_call_frame, cast,
         mlir::ArrayRef<mlir::LLVM::GEPArg>{mlir::LLVM::GEPArg(0),
                                            mlir::LLVM::GEPArg(1)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto workgroup_ptr =
-        builder.create<mlir::LLVM::LoadOp>(ptr_type, workgroup_gep);
+        mlir::LLVM::LoadOp::create(builder, ptr_type, workgroup_gep);
 
     int32_t workgroup_dim_idx = static_cast<int32_t>(op.getDimension());
-    auto workgroup_dim_gep = builder.create<mlir::LLVM::GEPOp>(
-        ptr_type, kernel_dim, workgroup_ptr,
+    auto workgroup_dim_gep = mlir::LLVM::GEPOp::create(
+        builder, ptr_type, kernel_dim, workgroup_ptr,
         mlir::ArrayRef<mlir::LLVM::GEPArg>{
             mlir::LLVM::GEPArg(0), mlir::LLVM::GEPArg(workgroup_dim_idx)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto workgroup_dim_load =
-        builder.create<mlir::LLVM::LoadOp>(i64_ty, workgroup_dim_gep);
+        mlir::LLVM::LoadOp::create(builder, i64_ty, workgroup_dim_gep);
     workgroup_dim_load.setInvariant(true);
 
     mlir::Value workgroup_dim = workgroup_dim_load.getResult();
@@ -176,11 +175,12 @@ struct LowerExtractWorkgroupIdOp
         mlir::DataLayout::closest(builder.getInsertionBlock()->getParentOp())
             .getTypeSizeInBits(mlir::IndexType::get(context)));
     if (index_ty != i64_ty) {
-      workgroup_dim = builder.create<mlir::LLVM::TruncOp>(
-          index_ty, workgroup_dim, mlir::LLVM::IntegerOverflowFlags::nsw);
+      workgroup_dim =
+          mlir::LLVM::TruncOp::create(builder, index_ty, workgroup_dim,
+                                      mlir::LLVM::IntegerOverflowFlags::nsw);
     }
-    auto workgroup_dim_cast = builder.create<mlir::UnrealizedConversionCastOp>(
-        mlir::IndexType::get(context), workgroup_dim);
+    auto workgroup_dim_cast = mlir::UnrealizedConversionCastOp::create(
+        builder, mlir::IndexType::get(context), workgroup_dim);
 
     rewriter.replaceOp(op, workgroup_dim_cast.getResult(0));
 
@@ -252,8 +252,8 @@ struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
     llvm::SmallVector<mlir::Type> new_operands{ptr};
     rewriter.setInsertionPointToStart(&op.getBody().front());
 
-    auto cast = rewriter.create<mlir::UnrealizedConversionCastOp>(
-        op.getLoc(), func_type.getInput(0), op.getArgument(0));
+    auto cast = mlir::UnrealizedConversionCastOp::create(
+        rewriter, op.getLoc(), func_type.getInput(0), op.getArgument(0));
     op.getArgument(0).replaceAllUsesExcept(cast.getResult(0), cast);
     op.setFunctionType(rewriter.getFunctionType(new_operands, {ptr}));
     auto& entry = op->getRegion(0).front();
@@ -301,8 +301,9 @@ class WrapEntryWithCallFrame
 
     auto call_frame_type = CallFrameType::get(context);
     auto error_type = ErrorType::get(context);
-    mlir::func::FuncOp kernel_func = builder.create<mlir::func::FuncOp>(
-        kernel_name, rewriter.getFunctionType({call_frame_type}, {error_type}));
+    mlir::func::FuncOp kernel_func = mlir::func::FuncOp::create(
+        builder, kernel_name,
+        rewriter.getFunctionType({call_frame_type}, {error_type}));
 
     builder.setInsertionPointToStart(kernel_func.addEntryBlock());
 
@@ -316,7 +317,7 @@ class WrapEntryWithCallFrame
       mlir::DictionaryAttr arg_attr =
           arg_attrs ? mlir::dyn_cast<mlir::DictionaryAttr>(arg_attrs[idx])
                     : nullptr;
-      LoadOp load = builder.create<LoadOp>(arg.getType(), call_frame_arg, idx);
+      LoadOp load = LoadOp::create(builder, arg.getType(), call_frame_arg, idx);
       if (arg_attr) {
         load->setAttrs(arg_attr);
       }
@@ -325,16 +326,17 @@ class WrapEntryWithCallFrame
 
     for (auto workgroup_id : {WorkGroupDimension::x, WorkGroupDimension::y,
                               WorkGroupDimension::z}) {
-      call_args.push_back(builder.create<ExtractWorkgroupIdOp>(
-          mlir::IndexType::get(context), call_frame_arg, workgroup_id));
+      call_args.push_back(
+          ExtractWorkgroupIdOp::create(builder, mlir::IndexType::get(context),
+                                       call_frame_arg, workgroup_id));
     }
 
     // Use func::call here rather than pure call to avoid the entry function
     // being DCEd.
-    builder.create<mlir::func::CallOp>(op, call_args);
+    mlir::func::CallOp::create(builder, op, call_args);
 
-    auto error = builder.create<cpu::SuccessOp>(error_type);
-    builder.create<mlir::func::ReturnOp>(error.getResult());
+    auto error = cpu::SuccessOp::create(builder, error_type);
+    mlir::func::ReturnOp::create(builder, error.getResult());
 
     op->setAttr("xla.cpu.is_wrapped", builder.getUnitAttr());
     op.setPrivate();
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
index 65996cf9b41d30..4b6ac965f10230 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
@@ -106,6 +106,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_attrs.h.inc"
 #include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/codegen/llvm_kernel_source.h"
@@ -212,7 +213,7 @@ static void AddGenericLoweringPasses(mlir::OpPassManager& pm,
   pm.addPass(mlir::createSCFToControlFlowPass());
   pm.addPass(emitters::CreateLowerXlaIntrinsicLibPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateConvertMathToLLVMPass());
-  pm.addPass(emitters::CreateLowerToLLVMPass(/*target_type=*/"cpu"));
+  pm.addPass(emitters::CreateLowerToLLVMCPUPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
index 5e0bdaddccbb5f..cec9c5c1769275 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
index 74302f25fd368e..15ccb3b8fce2ae 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
index 232bfe0488ba26..947566d3de8715 100644
--- a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
@@ -521,12 +521,13 @@ void RewriteToPolynomialApproximations(llvm::Module* module,
 
   rewrite_calls("expf", GenerateVF32Exp, /*vector_width=*/1);
   rewrite_calls("llvm.exp.f32", GenerateVF32Exp, /*vector_width=*/1);
-  rewrite_calls(kExpV4F32Sym, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls("llvm.exp.v2f32", GenerateVF32Exp, /*vector_width=*/2);
   rewrite_calls("llvm.exp.v4f32", GenerateVF32Exp, /*vector_width=*/4);
-  rewrite_calls(kExpV8F32Sym, GenerateVF32Exp, /*vector_width=*/8);
   rewrite_calls("llvm.exp.v8f32", GenerateVF32Exp, /*vector_width=*/8);
-  rewrite_calls(kExpV16F32Sym, GenerateVF32Exp, /*vector_width=*/16);
   rewrite_calls("llvm.exp.v16f32", GenerateVF32Exp, /*vector_width=*/16);
+  rewrite_calls(kExpV4F32Sym, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls(kExpV8F32Sym, GenerateVF32Exp, /*vector_width=*/8);
+  rewrite_calls(kExpV16F32Sym, GenerateVF32Exp, /*vector_width=*/16);
 
   rewrite_calls("llvm.exp.f16", UpcastF16ToF32<GenerateVF32Exp>,
                 /*vector_width=*/1);
@@ -541,6 +542,10 @@ void RewriteToPolynomialApproximations(llvm::Module* module,
 
   rewrite_calls("logf", GenerateVF32Log, /*vector_width=*/1);
   rewrite_calls("llvm.log.f32", GenerateVF32Log, /*vector_width=*/1);
+  rewrite_calls("llvm.log.v2f32", GenerateVF32Log, /*vector_width=*/2);
+  rewrite_calls("llvm.log.v4f32", GenerateVF32Log, /*vector_width=*/4);
+  rewrite_calls("llvm.log.v8f32", GenerateVF32Log, /*vector_width=*/8);
+  rewrite_calls("llvm.log.v16f32", GenerateVF32Log, /*vector_width=*/16);
   rewrite_calls(kLogV4F32Sym, GenerateVF32Log, /*vector_width=*/4);
   rewrite_calls(kLogV8F32Sym, GenerateVF32Log, /*vector_width=*/8);
   rewrite_calls(kLogV16F32Sym, GenerateVF32Log, /*vector_width=*/16);
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
index 0e0243f5b6dbd5..1ae35c6a56c63d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
@@ -34,7 +34,10 @@ cc_library(
         ["tiled_fusion_emitter_stub.cc"],
     ),
     hdrs = ["tiled_fusion_emitter.h"],
-    visibility = ["//xla/backends/cpu/codegen:__pkg__"],
+    visibility = [
+        "//xla/backends/cpu/codegen:__pkg__",
+        "//xla/service/cpu:__pkg__",
+    ],
     deps = [
         "//xla:shape_util",
         "//xla:util",
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
index ca968b3df04a1d..74f951b2ef9d3b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
@@ -113,7 +113,7 @@ absl::StatusOr<std::vector<FlatTiling>> GetTiling(
 }
 
 // We don't currently support sub-byte types in the tiled CPU emitter.
-static bool IsSupportedType(PrimitiveType type) {
+bool IsSupportedTilingType(PrimitiveType type) {
   if (type == PRED) {
     return true;
   }
@@ -144,7 +144,7 @@ static bool IsSupportedShape(const Shape& shape) {
   ShapeUtil::ForEachSubshape(
       shape, [&](const Shape& subshape, const ShapeIndex& index) {
         if (subshape.IsArray()) {
-          if (!IsSupportedType(subshape.element_type())) {
+          if (!IsSupportedTilingType(subshape.element_type())) {
             is_supported = false;
           }
         }
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
index d2f88d17d85b74..6a8eaaf96b7ac2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
@@ -32,6 +32,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
+bool IsSupportedTilingType(PrimitiveType type);
+
 absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion);
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
index 37f2abadb37ce4..87e14516c1d740 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
@@ -30,6 +30,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
+bool IsSupportedTilingType(PrimitiveType type) { return false; }
+
 absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion) {
   return absl::UnimplementedError("not supported for this build configuration");
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
index d81c0557967508..ee920c56af1ac2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
@@ -114,7 +114,6 @@ cc_library(
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:VectorDialect",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
index 6ae31ae3594c00..b82d03d47c699e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
@@ -76,8 +76,9 @@ struct LowerXtileEntry : mlir::OpRewritePattern<xtile::EntryFuncOp> {
       }
     }
 
-    auto new_func_op = rewriter.create<mlir::func::FuncOp>(
-        op->getLoc(), op.getSymName(), op.getFunctionType(), filtered_attrs);
+    auto new_func_op =
+        mlir::func::FuncOp::create(rewriter, op->getLoc(), op.getSymName(),
+                                   op.getFunctionType(), filtered_attrs);
     new_func_op.setArgAttrsAttr(op.getArgAttrsAttr());
 
     // Move the region from the old function to the new one.
@@ -99,7 +100,8 @@ struct LowerXTileEntryReturn
   mlir::LogicalResult matchAndRewrite(
       xtile::EntryFuncReturnOp op,
       mlir::PatternRewriter& rewriter) const override {
-    rewriter.replaceOp(op, rewriter.create<mlir::func::ReturnOp>(op->getLoc()));
+    rewriter.replaceOp(op,
+                       mlir::func::ReturnOp::create(rewriter, op->getLoc()));
     return mlir::success();
   }
 };
@@ -151,8 +153,8 @@ class LowerXTileEntryPass
       auto call_frame_type = CallFrameType::get(context);
       auto error_type = ErrorType::get(context);
       builder.setInsertionPointToStart(module.getBody());
-      mlir::func::FuncOp kernel_func = builder.create<mlir::func::FuncOp>(
-          kernel_name,
+      mlir::func::FuncOp kernel_func = mlir::func::FuncOp::create(
+          builder, kernel_name,
           builder.getFunctionType({call_frame_type}, {error_type}));
 
       builder.setInsertionPointToStart(kernel_func.addEntryBlock());
@@ -162,7 +164,7 @@ class LowerXTileEntryPass
       llvm::SmallVector<mlir::Value> call_args;
       for (const auto& [idx, arg] :
            llvm::enumerate(entry_func.getBufferArgs())) {
-        LoadOp load = builder.create<LoadOp>(arg.getType(), call_frame, idx);
+        LoadOp load = LoadOp::create(builder, arg.getType(), call_frame, idx);
         call_args.push_back(load);
       }
 
@@ -177,11 +179,11 @@ class LowerXTileEntryPass
       int32_t tiles_per_workgroup = tile_info.getTilesPerWorkgroup();
 
       mlir::Value tile_count_value =
-          builder.create<mlir::arith::ConstantIndexOp>(tile_count);
+          mlir::arith::ConstantIndexOp::create(builder, tile_count);
       mlir::Value tiles_per_workgroup_value =
-          builder.create<mlir::arith::ConstantIndexOp>(tiles_per_workgroup);
-      mlir::Value workgroup_id = builder.create<ExtractWorkgroupIdOp>(
-          builder.getIndexType(), call_frame, WorkGroupDimension::x);
+          mlir::arith::ConstantIndexOp::create(builder, tiles_per_workgroup);
+      mlir::Value workgroup_id = ExtractWorkgroupIdOp::create(
+          builder, builder.getIndexType(), call_frame, WorkGroupDimension::x);
 
       auto flags = mlir::arith::IntegerOverflowFlags::nsw |
                    mlir::arith::IntegerOverflowFlags::nuw;
@@ -189,23 +191,24 @@ class LowerXTileEntryPass
       // This isn't needed for correctness as the workgroup id passed from the
       // runtime will always be in bounds but it constrains the range which LLVM
       // can then take advantage of.
-      mlir::Value bounded_workgroup_id = builder.create<mlir::arith::MaxSIOp>(
-          workgroup_id, builder.create<mlir::arith::ConstantIndexOp>(0));
+      mlir::Value bounded_workgroup_id = mlir::arith::MaxSIOp::create(
+          builder, workgroup_id,
+          mlir::arith::ConstantIndexOp::create(builder, 0));
 
-      mlir::Value start_tile_id = builder.create<mlir::arith::MulIOp>(
-          bounded_workgroup_id, tiles_per_workgroup_value, flags);
-      mlir::Value bounded_start_tile_id =
-          builder.create<mlir::arith::MinSIOp>(start_tile_id, tile_count_value);
+      mlir::Value start_tile_id = mlir::arith::MulIOp::create(
+          builder, bounded_workgroup_id, tiles_per_workgroup_value, flags);
+      mlir::Value bounded_start_tile_id = mlir::arith::MinSIOp::create(
+          builder, start_tile_id, tile_count_value);
 
-      mlir::Value end_tile_id = builder.create<mlir::arith::AddIOp>(
-          start_tile_id, tiles_per_workgroup_value, flags);
+      mlir::Value end_tile_id = mlir::arith::AddIOp::create(
+          builder, start_tile_id, tiles_per_workgroup_value, flags);
       mlir::Value bounded_end_tile_id =
-          builder.create<mlir::arith::MinSIOp>(end_tile_id, tile_count_value);
+          mlir::arith::MinSIOp::create(builder, end_tile_id, tile_count_value);
 
-      mlir::Value step = builder.create<mlir::arith::ConstantIndexOp>(1);
+      mlir::Value step = mlir::arith::ConstantIndexOp::create(builder, 1);
 
-      auto for_op = builder.create<mlir::scf::ForOp>(bounded_start_tile_id,
-                                                     bounded_end_tile_id, step);
+      auto for_op = mlir::scf::ForOp::create(builder, bounded_start_tile_id,
+                                             bounded_end_tile_id, step);
       {
         mlir::ImplicitLocOpBuilder body_builder(entry_func->getLoc(),
                                                 entry_func);
@@ -213,12 +216,12 @@ class LowerXTileEntryPass
 
         call_args.push_back(for_op.getInductionVar());
 
-        body_builder.create<mlir::func::CallOp>(kernel_impl_name,
-                                                mlir::TypeRange(), call_args);
+        mlir::func::CallOp::create(body_builder, kernel_impl_name,
+                                   mlir::TypeRange(), call_args);
       }
 
-      auto error = builder.create<cpu::SuccessOp>(error_type);
-      builder.create<mlir::func::ReturnOp>(error.getResult());
+      auto error = cpu::SuccessOp::create(builder, error_type);
+      mlir::func::ReturnOp::create(builder, error.getResult());
     }
 
     return mlir::success();
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
index c88fd34f242e40..0eae2b15ec6e66 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
@@ -44,8 +44,9 @@ mlir::VectorType GetVectorType(mlir::ShapedType type) {
 mlir::TypedValue<mlir::VectorType> ReadTensorToVector(mlir::OpBuilder& builder,
                                                       mlir::Value input) {
   if (input.getType().isIntOrFloat()) {
-    return builder.create<mlir::vector::FromElementsOp>(
-        input.getLoc(), mlir::VectorType::get({}, input.getType()), input);
+    return mlir::vector::FromElementsOp::create(
+        builder, input.getLoc(), mlir::VectorType::get({}, input.getType()),
+        input);
   }
 
   auto input_tensor =
@@ -65,9 +66,9 @@ mlir::RankedTensorType GetTensorType(mlir::ShapedType type) {
 mlir::TypedValue<mlir::RankedTensorType> WriteVectorToTensor(
     mlir::OpBuilder& builder, mlir::Value input) {
   if (input.getType().isIntOrFloat()) {
-    return builder.create<mlir::tensor::FromElementsOp>(
-        input.getLoc(), mlir::RankedTensorType::get({}, input.getType()),
-        input);
+    return mlir::tensor::FromElementsOp::create(
+        builder, input.getLoc(),
+        mlir::RankedTensorType::get({}, input.getType()), input);
   }
 
   auto input_vector = mlir::cast<mlir::TypedValue<mlir::VectorType>>(input);
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
index 257bef3ecf366a..913ff9426f5cfd 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
@@ -180,9 +180,9 @@ struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
     mlir::ArrayAttr iterator_types = GetIteratorTypes(
         rewriter, iterator_count, lhs_batch.size(), lhs_contracting.size());
 
-    mlir::Value result = rewriter.create<mlir::vector::ContractionOp>(
-        op->getLoc(), lhs_vector, rhs_vector, accumulator, indexing_maps,
-        iterator_types);
+    mlir::Value result = mlir::vector::ContractionOp::create(
+        rewriter, op->getLoc(), lhs_vector, rhs_vector, accumulator,
+        indexing_maps, iterator_types);
 
     rewriter.replaceOp(op, WriteVectorToTensor(rewriter, result));
 
@@ -193,16 +193,16 @@ struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
   mlir::Value GetAccumulator(mlir::OpBuilder& builder, mlir::Location loc,
                              mlir::RankedTensorType result_type) const {
     mlir::Type element_type = result_type.getElementType();
-    auto zero_const = builder.create<mlir::arith::ConstantOp>(
-        loc, element_type, builder.getZeroAttr(element_type));
+    auto zero_const = mlir::arith::ConstantOp::create(
+        builder, loc, element_type, builder.getZeroAttr(element_type));
 
     if (result_type.getRank() == 0) {
       return zero_const;
     }
 
     auto result_vector_type = GetVectorType(result_type);
-    return builder.create<mlir::vector::BroadcastOp>(loc, result_vector_type,
-                                                     zero_const);
+    return mlir::vector::BroadcastOp::create(builder, loc, result_vector_type,
+                                             zero_const);
   }
 };
 
@@ -215,8 +215,8 @@ struct LowerTranspose : mlir::OpRewritePattern<mlir::stablehlo::TransposeOp> {
     mlir::Value source_vector = ReadTensorToVector(rewriter, op.getOperand());
 
     mlir::TypedValue<mlir::VectorType> dest_vector =
-        rewriter.create<mlir::vector::TransposeOp>(op->getLoc(), source_vector,
-                                                   op.getPermutation());
+        mlir::vector::TransposeOp::create(rewriter, op->getLoc(), source_vector,
+                                          op.getPermutation());
 
     mlir::Value dest_tensor = WriteVectorToTensor(rewriter, dest_vector);
 
@@ -242,8 +242,9 @@ struct LowerReduce : mlir::OpRewritePattern<mlir::stablehlo::ReduceOp> {
     auto result_type =
         mlir::cast<mlir::RankedTensorType>(result_tensor.getType());
 
-    mlir::Value init_value = rewriter.create<mlir::tensor::ExtractOp>(
-        op->getLoc(), result_type.getElementType(), op.getInitValues().front());
+    mlir::Value init_value = mlir::tensor::ExtractOp::create(
+        rewriter, op->getLoc(), result_type.getElementType(),
+        op.getInitValues().front());
 
     // Ensure the reduction dimensions are sorted so we can easily check if the
     // minor dimension is reduced.
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
index 4f7354d0cc669c..b0711dba1037b2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
@@ -73,7 +73,7 @@ static void InsertValue(mlir::OpBuilder& builder, mlir::Location loc,
   llvm::SmallVector<mlir::Value> padded_indices(indices);
   while (padded_indices.size() < buffer.getType().getRank()) {
     padded_indices.push_back(
-        builder.create<mlir::arith::ConstantIndexOp>(loc, 0));
+        mlir::arith::ConstantIndexOp::create(builder, loc, 0));
   }
 
   if (mlir::isa<mlir::VectorType>(value.getType())) {
@@ -105,14 +105,14 @@ static std::array<llvm::SmallVector<mlir::Value>, 3> GetLoopBounds(
     llvm::ArrayRef<int64_t> upper_bounds, int64_t lower_bound = 0) {
   llvm::SmallVector<mlir::Value> lbs(
       upper_bounds.size(),
-      builder.create<mlir::arith::ConstantIndexOp>(loc, lower_bound));
+      mlir::arith::ConstantIndexOp::create(builder, loc, lower_bound));
   llvm::SmallVector<mlir::Value> ubs =
       llvm::map_to_vector(upper_bounds, [&](int64_t size) -> mlir::Value {
-        return builder.create<mlir::arith::ConstantIndexOp>(loc, size);
+        return mlir::arith::ConstantIndexOp::create(builder, loc, size);
       });
   llvm::SmallVector<mlir::Value> step(
       upper_bounds.size(),
-      builder.create<mlir::arith::ConstantIndexOp>(loc, 1));
+      mlir::arith::ConstantIndexOp::create(builder, loc, 1));
   return {lbs, ubs, step};
 }
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc b/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
index 03c71381010e3a..32d24489c95343 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdio>
 #include <iostream>
 #include <memory>
-#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc b/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc
index 2a5d61871b4dfa..96dcbde59ec089 100644
--- a/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc
@@ -54,9 +54,11 @@ VectorIrBuilder::VectorIrBuilder(PrimitiveType primitive_type,
       name_(std::move(name)) {
   scalar_type_ =
       llvm_ir::PrimitiveTypeToIrType(primitive_type, b_->getContext());
-  scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
+  scalar_pointer_type_ =
+      llvm::PointerType::getUnqual(scalar_type_->getContext());
   vector_type_ = llvm::VectorType::get(scalar_type_, vector_size, false);
-  vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
+  vector_pointer_type_ =
+      llvm::PointerType::getUnqual(vector_type_->getContext());
 }
 
 void VectorIrBuilder::AssertCorrectTypes(
diff --git a/third_party/xla/xla/backends/cpu/collectives/BUILD b/third_party/xla/xla/backends/cpu/collectives/BUILD
index abec1d790f9b90..c1103b1457180a 100644
--- a/third_party/xla/xla/backends/cpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/cpu/collectives/BUILD
@@ -76,13 +76,11 @@ xla_cc_test(
         ":cpu_clique_key",
         ":cpu_cliques",
         ":in_process_collectives",
-        "//xla:util",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -144,7 +142,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:rendezvous",
         "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/math:math_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -272,7 +269,6 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
@@ -338,7 +334,6 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
index 733dd00f6eb2f7..709f9f0f914f34 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
@@ -509,9 +509,10 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
   // Allocates an aligned buffer of the given size.
   static absl::StatusOr<OwnedDataPtr> AllocateData(size_t size) {
     OwnedDataPtr owned_data(
-        tsl::port::AlignedMalloc(std::max<size_t>(size, Align()), Align()),
+        tsl::port::AlignedMalloc(std::max<size_t>(size, Align()),
+                                 static_cast<std::align_val_t>(Align())),
         [](void* ptr) { tsl::port::AlignedFree(ptr); });
-    if (ABSL_PREDICT_FALSE(owned_data.get() == nullptr)) {
+    if (ABSL_PREDICT_FALSE(owned_data == nullptr)) {
       return Internal("Failed to allocate memory for NanoArray. Errno: %s",
                       strerror(errno));
     }
@@ -1260,6 +1261,8 @@ class NanoDevice final : public llvm::RTTIExtends<NanoDevice, ifrt::Device> {
     return *attributes;
   }
 
+  absl::string_view PlatformName() const override { return "cpu"; }
+
   absl::string_view Kind() const override { return "cpu"; }
 
   absl::string_view ToString() const override { return "NanoRT CPU"; }
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
index bec4d24483521b..cad17e33cfafa1 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
@@ -87,7 +87,7 @@ absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtClient::Compile(
                                   optimized_hlo_program_shape);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>> NanoRtClient::Export(
+absl::StatusOr<std::unique_ptr<CompiledModule>> NanoRtClient::Export(
     NanoRtExecutable* executable) {
   cpu::CpuCompiler compiler;
   return compiler.Export(executable->executable());
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
index cad9c584a0fba4..54b890360b3e7d 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
@@ -39,7 +39,7 @@ class NanoRtClient {
           ExecutableBuildOptions());
 
   // Exports the given NanoRtExecutable to an AotCompilationResult.
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
+  absl::StatusOr<std::unique_ptr<CompiledModule>> Export(
       NanoRtExecutable* executable);
 };
 
diff --git a/third_party/xla/xla/backends/cpu/onednn_emitter.cc b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
index 0ea1f6a916ffc5..0e51804ff92743 100644
--- a/third_party/xla/xla/backends/cpu/onednn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index 1ee4f81b7db5f6..6fd59cdf1d6bd9 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -1,10 +1,9 @@
 load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
 load("//xla/service/cpu:build_defs.bzl", "runtime_copts")
-load("//xla/tsl:tsl.bzl", "if_windows", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_google", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -56,7 +55,7 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/service:buffer_assignment",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/stream_executor:device_address",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
@@ -105,14 +104,11 @@ cc_library(
         "//xla:util",
         "//xla/runtime:work_group",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
@@ -172,7 +168,6 @@ cc_library(
     name = "thunk",
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
-    defines = if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":buffer_allocations",
         ":function_library",
@@ -180,8 +175,8 @@ cc_library(
         "//xla:executable_run_options",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/backends/cpu/collectives:in_process_collectives",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_threadpool",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_threadpool",
         "//xla/ffi:execution_context",
         "//xla/runtime:buffer_use",
         "//xla/runtime:device_id",
@@ -201,10 +196,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ] + if_ynnpack([
-        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_threadpool",
-    ]),
+    ],
 )
 
 cc_library(
@@ -327,6 +319,7 @@ cc_library(
         ":thunk_executor",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_address",
@@ -348,6 +341,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_testlib",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
@@ -370,13 +364,10 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -496,14 +487,11 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -524,12 +512,10 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -550,13 +536,10 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -580,13 +563,10 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -752,6 +732,11 @@ cc_library(
         "dot_lib_s8.cc",
     ],
     hdrs = ["dot_lib.h"],
+    tags = if_google([
+        #  Prevent build_cleaner from adding a dependency on eigen_contraction_kernel.h, see comment
+        # on `:dot_lib_onednn` below.
+        "ignore_for_dep=third_party/tensorflow/compiler/xla/tsl/framework/contraction/eigen_contraction_kernel.h",
+    ]),
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
@@ -799,14 +784,11 @@ cc_library(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -967,7 +949,6 @@ cc_library(
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -995,7 +976,6 @@ xla_cc_test(
     deps = [
         ":buffer_allocations",
         ":function_library",
-        ":kernel",
         ":kernel_c_api",
         ":kernel_thunk",
         ":thunk",
@@ -1003,8 +983,6 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
@@ -1099,7 +1077,6 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1227,7 +1204,6 @@ cc_library(
         "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
     ],
@@ -1271,11 +1247,10 @@ cc_library(
         ":while_thunk",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
+        "//xla/backends/cpu:ynn_emitter",
         "//xla/backends/cpu:ynn_fusion_options_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
@@ -1296,11 +1271,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
-    ] + if_ynnpack([
-        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
-        "//xla/backends/cpu:ynn_emitter",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1335,7 +1306,6 @@ xla_cc_test(
 xla_cc_test(
     name = "thunk_sequence_serdes_test",
     srcs = ["thunk_sequence_serdes_test.cc"],
-    local_defines = if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":all_gather_thunk",
         ":all_reduce_thunk",
@@ -1370,9 +1340,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/runtime:resource_use",
@@ -1393,9 +1361,7 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
-    ] + if_ynnpack([
-        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1424,8 +1390,6 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h b/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
index b1f7e8142d2939..a0d11f61c53d42 100644
--- a/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
+++ b/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/util.h"
 
@@ -40,7 +40,8 @@ class BufferAllocations {
 
   explicit BufferAllocations(Buffers buffers);
   explicit BufferAllocations(absl::Span<const se::DeviceAddressBase> buffers);
-  explicit BufferAllocations(absl::Span<const MaybeOwningDeviceMemory> buffers);
+  explicit BufferAllocations(
+      absl::Span<const MaybeOwningDeviceAddress> buffers);
 
   // Returns the device address of buffer at the given index. Returns an error
   // if the index is out of range.
@@ -80,12 +81,12 @@ inline BufferAllocations::BufferAllocations(
       num_buffers_(buffers_.size()) {}
 
 inline BufferAllocations::BufferAllocations(
-    absl::Span<const MaybeOwningDeviceMemory> buffers)
+    absl::Span<const MaybeOwningDeviceAddress> buffers)
     : buffers_(buffers.size()),
       buffers_data_(buffers_.data()),
       num_buffers_(buffers_.size()) {
   for (size_t i = 0; i < buffers.size(); ++i) {
-    buffers_[i] = buffers[i].AsDeviceMemoryBase();
+    buffers_[i] = buffers[i].AsDeviceAddress();
   }
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
index 17ee2002d1b6b0..b324c7bf4b3b7c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 namespace {
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
index b2a918018fc137..019800e844304d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 namespace {
diff --git a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
index 106d945ddc3981..129a2a7dd32df3 100644
--- a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
@@ -84,7 +84,7 @@ CopyThunk::CopyThunk(Info info, BufferAllocation::Slice src_buffer,
 
     auto byte_strides = ShapeUtil::ByteStrides(src_shape_);
     CHECK(byte_strides.has_value());
-    options.input_layout = TransposePlan::Striding{*byte_strides};
+    options.input_striding = TransposePlan::Striding{*byte_strides};
 
     absl::InlinedVector<int64_t, 4> permutation(options.dims.size());
     absl::c_reverse_copy(dst_shape_.layout().minor_to_major(),
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.cc b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
index c1c948124c1257..0468b0c3e84b11 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
 #include "xla/backends/cpu/collectives/in_process_collectives.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
 #include "xla/executable_run_options.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
@@ -43,11 +43,6 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 
 // Ok execute event allocated with the static storage duration.
@@ -91,8 +86,6 @@ absl::string_view Thunk::KindToString(Kind kind) {
       return "topk";
     case Kind::kWhile:
       return "while";
-    case Kind::kXnnFusion:
-      return "xnn-fusion";
     case Kind::kYnnFusion:
       return "ynn-fusion";
     case Kind::kOneDnnFusion:
@@ -165,17 +158,6 @@ Thunk::CustomCallExecuteParams::CustomCallExecuteParams(
       intra_op_thread_pool(intra_op_thread_pool),
       ffi_execution_context(ffi_execution_context) {}
 
-absl::StatusOr<Thunk::XnnParams> Thunk::XnnParams::Create(
-    const ExecutableRunOptions* run_options) {
-  TF_ASSIGN_OR_RETURN(XnnThreadpool threadpool,
-                      CreateXnnThreadpool(run_options->intra_op_thread_pool()));
-  return XnnParams(std::move(threadpool));
-}
-
-Thunk::XnnParams::XnnParams(XnnThreadpool threadpool)
-    : threadpool(std::move(threadpool)) {}
-
-#ifdef XLA_YNNPACK
 absl::StatusOr<Thunk::YnnParams> Thunk::YnnParams::Create(
     const ExecutableRunOptions* run_options) {
   TF_ASSIGN_OR_RETURN(YnnThreadpool threadpool,
@@ -185,7 +167,6 @@ absl::StatusOr<Thunk::YnnParams> Thunk::YnnParams::Create(
 
 Thunk::YnnParams::YnnParams(YnnThreadpool threadpool)
     : threadpool(std::move(threadpool)) {}
-#endif  // XLA_YNNPACK
 
 Thunk::ExecuteSession::ExecuteSession(int64_t max_workers,
                                       int64_t split_threshold)
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.h b/third_party/xla/xla/backends/cpu/runtime/thunk.h
index 154dea8200b7f9..a30eb2b27f76dd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.h
@@ -35,8 +35,8 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/xfeed_manager.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/runtime/buffer_use.h"
@@ -47,11 +47,6 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
-#endif  // XLA_YNNPACK
-
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
@@ -91,7 +86,6 @@ class Thunk {
     kSort,
     kTopK,
     kWhile,
-    kXnnFusion,
     kYnnFusion,
     kOneDnnFusion,
   };
@@ -254,25 +248,10 @@ class Thunk {
                             const ffi::ExecutionContext* ffi_execution_context);
   };
 
-  //===--------------------------------------------------------------------===//
-  // XnnParams
-  //===--------------------------------------------------------------------===//
-
-  // Parameters capturing all the details required for running XNNPACK fusions.
-  struct XnnParams {
-    static absl::StatusOr<XnnParams> Create(
-        const ExecutableRunOptions* run_options);
-
-    XnnThreadpool threadpool = nullptr;
-
-    explicit XnnParams(XnnThreadpool threadpool);
-  };
-
   //===--------------------------------------------------------------------===//
   // YnnParams
   //===--------------------------------------------------------------------===//
 
-#ifdef XLA_YNNPACK
   // Parameters capturing all the details required for running XNNPACK fusions.
   struct YnnParams {
     static absl::StatusOr<YnnParams> Create(
@@ -282,10 +261,6 @@ class Thunk {
 
     explicit YnnParams(YnnThreadpool threadpool);
   };
-#else
-  // Use XnnParams for placeholder. The parameter won't be used anyway.
-  using YnnParams = XnnParams;
-#endif  // XLA_YNNPACK
 
   //===--------------------------------------------------------------------===//
   // ExecuteParams
@@ -301,7 +276,6 @@ class Thunk {
     TaskRunner* task_runner = nullptr;
     CollectiveExecuteParams* collective_params = nullptr;
     CustomCallExecuteParams* custom_call_params = nullptr;
-    XnnParams* xnn_params = nullptr;
     YnnParams* ynn_params = nullptr;
     int64_t run_id = -1;          // -1 means no run id is set.
     int64_t device_ordinal = -1;  // -1 means no device ordinal is set.
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.proto b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
index 0af36ecb40e915..f4501ddce0a0c8 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
@@ -145,6 +145,7 @@ message SortThunkProto {
   repeated ShapeBufferAllocationSliceProto inputs_shapes = 5;
 }
 
+// TODO: b/467367981, this is deprecated and should be removed.
 message XnnDotThunkProto {
   DotDimensionNumbers dot_dimensions = 1;
   ShapeBufferAllocationSliceProto lhs_buffer_shape = 2;
@@ -295,6 +296,7 @@ message ThunkProto {
     CallThunkProto call_thunk = 3;
     ConditionalThunkProto conditional_thunk = 4;
     SortThunkProto sort_thunk = 5;
+    // TODO: b/467367981, this is deprecated and should be removed.
     XnnFusionThunkProto xnn_fusion_thunk = 6;
     DotThunkProto dot_thunk = 7;
     RngGetAndUpdateStateThunkProto rng_get_and_update_state_thunk = 8;
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
index 0d515fd787b1fb..d8381b601813d1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
@@ -61,10 +61,9 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/xnn_fusion_options.pb.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
 #include "xla/backends/cpu/ynn_fusion_options.pb.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -81,12 +80,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/ynn_emitter.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 
 void ForEachThunkProto(const ThunkSequenceProto& proto,
@@ -125,20 +118,6 @@ ProtoCollectiveThunkToCollectiveThunkKind(const CollectiveThunkProto& proto) {
   }
 }
 
-static absl::StatusOr<XnnFusionThunk::XnnFusionKind>
-ProtoXnnFusionThunkToXnnFusionThunkKind(const XnnFusionThunkProto& proto) {
-  switch (proto.impl_case()) {
-    case XnnFusionThunkProto::ImplCase::kXnnFusionThunk:
-      return XnnFusionThunk::XnnFusionKind::kFusion;
-    case XnnFusionThunkProto::ImplCase::kXnnDotThunk:
-      return XnnFusionThunk::XnnFusionKind::kDot;
-    case XnnFusionThunkProto::ImplCase::kXnnConvolutionThunk:
-      return XnnFusionThunk::XnnFusionKind::kConvolution;
-    case XnnFusionThunkProto::ImplCase::IMPL_NOT_SET:
-      return Internal("XNN fusion thunk kind not set.");
-  }
-}
-
 static absl::StatusOr<Thunk::Kind> ProtoThunkToThunkKind(
     const ThunkProto& proto) {
   switch (proto.impl_case()) {
@@ -173,7 +152,7 @@ static absl::StatusOr<Thunk::Kind> ProtoThunkToThunkKind(
     case ThunkProto::ImplCase::kWhileThunk:
       return Thunk::Kind::kWhile;
     case ThunkProto::ImplCase::kXnnFusionThunk:
-      return Thunk::Kind::kXnnFusion;
+      return Internal("Thunk kind kXnnFusionThunk is deprecated.");
     case ThunkProto::ImplCase::kPartitionIdThunk:
       return Thunk::Kind::kPartitionId;
     case ThunkProto::ImplCase::kReplicaIdThunk:
@@ -734,7 +713,6 @@ static absl::Status ToProto(const WhileThunk& thunk, ThunkProto& proto) {
   return absl::OkStatus();
 }
 
-#ifdef XLA_YNNPACK
 static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
   YnnFusionThunkProto* ynn_fusion_proto = proto.mutable_ynn_fusion_thunk();
   ynn_fusion_proto->mutable_options()->set_use_threadpool(
@@ -754,67 +732,6 @@ static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
 
   return absl::OkStatus();
 }
-#endif  // XLA_YNNPACK
-
-static absl::Status ToProto(const XnnFusionThunk& thunk, ThunkProto& proto) {
-  // TODO(basioli) XnnFusionThunk is not serializable because it contains
-  // a builder function that is not serializable.
-  // This would require a serialization of the XNNPACK subgraph.
-  return absl::UnimplementedError("XnnFusionThunk is not serializable.");
-}
-
-static absl::Status ToProto(const XnnDotThunk& thunk, ThunkProto& proto) {
-  XnnDotThunkProto* xnn_dot_thunk_proto =
-      proto.mutable_xnn_fusion_thunk()->mutable_xnn_dot_thunk();
-  *xnn_dot_thunk_proto->mutable_dot_dimensions() = thunk.dot_dimensions();
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      thunk.dot_slices().lhs_buffer, thunk.dot_slices().lhs_shape,
-      xnn_dot_thunk_proto->mutable_lhs_buffer_shape()));
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      thunk.dot_slices().rhs_buffer, thunk.dot_slices().rhs_shape,
-      xnn_dot_thunk_proto->mutable_rhs_buffer_shape()));
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      thunk.dot_slices().out_buffer, thunk.dot_slices().out_shape,
-      xnn_dot_thunk_proto->mutable_out_buffer_shape()));
-  proto.mutable_xnn_fusion_thunk()->mutable_options()->set_use_threadpool(
-      thunk.options().use_threadpool);
-  xnn_dot_thunk_proto->set_capture_rhs(thunk.capture_rhs());
-  return absl::OkStatus();
-}
-
-static absl::Status ToProto(const XnnConvolutionThunk& thunk,
-                            ThunkProto& proto) {
-  XnnConvolutionThunkProto* convolution_thunk_proto =
-      proto.mutable_xnn_fusion_thunk()->mutable_xnn_convolution_thunk();
-
-  const std::string dnums_as_str = thunk.dnums().SerializeAsString();
-  convolution_thunk_proto->mutable_dimension_numbers()->ParseFromString(
-      dnums_as_str);
-
-  const std::string window_as_str = thunk.window().SerializeAsString();
-  convolution_thunk_proto->mutable_window()->ParseFromString(window_as_str);
-
-  convolution_thunk_proto->set_feature_group_count(thunk.feature_group_count());
-
-  const ConvolutionSlices& convolution_slices = thunk.convolution_slices();
-
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      convolution_slices.input_buffer, convolution_slices.input_shape,
-      convolution_thunk_proto->mutable_input_buffer_shape()));
-
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      convolution_slices.output_buffer, convolution_slices.output_shape,
-      convolution_thunk_proto->mutable_output_buffer_shape()));
-
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      convolution_slices.kernel_buffer, convolution_slices.kernel_shape,
-      convolution_thunk_proto->mutable_kernel_buffer_shape()));
-
-  proto.mutable_xnn_fusion_thunk()->mutable_options()->set_use_threadpool(
-      thunk.options().use_threadpool);
-
-  return absl::OkStatus();
-}
 
 static absl::Status ToProto(const FftThunk& thunk, ThunkProto& proto) {
   FftThunkProto* fft_thunk_proto = proto.mutable_fft_thunk();
@@ -983,25 +900,6 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
       TF_RETURN_IF_ERROR(
           ::xla::cpu::ToProto(tsl::down_cast<const WhileThunk&>(thunk), proto));
       break;
-    case Thunk::Kind::kXnnFusion: {
-      const XnnFusionThunk& xnn_fusion_thunk =
-          tsl::down_cast<const XnnFusionThunk&>(thunk);
-      switch (xnn_fusion_thunk.xnn_fusion_kind()) {
-        case XnnFusionThunk::XnnFusionKind::kFusion:
-          TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
-              tsl::down_cast<const XnnFusionThunk&>(thunk), proto));
-          break;
-        case XnnFusionThunk::XnnFusionKind::kDot:
-          TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
-              tsl::down_cast<const XnnDotThunk&>(thunk), proto));
-          break;
-        case XnnFusionThunk::XnnFusionKind::kConvolution:
-          TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
-              tsl::down_cast<const XnnConvolutionThunk&>(thunk), proto));
-          break;
-      }
-      break;
-    }
     case Thunk::Kind::kPartitionId:
       TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
           static_cast<const PartitionIdThunk&>(
@@ -1016,12 +914,10 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
                   internal::LogicalIdKind::kReplicaId>&>(thunk)),
           proto));
       break;
-#ifdef XLA_YNNPACK
     case Thunk::Kind::kYnnFusion:
       TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
           tsl::down_cast<const YnnFusionThunk&>(thunk), proto));
       break;
-#endif  // XLA_YNNPACK
     default:
       return absl::UnimplementedError(
           absl::StrFormat("ToProto is not implemented for thunk kind: %s",
@@ -1552,7 +1448,6 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
                             std::move(*body_sequence), trip_count);
 }
 
-#ifdef XLA_YNNPACK
 static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
     const ThunkProto& proto, const HloModule* hlo_module,
     const std::vector<BufferAllocation>& buffer_allocations) {
@@ -1630,94 +1525,6 @@ static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
       },
       captured_arguments_ids);
 }
-#endif  // XLA_YNNPACK
-
-static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  return absl::UnimplementedError("XnnFusionThunkFromProto is not implemented");
-}
-
-static absl::StatusOr<std::unique_ptr<XnnDotThunk>> XnnDotThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
-
-  XnnDotThunk::Options options = {
-      proto.xnn_fusion_thunk().options().use_threadpool(),
-  };
-
-  TF_ASSIGN_OR_RETURN(
-      auto lhs_slice_shape,
-      DeserializeSliceShapeFromProto(
-          proto.xnn_fusion_thunk().xnn_dot_thunk().lhs_buffer_shape(),
-          buffer_allocations));
-
-  TF_ASSIGN_OR_RETURN(
-      auto rhs_slice_shape,
-      DeserializeSliceShapeFromProto(
-          proto.xnn_fusion_thunk().xnn_dot_thunk().rhs_buffer_shape(),
-          buffer_allocations));
-  TF_ASSIGN_OR_RETURN(
-      auto out_slice_shape,
-      DeserializeSliceShapeFromProto(
-          proto.xnn_fusion_thunk().xnn_dot_thunk().out_buffer_shape(),
-          buffer_allocations));
-
-  const auto& [lhs_buffer, lhs_shape] = lhs_slice_shape;
-  const auto& [rhs_buffer, rhs_shape] = rhs_slice_shape;
-  const auto& [out_buffer, out_shape] = out_slice_shape;
-
-  bool capture_rhs = proto.xnn_fusion_thunk().xnn_dot_thunk().capture_rhs();
-
-  return XnnDotThunk::Create(
-      std::move(options), std::move(info),
-      proto.xnn_fusion_thunk().xnn_dot_thunk().dot_dimensions(), lhs_buffer,
-      lhs_shape, rhs_buffer, rhs_shape, out_buffer, out_shape, capture_rhs);
-}
-
-static absl::StatusOr<std::unique_ptr<XnnConvolutionThunk>>
-XnnConvolutionThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
-
-  XnnConvolutionThunk::Options options = {
-      proto.xnn_fusion_thunk().options().use_threadpool(),
-  };
-
-  const auto& conv_proto = proto.xnn_fusion_thunk().xnn_convolution_thunk();
-
-  // Dimension numbers.
-  ConvolutionDimensionNumbers dnums = conv_proto.dimension_numbers();
-
-  // Window.
-  Window window = conv_proto.window();
-
-  // Feature group count.
-  int64_t feature_group_count = conv_proto.feature_group_count();
-
-  TF_ASSIGN_OR_RETURN(auto input_slice_shape,
-                      DeserializeSliceShapeFromProto(
-                          conv_proto.input_buffer_shape(), buffer_allocations));
-  TF_ASSIGN_OR_RETURN(
-      auto kernel_slice_shape,
-      DeserializeSliceShapeFromProto(conv_proto.kernel_buffer_shape(),
-                                     buffer_allocations));
-  TF_ASSIGN_OR_RETURN(
-      auto output_slice_shape,
-      DeserializeSliceShapeFromProto(conv_proto.output_buffer_shape(),
-                                     buffer_allocations));
-
-  const auto& [input_buffer, input_shape] = input_slice_shape;
-  const auto& [kernel_buffer, kernel_shape] = kernel_slice_shape;
-  const auto& [output_buffer, output_shape] = output_slice_shape;
-
-  return XnnConvolutionThunk::Create(
-      std::move(options), std::move(info), std::move(input_buffer), input_shape,
-      std::move(kernel_buffer), kernel_shape, std::move(output_buffer),
-      output_shape, dnums, window, feature_group_count);
-}
 
 static absl::StatusOr<std::unique_ptr<Thunk>> PartitionIdThunkFromProto(
     const ThunkProto& proto,
@@ -1813,27 +1620,12 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       return TopKThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kWhile:
       return WhileThunkFromProto(proto, hlo_module_, buffer_allocations_);
-    case Thunk::Kind::kXnnFusion: {
-      TF_ASSIGN_OR_RETURN(
-          auto xnn_fusion_kind,
-          ProtoXnnFusionThunkToXnnFusionThunkKind(proto.xnn_fusion_thunk()));
-      switch (xnn_fusion_kind) {
-        case XnnFusionThunk::XnnFusionKind::kFusion:
-          return XnnFusionThunkFromProto(proto, *buffer_allocations_);
-        case XnnFusionThunk::XnnFusionKind::kDot:
-          return XnnDotThunkFromProto(proto, *buffer_allocations_);
-        case XnnFusionThunk::XnnFusionKind::kConvolution:
-          return XnnConvolutionThunkFromProto(proto, *buffer_allocations_);
-      }
-    }
     case Thunk::Kind::kPartitionId:
       return PartitionIdThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kReplicaId:
       return ReplicaIdThunkFromProto(proto, *buffer_allocations_);
-#ifdef XLA_YNNPACK
     case Thunk::Kind::kYnnFusion:
       return YnnFusionThunkFromProto(proto, hlo_module_, *buffer_allocations_);
-#endif  // XLA_YNNPACK
     default:
       return absl::Status(absl::StatusCode::kInvalidArgument,
                           absl::StrFormat("Unsupported thunk kind: %s",
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
index a8e425116af038..f4720cd7756d99 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
@@ -59,9 +59,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
@@ -80,10 +78,6 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 namespace {
 
@@ -185,9 +179,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateTopKThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateWhileThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateWhileThunk(1));
-    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateXnnDotThunk());
-    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
-                        CreateXnnConvolutionThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateKernelThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
                         CreateConvolutionThunk());
@@ -606,79 +597,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         /*trip_count=*/trip_count);
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateXnnDotThunk() {
-    TF_RETURN_IF_ERROR(AddBufferAllocations(3));
-    DotDimensionNumbers dot_dimensions;
-    dot_dimensions.add_lhs_contracting_dimensions(1);
-    dot_dimensions.add_rhs_contracting_dimensions(0);
-    return XnnDotThunk::Create(
-        XnnFusionThunk::Options(), Thunk::Info(),
-        /*dot_dimensions=*/dot_dimensions,
-        /*lhs_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 3]),
-        /*lhs_shape=*/literals_[buffer_allocations_.size() - 3].shape(),
-        /*rhs_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 2]),
-        /*rhs_shape=*/literals_[buffer_allocations_.size() - 2].shape(),
-        /*out_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 1]),
-        /*out_shape=*/literals_[buffer_allocations_.size() - 1].shape(), true);
-  }
-
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateXnnConvolutionThunk() {
-    std::vector<int64_t> input_dims = {1, 8, 8, 16};
-    std::vector<int64_t> kernel_dims = {32, 1, 1, 16};
-    std::vector<int64_t> output_dims = {1, 8, 8, 32};
-
-    // Convolution rank inferred from the input dimensions.
-    int convolution_rank = input_dims.size() - 2;
-
-    // Convolution parameters.
-    ConvolutionDimensionNumbers conv_dims =
-        MakeConvolutionDimensionNumbers(convolution_rank);
-    Window window = MakeWindow(convolution_rank);
-
-    // Adjust kernel dimensions for XNNPACK.
-    conv_dims.set_kernel_input_feature_dimension(3);
-    conv_dims.set_kernel_output_feature_dimension(0);
-    conv_dims.set_kernel_spatial_dimensions(0, 1);
-    conv_dims.set_kernel_spatial_dimensions(1, 2);
-
-    // Actual data.
-    literals_.push_back(
-        LiteralUtil::CreateFull<float>(input_dims, 0.0));  // input
-    literals_.push_back(
-        LiteralUtil::CreateFull<float>(kernel_dims, 0.0));  // kernel
-    literals_.push_back(
-        LiteralUtil::CreateFull<float>(output_dims, 0.0));  // output
-
-    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(CreateBufferAllocation(
-        buffer_allocations_.size(), literals_[literals_.size() - 3])));
-    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(CreateBufferAllocation(
-        buffer_allocations_.size(), literals_[literals_.size() - 2])));
-    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(CreateBufferAllocation(
-        buffer_allocations_.size(), literals_[literals_.size() - 1])));
-
-    return XnnConvolutionThunk::Create(
-        XnnFusionThunk::Options(), Thunk::Info(),
-        /*input_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 3]),
-        /*input_shape=*/literals_[buffer_allocations_.size() - 3].shape(),
-        /*kernel_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 2]),
-        /*kernel_shape=*/literals_[buffer_allocations_.size() - 2].shape(),
-        /*output_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 1]),
-        /*output_shape=*/literals_[buffer_allocations_.size() - 1].shape(),
-        conv_dims, window, /*feature_group_count=*/1);
-  }
-
   absl::StatusOr<std::unique_ptr<Thunk>> CreateKernelThunk() {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
     return KernelThunk::Create(
@@ -1107,113 +1025,12 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
            thunk_1.trip_count() == thunk_2.trip_count();
   }
 
-  bool VerifyXnnFusionThunkEquality(const XnnFusionThunk& thunk_1,
-                                    const XnnFusionThunk& thunk_2) {
-    // TODO(basioli) assume this is always false until we implement
-    // serialization of XnnFusionThunk.
-    return false;
-  }
-
-#ifdef XLA_YNNPACK
   bool VerifyYnnFusionThunkEquality(const YnnFusionThunk& thunk_1,
                                     const YnnFusionThunk& thunk_2) {
     // TODO(ashaposhnikov) assume this is always false until we implement
     // serialization of YnnFusionThunk.
     return false;
   }
-#endif  // XLA_YNNPACK
-
-  bool VerifyXnnDotThunkEquality(const XnnDotThunk& thunk_1,
-                                 const XnnDotThunk& thunk_2) {
-    const bool are_dot_dimensions_equal =
-        absl::c_equal(thunk_1.dot_dimensions().lhs_batch_dimensions(),
-                      thunk_2.dot_dimensions().lhs_batch_dimensions()) &&
-        absl::c_equal(thunk_1.dot_dimensions().rhs_batch_dimensions(),
-                      thunk_2.dot_dimensions().rhs_batch_dimensions()) &&
-        absl::c_equal(thunk_1.dot_dimensions().lhs_contracting_dimensions(),
-                      thunk_2.dot_dimensions().lhs_contracting_dimensions()) &&
-        absl::c_equal(thunk_1.dot_dimensions().rhs_contracting_dimensions(),
-                      thunk_2.dot_dimensions().rhs_contracting_dimensions());
-
-    const bool are_options_equal =
-        thunk_1.options().use_threadpool == thunk_2.options().use_threadpool;
-
-    const bool is_capturing_rhs_equal =
-        thunk_1.capture_rhs() == thunk_2.capture_rhs();
-
-    return are_options_equal && are_dot_dimensions_equal &&
-           is_capturing_rhs_equal &&
-           VerifySliceShapeEquality(thunk_1.dot_slices().lhs_buffer,
-                                    thunk_1.dot_slices().lhs_shape,
-                                    thunk_2.dot_slices().lhs_buffer,
-                                    thunk_2.dot_slices().lhs_shape) &&
-           VerifySliceShapeEquality(thunk_1.dot_slices().rhs_buffer,
-                                    thunk_1.dot_slices().rhs_shape,
-                                    thunk_2.dot_slices().rhs_buffer,
-                                    thunk_2.dot_slices().rhs_shape) &&
-           VerifySliceShapeEquality(
-               thunk_1.dot_slices().out_buffer, thunk_1.dot_slices().out_shape,
-               thunk_2.dot_slices().out_buffer, thunk_2.dot_slices().out_shape);
-  }
-
-  bool VerifyXnnConvolutionThunkEquality(const XnnConvolutionThunk& thunk_1,
-                                         const XnnConvolutionThunk& thunk_2) {
-    const bool are_dnums_equal =
-        absl::c_equal(thunk_1.dnums().input_spatial_dimensions(),
-                      thunk_2.dnums().input_spatial_dimensions()) &&
-        absl::c_equal(thunk_1.dnums().kernel_spatial_dimensions(),
-                      thunk_2.dnums().kernel_spatial_dimensions()) &&
-        absl::c_equal(thunk_1.dnums().output_spatial_dimensions(),
-                      thunk_2.dnums().output_spatial_dimensions()) &&
-        thunk_1.dnums().input_batch_dimension() ==
-            thunk_2.dnums().input_batch_dimension() &&
-        thunk_1.dnums().input_feature_dimension() ==
-            thunk_2.dnums().input_feature_dimension() &&
-        thunk_1.dnums().kernel_input_feature_dimension() ==
-            thunk_2.dnums().kernel_input_feature_dimension() &&
-        thunk_1.dnums().kernel_output_feature_dimension() ==
-            thunk_2.dnums().kernel_output_feature_dimension() &&
-        thunk_1.dnums().output_batch_dimension() ==
-            thunk_2.dnums().output_batch_dimension() &&
-        thunk_1.dnums().output_feature_dimension() ==
-            thunk_2.dnums().output_feature_dimension();
-
-    const bool are_options_equal =
-        thunk_1.options().use_threadpool == thunk_2.options().use_threadpool;
-
-    const bool are_windows_equal = absl::c_equal(
-        thunk_1.window().dimensions(), thunk_2.window().dimensions(),
-        [](const WindowDimension& window_dimension_1,
-           const WindowDimension& window_dimension_2) {
-          return window_dimension_1.size() == window_dimension_2.size() &&
-                 window_dimension_1.stride() == window_dimension_2.stride() &&
-                 window_dimension_1.padding_low() ==
-                     window_dimension_2.padding_low() &&
-                 window_dimension_1.padding_high() ==
-                     window_dimension_2.padding_high() &&
-                 window_dimension_1.window_dilation() ==
-                     window_dimension_2.window_dilation() &&
-                 window_dimension_1.base_dilation() ==
-                     window_dimension_2.base_dilation() &&
-                 window_dimension_1.window_reversal() ==
-                     window_dimension_2.window_reversal();
-        });
-
-    return are_dnums_equal && are_windows_equal && are_options_equal &&
-           thunk_1.feature_group_count() == thunk_2.feature_group_count() &&
-           VerifySliceShapeEquality(thunk_1.convolution_slices().input_buffer,
-                                    thunk_1.convolution_slices().input_shape,
-                                    thunk_2.convolution_slices().input_buffer,
-                                    thunk_2.convolution_slices().input_shape);
-    VerifySliceShapeEquality(thunk_1.convolution_slices().kernel_buffer,
-                             thunk_1.convolution_slices().kernel_shape,
-                             thunk_2.convolution_slices().kernel_buffer,
-                             thunk_2.convolution_slices().kernel_shape);
-    VerifySliceShapeEquality(thunk_1.convolution_slices().output_buffer,
-                             thunk_1.convolution_slices().output_shape,
-                             thunk_2.convolution_slices().output_buffer,
-                             thunk_2.convolution_slices().output_shape);
-  }
 
   bool VerifyKernelThunkEquality(const KernelThunkBase& thunk_1,
                                  const KernelThunkBase& thunk_2) {
@@ -1408,32 +1225,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         return VerifyWhileThunkEquality(
             tsl::down_cast<const WhileThunk&>(thunk_1),
             tsl::down_cast<const WhileThunk&>(thunk_2));
-      case Thunk::Kind::kXnnFusion: {
-        const XnnFusionThunk& xnn_fusion_thunk_1 =
-            tsl::down_cast<const XnnFusionThunk&>(thunk_1);
-        const XnnFusionThunk& xnn_fusion_thunk_2 =
-            tsl::down_cast<const XnnFusionThunk&>(thunk_2);
-        if (xnn_fusion_thunk_1.xnn_fusion_kind() !=
-            xnn_fusion_thunk_2.xnn_fusion_kind()) {
-          return false;
-        }
-        switch (xnn_fusion_thunk_1.xnn_fusion_kind()) {
-          case XnnFusionThunk::XnnFusionKind::kFusion:
-            return VerifyXnnFusionThunkEquality(
-                tsl::down_cast<const XnnFusionThunk&>(thunk_1),
-                tsl::down_cast<const XnnFusionThunk&>(thunk_2));
-          case XnnFusionThunk::XnnFusionKind::kDot:
-            return VerifyXnnDotThunkEquality(
-                tsl::down_cast<const XnnDotThunk&>(thunk_1),
-                tsl::down_cast<const XnnDotThunk&>(thunk_2));
-          case XnnFusionThunk::XnnFusionKind::kConvolution:
-            return VerifyXnnConvolutionThunkEquality(
-                tsl::down_cast<const XnnConvolutionThunk&>(thunk_1),
-                tsl::down_cast<const XnnConvolutionThunk&>(thunk_2));
-        }
-      }
       case Thunk::Kind::kYnnFusion: {
-#ifdef XLA_YNNPACK
         const YnnFusionThunk& ynn_fusion_thunk_1 =
             tsl::down_cast<const YnnFusionThunk&>(thunk_1);
         const YnnFusionThunk& ynn_fusion_thunk_2 =
@@ -1445,10 +1237,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         return VerifyYnnFusionThunkEquality(
             tsl::down_cast<const YnnFusionThunk&>(thunk_1),
             tsl::down_cast<const YnnFusionThunk&>(thunk_2));
-#else
-        CHECK(false) << "Unsupported YNN fusion thunk type";
-        return false;
-#endif  // XLA_YNNPACK
       }
       case Thunk::Kind::kKernel:
         return VerifyKernelThunkEquality(
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
deleted file mode 100644
index 9cd96a5e90b63b..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
+++ /dev/null
@@ -1,215 +0,0 @@
-load("//xla:xla.default.bzl", "xla_cc_test")
-load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//xla:friends",
-    ],
-)
-
-cc_library(
-    name = "xnn_interop",
-    srcs = ["xnn_interop.cc"],
-    hdrs = ["xnn_interop.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/tsl/platform:logging",
-        "@XNNPACK",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-cc_library(
-    name = "xnn_threadpool",
-    srcs = ["xnn_threadpool.cc"],
-    hdrs = ["xnn_threadpool.h"],
-    deps = [
-        ":xnn_interop",
-        "@XNNPACK",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/status:statusor",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "xnn_convolution_thunk",
-    srcs = ["xnn_convolution_thunk.cc"],
-    hdrs = ["xnn_convolution_thunk.h"],
-    deps = [
-        ":xnn_fusion_thunk",
-        ":xnn_interop",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:convolution_dims",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_convolution_thunk_test",
-    srcs = ["xnn_convolution_thunk_test.cc"],
-    deps = [
-        ":xnn_convolution_thunk",
-        ":xnn_interop",
-        ":xnn_threadpool",
-        "//xla:error_spec",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_testlib",
-        "//xla/hlo/evaluator:hlo_evaluator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service:hlo_module_config",
-        "//xla/tests:literal_test_util",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "xnn_dot_thunk",
-    srcs = ["xnn_dot_thunk.cc"],
-    hdrs = ["xnn_dot_thunk.h"],
-    deps = [
-        ":xnn_fusion_thunk",
-        ":xnn_interop",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_dims",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/functional:bind_front",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_dot_thunk_test",
-    srcs = ["xnn_dot_thunk_test.cc"],
-    deps = [
-        ":xnn_dot_thunk",
-        ":xnn_interop",
-        ":xnn_threadpool",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_testlib",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-cc_library(
-    name = "xnn_fusion_thunk",
-    srcs = ["xnn_fusion_thunk.cc"],
-    hdrs = ["xnn_fusion_thunk.h"],
-    deps = [
-        ":xnn_interop",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/runtime:buffer_use",
-        "//xla/runtime:object_pool",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/functional:bind_front",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_fusion_thunk_test",
-    srcs = ["xnn_fusion_thunk_test.cc"],
-    deps = [
-        ":xnn_fusion_thunk",
-        ":xnn_interop",
-        ":xnn_threadpool",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_testlib",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@XNNPACK",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-    ],
-)
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
deleted file mode 100644
index 0d83fbec77698d..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/memory/memory.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-absl::StatusOr<XnnSubgraph> XnnConvolutionThunk::BuildConvolutionSubgraph(
-    absl::Span<const Argument> arguments, absl::Span<const Result> results,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  TF_ASSIGN_OR_RETURN(XnnSubgraph subgraph,
-                      CreateXnnSubgraph([&](xnn_subgraph_t* subgraph) {
-                        return xnn_create_subgraph(
-                            /*external_value_ids=*/3,
-                            /*flags=*/0, subgraph);
-                      }));
-
-  uint32_t input_id = XNN_INVALID_VALUE_ID;
-  uint32_t kernel_id = XNN_INVALID_VALUE_ID;
-  uint32_t out_id = XNN_INVALID_VALUE_ID;
-
-  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
-    return {dims.begin(), dims.end()};
-  };
-
-  VLOG(3) << absl::StreamFormat(
-      "Create XNNPACK convolution: input_shape=%s kernel_shape=%s out_shape=%s",
-      convolution_slices_.input_shape.ToString(true),
-      convolution_slices_.kernel_shape.ToString(true),
-      convolution_slices_.output_shape.ToString(true));
-
-  std::vector<size_t> input_dims =
-      dims(convolution_slices_.input_shape.dimensions());
-  std::vector<size_t> kernel_dims =
-      dims(convolution_slices_.kernel_shape.dimensions());
-  std::vector<size_t> out_dims =
-      dims(convolution_slices_.output_shape.dimensions());
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, input_dims.size(), input_dims.data(),
-      nullptr,
-      /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, kernel_dims.size(), kernel_dims.data(),
-      /*data=*/arguments_buffers[1].opaque(),
-      /*external_id=*/1, /*flags=*/0, &kernel_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, out_dims.size(), out_dims.data(),
-      nullptr,
-      /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
-
-  auto& ds = convolution_canonical_dims_;
-  XNN_RETURN_IF_ERROR(xnn_define_convolution_2d(
-      subgraph.get(),  //
-      /*input_padding_top=*/ds.padding_before.x,
-      /*input_padding_right=*/ds.padding_before.y,
-      /*input_padding_bottom=*/ds.padding_after.x,
-      /*input_padding_left=*/ds.padding_after.y,
-      /*kernel_height=*/ds.kernel_dims.x,
-      /*kernel_width=*/ds.kernel_dims.y,
-      /*subsampling_height=*/ds.strides.x,
-      /*subsampling_width=*/ds.strides.y,
-      /*dilation_height=*/ds.base_dilation.x,
-      /*dilation_width=*/ds.base_dilation.y,
-      /*groups=*/ds.feature_group_count,
-      /*group_input_channels=*/ds.input_channels,
-      /*group_output_channels=*/ds.kernel_filters,
-      /*output_min=*/std::numeric_limits<float>::lowest(),
-      /*output_max=*/std::numeric_limits<float>::max(), input_id, kernel_id,
-      /*bias_id=*/XNN_INVALID_VALUE_ID, out_id,
-      /*flags=*/XNN_FLAG_TENSORFLOW_SAME_PADDING));
-
-  return subgraph;
-}
-
-absl::StatusOr<std::unique_ptr<XnnConvolutionThunk>>
-XnnConvolutionThunk::Create(
-    Options options, Info info, BufferAllocation::Slice input_buffer,
-    const Shape& input_shape, BufferAllocation::Slice kernel_buffer,
-    const Shape& kernel_shape, BufferAllocation::Slice output_buffer,
-    const Shape& output_shape, const ConvolutionDimensionNumbers& dnums,
-    const Window& window, int64_t feature_group_count) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  if (dnums.kernel_input_feature_dimension() != 3 ||
-      dnums.kernel_output_feature_dimension() != 0) {
-    return InvalidArgument(
-        "XNNPACK convolution expects kernel (filter) in OHWI format");
-  }
-
-  ConvolutionSlices slices = {input_buffer, input_shape,   kernel_buffer,
-                              kernel_shape, output_buffer, output_shape};
-
-  TF_ASSIGN_OR_RETURN(
-      ConvolutionCanonicalDims canonical_dims,
-      GetConvolutionCanonicalDims(slices, dnums, window, feature_group_count));
-
-  return absl::WrapUnique(new XnnConvolutionThunk(
-      std::move(options), std::move(info), std::move(slices),
-      std::move(canonical_dims), dnums, window));
-}
-
-static std::vector<XnnFusionThunk::Argument> ConvolutionArguments(
-    const ConvolutionSlices& slices) {
-  return {XnnFusionThunk::Argument{slices.input_buffer, slices.input_shape},
-          XnnFusionThunk::Argument{slices.kernel_buffer, slices.kernel_shape}};
-}
-
-static std::vector<XnnFusionThunk::Result> ConvolutionResults(
-    const ConvolutionSlices& slices) {
-  return {XnnFusionThunk::Result{slices.output_buffer, slices.output_shape}};
-}
-
-XnnConvolutionThunk::XnnConvolutionThunk(
-    Options options, Info info, ConvolutionSlices convolution_slices,
-    ConvolutionCanonicalDims convolution_canonical_dims,
-    ConvolutionDimensionNumbers dnums, Window window)
-    : XnnFusionThunk(XnnFusionKind::kConvolution, std::move(options),
-                     std::move(info), ConvolutionArguments(convolution_slices),
-                     ConvolutionResults(convolution_slices),
-                     CapturingBuilder(std::bind(
-                         &XnnConvolutionThunk::BuildConvolutionSubgraph, this,
-                         std::placeholders::_1, std::placeholders::_2,
-                         std::placeholders::_3)),
-                     /*captured_arguments_ids=*/{1}),
-      convolution_slices_(std::move(convolution_slices)),
-      convolution_canonical_dims_(std::move(convolution_canonical_dims)),
-      dnums_(std::move(dnums)),
-      window_(std::move(window)) {}
-
-std::string XnnConvolutionThunk::fusion_kind() const { return "convolution"; }
-
-std::string XnnConvolutionThunk::fusion_description() const {
-  return absl::StrFormat("convolution_rank=%d",
-                         convolution_canonical_dims_.convolution_rank());
-}
-
-std::vector<std::string> XnnConvolutionThunk::fusion_details() const {
-  return {absl::StrCat(convolution_canonical_dims_)};
-}
-
-std::string XnnConvolutionThunk::argument_name(size_t index) const {
-  return index == 0 ? "input" : "kernel";
-}
-
-std::string XnnConvolutionThunk::result_name(size_t index) const {
-  return "out";
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
deleted file mode 100644
index 7269ddff7f20d9..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_CONVOLUTION_THUNK_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_CONVOLUTION_THUNK_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Convolution operation implemented on top of XNNPACK.
-class XnnConvolutionThunk final : public XnnFusionThunk {
- public:
-  static absl::StatusOr<std::unique_ptr<XnnConvolutionThunk>> Create(
-      Options options, Info info, BufferAllocation::Slice input_buffer,
-      const Shape& input_shape, BufferAllocation::Slice kernel_buffer,
-      const Shape& kernel_shape, BufferAllocation::Slice output_buffer,
-      const Shape& output_shape, const ConvolutionDimensionNumbers& dnums,
-      const Window& window, int64_t feature_group_count);
-
-  ConvolutionDimensionNumbers dnums() const { return dnums_; }
-  Window window() const { return window_; }
-
-  int64_t feature_group_count() const {
-    return convolution_canonical_dims_.feature_group_count;
-  }
-
-  const ConvolutionSlices& convolution_slices() const {
-    return convolution_slices_;
-  }
-
- protected:
-  std::string fusion_kind() const final;
-  std::string fusion_description() const final;
-
-  bool has_fusion_details() const final { return true; }
-  std::vector<std::string> fusion_details() const final;
-
-  std::string argument_name(size_t index) const final;
-  std::string result_name(size_t index) const final;
-
- private:
-  XnnConvolutionThunk(Options options, Info info,
-                      ConvolutionSlices convolution_slices,
-                      ConvolutionCanonicalDims convolution_canonical_dims,
-                      ConvolutionDimensionNumbers dnums, Window window);
-
-  absl::StatusOr<XnnSubgraph> BuildConvolutionSubgraph(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  ConvolutionSlices convolution_slices_;
-  ConvolutionCanonicalDims convolution_canonical_dims_;
-
-  // Convolution operation parameters that were used to construct this thunk. We
-  // only keep them around to be able to serialize/deserialize thunk.
-  ConvolutionDimensionNumbers dnums_;
-  Window window_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_CONVOLUTION_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc
deleted file mode 100644
index 32eb78cf23f09c..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <random>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/substitute.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk_testlib.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/evaluator/hlo_evaluator.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/layout.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/xla_data.pb.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-namespace {
-
-class XnnConvolutionThunkTest
-    : public ::testing::TestWithParam<std::tuple<bool, std::vector<int32_t>>> {
- protected:
-  bool use_threadpool() const { return std::get<0>(GetParam()); }
-
-  int32_t dimension(int32_t index) const {
-    return std::get<1>(GetParam())[index];
-  }
-
-  bool IsOdd(int n) { return n % 2 == 1; }
-};
-
-TEST_P(XnnConvolutionThunkTest, SimpleConvolution) {
-  int32_t batch = dimension(0);
-  int32_t height = dimension(1);
-  int32_t width = dimension(2);
-  int32_t input_channels = dimension(3);
-  int32_t kernel_h = dimension(4);
-  int32_t kernel_w = dimension(5);
-  int32_t output_channels = dimension(6);
-
-  // Padding values for 'SAME' padding. Only odd kernel sizes are supported.
-  CHECK(IsOdd(kernel_h) && IsOdd(kernel_w));
-  int padding_h = (kernel_h - 1) / 2;
-  int padding_w = (kernel_w - 1) / 2;
-
-  std::minstd_rand0 engine;
-
-  // Input format is NHWC.
-  auto input_shape =
-      ShapeUtil::MakeShape(F32, {batch, height, width, input_channels});
-
-  // Kernel format is HWIO.
-  auto kernel_shape = ShapeUtil::MakeShape(
-      F32, {kernel_h, kernel_w, input_channels, output_channels});
-
-  auto input =
-      *LiteralUtil::CreateRandomLiteral<F32>(input_shape, &engine, 1.0f, 0.1f);
-  auto kernel =
-      *LiteralUtil::CreateRandomLiteral<F32>(kernel_shape, &engine, 1.0f, 0.1f);
-
-  // Create a reference HLO module that we can use to compare the results.
-  std::string hlo_module_template = R"(
-    HloModule convolution
-
-    ENTRY TestComputation {
-      %p0 = $0 parameter(0)
-      %p1 = $1 parameter(1)
-      ROOT conv = convolution(p0, p1), window={size=$2 pad=$3},
-        dim_labels=b01f_01io->b01f
-    }
-  )";
-
-  std::string hlo_module = absl::Substitute(
-      hlo_module_template, input_shape.ToString(), kernel_shape.ToString(),
-      absl::StrCat(kernel_h, "x", kernel_w),
-      absl::StrCat(padding_h, "_", padding_h, "x", padding_w, "_", padding_w));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> module,
-      ParseAndReturnUnverifiedModule(hlo_module, HloModuleConfig()));
-
-  HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal expected_result,
-                          evaluator.Evaluate(*module, {&input, &kernel}));
-
-  HloInstruction* conv =
-      hlo_query::FindInstruction(module->entry_computation(), "conv");
-  ASSERT_NE(conv, nullptr);
-
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
-                                 threads.NumThreads());
-
-  // XNNPACK expects OHWI format for the kernel.
-  Literal kernel_transposed =
-      kernel.Transpose({3, 0, 1, 2})
-          .Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
-
-  // Create a Literal with the expected shape.
-  const Shape& out_shape = expected_result.shape();
-  auto out = LiteralUtil::CreateFull(out_shape.dimensions(), 0.f);
-
-  BufferAllocations allocations =
-      CreateBufferAllocations(input, kernel_transposed, out);
-
-  auto [input_alloc, kernel_transposed_alloc, out_alloc] =
-      CreateBufferAllocation(input, kernel_transposed, out);
-  auto [input_slice, kernel_transposed_slice, out_slice] =
-      CreateBufferAllocationSlice(input_alloc, kernel_transposed_alloc,
-                                  out_alloc);
-
-  // Adjust kernel dimensions for XNNPACK.
-  ConvolutionDimensionNumbers dnums = conv->convolution_dimension_numbers();
-  dnums.set_kernel_input_feature_dimension(3);
-  dnums.set_kernel_output_feature_dimension(0);
-  dnums.set_kernel_spatial_dimensions(0, 1);
-  dnums.set_output_spatial_dimensions(1, 2);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk,
-      XnnConvolutionThunk::Create(
-          XnnConvolutionThunk::Options{use_threadpool()}, {"convolution"},
-          input_slice, input_shape, kernel_transposed_slice,
-          kernel_transposed.shape(), out_slice, out_shape, dnums,
-          conv->window(), conv->feature_group_count()));
-
-  XnnThreadpool threadpool;
-  if (use_threadpool()) {
-    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateXnnThreadpool(&device));
-  }
-  Thunk::XnnParams xnn_params(std::move(threadpool));
-
-  Thunk::ExecuteParams params;
-  params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool() ? &device : nullptr;
-  params.xnn_params = &xnn_params;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  ErrorSpec error_spec{1e-5};
-  EXPECT_TRUE(LiteralTestUtil::Near(expected_result, out, error_spec));
-
-  // Execute thunk one more time to test that we reuse XNN runtime.
-  execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  EXPECT_TRUE(LiteralTestUtil::Near(expected_result, out, error_spec));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    XnnConvolution, XnnConvolutionThunkTest,
-    ::testing::Combine(::testing::Values(true, false),
-                       ::testing::Values(std::vector<int32_t>{1, 8, 8, 16, 1, 1,
-                                                              32})));
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
deleted file mode 100644
index 44ec1b8139bfc5..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/functional/bind_front.h"
-#include "absl/memory/memory.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/primitive_util.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-absl::StatusOr<XnnSubgraph> XnnDotThunk::BuildDotSubgraph(
-    absl::Span<const Argument> arguments, absl::Span<const Result> results,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  TF_ASSIGN_OR_RETURN(XnnSubgraph subgraph,
-                      CreateXnnSubgraph([](xnn_subgraph_t* subgraph) {
-                        return xnn_create_subgraph(
-                            /*external_value_ids=*/3,
-                            /*flags=*/0, subgraph);
-                      }));
-
-  uint32_t lhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t rhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t out_id = XNN_INVALID_VALUE_ID;
-
-  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
-    return {dims.begin(), dims.end()};
-  };
-
-  std::vector<size_t> lhs_dims = dims(dot_slices_.lhs_shape.dimensions());
-  std::vector<size_t> rhs_dims = dims(dot_slices_.rhs_shape.dimensions());
-  std::vector<size_t> out_dims = dims(dot_slices_.out_shape.dimensions());
-
-  PrimitiveType dtype = dot_slices_.lhs_shape.element_type();
-  if (dtype != F32 && dtype != BF16) {
-    return InvalidArgument("Unsupported input data type for XnnDotThunk: %s",
-                           primitive_util::LowercasePrimitiveTypeName(dtype));
-  }
-  xnn_datatype input_dtype =
-      (dtype == F32) ? xnn_datatype_fp32 : xnn_datatype_bf16;
-  xnn_datatype output_dtype = xnn_datatype_fp32;
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), input_dtype, lhs_dims.size(), lhs_dims.data(), nullptr,
-      /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &lhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), input_dtype, rhs_dims.size(), rhs_dims.data(),
-      capture_rhs_ ? arguments_buffers[1].opaque() : nullptr,
-      /*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &rhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), output_dtype, out_dims.size(), out_dims.data(), nullptr,
-      /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_batch_matrix_multiply(
-      subgraph.get(), lhs_id, rhs_id, out_id,
-      (/*flags=*/dot_canonical_dims_.rhs_canonical ? 0 : XNN_FLAG_TRANSPOSE_B) |
-          XNN_FLAG_NO_BROADCAST));
-
-  return subgraph;
-}
-
-absl::StatusOr<std::unique_ptr<XnnDotThunk>> XnnDotThunk::Create(
-    Options options, Info info, DotDimensionNumbers dot_dimensions,
-    BufferAllocation::Slice lhs_buffer, Shape lhs_shape,
-    BufferAllocation::Slice rhs_buffer, Shape rhs_shape,
-    BufferAllocation::Slice out_buffer, Shape out_shape, bool capture_rhs) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
-                                                      rhs_shape, out_shape));
-
-  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
-                      GetDotCanonicalDims(dot_dimensions, dot_shape));
-
-  DotSlices dot_slices{lhs_buffer, std::move(lhs_shape),
-                       rhs_buffer, std::move(rhs_shape),
-                       out_buffer, std::move(out_shape)};
-
-  return absl::WrapUnique(new XnnDotThunk(
-      std::move(options), std::move(info), std::move(dot_dimensions),
-      std::move(dot_slices), std::move(dot_shape),
-      std::move(dot_canonical_dims), capture_rhs));
-}
-
-static std::vector<XnnFusionThunk::Argument> DotArguments(
-    const DotSlices& slices) {
-  return {XnnFusionThunk::Argument{slices.lhs_buffer, slices.lhs_shape},
-          XnnFusionThunk::Argument{slices.rhs_buffer, slices.rhs_shape}};
-}
-
-static std::vector<XnnFusionThunk::Result> DotResults(const DotSlices& slices) {
-  return {XnnFusionThunk::Result{slices.out_buffer, slices.out_shape}};
-}
-
-static absl::Span<const int64_t> DotCapturedArgumentIds(bool capture_rhs) {
-  static constexpr int64_t kRhsIndex = 1;
-  return capture_rhs ? absl::Span<const int64_t>(&kRhsIndex, 1)
-                     : absl::Span<const int64_t>();
-}
-
-XnnDotThunk::XnnDotThunk(Options options, Info info,
-                         DotDimensionNumbers dot_dimensions,
-                         DotSlices dot_slices, DotShape dot_shape,
-                         DotCanonicalDims dot_canonical_dims, bool capture_rhs)
-    : XnnFusionThunk(XnnFusionKind::kDot, std::move(options), std::move(info),
-                     DotArguments(dot_slices), DotResults(dot_slices),
-                     CapturingBuilder(absl::bind_front(
-                         &XnnDotThunk::BuildDotSubgraph, this)),
-                     DotCapturedArgumentIds(capture_rhs)),
-      dot_dimensions_(std::move(dot_dimensions)),
-      dot_slices_(std::move(dot_slices)),
-      dot_shape_(std::move(dot_shape)),
-      dot_canonical_dims_(std::move(dot_canonical_dims)),
-      capture_rhs_(capture_rhs) {}
-
-std::string XnnDotThunk::fusion_kind() const { return "dot"; }
-
-std::string XnnDotThunk::fusion_description() const {
-  return absl::StrFormat(
-      "lhs_batch_dims=[%s], rhs_batch_dims=[%s], "
-      "lhs_contract_dims=[%s], rhs_contract_dims=[%s], capture_rhs=%v",
-      absl::StrJoin(dot_dimensions_.lhs_batch_dimensions(), ","),
-      absl::StrJoin(dot_dimensions_.rhs_batch_dimensions(), ","),
-      absl::StrJoin(dot_dimensions_.lhs_contracting_dimensions(), ","),
-      absl::StrJoin(dot_dimensions_.rhs_contracting_dimensions(), ","),
-      capture_rhs_);
-}
-
-std::vector<std::string> XnnDotThunk::fusion_details() const {
-  return {
-      absl::StrFormat("  matmul shape: batch_size=%d, lhs=%s, rhs=%s, out=%s",
-                      dot_shape_.batch_size,
-                      dot_shape_.lhs_matmul_shape.ToString(true),
-                      dot_shape_.rhs_matmul_shape.ToString(true),
-                      dot_shape_.out_matmul_shape.ToString(true)),
-      absl::StrFormat("  matmul dims: m=%d, k=%d, n=%d, lhs_column_major=%v, "
-                      "lhs_canonical=%v rhs_column_major=%v, rhs_canonical=%v",
-                      dot_canonical_dims_.m, dot_canonical_dims_.k,
-                      dot_canonical_dims_.n,
-                      dot_canonical_dims_.lhs_column_major,
-                      dot_canonical_dims_.lhs_canonical,
-                      dot_canonical_dims_.rhs_column_major,
-                      dot_canonical_dims_.rhs_canonical),
-  };
-}
-
-std::string XnnDotThunk::argument_name(size_t index) const {
-  return index == 0 ? "lhs" : "rhs";
-}
-
-std::string XnnDotThunk::result_name(size_t index) const { return "out"; }
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
deleted file mode 100644
index 448897ad0eb662..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Dot operation implemented on top of XNNPACK.
-class XnnDotThunk final : public XnnFusionThunk {
- public:
-  static absl::StatusOr<std::unique_ptr<XnnDotThunk>> Create(
-      Options options, Info info, DotDimensionNumbers dot_dimensions,
-      BufferAllocation::Slice lhs_buffer, Shape lhs_shape,
-      BufferAllocation::Slice rhs_buffer, Shape rhs_shape,
-      BufferAllocation::Slice out_buffer, Shape out_shape, bool capture_rhs);
-
-  DotDimensionNumbers dot_dimensions() const { return dot_dimensions_; }
-  DotSlices dot_slices() const { return dot_slices_; }
-  bool capture_rhs() const { return capture_rhs_; }
-
- protected:
-  std::string fusion_kind() const final;
-  std::string fusion_description() const final;
-
-  bool has_fusion_details() const final { return true; }
-  std::vector<std::string> fusion_details() const final;
-
-  std::string argument_name(size_t index) const final;
-  std::string result_name(size_t index) const final;
-
- private:
-  XnnDotThunk(Options options, Info info, DotDimensionNumbers dot_dimensions,
-              DotSlices dot_slices, DotShape dot_shape,
-              DotCanonicalDims dot_canonical_dims, bool capture_rhs);
-
-  absl::StatusOr<XnnSubgraph> BuildDotSubgraph(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  DotDimensionNumbers dot_dimensions_;
-  DotSlices dot_slices_;
-  DotShape dot_shape_;
-  DotCanonicalDims dot_canonical_dims_;
-
-  // If true, the RHS buffer might be captured by XNNPACK graph by value. This
-  // allows XNNPACK to do packing at graph compile time.
-  bool capture_rhs_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
deleted file mode 100644
index 16f0efb0910c92..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk_testlib.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-#include "xla/literal_util.h"
-#include "xla/primitive_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/cpu_info.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-namespace {
-
-using XnnDotThunkTestSpec = std::tuple<PrimitiveType, bool, bool>;
-
-class XnnDotThunkTest : public testing::TestWithParam<XnnDotThunkTestSpec> {
- public:
-  static std::string Name(
-      const ::testing::TestParamInfo<XnnDotThunkTestSpec>& info) {
-    return absl::StrCat(
-        primitive_util::LowercasePrimitiveTypeName(std::get<0>(info.param)),
-        "_", std::get<1>(info.param) ? "threadpool" : "single_threaded", "_",
-        std::get<2>(info.param) ? "capture_rhs" : "no_capture_rhs");
-  }
-};
-
-TEST_P(XnnDotThunkTest, SimpleDot) {
-  auto [input_type, use_threadpool, capture_rhs] = GetParam();
-
-  if (input_type == BF16 &&
-      !tsl::port::TestCPUFeature(tsl::port::AVX512_BF16)) {
-    GTEST_SKIP() << "CPU needs AVX512_BF16 for this test.";
-  }
-
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
-                                 threads.NumThreads());
-
-  auto lhs = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto rhs = LiteralUtil::CreateR2<float>({{4.0, 3.0}, {2.0, 1.0}});
-  auto out = LiteralUtil::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
-  if (input_type == BF16) {
-    lhs = LiteralUtil::ConvertF32ToBF16(lhs);
-    rhs = LiteralUtil::ConvertF32ToBF16(rhs);
-  }
-
-  BufferAllocations allocations = CreateBufferAllocations(lhs, rhs, out);
-
-  auto [lhs_alloc, rhs_alloc, out_alloc] =
-      CreateBufferAllocation(lhs, rhs, out);
-  auto [lhs_slice, rhs_slice, out_slice] =
-      CreateBufferAllocationSlice(lhs_alloc, rhs_alloc, out_alloc);
-
-  Shape input_shape = ShapeUtil::MakeShape(input_type, {2, 2});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  DotDimensionNumbers dot_dimensions;
-  dot_dimensions.add_lhs_contracting_dimensions(1);
-  dot_dimensions.add_rhs_contracting_dimensions(0);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk,
-      XnnDotThunk::Create(XnnDotThunk::Options{use_threadpool}, {"dot"},
-                          dot_dimensions, lhs_slice, input_shape, rhs_slice,
-                          input_shape, out_slice, output_shape, capture_rhs));
-
-  XnnThreadpool threadpool;
-  if (use_threadpool) {
-    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateXnnThreadpool(&device));
-  }
-  Thunk::XnnParams xnn_params(std::move(threadpool));
-
-  Thunk::ExecuteParams params;
-  params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool ? &device : nullptr;
-  params.xnn_params = &xnn_params;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  EXPECT_EQ(out, LiteralUtil::CreateR2<float>({{8.0, 5.0}, {20.0, 13.0}}));
-}
-
-INSTANTIATE_TEST_SUITE_P(XnnDot, XnnDotThunkTest,
-                         ::testing::Combine(::testing::ValuesIn({F32, BF16}),
-                                            ::testing::Bool(),
-                                            ::testing::Bool()),
-                         XnnDotThunkTest::Name);
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
deleted file mode 100644
index fec3c3e1858997..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <utility>
-#include <vector>
-
-#include "experimental.h"  // xnnpack
-#include "xnnpack.h"
-#include "absl/algorithm/container.h"
-#include "absl/base/no_destructor.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/bind_front.h"
-#include "absl/functional/function_ref.h"
-#include "absl/log/check.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::cpu {
-
-absl::string_view XnnFusionThunk::XnnFusionKindToString(XnnFusionKind kind) {
-  switch (kind) {
-    case XnnFusionKind::kFusion:
-      return "xnn-fusion";
-    case XnnFusionKind::kDot:
-      return "xnn-dot";
-    case XnnFusionKind::kConvolution:
-      return "xnn-convolution";
-  }
-}
-
-std::ostream& operator<<(std::ostream& os, XnnFusionThunk::XnnFusionKind kind) {
-  return os << XnnFusionThunk::XnnFusionKindToString(kind);
-}
-
-// XNNPACK executable instantiated for the fusion operation.
-struct XnnFusionThunk::XnnExecutable {
-  tsl::AsyncValueRef<XnnFusionThunk::ExecuteEvent> Invoke(
-      const XnnThreadpool& threadpool,
-      absl::Span<se::DeviceAddressBase> arguments,
-      absl::Span<se::DeviceAddressBase> results,
-      absl::FunctionRef<bool(size_t)> is_captured_argument);
-
-  // Resets XNNPACK runtime and subgraph.
-  absl::Status Reset();
-
-  XnnSubgraph subgraph = nullptr;
-  XnnRuntime runtime = nullptr;
-
-  // TODO(ezhulenev): Today we rely on device memory as an identity of the
-  // captured argument, and this is not correct as we can have multiple
-  // arguments allocated to the heap address. This is work in progress, and will
-  // be migrated to a buffer identity passed to XLA by the client (PjRt).
-  std::vector<se::DeviceAddressBase> captured_arguments;
-};
-
-tsl::AsyncValueRef<XnnFusionThunk::ExecuteEvent>
-XnnFusionThunk::XnnExecutable::Invoke(
-    const XnnThreadpool& threadpool,
-    absl::Span<se::DeviceAddressBase> arguments,
-    absl::Span<se::DeviceAddressBase> results,
-    absl::FunctionRef<bool(size_t)> is_captured_argument) {
-  // Create external values for all arguments and results.
-  absl::InlinedVector<xnn_external_value, 8> external_values;
-  external_values.reserve(arguments.size() + results.size());
-
-  // External tensor id for arguments and results.
-  uint32_t id = 0;
-
-  for (const se::DeviceAddressBase& argument : arguments) {
-    xnn_external_value value{id++, argument.opaque()};
-    if (!is_captured_argument(value.id)) {
-      external_values.push_back(value);
-    }
-  }
-
-  for (const se::DeviceAddressBase& result : results) {
-    xnn_external_value value{id++, result.opaque()};
-    external_values.push_back(value);
-  }
-
-  DCHECK_NE(runtime.get(), nullptr) << "XNNPACK runtime is not initialized";
-  XNN_RETURN_IF_ERROR(xnn_setup_runtime_v2(
-      runtime.get(), external_values.size(), external_values.data()));
-
-  // Update threadpool used by the XNNPACK runtime.
-  xnn_update_runtime_with_threadpool(runtime.get(), threadpool.get());
-
-  // Execute XNNPACK runtime in the caller thread.
-  XNN_RETURN_IF_ERROR(xnn_invoke_runtime(runtime.get()));
-  return OkExecuteEvent();
-}
-
-absl::Status XnnFusionThunk::XnnExecutable::Reset() {
-  runtime.reset();
-  subgraph.reset();
-  return absl::OkStatus();
-}
-
-absl::StatusOr<XnnFusionThunk::XnnExecutable>
-XnnFusionThunk::CreateXnnExecutable(
-    const XnnThreadpool& threadpool,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  bool capturing = !captured_arguments_ids_.empty();
-  VLOG(3) << absl::StreamFormat(
-      "Create %s XNN executable for `%s` operation: num_created=%d",
-      capturing ? "capturing" : "pooled", info().op_name,
-      capturing ? num_capturing_created_.fetch_add(1)
-                : xnn_executable_pool_.num_created());
-
-  XnnExecutable executable;
-
-  // Keep track of the arguments captured by value.
-  executable.captured_arguments = CaptureArguments(arguments_buffers);
-
-  if (builder_) {
-    TF_ASSIGN_OR_RETURN(executable.subgraph, builder_(arguments_, results_));
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        executable.subgraph,
-        capturing_builder_(arguments_, results_, arguments_buffers));
-  }
-
-    uint32_t flags = XNN_FLAG_SLINKY_ENABLED | XNN_FLAG_SLINKY_STATIC_BOUNDS |
-                   XNN_FLAG_DONT_SPIN_WORKERS;
-
-  TF_ASSIGN_OR_RETURN(
-      executable.runtime, CreateXnnRuntime([&](xnn_runtime_t* runtime) {
-        return xnn_create_runtime_with_threadpool(
-            executable.subgraph.get(), /*weights_cache=*/nullptr,
-            threadpool.get(), flags, runtime);
-      }));
-  XNN_RETURN_IF_ERROR(xnn_reshape_runtime(executable.runtime.get()));
-
-  return {std::move(executable)};
-}
-
-absl::Status XnnFusionThunk::UpdateXnnExecutable(
-    const XnnThreadpool& threadpool, XnnExecutable& executable,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  DCHECK(capturing_builder_) << "XNN executable is not capturing arguments";
-  DCHECK_EQ(executable.captured_arguments.size(),
-            captured_arguments_ids_.size())
-      << "Unexpected number of captured arguments";
-
-  // If all arguments captured by value are the same as the last execution,
-  // we can reuse the XNN executable.
-  auto capture_arguments = CaptureArguments(arguments_buffers);
-  if (executable.captured_arguments == capture_arguments) {
-    VLOG(3) << absl::StreamFormat("Reuse XNN executable for `%s` operation",
-                                  info().op_name);
-    return absl::OkStatus();
-  }
-
-  VLOG(3) << absl::StreamFormat("Update XNN executable for `%s` operation",
-                                info().op_name);
-
-  TF_RETURN_IF_ERROR(executable.Reset());
-
-  // Keep track of the updated arguments captured by value.
-  executable.captured_arguments = std::move(capture_arguments);
-
-  TF_ASSIGN_OR_RETURN(
-      executable.subgraph,
-      capturing_builder_(arguments_, results_, arguments_buffers));
-
-  uint32_t flags = XNN_FLAG_SLINKY_ENABLED | XNN_FLAG_SLINKY_STATIC_BOUNDS |
-                   XNN_FLAG_DONT_SPIN_WORKERS;
-
-  TF_ASSIGN_OR_RETURN(
-      executable.runtime, CreateXnnRuntime([&](xnn_runtime_t* runtime) {
-        return xnn_create_runtime_with_threadpool(
-            executable.subgraph.get(), /*weights_cache=*/nullptr,
-            threadpool.get(), flags, runtime);
-      }));
-  XNN_RETURN_IF_ERROR(xnn_reshape_runtime(executable.runtime.get()));
-
-  return absl::OkStatus();
-}
-
-std::vector<se::DeviceAddressBase> XnnFusionThunk::CaptureArguments(
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  std::vector<se::DeviceAddressBase> captured_arguments_ids;
-  captured_arguments_ids.reserve(captured_arguments_ids_.size());
-  for (int64_t i = 0; i < captured_arguments_ids_.size(); ++i) {
-    int32_t arg_index = captured_arguments_ids_[i];
-    captured_arguments_ids.push_back(arguments_buffers[arg_index]);
-  }
-  return captured_arguments_ids;
-}
-
-absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunk::Create(
-    Options options, Info info, std::vector<Argument> arguments,
-    std::vector<Result> results, Builder builder) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  return absl::WrapUnique(new XnnFusionThunk(
-      XnnFusionKind::kFusion, std::move(options), std::move(info),
-      std::move(arguments), std::move(results), std::move(builder)));
-}
-
-absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunk::Create(
-    Options options, Info info, std::vector<Argument> arguments,
-    std::vector<Result> results, CapturingBuilder capturing_builder,
-    absl::Span<const int64_t> captured_arguments_ids) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  return absl::WrapUnique(new XnnFusionThunk(
-      XnnFusionKind::kFusion, std::move(options), std::move(info),
-      std::move(arguments), std::move(results), std::move(capturing_builder),
-      captured_arguments_ids));
-}
-
-XnnFusionThunk::XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                               std::vector<Argument> arguments,
-                               std::vector<Result> results, Builder builder)
-    : Thunk(Kind::kXnnFusion, std::move(info)),
-      xnn_fusion_kind_(kind),
-      options_(std::move(options)),
-      arguments_(std::move(arguments)),
-      results_(std::move(results)),
-      builder_(std::move(builder)),
-      xnn_executable_pool_(
-          absl::bind_front(&XnnFusionThunk::CreateXnnExecutable, this)) {}
-
-XnnFusionThunk::XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                               std::vector<Argument> arguments,
-                               std::vector<Result> results,
-                               CapturingBuilder capturing_builder,
-                               absl::Span<const int64_t> captured_arguments_ids)
-    : Thunk(Kind::kXnnFusion, std::move(info)),
-      xnn_fusion_kind_(kind),
-      options_(std::move(options)),
-      arguments_(std::move(arguments)),
-      results_(std::move(results)),
-      capturing_builder_(std::move(capturing_builder)),
-      captured_arguments_ids_(captured_arguments_ids.begin(),
-                              captured_arguments_ids.end()),
-      xnn_executable_pool_(
-          absl::bind_front(&XnnFusionThunk::CreateXnnExecutable, this)) {}
-
-XnnFusionThunk::~XnnFusionThunk() = default;
-
-XnnFusionThunk::BufferUses XnnFusionThunk::buffer_uses() const {
-  BufferUses buffer_uses;
-  for (const Argument& argument : arguments_) {
-    buffer_uses.push_back(BufferUse::Read(argument.slice, argument.shape));
-  }
-  for (const Result& result : results_) {
-    buffer_uses.push_back(BufferUse::Write(result.slice, result.shape));
-  }
-
-  return buffer_uses;
-}
-
-const XnnThreadpool& GetXnnThreadpool(const Thunk::ExecuteParams& params) {
-  static absl::NoDestructor<XnnThreadpool> no_threadpool(nullptr);
-  return params.xnn_params ? params.xnn_params->threadpool : *no_threadpool;
-}
-
-tsl::AsyncValueRef<XnnFusionThunk::ExecuteEvent> XnnFusionThunk::Execute(
-    const ExecuteParams& params) {
-  VLOG(3) << absl::StreamFormat("XNN %s `%s`: %s", fusion_kind(),
-                                info().op_name, fusion_description());
-
-  if (VLOG_IS_ON(3) && has_fusion_details()) {
-    for (auto& detail : fusion_details()) {
-      VLOG(3) << detail;
-    }
-  }
-
-  // Resolve device memory for arguments.
-  absl::InlinedVector<se::DeviceAddressBase, 8> arguments_buffers;
-  arguments_buffers.resize(arguments_.size());
-  for (size_t i = 0; i < arguments_.size(); ++i) {
-    Argument& argument = arguments_[i];
-
-    TF_ASSIGN_OR_RETURN(
-        arguments_buffers[i],
-        params.buffer_allocations->GetDeviceAddress(argument.slice));
-
-    VLOG(3) << absl::StreamFormat("  %s: %s in slice %s (%p)", argument_name(i),
-                                  argument.shape.ToString(true),
-                                  argument.slice.ToString(),
-                                  arguments_buffers[i].opaque());
-  }
-
-  // Resolve device memory for results.
-  absl::InlinedVector<se::DeviceAddressBase, 4> results_buffers;
-  results_buffers.resize(results_.size());
-  for (size_t i = 0; i < results_.size(); ++i) {
-    Result& result = results_[i];
-
-    TF_ASSIGN_OR_RETURN(
-        results_buffers[i],
-        params.buffer_allocations->GetDeviceAddress(results_[i].slice));
-
-    VLOG(3) << absl::StreamFormat("  %s: %s in slice %s (%p)", result_name(i),
-                                  result.shape.ToString(true),
-                                  result.slice.ToString(),
-                                  results_buffers[i].opaque());
-  }
-
-  DCHECK(builder_ || capturing_builder_) << "One of the builders must be set.";
-
-  auto invoke = [&](typename XnnExecutablePool::BorrowedObject executable) {
-    auto executed = executable->Invoke(
-        GetXnnThreadpool(params), absl::MakeSpan(arguments_buffers),
-        absl::MakeSpan(results_buffers), [&](size_t id) {
-          return absl::c_linear_search(captured_arguments_ids_, id);
-        });
-
-    // Do not return executable to the pool until the execution is done.
-    executed.AndThen([executable = std::move(executable)] {});
-    return executed;
-  };
-
-  // Borrow XnnExecutable from the pool.
-  TF_ASSIGN_OR_RETURN(auto executable,
-                      xnn_executable_pool_.GetOrCreate(GetXnnThreadpool(params),
-                                                       arguments_buffers));
-
-  // If XNN graph doesn't capture any of the arguments by value, we can execute
-  // XnnExecutable immediately.
-  if (captured_arguments_ids_.empty()) {
-    return invoke(std::move(executable));
-  }
-
-  // Otherwise reset XnnExecutable to capture new arguments buffers.
-  TF_RETURN_IF_ERROR(UpdateXnnExecutable(GetXnnThreadpool(params), *executable,
-                                         arguments_buffers));
-  return invoke(std::move(executable));
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
deleted file mode 100644
index 21deb08bfd6fd6..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
-
-#include <stdbool.h>
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/runtime/object_pool.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-
-namespace xla::cpu {
-
-// XNN fusion thunk encapsulates XNNPACK subgraph contructed from an XLA fusion
-// operation, where each HLO op has a corresponding XNNPACK operator.
-class XnnFusionThunk : public Thunk {
- public:
-  enum class XnnFusionKind {
-    kFusion,
-    kDot,
-    kConvolution,
-  };
-
-  static absl::string_view XnnFusionKindToString(XnnFusionKind kind);
-
-  ~XnnFusionThunk() override;
-
-  struct Options {
-    // Pass XnnThreadpool constructed from the intra-op threadpool to the
-    // XNNPACK runtime to allow XNNPACK to parallelize the execution.
-    bool use_threadpool = true;
-  };
-
-  struct Argument {
-    BufferAllocation::Slice slice;
-    Shape shape;
-  };
-
-  struct Result {
-    BufferAllocation::Slice slice;
-    Shape shape;
-  };
-
-  // Builder function constructs XNNPACK subgraph for the fusion operation.
-  using Builder = absl::AnyInvocable<absl::StatusOr<XnnSubgraph>(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results)>;
-
-  // Builder function that constructs XNNPACK subgraph for the fusion operation
-  // and captures some of the arguments buffers by value. Such XNNPACK subgraphs
-  // can't be reused if captured arguments are not the same, and can lead to
-  // crashes and undefined behavior if captured arguments are destroyed.
-  // Capturing arguments by value allows XNNPACK to do packing at graph compile
-  // time, and avoid re-packing costs at run time (at inference weights stay
-  // constant, i.e. convolution filters and one of the dot arguments).
-  using CapturingBuilder = absl::AnyInvocable<absl::StatusOr<XnnSubgraph>(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers)>;
-
-  static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> Create(
-      Options options, Info info, std::vector<Argument> arguments,
-      std::vector<Result> results, Builder builder);
-
-  static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> Create(
-      Options options, Info info, std::vector<Argument> arguments,
-      std::vector<Result> results, CapturingBuilder capturing_builder,
-      absl::Span<const int64_t> captured_arguments_ids);
-
-  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
-
-  bool ExecuteMayBlock() const final { return true; }
-
-  BufferUses buffer_uses() const final;
-
-  Options options() const { return options_; }
-
-  XnnFusionKind xnn_fusion_kind() const { return xnn_fusion_kind_; }
-
- protected:
-  XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                 std::vector<Argument> arguments, std::vector<Result> results,
-                 Builder builder);
-
-  XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                 std::vector<Argument> arguments, std::vector<Result> results,
-                 CapturingBuilder capturing_builder,
-                 absl::Span<const int64_t> captured_arguments_ids);
-
-  // Extension points for subclasses to customize the logging behavior.
-  virtual std::string fusion_kind() const { return "fusion"; }
-  virtual std::string fusion_description() const { return ""; }
-
-  virtual bool has_fusion_details() const { return false; }
-  virtual std::vector<std::string> fusion_details() const { return {}; }
-
-  virtual std::string argument_name(size_t index) const {
-    return absl::StrCat("arg #", index);
-  }
-
-  virtual std::string result_name(size_t index) const {
-    return absl::StrCat("res #", index);
-  }
-
- private:
-  // XNNPACK subgraph + runtime instantiated and ready for execution.
-  struct XnnExecutable;
-
-  // Creates XnnExecutable for the fusion operation using one of the builders.
-  absl::StatusOr<XnnExecutable> CreateXnnExecutable(
-      const XnnThreadpool& threadpool,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  // Updates XnnExecutable to the XNN subgraph constructed with the given
-  // arguments buffers.
-  absl::Status UpdateXnnExecutable(
-      const XnnThreadpool& threadpool, XnnExecutable& executable,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  // Returns the list of captured arguments buffers.
-  std::vector<se::DeviceAddressBase> CaptureArguments(
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  XnnFusionKind xnn_fusion_kind_;
-  Options options_;
-
-  std::vector<Argument> arguments_;
-  std::vector<Result> results_;
-
-  // Builder that constructs XNNPACK subgraph for the fusion operation.
-  Builder builder_;
-
-  // Builder that constructs XNNPACK subgraph for the fusion operation and
-  // captures some of the arguments buffers by value. Such subgraphs can't be
-  // reused if captured arguments changed since the last execution.
-  CapturingBuilder capturing_builder_;
-
-  // Indices of arguments that are captured by XNNPACK subgraph by value.
-  std::vector<int64_t> captured_arguments_ids_;
-
-  // XLA:CPU executable can be called concurrently from multiple threads,
-  // and we need to keep a pool of XNNPACK executables to avoid data races.
-  using XnnExecutablePool = ObjectPool<XnnExecutable, const XnnThreadpool&,
-                                       absl::Span<const se::DeviceAddressBase>>;
-  XnnExecutablePool xnn_executable_pool_;
-
-  // The number of XNNPACK executables created for capturing graphs.
-  std::atomic<int64_t> num_capturing_created_{0};
-};
-
-std::ostream& operator<<(std::ostream& os, XnnFusionThunk::XnnFusionKind kind);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
deleted file mode 100644
index 4f802f04bc6530..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk_testlib.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-#include "xla/literal_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/xla_data.pb.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-namespace {
-
-static absl::StatusOr<XnnSubgraph> BuildBinaryAddSubgraph(
-    absl::Span<const XnnFusionThunk::Argument> arguments,
-    absl::Span<const XnnFusionThunk::Result> results) {
-  TF_ASSIGN_OR_RETURN(XnnSubgraph subgraph,
-                      CreateXnnSubgraph([&](xnn_subgraph_t* subgraph) {
-                        return xnn_create_subgraph(
-                            /*external_value_ids=*/3,
-                            /*flags=*/0, subgraph);
-                      }));
-
-  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
-    return {dims.begin(), dims.end()};
-  };
-
-  uint32_t lhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t rhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t out_id = XNN_INVALID_VALUE_ID;
-
-  std::vector<size_t> lhs_dims = dims(arguments[0].shape.dimensions());
-  std::vector<size_t> rhs_dims = dims(arguments[1].shape.dimensions());
-  std::vector<size_t> out_dims = dims(results[0].shape.dimensions());
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, lhs_dims.size(), lhs_dims.data(),
-      nullptr,
-      /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &lhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, rhs_dims.size(), rhs_dims.data(),
-      nullptr,
-      /*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &rhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, out_dims.size(), out_dims.data(),
-      nullptr,
-      /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
-
-  xnn_binary_params params = {-std::numeric_limits<float>::infinity(),
-                              std::numeric_limits<float>::infinity()};
-
-  XNN_RETURN_IF_ERROR(xnn_define_binary(subgraph.get(), xnn_binary_add, &params,
-                                        lhs_id, rhs_id, out_id, /*flags=*/0));
-
-  return subgraph;
-}
-
-class XnnFusionThunkTest : public testing::TestWithParam<bool> {
- public:
-  static std::string Name(const ::testing::TestParamInfo<bool>& info) {
-    return absl::StrCat(info.param ? "threadpool" : "single_threaded");
-  }
-
- protected:
-  bool use_threadpool() const { return GetParam(); }
-};
-
-TEST_P(XnnFusionThunkTest, ElementwiseAdd) {
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
-                                 threads.NumThreads());
-
-  auto lhs = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
-  auto rhs = LiteralUtil::CreateR1<float>({4.0, 3.0, 2.0, 1.0});
-  auto out = LiteralUtil::CreateR1<float>({0.0, 0.0, 0.0, 0.0});
-
-  BufferAllocations allocations = CreateBufferAllocations(lhs, rhs, out);
-
-  auto [lhs_alloc, rhs_alloc, out_alloc] =
-      CreateBufferAllocation(lhs, rhs, out);
-  auto [lhs_slice, rhs_slice, out_slice] =
-      CreateBufferAllocationSlice(lhs_alloc, rhs_alloc, out_alloc);
-
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  XnnFusionThunk::Argument lhs_arg = {lhs_slice, shape};
-  XnnFusionThunk::Argument rhs_arg = {rhs_slice, shape};
-  XnnFusionThunk::Result out_res = {out_slice, shape};
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk, XnnFusionThunk::Create(
-                      XnnFusionThunk::Options{use_threadpool()}, {"fusion"},
-                      {lhs_arg, rhs_arg}, {out_res}, &BuildBinaryAddSubgraph));
-
-  XnnThreadpool threadpool;
-  if (use_threadpool()) {
-    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateXnnThreadpool(&device));
-  }
-  Thunk::XnnParams xnn_params(std::move(threadpool));
-
-  Thunk::ExecuteParams params;
-  params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool() ? &device : nullptr;
-  params.xnn_params = &xnn_params;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  EXPECT_EQ(out, LiteralUtil::CreateR1<float>({5.0, 5.0, 5.0, 5.0}));
-}
-
-INSTANTIATE_TEST_SUITE_P(XnnFusion, XnnFusionThunkTest, ::testing::Bool(),
-                         XnnFusionThunkTest::Name);
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc
deleted file mode 100644
index 3d219ee8b267f5..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-
-#include "experimental.h"  // xnnpack
-#include "xnnpack.h"
-#include "absl/functional/function_ref.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/primitive_util.h"
-#include "xla/util.h"
-
-namespace xla::cpu {
-
-absl::Status InitializeXnnPack() {
-  static xnn_status status = xnn_initialize(/*allocator=*/nullptr);
-  if (status != xnn_status_success) {
-    return Internal("XNNPACK initialization failed");
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<XnnSubgraph> CreateXnnSubgraph(
-    absl::FunctionRef<xnn_status(xnn_subgraph_t*)> builder) {
-  xnn_subgraph_t subgraph = nullptr;
-  XNN_RETURN_IF_ERROR(builder(&subgraph));
-  return XnnSubgraph(subgraph);
-}
-
-absl::StatusOr<XnnRuntime> CreateXnnRuntime(
-    absl::FunctionRef<xnn_status(xnn_runtime_t*)> builder) {
-  xnn_runtime_t runtime = nullptr;
-  XNN_RETURN_IF_ERROR(builder(&runtime));
-  return XnnRuntime(runtime);
-}
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    absl::FunctionRef<xnn_status(xnn_threadpool_t*)> builder) {
-  xnn_threadpool_t threadpool = nullptr;
-  XNN_RETURN_IF_ERROR(builder(&threadpool));
-  return XnnThreadpool(threadpool);
-}
-
-absl::StatusOr<xnn_datatype> XnnDatatype(const PrimitiveType& type) {
-  switch (type) {
-    case BF16:
-      return xnn_datatype_bf16;
-    case F16:
-      return xnn_datatype_fp16;
-    case F32:
-      return xnn_datatype_fp32;
-    default:
-      return InvalidArgument("Unsupported XNNPACK data type: %s",
-                             primitive_util::LowercasePrimitiveTypeName(type));
-  }
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
deleted file mode 100644
index e591665c0f38f3..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
-
-#include <memory>
-
-#include "experimental.h"  // xnnpack
-#include "xnnpack.h"
-#include "absl/base/optimization.h"
-#include "absl/functional/function_ref.h"
-#include "absl/status/status.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/util.h"
-
-namespace xla::cpu {
-
-//===----------------------------------------------------------------------===//
-// XNNPACK status to ABSL status conversion macros.
-//===----------------------------------------------------------------------===//
-
-#define XNN_RETURN_IF_ERROR(expr)             \
-  do {                                        \
-    absl::Status s = XnnStatusToStatus(expr); \
-    if (!s.ok()) {                            \
-      return s;                               \
-    }                                         \
-  } while (0)
-
-#define XNN_LOG_IF_ERROR(expr)                         \
-  do {                                                 \
-    absl::Status s = XnnStatusToStatus(expr);          \
-    if (!s.ok()) {                                     \
-      LOG(ERROR) << "XNNPACK operation failed: " << s; \
-    }                                                  \
-  } while (0)
-
-// Statically initializes XNNPACK for the current process.
-absl::Status InitializeXnnPack();
-
-// Converts XNNPACK status to absl::Status.
-inline absl::Status XnnStatusToStatus(xnn_status status) {
-  if (ABSL_PREDICT_TRUE(status == xnn_status_success)) {
-    return absl::OkStatus();
-  }
-
-  auto error_message = [](xnn_status status) {
-    switch (status) {
-      case xnn_status_success:
-        return "";
-      case xnn_status_uninitialized:
-        return "uninitialized";
-      case xnn_status_invalid_parameter:
-        return "invalid parameter";
-      case xnn_status_invalid_state:
-        return "invalid state";
-      case xnn_status_unsupported_parameter:
-        return "unsupported parameter";
-      case xnn_status_unsupported_hardware:
-        return "unsupported hardware";
-      case xnn_status_out_of_memory:
-        return "out of memory";
-      case xnn_status_reallocation_required:
-        return "reallocation required";
-      case xnn_status_deprecated:
-        return "deprecated";
-    }
-  };
-
-  return Internal("XNNPACK operation failed: %s", error_message(status));
-}
-
-//===----------------------------------------------------------------------===//
-// XLA to XNNPACK type conversions.
-//===----------------------------------------------------------------------===//
-
-absl::StatusOr<xnn_datatype> XnnDatatype(const PrimitiveType& type);
-
-//===----------------------------------------------------------------------===//
-// RAII wrappers for XNNPACK types.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-struct XnnDeleter {
-  void operator()(xnn_subgraph* subgraph) {
-    XNN_LOG_IF_ERROR(xnn_delete_subgraph(subgraph));
-  }
-  void operator()(xnn_runtime* runtime) {
-    XNN_LOG_IF_ERROR(xnn_delete_runtime(runtime));
-  }
-  void operator()(xnn_threadpool* threadpool) {
-    XNN_LOG_IF_ERROR(xnn_delete_threadpool(threadpool));
-  }
-};
-}  // namespace internal
-
-using XnnSubgraph = std::unique_ptr<xnn_subgraph, internal::XnnDeleter>;
-using XnnRuntime = std::unique_ptr<xnn_runtime, internal::XnnDeleter>;
-using XnnThreadpool = std::unique_ptr<xnn_threadpool, internal::XnnDeleter>;
-
-absl::StatusOr<XnnSubgraph> CreateXnnSubgraph(
-    absl::FunctionRef<xnn_status(xnn_subgraph_t*)> builder);
-
-absl::StatusOr<XnnRuntime> CreateXnnRuntime(
-    absl::FunctionRef<xnn_status(xnn_runtime_t*)> builder);
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    absl::FunctionRef<xnn_status(xnn_threadpool_t*)> builder);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
deleted file mode 100644
index 8ca982278e689e..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-
-#include <cstdint>
-
-#include "experimental.h"  // xnnpack
-#include "absl/base/optimization.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-
-#define EIGEN_USE_THREADS
-#include "Eigen/ThreadPool"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-
-static int32_t NumThreads(void* pool) {
-  if (ABSL_PREDICT_FALSE(pool == nullptr)) {
-    return 0;
-  }
-  return reinterpret_cast<Eigen::ThreadPoolInterface*>(pool)->NumThreads();
-}
-
-static void Schedule(void* pool, void* context, void (*task)(void* context)) {
-  if (ABSL_PREDICT_FALSE(pool == nullptr)) {
-    (*task)(context);
-  }
-  reinterpret_cast<Eigen::ThreadPoolInterface*>(pool)->Schedule(
-      [task, context]() { (*task)(context); });
-}
-
-// And adaptor from Eigen::ThreadPoolInterface to xnn_threadpool_t.
-static constexpr xnn_scheduler_v2 kXnnScheduler = {&NumThreads, &Schedule};
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    Eigen::ThreadPoolInterface* threadpool) {
-  return CreateXnnThreadpool([&](xnn_threadpool_t* xnn_threadpool) {
-    return xnn_create_threadpool_v2(kXnnScheduler, threadpool, /*flags=*/1,
-                                    xnn_threadpool);
-  });
-}
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    const Eigen::ThreadPoolDevice* device) {
-  return CreateXnnThreadpool(device->getPool());
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
deleted file mode 100644
index d154af861814ff..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
-
-#include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-
-namespace Eigen {
-struct ThreadPoolDevice;
-class ThreadPoolInterface;
-}  // namespace Eigen
-
-namespace xla::cpu {
-
-// Creates an XNNPACK threadpool from an Eigen threadpool.
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    Eigen::ThreadPoolInterface* threadpool);
-
-// Creates an XNNPACK threadpool from an Eigen ThreadPoolDevice.
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    const Eigen::ThreadPoolDevice* device);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
index 5172563e23ca0c..2c07477fc16b25 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
@@ -1,5 +1,5 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "ynn_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -22,10 +22,7 @@ cc_library(
         "//xla/backends/cpu/runtime:work_queue",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -33,7 +30,7 @@ cc_library(
     ],
 )
 
-ynn_cc_test(
+xla_cc_test(
     name = "slinky_threadpool_test",
     srcs = ["slinky_threadpool_test.cc"],
     deps = [
@@ -72,15 +69,10 @@ cc_library(
         ":slinky_threadpool",
         ":ynn_interop",
         "@XNNPACK//ynnpack:ynnpack_h",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
-        "@slinky//slinky/base:thread_pool",
     ],
 )
 
@@ -118,7 +110,7 @@ cc_library(
     ],
 )
 
-ynn_cc_test(
+xla_cc_test(
     name = "ynn_fusion_thunk_test",
     srcs = ["ynn_fusion_thunk_test.cc"],
     deps = [
diff --git a/third_party/xla/xla/backends/cpu/testlib/BUILD b/third_party/xla/xla/backends/cpu/testlib/BUILD
index e4d57e372c0ef0..e926426a322d37 100644
--- a/third_party/xla/xla/backends/cpu/testlib/BUILD
+++ b/third_party/xla/xla/backends/cpu/testlib/BUILD
@@ -159,7 +159,6 @@ tsl_pybind_extension(
         "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -192,7 +191,6 @@ xla_cc_test(
         "//xla/codegen:llvm_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
@@ -200,7 +198,6 @@ xla_cc_test(
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:ir_headers",
-        "@local_tsl//tsl/platform:casts",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
index 20385e34237e0d..e3646f2fd624b2 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
@@ -271,7 +271,8 @@ NB_MODULE(_extension, kernel_runner_module) {
   kernel_runner_module.def(
       "run_fusion_wrapper_pass",
       [](std::unique_ptr<HloModule, nb::deleter<HloModule>> hlo_module) {
-        FusionWrapper fusion_wrapper(true);
+        FusionWrapper fusion_wrapper(/*using_new_fusion_emitter=*/true,
+                                     /*use_tiled_emitter=*/true);
         absl::StatusOr<bool> result = fusion_wrapper.Run(hlo_module.get());
         if (!result.ok()) {
           throw std::runtime_error(std::string(result.status().message()));
diff --git a/third_party/xla/xla/backends/cpu/tests/BUILD b/third_party/xla/xla/backends/cpu/tests/BUILD
index 0e63cc568a6459..435200a13e65d2 100644
--- a/third_party/xla/xla/backends/cpu/tests/BUILD
+++ b/third_party/xla/xla/backends/cpu/tests/BUILD
@@ -20,14 +20,9 @@ xla_test(
     tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla:literal",
-        "//xla:literal_util",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
-        "//xla/tests:client_library_test_runner_mixin",
         "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
         "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tests:literal_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -41,14 +36,11 @@ xla_test(
     tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
         "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:platform_port",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
index 413f90b4ab959b..4903ef04530452 100644
--- a/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
+++ b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
@@ -61,16 +61,16 @@ TEST_P(YnnFusionTest, AddAndMultiply) {
     HloModule add_and_multiply
 
     ynn_fusion {
-      %lhs = $dtype[4] parameter(0)
-      %rhs = $dtype[4] parameter(1)
-      %add = $dtype[4] add(%lhs, %rhs)
-      ROOT %mul = $in_dtype[4] multiply(%add, %add)
+      %lhs = $dtype[100] parameter(0)
+      %rhs = $dtype[100] parameter(1)
+      %add = $dtype[100] add(%lhs, %rhs)
+      ROOT %mul = $in_dtype[100] multiply(%add, %add)
     }
 
     ENTRY entry {
-      %p0 = $dtype[4] parameter(0)
-      %p1 = $dtype[4] parameter(1)
-      ROOT %fusion = $dtype[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+      %p0 = $dtype[100] parameter(0)
+      %p1 = $dtype[100] parameter(1)
+      ROOT %fusion = $dtype[100] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
         backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     })";
 
diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index d23039f371acb1..66ba9db13ba559 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -3,7 +3,6 @@ load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api")
 load("//xla/tsl/mkl:graph.bzl", "onednn_graph_cc_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -26,7 +25,7 @@ cc_library(
     deps = [
         ":library_matcher",
         ":onednn_matcher",
-        ":xnn_matcher",
+        ":ynn_matcher",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -42,8 +41,9 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform:protobuf",
-    ] + if_ynnpack([":ynn_matcher"]),
+    ],
 )
 
 xla_cc_test(
@@ -54,7 +54,6 @@ xla_cc_test(
         ":library_rewriter",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/backends/cpu:xnn_gemm_config",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/backends/cpu/codegen:target_machine_test_base",
         "//xla/hlo/ir:hlo",
@@ -100,27 +99,12 @@ onednn_graph_cc_library(
     ],
 )
 
-cc_library(
-    name = "xnn_matcher",
-    hdrs = ["xnn_matcher.h"],
-    deps = [
-        ":library_matcher",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:protobuf",
-    ],
-)
-
 cc_library(
     name = "ynn_matcher",
     hdrs = ["ynn_matcher.h"],
     deps = [
         ":library_matcher",
+        "//xla/backends/cpu:ynn_support",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/base:no_destructor",
@@ -128,42 +112,16 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:protobuf",
-    ] + if_ynnpack(["//xla/backends/cpu:ynn_support"]),
-)
-
-cc_library(
-    name = "xnn_graph_fusion",
-    srcs = ["xnn_graph_fusion.cc"],
-    hdrs = ["xnn_graph_fusion.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_proto_cc",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:call_graph",
-        "//xla/service:instruction_fusion",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
 xla_cc_test(
-    name = "xnn_graph_fusion_test",
-    srcs = ["xnn_graph_fusion_test.cc"],
+    name = "ynn_matcher_test",
+    srcs = ["ynn_matcher_test.cc"],
     deps = [
-        ":xnn_graph_fusion",
-        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
-        "@com_google_googletest//:gtest",
+        "//xla/service:cpu_plugin",
+        "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
index 23c5874c652fb0..381ea0c970b503 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
@@ -45,6 +45,7 @@ class LibraryMatcher {
           break;
         // Not intended to be used by LibraryMatcher.
         case DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT:
+        case DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION:
           break;
         case DebugOptions::LIBRARY_FUSION_TYPE_REDUCE:
           fuse_reduce_ = true;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
index 4e3fd3e2d1a0d9..10c8aafaa98f99 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
@@ -160,6 +160,10 @@ inline absl::Status InsertConvertIfNecessary(
   return absl::OkStatus();
 }
 
+inline bool IsElementwiseAndNotConstant(const HloInstruction* instr) {
+  return instr->IsElementwise() && !instr->IsConstant();
+}
+
 }  // namespace
 
 absl::StatusOr<LibraryMatcher*> LibraryRewriter::ChooseLibrary(
@@ -298,7 +302,7 @@ absl::StatusOr<bool> LibraryRewriter::ProcessComputation(
       fusion_starters.push_back(*it);
     } else if (fuse_reduce_ && (*it)->opcode() == HloOpcode::kReduce) {
       fusion_starters.push_back(*it);
-    } else if (fuse_eltwise_ && (*it)->IsElementwise()) {
+    } else if (fuse_eltwise_ && IsElementwiseAndNotConstant(*it)) {
       eltwise_ops.push_back(*it);
     }
   }
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
index a36f612c655442..69dc325a5cfdb1 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
@@ -26,9 +26,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/repeated_field.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/transforms/library_matcher.h"
-#include "xla/backends/cpu/transforms/xnn_matcher.h"
+#include "xla/backends/cpu/transforms/ynn_matcher.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -39,9 +40,6 @@ limitations under the License.
 #include "xla/backends/cpu/transforms/onednn_matcher.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/transforms/ynn_matcher.h"
-#endif
 
 namespace xla::cpu {
 
@@ -53,10 +51,8 @@ enum class FusionDirection {
 
 struct LibraryRewriterOptions {
   bool use_onednn = false;
-  bool use_xnnpack = false;
   bool use_ynnpack = false;
   const tsl::protobuf::RepeatedField<int>* onednn_fusion_types = nullptr;
-  const tsl::protobuf::RepeatedField<int>* xnn_fusion_types = nullptr;
   const tsl::protobuf::RepeatedField<int>* ynn_fusion_types = nullptr;
 };
 
@@ -75,18 +71,11 @@ class LibraryRewriter : public HloModulePass {
           target_machine_features_, options_.onednn_fusion_types));
     }
 #endif  // XLA_ONEDNN_USE_GRAPH_API
-    if (options_.use_xnnpack && options_.xnn_fusion_types != nullptr &&
-        !options_.xnn_fusion_types->empty()) {
-      libs_.push_back(std::make_unique<XnnMatcher>(target_machine_features_,
-                                                   options_.xnn_fusion_types));
-    }
-#ifdef XLA_YNNPACK
     if (options_.use_ynnpack && options_.ynn_fusion_types != nullptr &&
         !options_.ynn_fusion_types->empty()) {
       libs_.push_back(std::make_unique<YnnMatcher>(target_machine_features_,
                                                    options_.ynn_fusion_types));
     }
-#endif  // XLA_YNNPACK
 
     for (std::unique_ptr<LibraryMatcher>& lib : libs_) {
       supported_ops_.merge(lib->SupportedOps());
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
index 2646ee286b0259..dc02e86a829da6 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/codegen/target_machine_test_base.h"
-#include "xla/backends/cpu/xnn_gemm_config.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -64,7 +63,7 @@ class CpuLibraryTest : public TargetMachineTestBase {
 
   static const DotRewriteTestSpec& GetDefaultTestSpec() {
     static const absl::NoDestructor<DotRewriteTestSpec> kDefaultTestSpec(
-        {"xnn", "f32", "f32", "znver3", "+avx,+avx2", "dot"});
+        {"ynn", "f32", "f32", "znver3", "+avx,+avx2", "dot"});
     return *kDefaultTestSpec;
   }
 
@@ -79,9 +78,6 @@ class CpuLibraryTest : public TargetMachineTestBase {
             /*triple_string=*/"x86_64-unknown-linux-gnu", spec.cpu_name,
             spec.features);
 
-    // Override XnnGemmConfig.
-    GetXnnGemmConfig().SetTestFilter([](const XnnGemm&) { return true; });
-
     // Create an HLO module with the specified input and output data types.
     std::string hlo_text = absl::StrReplaceAll(
         hlo_template,
@@ -100,15 +96,12 @@ class CpuLibraryTest : public TargetMachineTestBase {
     }
     tsl::protobuf::RepeatedField<int> empty_fusion_types;
     bool use_onednn = spec.lib == "onednn";
-    bool use_xnnpack = spec.lib == "xnn";
     bool use_ynnpack = spec.lib == "ynn";
     LibraryRewriterOptions options = {
         use_onednn,
-        use_xnnpack,
         use_ynnpack,
         /*onednn_fusion_types=*/
         use_onednn ? &fusion_types : &empty_fusion_types,
-        /*xnn_fusion_types=*/use_xnnpack ? &fusion_types : &empty_fusion_types,
         /*ynn_fusion_types=*/use_ynnpack ? &fusion_types : &empty_fusion_types,
     };
     LibraryRewriter rewriter(features.get(), options);
@@ -158,15 +151,22 @@ class CpuLibraryFullParamTest
     RunTestInternal(GetParam(), hlo_template, expected);
   }
 
+  // Manually update expected dtype support for each library.
   bool IsDotEnabledOnCPU() {
     DotRewriteTestSpec spec = GetParam();
-    bool bf16_dot_supported = absl::StrContains(spec.features, "+avx512bf16");
-    bool fp16_dot_supported = absl::StrContains(spec.features, "+avx512fp16");
+    EXPECT_TRUE(spec.lib == "onednn" || spec.lib == "ynn");
+
+    if (spec.lib == "ynn") {
+      return (spec.in_dtype == "f32" || spec.in_dtype == "bf16");
+    }
+
     if (spec.in_dtype == "bf16") {
-      return bf16_dot_supported;
+      return absl::StrContains(spec.features, "+avx512bf16") ||
+             absl::StrContains(spec.features, "+amx_bf16");
     }
     if (spec.in_dtype == "f16") {
-      return fp16_dot_supported;
+      return absl::StrContains(spec.features, "+avx512fp16") ||
+             absl::StrContains(spec.features, "+amx_fp16");
     }
     return true;
   }
@@ -259,7 +259,7 @@ TEST_P(CpuLibraryFullParamTest, MatMulDimSizeUnqual) {
 
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kDot, 0, 0, false};
-  if (spec.lib == "xnn" && IsDotEnabledOnCPU()) {
+  if (spec.lib == "ynn" && IsDotEnabledOnCPU()) {
     expected = FusionProperties{HloOpcode::kDot, 2, 3, true};
   }
   RunTest(hlo_template, expected);
@@ -307,8 +307,8 @@ TEST_P(CpuLibraryFullParamTest, MatMulAddSubMulSameInputs) {
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kMultiply, 0, 0, false};
   if (IsDotEnabledOnCPU()) {
-    // {Dot, Add, Sub, Mul} for XNN, {Dot, Add} for oneDNN.
-    expected = spec.lib == "xnn"
+    // {Dot, Add, Sub, Mul} for YNN, {Dot, Add} for oneDNN.
+    expected = spec.lib == "ynn"
                    ? FusionProperties{HloOpcode::kMultiply, 3, 7, true}
                    : FusionProperties{HloOpcode::kAdd, 3, 5, true};
   } else if (spec.fusion_mode == "greedy") {
@@ -338,8 +338,8 @@ TEST_P(CpuLibraryFullParamTest, MatMulAddSubMulDifferentInputs) {
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kMultiply, 0, 0, false};
   if (IsDotEnabledOnCPU()) {
-    // {Dot, Add, Sub, Mul} for XNN, {Dot, Add} for oneDNN.
-    expected = spec.lib == "xnn"
+    // {Dot, Add, Sub, Mul} for YNN, {Dot, Add} for oneDNN.
+    expected = spec.lib == "ynn"
                    ? FusionProperties{HloOpcode::kMultiply, 5, 9, true}
                    : FusionProperties{HloOpcode::kAdd, 3, 5, true};
   } else if (spec.fusion_mode == "greedy") {
@@ -373,12 +373,12 @@ TEST_P(CpuLibraryFullParamTest, MatMulAddMinExpSort) {
                      dimensions={0}, to_apply=compare
     })";
 
-  // Sort is not supported by xnn_emitter and should not be in the fusion.
+  // Sort is not supported by ynn_emitter and should not be in the fusion.
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kExp, 0, 0, false};
   if (IsDotEnabledOnCPU()) {
-    // {Dot, Add, Min, Exp} for XNN, {Dot, Add} for oneDNN.
-    expected = spec.lib == "xnn"
+    // {Dot, Add, Min, Exp} for YNN, {Dot, Add} for oneDNN.
+    expected = spec.lib == "ynn"
                    ? FusionProperties{HloOpcode::kExp, 4, 8, true}
                    : FusionProperties{HloOpcode::kAdd, 3, 5, true};
   } else if (spec.fusion_mode == "greedy") {
@@ -430,23 +430,21 @@ std::vector<DotRewriteTestSpec> GetDotRewriteTestSpecs() {
   absl::flat_hash_map<std::string, std::string> cpu_to_features = {
       {"znver3", "+avx,+avx2"},
       {"sapphirerapids",
-       "+avx512vnni,+avx512bf16,+amx-bf16,+avx512fp16,+amx-int8,+amx-tile,+amx-"
-       "transpose"},
+       "+avx512vnni,+avx512bf16,+amx-bf16,+avx512fp16,+amx-int8,+amx-tile"},
   };
 
   // Input and output data types to test per each library + CPU combination.
   using StrPair = std::pair<std::string, std::string>;
   absl::flat_hash_map<StrPair, std::vector<StrPair>> dtype_map = {
-      {{"xnn", "znver3"}, {{"f32", "f32"}, {"bf16", "f32"}}},
-      {{"xnn", "sapphirerapids"},
-       {{"f32", "f32"}, {"bf16", "f32"}, {"bf16", "bf16"}}},
+      {{"ynn", "znver3"}, {{"f32", "f32"}, {"bf16", "f32"}}},
+      {{"ynn", "sapphirerapids"}, {{"f32", "f32"}, {"bf16", "f32"}}},
   };
 
   // Fusion modes to test for each library.
-  // We temporarily use XNN_GRAPH_FUSION_MODE_DISABLED to denote the dot fusion
-  // mode (starting fusion nodes with dots).
-  absl::flat_hash_map<std::string, std::vector<std::string>> fusion_modes = {
-      {"xnn", {"dot", "greedy"}}};
+  absl::flat_hash_map<std::string, std::vector<std::string>> fusion_modes;
+
+  // Don't test YNNPACK if we don't build with it.
+  fusion_modes["ynn"] = {"dot", "greedy"};
 
 #if XLA_ONEDNN_USE_GRAPH_API
   // Don't test oneDNN if we don't build with it.
@@ -595,7 +593,8 @@ TEST_P(CpuLibraryFusionTypeTest, JoiningFusions) {
   }
 }
 
-TEST_P(CpuLibraryFusionTypeTest, Reduce) {
+// TODO(penporn): Re-enable this test when YNNPACK supports reduce.
+TEST_P(CpuLibraryFusionTypeTest, DISABLED_Reduce) {
   const absl::string_view hlo_template = R"(
     HloModule reduce
 
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
deleted file mode 100644
index b360691f66f6d6..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/primitive_util.h"
-#include "xla/service/call_graph.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/instruction_fusion.h"
-#include "xla/xla.pb.h"
-
-namespace xla::cpu {
-
-namespace {
-
-bool IsWideningConvert(const HloInstruction* instr) {
-  return instr->opcode() == HloOpcode::kConvert &&
-         primitive_util::BitWidth(instr->operand(0)->shape().element_type()) <
-             primitive_util::BitWidth(instr->shape().element_type());
-}
-
-}  // namespace
-
-FusionDecision XnnGraphFusion::ShouldFuse(HloInstruction* consumer,
-                                          int64_t operand_index) {
-  if (!IsXnnGraphFusion(consumer) && !IsOpSupported(consumer)) {
-    return FusionDecision::Forbid("Unsupported consumer");
-  }
-
-  if (consumer->opcode() == HloOpcode::kBroadcast) {
-    return FusionDecision::Forbid(
-        "Do not start growing fusions from broadcasts");
-  }
-
-  if (IsWideningConvert(consumer)) {
-    // We don't want to start a fusion with a widening convert, because that
-    // makes the buffer the fusion writes to bigger, and it would be better to
-    // fuse the convert into the consumer of the convert.
-    return FusionDecision::Forbid(
-        "Do not start growing fusions from widening converts");
-  }
-
-  HloInstruction* producer = consumer->mutable_operand(operand_index);
-  if (!(producer->opcode() == HloOpcode::kParameter ||
-        IsOpSupported(producer))) {
-    return FusionDecision::Forbid("Unsupported producer");
-  }
-  return FusionDecision::Allow();
-}
-
-HloInstruction::FusionKind XnnGraphFusion::ChooseKind(
-    const HloInstruction* producer, const HloInstruction* consumer) {
-  return HloInstruction::FusionKind::kCustom;
-}
-
-HloInstruction* XnnGraphFusion::Fuse(HloInstruction* producer,
-                                     HloInstruction* consumer,
-                                     HloComputation* computation) {
-  HloInstruction* fusion =
-      InstructionFusion::Fuse(producer, consumer, computation);
-
-  BackendConfig backend_config;
-  FusionBackendConfig* fusion_config = backend_config.mutable_fusion_config();
-  fusion_config->set_kind(kXnnFusionKind);
-  CHECK(backend_config.has_fusion_config());
-  CHECK_OK(fusion->set_backend_config(backend_config));
-  return fusion;
-}
-
-std::vector<HloComputation*> XnnGraphFusion::GetNonFusionComputations(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  std::vector<HloComputation*> non_fusion_computations =
-      InstructionFusion::GetNonFusionComputations(module, execution_threads);
-  std::unique_ptr<CallGraph> call_graph =
-      CallGraph::Build(module, execution_threads);
-  auto SkipComputation = [&](HloComputation* c) {
-    auto callers = call_graph->GetComputationCallers(c);
-    return std::any_of(
-        callers.begin(), callers.end(),
-        [&](HloInstruction* caller) { return caller->has_to_apply(); });
-  };
-  auto it = std::remove_if(non_fusion_computations.begin(),
-                           non_fusion_computations.end(), SkipComputation);
-  non_fusion_computations.erase(it, non_fusion_computations.end());
-  return non_fusion_computations;
-}
-
-bool XnnGraphFusion::IsOpSupported(const HloInstruction* instr) {
-  if (!IsLayoutSupportedByXnn(instr->shape())) {
-    return false;
-  }
-  if (!XnnDatatype(instr->shape().element_type()).ok()) {
-    return false;
-  }
-  if (instr->IsConstant()) {
-    return IsConstantSupportedByXnn(instr);
-  }
-  if (instr->IsElementwise()) {
-    return IsElementwiseOpSupportedByXnn(instr);
-  }
-
-  switch (instr->opcode()) {
-    case HloOpcode::kBitcast:
-      return IsBitcastOpSupportedByXnn(instr);
-    case HloOpcode::kBroadcast:
-      return IsBroadcastOpSupportedByXnn(instr);
-    case HloOpcode::kReduce:
-      return IsReduceOpSupportedByXnn(instr);
-    default:
-      return false;
-  }
-}
-
-bool XnnGraphFusion::IsXnnGraphFusion(const HloInstruction* instr) {
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return false;
-  }
-  const HloFusionInstruction* fusion = Cast<HloFusionInstruction>(instr);
-  if (fusion->fusion_kind() != HloInstruction::FusionKind::kCustom) {
-    return false;
-  }
-  auto backend_config = fusion->backend_config<BackendConfig>();
-  if (!backend_config.ok() || !backend_config->has_fusion_config()) {
-    return false;
-  }
-  return backend_config->fusion_config().kind() == kXnnFusionKind;
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h
deleted file mode 100644
index ca596ad7a94ac0..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_TRANSFORMS_XNN_GRAPH_FUSION_H_
-#define XLA_BACKENDS_CPU_TRANSFORMS_XNN_GRAPH_FUSION_H_
-
-#include <cstdint>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/instruction_fusion.h"
-
-namespace xla {
-namespace cpu {
-
-class XnnGraphFusion : public InstructionFusion {
- public:
-  XnnGraphFusion() : InstructionFusion(XnnGraphFusion::IsExpensive) {}
-  ~XnnGraphFusion() override = default;
-
- private:
-  FusionDecision ShouldFuse(HloInstruction* consumer,
-                            int64_t operand_index) override;
-  HloInstruction::FusionKind ChooseKind(
-      const HloInstruction* producer, const HloInstruction* consumer) override;
-
-  HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
-                       HloComputation* computation) override;
-
-  std::vector<HloComputation*> GetNonFusionComputations(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  static bool IsOpSupported(const HloInstruction* instr);
-
-  static bool IsXnnGraphFusion(const HloInstruction* instr);
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_CPU_TRANSFORMS_XNN_GRAPH_FUSION_H_
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
deleted file mode 100644
index b992f7aa6b74d6..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-
-#include <memory>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace op = xla::testing::opcode_matchers;
-
-namespace xla::cpu {
-namespace {
-
-using XnnGraphFusionTest = HloHardwareIndependentTestBase;
-
-TEST_F(XnnGraphFusionTest, BasicFusion) {
-  std::string hlo_string = R"(
-HloModule FusionDemonstration
-
-ENTRY entry {
-   %param.0 = f32[2,2] parameter(0)
-   %constant.0 = f32[2,2] constant({ { 1, 2 }, { 3, 4 } })
-   %add.0 = f32[2,2] add(f32[2,2] %param.0, f32[2,2]{1,0} %constant.0)
-   %sub.0 = f32[2,2] subtract(f32[2,2] %param.0, f32[2,2] %constant.0)
-   ROOT %result = f32[2,2] multiply(f32[2,2] %add.0, f32[2,2] %sub.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_TRUE(changed);
-  EXPECT_THAT(module.get()->entry_computation()->root_instruction(),
-              op::Fusion());
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  ASSERT_EQ(root->opcode(), HloOpcode::kFusion);
-  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(root);
-  TF_ASSERT_OK_AND_ASSIGN(auto backend_config,
-                          fusion->backend_config<BackendConfig>());
-  ASSERT_TRUE(backend_config.has_fusion_config());
-  EXPECT_EQ(backend_config.fusion_config().kind(), kXnnFusionKind);
-}
-
-TEST_F(XnnGraphFusionTest, BasicFusionUnsupportedType) {
-  std::string hlo_string = R"(
-HloModule FusionDemonstration
-
-ENTRY entry {
-   %param.0 = s2[2,2] parameter(0)
-   %constant.0 = s2[2,2] constant({ { 0, 1 }, { 1, 0 } })
-   %add.0 = s2[2,2] add(s2[2,2] %param.0, s2[2,2] %constant.0)
-   %sub.0 = s2[2,2] subtract(s2[2,2] %param.0, s2[2,2] %constant.0)
-   ROOT %result = s2[2,2] multiply(s2[2,2] %add.0, s2[2,2] %sub.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, BasicFusionUnsupportedLayout) {
-  std::string hlo_string = R"(
-HloModule FusionDemonstration
-
-ENTRY entry {
-   %param.0 = f32[2,2]{0,1} parameter(0)
-   %constant.0 = f32[2,2]{0,1} constant({ { 0, 1 }, { 1, 0 } })
-   %add.0 = f32[2,2]{0,1} add(f32[2,2]{0,1} %param.0, f32[2,2]{0,1} %constant.0)
-   %sub.0 = f32[2,2]{0,1} subtract(f32[2,2]{0,1} %param.0, f32[2,2]{0,1} %constant.0)
-   ROOT %result = f32[2,2]{0,1} multiply(f32[2,2]{0,1} %add.0, f32[2,2]{0,1} %sub.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-static void SetFusionMode(HloModule* module,
-                          DebugOptions::XnnGraphFusionMode mode) {
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_cpu_experimental_xnn_graph_fusion_mode(mode);
-}
-
-TEST_F(XnnGraphFusionTest, BasicBroadcast) {
-  std::string hlo_string = R"(
-HloModule BroadcastFusion
-
-ENTRY entry {
-  %param.0 = f32[] parameter(0)
-  %broadcast.0 = f32[2,2] broadcast(f32[] %param.0), dimensions={}
-  ROOT result = f32[2,2] add(f32[2,2] %broadcast.0, f32[2,2] %broadcast.0)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_TRUE(changed);
-  EXPECT_THAT(module.get()->entry_computation()->root_instruction(),
-              op::Fusion());
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  ASSERT_EQ(root->opcode(), HloOpcode::kFusion);
-  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(root);
-  TF_ASSERT_OK_AND_ASSIGN(auto backend_config,
-                          fusion->backend_config<BackendConfig>());
-  ASSERT_TRUE(backend_config.has_fusion_config());
-  EXPECT_EQ(backend_config.fusion_config().kind(), kXnnFusionKind);
-}
-
-TEST_F(XnnGraphFusionTest, SkipRootBroadcast) {
-  std::string hlo_string = R"(
-HloModule SkipRootBroadcast
-
-ENTRY entry {
-  %param.0 = f32[] parameter(0)
-  %add.0 = f32[] add(f32[] %param.0, f32[] %param.0)
-  ROOT result = f32[2,2] broadcast(f32[] %param.0), dimensions={}
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, SkipUnsupportedBroadcast) {
-  // Broadcast changes the relative order of dimensions.
-  std::string hlo_string = R"(
-HloModule SkipUnsupportedBroadcast
-
-ENTRY entry {
-  %param.0 = f32[2,3] parameter(0)
-  %broadcast.0 = f32[4,3,2] broadcast(f32[2,3] %param.0), dimensions={2,1}
-  ROOT result = f32[4,3,2] add(f32[4,3,2] %broadcast.0, f32[4,3,2] %broadcast.0)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, SkipRootWideningConvert) {
-  std::string hlo_string = R"(
-HloModule SkipRootWideningConvert
-
-ENTRY entry {
-  %param.0 = f32[4] parameter(0)
-  %to_bf16.0 = bf16[4] convert(f32[4] %param.0)
-  ROOT result = f32[4] convert(bf16[4] %to_bf16.0)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, BasicFusionUnsupportedOperandType) {
-  std::string hlo_string = R"(
-HloModule BasicFusionUnsupportedOperandType
-
-ENTRY entry {
-   %param.0 = s1[2,2] parameter(0)
-   ROOT %converted_param.0 = f32[2,2] convert(s1[2,2] %param.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, BasicReduce) {
-  std::string hlo_string = R"(
-HloModule BasicReduce
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  ROOT maximum = f32[] maximum(arg_0, arg_1)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(-inf)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_TRUE(changed);
-  EXPECT_THAT(module.get()->entry_computation()->root_instruction(),
-              op::Fusion());
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(root);
-  TF_ASSERT_OK_AND_ASSIGN(auto backend_config,
-                          fusion->backend_config<BackendConfig>());
-  ASSERT_TRUE(backend_config.has_fusion_config());
-  EXPECT_EQ(backend_config.fusion_config().kind(), kXnnFusionKind);
-}
-
-TEST_F(XnnGraphFusionTest, SkipReduceWithUnsupportedInit) {
-  std::string hlo_string = R"(
-HloModule SkipReduceWithUnsupportedInit
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  ROOT maximum = f32[] maximum(arg_0, arg_1)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(1.33)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, SkipReduceWithUnsupportedReducer) {
-  std::string hlo_string = R"(
-HloModule SkipReduceWithUnsupportedReducer
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  ROOT sub = f32[] subtract(arg_0, arg_1)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(1.33)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, NoFusionInsideReducer) {
-  std::string hlo_string = R"(
-HloModule NoFusionInsideReducer
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  mul = f32[] multiply(arg_0, arg_1)
-  ROOT result = f32[] add(arg_0, mul)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(1.33)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h b/third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h
deleted file mode 100644
index faa943fa4ce929..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_TRANSFORMS_XNN_MATCHER_H_
-#define XLA_BACKENDS_CPU_TRANSFORMS_XNN_MATCHER_H_
-
-#include <string>
-
-#include "absl/base/no_destructor.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/transforms/library_matcher.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "tsl/platform/protobuf.h"
-
-namespace xla::cpu {
-
-class XnnMatcher : public LibraryMatcher {
- public:
-  explicit XnnMatcher(const TargetMachineFeatures* target_machine_features,
-                      const tsl::protobuf::RepeatedField<int>* fusion_types)
-      : LibraryMatcher(target_machine_features, fusion_types) {}
-  ~XnnMatcher() override = default;
-
-  // Returns the set of supported HLO instructions.
-  absl::flat_hash_set<HloOpcode> SupportedOps() const override {
-    static const absl::NoDestructor<absl::flat_hash_set<HloOpcode>>
-        kSupportedOps{[]() {
-          absl::flat_hash_set<HloOpcode> supported_ops{
-              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant};
-          for (const auto& [op, _] : GetXnnUnaryOpMap()) {
-            supported_ops.insert(op);
-          }
-          for (const auto& [op, _] : GetXnnBinaryOpMap()) {
-            supported_ops.insert(op);
-          }
-          return supported_ops;
-        }()};
-    return *kSupportedOps;
-  }
-
-  // Returns true if the HLO instruction is supported by the library.
-  absl::StatusOr<bool> IsOpSupported(const HloInstruction* instr) override {
-    if (instr->opcode() == HloOpcode::kDot) {
-      return IsDotSupportedByXnn(
-          instr->dot_dimension_numbers(), instr->operand(0)->shape(),
-          instr->operand(1)->shape(), instr->shape(), target_machine_features_);
-    }
-    if (instr->opcode() == HloOpcode::kReduce) {
-      return IsReduceOpSupportedByXnn(instr);
-    }
-    if (instr->IsConstant()) {
-      return IsConstantSupportedByXnn(instr);
-    }
-    // TODO(b/441837668): Need to get the reduction performance/cost model
-    // right before enabling fusions. Fusions make performance analysis quite
-    // challenging.
-    if (fuse_reduce_) {
-      return false;
-    }
-    if (instr->IsElementwise()) {
-      return IsElementwiseOpSupportedByXnn(instr);
-    }
-    return false;
-  }
-
-  // Returns true if we should start a new fusion containing just the given HLO
-  // instruction. We control the instructions that can start a fusion with the
-  // `--xla_cpu_experimental_xnn_fusion_type` flag.
-  bool ShouldCreateFusion(const HloInstruction* instr) override {
-    if (fuse_dot_ && instr->opcode() == HloOpcode::kDot) {
-      return true;
-    }
-    if (fuse_reduce_ && instr->opcode() == HloOpcode::kReduce) {
-      return true;
-    }
-    return fuse_eltwise_ && instr->IsElementwise();
-  }
-
-  // Returns the output type of the XNN op, so we can insert a convert node if
-  // the op does not support the original HLO output type.
-  PrimitiveType LibraryOpOutputType(const HloInstruction* instr) override {
-    auto out_type = instr->shape().element_type();
-    if (instr->opcode() != HloOpcode::kDot) {
-      return out_type;
-    }
-    return out_type == BF16 ? F32 : out_type;
-  }
-
-  // Returns a prefix string for the fusion op's name.
-  std::string fusion_prefix() const override { return "xnn_"; }
-
-  // Returns a string for FusionBackendConfig's fusion kind.
-  absl::string_view fusion_kind() const override { return kXnnFusionKind; }
-
- private:
-  absl::flat_hash_set<DebugOptions::LibraryFusionType> fusion_types_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_TRANSFORMS_XNN_MATCHER_H_
diff --git a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
index 38dc8f6f820cf7..c8d05b933591e3 100644
--- a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
+++ b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
@@ -43,7 +43,8 @@ class YnnMatcher : public LibraryMatcher {
     static const absl::NoDestructor<absl::flat_hash_set<HloOpcode>>
         kSupportedOps{[]() {
           absl::flat_hash_set<HloOpcode> supported_ops{
-              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant};
+              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant,
+              HloOpcode::kConvolution};
           for (const auto& [op, _] : GetYnnUnaryOpMap()) {
             supported_ops.insert(op);
           }
@@ -65,6 +66,9 @@ class YnnMatcher : public LibraryMatcher {
     if (instr->opcode() == HloOpcode::kReduce) {
       return IsReduceOpOffloadedToYnn(instr);
     }
+    if (instr->opcode() == HloOpcode::kConvolution) {
+      return IsConvolutionOpSupportedByYnn(instr);
+    }
     if (instr->IsConstant()) {
       return IsConstantSupportedByYnn(instr);
     }
diff --git a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc
new file mode 100644
index 00000000000000..d03d56c5a68f36
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+
+namespace xla::cpu {
+namespace {
+
+class YnnE2eTest : public HloTestBase {
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.add_xla_cpu_experimental_ynn_fusion_type(
+        DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
+    debug_options.clear_xla_cpu_experimental_ynn_fusion_type();
+    return debug_options;
+  }
+};
+
+TEST_F(YnnE2eTest, DoNotDegroupConvolutionFeatures) {
+  const char* matmul_module_str = R"(
+  HloModule convolution
+
+  ENTRY %main {
+    %lhs = f32[1,7,8,9] parameter(0)
+    %rhs = f32[1,5,3,9] parameter(1)
+    ROOT %conv = f32[1,4,8,9] convolution(%lhs, %rhs),
+        window={size=1x5 stride=2x1 pad=0_0x2_2}, dim_labels=b01f_01io->b01f,
+        feature_group_count=3
+  })";
+
+  // If the convolution feature group is de-grouped, the shape will change to:
+  //   f32[1,4,8,3,3]{4,3,2,1,0}
+  // This convolution is supported by YNNPACK, so the shape should not change.
+  MatchOptimizedHlo(matmul_module_str,
+                    "CHECK: f32[1,4,8,9]{3,2,1,0} convolution");
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_emitter.cc b/third_party/xla/xla/backends/cpu/xnn_emitter.cc
deleted file mode 100644
index 30473a0bc2ed9a..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_emitter.cc
+++ /dev/null
@@ -1,507 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_emitter.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal.h"
-#include "xla/shape.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// A mapping from HloInstruction to XNNPACK subgraph tensor id.
-using TensorIdMap = absl::flat_hash_map<const HloInstruction*, uint32_t>;
-
-//===----------------------------------------------------------------------===//
-// XLA <-> XNNPACK type conversion library.
-//===----------------------------------------------------------------------===//
-
-static std::vector<size_t> XnnDimensions(const Shape& shape) {
-  std::vector<size_t> dims;
-  for (auto& dim : shape.dimensions()) {
-    dims.push_back(dim);
-  }
-  return dims;
-}
-
-//===----------------------------------------------------------------------===//
-// XLA <-> XNNPACK emitters.
-//===----------------------------------------------------------------------===//
-
-static absl::StatusOr<uint32_t> FindTensorValue(const TensorIdMap& tensor_ids,
-                                                const HloInstruction* instr) {
-  if (auto it = tensor_ids.find(instr); it != tensor_ids.end()) {
-    return it->second;
-  }
-  return Internal("Can't fine XNNPACK tensor value for instruction %s",
-                  instr->ToString());
-}
-
-static absl::StatusOr<uint32_t> DefineTensorValue(
-    xnn_subgraph_t subgraph, xnn_datatype type, absl::Span<const size_t> dims) {
-  uint32_t tensor_id = XNN_INVALID_VALUE_ID;
-  uint32_t tensor_flags = 0;
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), nullptr,
-      /*external_id=*/tensor_id, tensor_flags, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineTensorValue(xnn_subgraph_t subgraph,
-                                                  const HloInstruction* instr) {
-  // We do not support instructions with multiple results (tuples).
-  if (!instr->shape().IsArray()) {
-    return Internal("Unsupported XNNPACK instruction shape: %s",
-                    instr->ToString());
-  }
-
-  auto dims = XnnDimensions(instr->shape());
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(instr->shape().element_type()));
-
-  uint32_t tensor_id = XNN_INVALID_VALUE_ID;
-  uint32_t tensor_flags = 0;
-
-  // If instruction is a root instruction of the parent computation we assign it
-  // an external tensor id corresponding to the result index.
-  const HloComputation* computation = instr->parent();
-  if (computation->root_instruction() == instr) {
-    tensor_id = computation->num_parameters();
-    tensor_flags = XNN_VALUE_FLAG_EXTERNAL_OUTPUT;
-  }
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), nullptr,
-      /*external_id=*/tensor_id, tensor_flags, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineConstant(
-    xnn_subgraph_t subgraph, std::vector<std::unique_ptr<Literal>>& literals,
-    const HloInstruction* instr) {
-  // We do not support instructions with multiple results (tuples).
-  if (!instr->shape().IsArray()) {
-    return Internal("Unsupported XNNPACK instruction shape: %s",
-                    instr->ToString());
-  }
-
-  auto dims = XnnDimensions(instr->shape());
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(instr->shape().element_type()));
-
-  uint32_t tensor_id = XNN_INVALID_VALUE_ID;
-  uint32_t tensor_flags = 0;
-
-  literals.push_back(instr->literal().CloneToUnique());
-  const void* value = literals.back()->untyped_data();
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), value,
-      /*external_id=*/tensor_id, tensor_flags, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineParameter(xnn_subgraph_t subgraph,
-                                                const HloInstruction* param) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for parameter: %s",
-                                param->ToString());
-
-  auto dims = XnnDimensions(param->shape());
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(param->shape().element_type()));
-
-  uint32_t tensor_id = param->parameter_number();
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), nullptr,
-      /*external_id=*/tensor_id, XNN_VALUE_FLAG_EXTERNAL_INPUT, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineBitcastOp(xnn_subgraph_t subgraph,
-                                                TensorIdMap& tensor_ids,
-                                                const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for bitcast op: %s",
-                                instr->ToString());
-  CHECK_EQ(instr->opcode(), HloOpcode::kBitcast);
-  const HloInstruction* input = instr->operand(0);
-  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-
-  auto dims = XnnDimensions(instr->shape());
-  XNN_RETURN_IF_ERROR(xnn_define_static_reshape(subgraph, dims.size(),
-                                                dims.data(), in, out,
-                                                /*flags=*/0));
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineBroadcastOp(xnn_subgraph_t subgraph,
-                                                  TensorIdMap& tensor_ids,
-                                                  const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for broadcast op: %s",
-                                instr->ToString());
-  CHECK_EQ(instr->opcode(), HloOpcode::kBroadcast);
-  const HloBroadcastInstruction* broadcast_instr =
-      Cast<HloBroadcastInstruction>(instr);
-  const HloInstruction* input = broadcast_instr->operand(0);
-  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
-
-  const absl::Span<const int64_t> input_dims = input->shape().dimensions();
-  const absl::Span<const int64_t> output_dims = instr->shape().dimensions();
-  const absl::Span<const int64_t> dims = broadcast_instr->dimensions();
-  CHECK(std::is_sorted(dims.begin(), dims.end()));
-  CHECK_LE(input_dims.size(), output_dims.size());
-
-  const size_t num_new_axes = output_dims.size() - input_dims.size();
-  // New axis positions used by XNNPACK expand_dims.
-  std::vector<size_t> xnn_expand_dims_new_axes;
-  xnn_expand_dims_new_axes.reserve(num_new_axes);
-  std::vector<size_t> xnn_expand_dims_dimensions;
-  xnn_expand_dims_dimensions.reserve(output_dims.size());
-
-  // Mask used by XNNPACK broadcast.
-  std::vector<size_t> xnn_new_shape;
-  xnn_new_shape.reserve(output_dims.size());
-
-  for (size_t dim_idx = 0; dim_idx < output_dims.size(); ++dim_idx) {
-    const auto it = std::find(dims.begin(), dims.end(), dim_idx);
-    if (it == dims.end()) {
-      // New dimension case.
-      xnn_expand_dims_new_axes.push_back(dim_idx);
-      xnn_expand_dims_dimensions.push_back(1u);
-      // Broadcasted dimension.
-      xnn_new_shape.push_back(output_dims[dim_idx]);
-    } else {
-      // Pass through the input dimension.
-      const size_t input_dim_idx = it - dims.begin();
-      CHECK_EQ(*it, dim_idx);
-      const size_t input_dim = input_dims[input_dim_idx];
-      CHECK_EQ(input_dim, output_dims[dim_idx]);
-      xnn_expand_dims_dimensions.push_back(input_dim);
-      // 0 means keeping the dimension of the input.
-      // See the description of xnn_define_static_broadcast in xnnpack.h
-      xnn_new_shape.push_back(0u);
-    }
-  }
-
-  CHECK_EQ(xnn_expand_dims_dimensions.size(), output_dims.size());
-  CHECK_EQ(xnn_expand_dims_new_axes.size(), num_new_axes);
-  CHECK_EQ(xnn_new_shape.size(), output_dims.size());
-
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(input->shape().element_type()));
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
-  TF_ASSIGN_OR_RETURN(
-      auto xnn_dims_expanded,
-      DefineTensorValue(subgraph, type, xnn_expand_dims_dimensions));
-  TF_ASSIGN_OR_RETURN(auto xnn_broadcast, DefineTensorValue(subgraph, instr));
-
-  XNN_RETURN_IF_ERROR(xnn_define_static_expand_dims(
-      subgraph, num_new_axes, xnn_expand_dims_new_axes.data(), in,
-      xnn_dims_expanded, /*flags=*/0));
-
-  XNN_RETURN_IF_ERROR(xnn_define_static_broadcast(
-      subgraph, xnn_new_shape.size(), xnn_new_shape.data(), xnn_dims_expanded,
-      xnn_broadcast, /*flags=*/0));
-
-  return xnn_broadcast;
-}
-
-static absl::StatusOr<uint32_t> DefineReduceOp(xnn_subgraph_t subgraph,
-                                               TensorIdMap& tensor_ids,
-                                               const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for reduce op: %s",
-                                instr->ToString());
-  CHECK_EQ(instr->opcode(), HloOpcode::kReduce);
-  const HloReduceInstruction* reduce_instr = Cast<HloReduceInstruction>(instr);
-  const HloInstruction* input = instr->operand(0);
-  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
-
-  xnn_reduce_operator xnn_reduce_op = xnn_reduce_invalid;
-  CHECK_EQ(reduce_instr->to_apply()->num_parameters(), 2);
-  CHECK_EQ(reduce_instr->to_apply()->instruction_count(), 3);
-
-  switch (reduce_instr->to_apply()->root_instruction()->opcode()) {
-    case HloOpcode::kAdd:
-      xnn_reduce_op = xnn_reduce_sum;
-      break;
-    case HloOpcode::kMaximum:
-      xnn_reduce_op = xnn_reduce_max;
-      break;
-    case HloOpcode::kMinimum:
-      xnn_reduce_op = xnn_reduce_min;
-      break;
-    default:
-      LOG(FATAL) << "Unsupported reduction: " << instr->to_apply()->ToString();
-  }
-
-  const absl::Span<const int64_t> dims = reduce_instr->dimensions();
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-  XNN_RETURN_IF_ERROR(xnn_define_static_reduce(
-      subgraph, xnn_reduce_op, dims.size(),
-      reinterpret_cast<const size_t*>(dims.data()), in, out,
-      /*flags=*/0));
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineUnaryOp(xnn_subgraph_t subgraph,
-                                              TensorIdMap& tensor_ids,
-                                              const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for unary op: %s",
-                                instr->ToString());
-  TF_ASSIGN_OR_RETURN(auto unary_op, XnnUnaryOperator(instr->opcode()));
-
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, instr->operand(0)));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-
-  VLOG(3) << absl::StreamFormat("  tensors: in=%d, out=%d", in, out);
-
-  xnn_unary_params params;
-  XNN_RETURN_IF_ERROR(
-      xnn_define_unary(subgraph, unary_op, &params, in, out, /*flags=*/0));
-
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineBinaryOp(xnn_subgraph_t subgraph,
-                                               TensorIdMap& tensor_ids,
-                                               const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for binary op: %s",
-                                instr->ToString());
-
-  TF_ASSIGN_OR_RETURN(auto binary_op, XnnBinaryOperator(instr->opcode()));
-
-  TF_ASSIGN_OR_RETURN(auto lhs, FindTensorValue(tensor_ids, instr->operand(0)));
-  TF_ASSIGN_OR_RETURN(auto rhs, FindTensorValue(tensor_ids, instr->operand(1)));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-
-  VLOG(3) << absl::StreamFormat("  tensors: lhs=%d, rhs=%d, out=%d", lhs, rhs,
-                                out);
-
-  xnn_binary_params params = {-std::numeric_limits<float>::infinity(),
-                              std::numeric_limits<float>::infinity()};
-
-  // In XLA, broadcasts are explicit ops, allowing XNNPACK to assume there is no
-  // broadcasting in the elementwise operation itself, which simplifies data
-  // dependencies.
-  const uint32_t flags = XNN_FLAG_NO_BROADCAST;
-  XNN_RETURN_IF_ERROR(xnn_define_binary(subgraph, binary_op, &params, lhs, rhs,
-                                        out, /*flags=*/flags));
-
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineBatchMatMul(xnn_subgraph_t subgraph,
-                                                  TensorIdMap& tensor_ids,
-                                                  const HloInstruction* instr) {
-  // Verify that this Dot is supported by XNNPACK.
-  const DotDimensionNumbers& dnums = instr->dot_dimension_numbers();
-  const Shape& lhs_shape = instr->operand(0)->shape();
-  const Shape& rhs_shape = instr->operand(1)->shape();
-  TF_ASSIGN_OR_RETURN(
-      bool is_supported,
-      IsDotSupportedByXnn(dnums, lhs_shape, rhs_shape, instr->shape(),
-                          /*cpu_features=*/nullptr, /*use_cost_model=*/false));
-
-  if (!is_supported) {
-    return InvalidArgument("Unsupported XNNPACK Dot op variation: %s",
-                           instr->ToString());
-  }
-
-  VLOG(3) << "Define tensor values for batch_matrix_multiply op";
-
-  TF_ASSIGN_OR_RETURN(uint32_t lhs,
-                      FindTensorValue(tensor_ids, instr->operand(0)));
-  TF_ASSIGN_OR_RETURN(uint32_t rhs,
-                      FindTensorValue(tensor_ids, instr->operand(1)));
-  TF_ASSIGN_OR_RETURN(uint32_t out, DefineTensorValue(subgraph, instr));
-
-  VLOG(3) << absl::StreamFormat("  tensors: lhs=%d, rhs=%d, out=%d", lhs, rhs,
-                                out);
-
-  // In XLA, broadcasts are explicit ops, allowing XNNPACK to assume there is no
-  // broadcasting in the elementwise operation itself, which simplifies data
-  // dependencies.
-  uint32_t flags = XNN_FLAG_NO_BROADCAST;
-  // IsXnnDotSupported has verified that rhs_contracting_dimensions has size 1.
-  if (dnums.rhs_contracting_dimensions(0) !=
-      dnums.rhs_batch_dimensions_size()) {
-    flags |= XNN_FLAG_TRANSPOSE_B;
-  }
-  XNN_RETURN_IF_ERROR(xnn_define_batch_matrix_multiply(subgraph, lhs, rhs, out,
-                                                       /*flags=*/flags));
-
-  return out;
-}
-
-//===----------------------------------------------------------------------===//
-// Emit XNNPACK subgraph for the given HLO computation.
-//===----------------------------------------------------------------------===//
-
-static absl::StatusOr<XnnSubgraph> EmitXnnSubgraph(
-    const HloComputation* computation,
-    std::vector<std::unique_ptr<Literal>>& literals) {
-  VLOG(3) << "Emit XNNPACK subgraph for computation: " << computation->name();
-
-  TF_ASSIGN_OR_RETURN(
-      XnnSubgraph subgraph, CreateXnnSubgraph([&](xnn_subgraph_t* subgraph) {
-        return xnn_create_subgraph(
-            /*external_value_ids=*/computation->num_parameters() + 1,
-            /*flags=*/0, subgraph);
-      }));
-
-  // Traverse fused computation in post-order and define XNNPACK operations
-  // corresponding to each HLO instruction.
-  TensorIdMap tensor_ids;
-  auto instructions = computation->MakeInstructionPostOrder();
-
-  for (const HloInstruction* instr : instructions) {
-    if (!IsLayoutSupportedByXnn(instr->shape())) {
-      return InvalidArgument(
-          "Instruction with unsupported layout in XNN fusion: %s",
-          instr->ToString());
-    }
-
-    if (instr->IsConstant()) {
-      if (!IsConstantSupportedByXnn(instr)) {
-        return InvalidArgument(
-            "Unsupported constant instruction in XNN fusion: %s",
-            instr->ToString());
-      }
-      TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                          DefineConstant(subgraph.get(), literals, instr));
-      continue;
-    }
-
-    if (instr->IsElementwise()) {
-      if (!IsElementwiseOpSupportedByXnn(instr)) {
-        return InvalidArgument(
-            "Unsupported elementwise instruction in XNN fusion: %s",
-            instr->ToString());
-      }
-      if (instr->operand_count() == 1) {
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineUnaryOp(subgraph.get(), tensor_ids, instr));
-      } else if (instr->operand_count() == 2) {
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineBinaryOp(subgraph.get(), tensor_ids, instr));
-      } else {
-        LOG(FATAL) << "Unexpected operand count " << instr->operand_count();
-      }
-      continue;
-    }
-
-    switch (instr->opcode()) {
-      case HloOpcode::kParameter: {
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineParameter(subgraph.get(), instr));
-      } break;
-
-      case HloOpcode::kBitcast: {
-        if (!IsBitcastOpSupportedByXnn(instr)) {
-          return InvalidArgument(
-              "Unsupported bitcast instruction in XNN fusion: %s",
-              instr->ToString());
-        }
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineBitcastOp(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      case HloOpcode::kBroadcast: {
-        if (!IsBroadcastOpSupportedByXnn(instr)) {
-          return InvalidArgument(
-              "Unsupported broadcast instruction in XNN fusion: %s",
-              instr->ToString());
-        }
-        TF_ASSIGN_OR_RETURN(
-            tensor_ids[instr],
-            DefineBroadcastOp(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      case HloOpcode::kReduce: {
-        // FIXME: Validate the reduce instruction.
-        // One cannot directly use IsReduceOpSupportedByXnn since the invariant
-        // value is not necessarily included into the same fusion. This might
-        // happen if the original instruction has multiple users or was rejected
-        // by the fusion compiler pass.
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineReduceOp(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      case HloOpcode::kDot: {
-        TF_ASSIGN_OR_RETURN(
-            tensor_ids[instr],
-            DefineBatchMatMul(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      default: {
-        return InvalidArgument("Unsupported XNNPACK fusion instruction: %s",
-                               instr->ToString());
-      }
-    }
-  }
-
-  return subgraph;
-}
-
-absl::StatusOr<absl::AnyInvocable<absl::StatusOr<XnnSubgraph>()>>
-EmitXnnFusionBuilder(const HloComputation* computation) {
-  // We do not support non-array parameters for XNNPACK operations.
-  for (auto& param : computation->parameter_instructions()) {
-    if (!param->shape().IsArray()) {
-      return InvalidArgument(
-          "XNNPACK fusion parameters must have array shapes, got %s",
-          param->shape().ToString());
-    }
-  }
-
-  // Result also must be a single array.
-  if (!computation->root_instruction()->shape().IsArray()) {
-    return InvalidArgument("XNNPACK fusion result must be an array, got %s",
-                           computation->root_instruction()->shape().ToString());
-  }
-
-  return [computation,
-          literals = std::vector<std::unique_ptr<Literal>>()]() mutable {
-    return EmitXnnSubgraph(computation, literals);
-  };
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_gemm_config.cc b/third_party/xla/xla/backends/cpu/xnn_gemm_config.cc
deleted file mode 100644
index 6750a849524741..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_gemm_config.cc
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_gemm_config.h"
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <cstddef>
-#include <limits>
-#include <numeric>
-
-#include "absl/base/no_destructor.h"
-#include "absl/log/check.h"
-#include "llvm/Target/TargetMachine.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-namespace {
-
-double Relu(double x) { return std::max(0.0, x); }
-
-template <size_t Size>
-std::array<double, Size> Relu(const std::array<double, Size>& input) {
-  std::array<double, Size> output{};
-  for (size_t i = 0; i < Size; ++i) {
-    output[i] = Relu(input[i]);
-  }
-  return output;
-}
-
-double Sigmoid(double x) { return 1.0 / (1.0 + std::exp(-x)); }
-
-double Sigmoid(std::array<double, 1> input) { return Sigmoid(input[0]); }
-
-template <size_t InSize, size_t OutSize>
-struct Layer {
-  std::array<std::array<double, InSize>, OutSize> weights;
-  std::array<double, OutSize> biases;
-
-  std::array<double, OutSize> operator()(
-      const std::array<double, InSize>& input) const {
-    std::array<double, OutSize> output{};
-    for (size_t i = 0; i < OutSize; ++i) {
-      output[i] = std::inner_product(input.begin(), input.end(),
-                                     weights[i].begin(), 0.0);
-      output[i] += biases[i];
-    }
-    return output;
-  }
-};
-
-template <size_t InSize>
-struct Scaler {
-  std::array<double, InSize> mean;
-  std::array<double, InSize> scale;
-
-  std::array<double, InSize> operator()(
-      const std::array<double, InSize>& features) const {
-    std::array<double, InSize> out;
-    for (size_t i = 0; i < features.size(); ++i) {
-      out[i] = (features[i] - mean[i]) / scale[i];
-    }
-    return out;
-  }
-};
-
-std::array<double, 6> ExtractFeatures(int m, int k, int n) {
-  std::array<double, 6> features = {static_cast<double>(m),
-                                    static_cast<double>(k),
-                                    static_cast<double>(n),
-                                    std::log(m),
-                                    std::log(k),
-                                    std::log(n)};
-  return features;
-}
-
-struct Net {
-  static constexpr size_t kNumFeatures = 6;
-  static constexpr size_t kHiddenLayer1Size = 8;
-  static constexpr size_t kHiddenLayer2Size = 8;
-
-  Scaler<kNumFeatures> scaler;
-  Layer<kNumFeatures, kHiddenLayer1Size> hidden_layer_1;
-  Layer<kHiddenLayer1Size, kHiddenLayer2Size> hidden_layer_2;
-  Layer<kHiddenLayer2Size, 1> output_layer;
-  double threshold;
-
-  int operator()(double m, double k, double n) const {
-    std::array<double, kNumFeatures> features = ExtractFeatures(m, k, n);
-    double probability = Sigmoid(output_layer(
-        Relu(hidden_layer_2(Relu(hidden_layer_1(scaler(features)))))));
-    return probability < threshold ? 1 : 0;
-  }
-};
-
-struct Range {
-  int min;
-  int max;
-
-  template <class... Args>
-  bool Contains(Args... args) const {
-    auto check = [this](int x) -> bool { return min <= x && x <= max; };
-    return (check(args) && ...);
-  }
-};
-
-struct GemmFilter {
-  Range input_range;
-  PrimitiveType lhs_dtype;
-  PrimitiveType rhs_dtype;
-  PrimitiveType out_dtype;
-
-  bool operator()(const XnnGemm& gemm) const {
-    return input_range.Contains(gemm.dot_canonical_dims.m,
-                                gemm.dot_canonical_dims.k,
-                                gemm.dot_canonical_dims.n) &&
-           gemm.lhs_dtype == lhs_dtype && gemm.rhs_dtype == rhs_dtype &&
-           gemm.out_dtype == out_dtype &&
-           gemm.dot_canonical_dims.lhs_canonical &&
-           !gemm.dot_canonical_dims.rhs_column_major &&
-           gemm.dot_canonical_dims.rhs_canonical &&
-           !gemm.dot_canonical_dims.output_column_major;
-  }
-};
-
-// NOLINTBEGIN
-// clang-format off
-
-static constexpr GemmFilter BF16BF16F32GemmFilter{
-  /*input_range=*/{0, std::numeric_limits<int>::max()},
-  /*lhs_dtype=*/PrimitiveType::BF16,
-  /*rhs_dtype=*/PrimitiveType::BF16,
-  /*out_dtype=*/PrimitiveType::F32,
-};
-
-static constexpr GemmFilter AMDRomeGemmFilter{
-  /*input_range=*/{16, 4096},
-  /*lhs_dtype=*/PrimitiveType::F32,
-  /*rhs_dtype=*/PrimitiveType::F32,
-  /*out_dtype=*/PrimitiveType::F32,
-};
-
-static constexpr Net AMDRomeNet{
-  /*scaler=*/{
-    /*mean=*/
-    {{ 2031.4479060265578, 2036.3171603677222, 2062.2170582226763, 7.29227087924762, 7.308301476602625, 7.331674465299577 }},
-    /*scale=*/
-    {{ 1188.2177375470617, 1178.7350461452038, 1179.7790996965598, 1.0416890873676914, 1.0053399234375506, 0.9757991392501179 }},
-  },
-  /*hidden_layer_1=*/{
-    /*weights=*/{{
-      {{ 0.5255922128957278, -0.8065013670906714, -0.5264014380189966, -1.2772498330118651, 1.3840216299823802, 0.7322759674330881 }},
-      {{ -0.7597171548555842, -1.2571169773685882, -0.32518437620636936, 1.0212806356673838, 0.9165371224616725, -0.19250317971610814 }},
-      {{ -2.3497882574965994, 0.23878289300722322, -2.5867259166595944, 0.8432052252434499, -0.7374592701571068, 0.6061228206232958 }},
-      {{ 0.3412638507438349, 0.009127030753615727, -0.43271581733053577, 0.3058216852138156, 0.4132978840654225, 0.08892908864656021 }},
-      {{ -0.3843556431761765, -0.5398088470059381, -2.0478454682095735, -1.9041927205327738, -1.0368295384919808, -0.1653666006655781 }},
-      {{ 0.9415170642828504, -0.4671602009419241, -2.594401365132767, 0.5011818371933664, 2.6743454901058725, 1.090931094328555 }},
-      {{ -2.030867525769208, 0.9360281369657524, -2.179490537456837, 0.6315631977398317, -0.2797813498393135, 1.1780045163240112 }},
-      {{ 2.026780502536945, 1.1382782700184098, 0.7076892737809293, -0.5003242829913847, 1.7337823655903326, 0.676979521067241 }},
-    }},
-    /*biases=*/{
-      { 2.827760670625431, -0.9347274494671962, 1.7748650815163647, -0.5102747570142624, 1.1443725632238269, 2.0573020231014616, 0.33721201132380757, 2.7437956980307643 },
-    }
-  },
-  /*hidden_layer_2=*/{
-    /*weights=*/{{
-      {{ 2.571821311709108, 0.16869445337763503, 0.3541411973512104, 0.31040383433531593, -1.9138308971941267, 1.577267326066108, 1.0358680188904088, -0.48597239908310547 }},
-      {{ -0.3168524372865204, -0.8109707535168992, -0.6883758912881943, 0.20041683878416458, 0.29562419861502953, 2.9699371941875183, -0.06378706528945598, -1.2627270412739198 }},
-      {{ 1.2121865841893051, 0.4324679330555888, 0.5756742637802713, -0.3965637421226802, -0.8316876650525071, 1.4267737797853521, 0.6590628275882154, 1.0969896994507335 }},
-      {{ 0.08152092107879703, 0.987281670566132, 2.711801967605775, 0.03262333498333622, -0.24851434369301018, 0.5857580261361529, -0.14172228489696118, 1.0096244465236095 }},
-      {{ -1.099617291565094, -0.96182176932886, 1.1198642662894356, 0.09569259551658717, 0.9865508260397995, -1.7073686127591108, 0.8545686868857858, 1.276785903326864 }},
-      {{ 0.6284115174399925, -0.5692706408214737, -0.3776497427936689, 0.2850473804130665, 0.5611912673866001, 0.7074167980672433, 1.3602397130866593, -2.4641849404042104 }},
-      {{ -0.2235255127724266, -0.6066818030776572, 2.098453748102861, -0.551860833640914, -0.6607678541967575, -1.0968858307838945, -3.097129404864497, 1.22936241411423 }},
-      {{ -0.35359032516179434, 0.16659401401800453, 0.7409562527506246, 0.12880569714035928, 1.6235584538175323, 0.35055754805485, -0.5085408039033421, 0.03832167245213557 }},
-    }},
-    /*biases=*/{
-      { -0.9650088973529635, 0.18404512445819377, -1.1301082618712814, -0.4114680200097482, -2.16829227705252, -0.792693003568079, 2.0186809343196432, 0.6651750830570318 },
-    }
-  },
-  /*output_layer=*/{
-    /*weights=*/{{
-      {{ -3.4950798141841886, 3.052869401349734, -1.9332425183341917, -2.4468455334890375, 3.1182134156177734, 2.662143418701658, 3.609609051057281, -1.6114776062537006 }},
-    }},
-    /*biases=*/{
-      { -0.8627209596023582 },
-    }
-  },
-  /*threshold=*/0.03,
-};
-
-static constexpr GemmFilter AMDGenoaGemmFilter{
-  /*input_range=*/{16, 4096},
-  /*lhs_dtype=*/PrimitiveType::F32,
-  /*rhs_dtype=*/PrimitiveType::F32,
-  /*out_dtype=*/PrimitiveType::F32,
-};
-
-static constexpr Net AMDGenoaNet {
-  /*scaler=*/{
-    /*mean=*/
-    {{ 2048.487742594484, 2032.4805924412667, 2042.0275791624106, 7.311636506981553, 7.331182177414692, 7.324348610024091 }},
-    /*scale=*/
-    {{ 1191.317145630777, 1166.4230415375375, 1162.7572402044934, 1.0130577584567735, 0.9372130582909888, 0.9819331632142719 }},
-  },
-  /*hidden_layer_1=*/{
-    /*weights=*/{{
-      {{ -0.3975566315544443, 0.5914998393825349, 0.6099048505253704, -2.2657754130482575, 0.36614796953745665, -0.9019941522654611 }},
-      {{ -1.634528631004246, -1.0247790097319367, 0.7441596497436759, 1.1627072134985457, 0.05409335988074912, -0.12091065051829138 }},
-      {{ 0.38395072299848293, 0.6541884828037803, 0.417837898603066, -0.9405446354332785, 2.184810649384631, -0.36876630139170674 }},
-      {{ 1.4311717327837925, 0.9019482519954495, 0.010222966815173684, 0.3734603575926762, -0.48722286699557477, 0.6097423536728197 }},
-      {{ -0.7136793187709407, -1.9428210404652928, 0.4274609198312262, 0.7241649472475438, 0.7127139917668667, -0.17169269406677637 }},
-      {{ 0.7274093691413374, 1.5619764328746881, 0.3132760663502329, 0.1150444561729908, 0.2015964262316955, -1.6488397218364703 }},
-      {{ -0.2753144111803734, 0.851664634951511, -0.7668837132534746, 0.8536953128922471, 0.5346385907475031, -0.3903852123459044 }},
-      {{ -0.33049518181245935, -0.1445885038395346, 0.33671360297244707, 0.19923558301288513, 0.47714692266995923, 2.673625950077934 }},
-    }},
-    /*biases=*/{
-      { 1.8781920773242509, 0.6510580145727756, 1.3641835181490685, -1.237083419397511, 0.09563962519162661, 1.0633713668067988, -0.2750294272946441, 0.4082406241441991 },
-    }
-  },
-  /*hidden_layer_2=*/{
-    /*weights=*/{{
-      {{ 1.482788775138106, -0.5911919348052194, -0.35265948412831416, 0.5693173975201452, 0.08299331485534553, -1.0926309595949408, 0.334160671733911, -0.8259113265483281 }},
-      {{ -0.7244072332431708, 1.7167578358580047, -0.4425799291591407, 0.38193961610444616, -0.3131049026459214, 0.7057668457879581, -0.8977670579096759, -1.1564071580034785 }},
-      {{ 0.2358887563481682, 0.845047198622242, 0.3965633248481624, -0.9292260319808021, 0.38780851270938177, 0.9073719197977955, 0.8942857890487362, 2.2078844573893486 }},
-      {{ 0.7588397006376895, 0.39649528525833017, 1.1922103753418032, -0.2623025347145879, -1.8688404509544276, 0.23950836230216038, 0.15018196046213705, 1.1091046070474726 }},
-      {{ -0.06639877236719088, 0.09408482409872725, 0.08853697547037886, -0.027191640785169502, -0.025050403848262424, -0.14821218627938373, -0.05119778874800481, -0.003846457076482196 }},
-      {{ -1.3626737341753659, -0.509211567650967, -1.3709529389911908, 0.8181695565961004, -0.9154056938786789, 1.6786394527771, -0.38910973671573107, 0.6109302318778375 }},
-      {{ -0.9490250745418807, -0.22890259271729135, -0.7669763564967859, -1.2378100390537607, 0.9325554827865082, -0.7707072257516585, -0.6101643395959798, 0.6438447441624673 }},
-      {{ 1.1581876959277013, 1.4439015663052703, -1.4659507082977212, 1.0425420146162472, -0.20891484120663645, 0.3292514803046433, 0.38947771607697135, 0.06588859566944062 }},
-    }},
-    /*biases=*/{
-      { 2.0991435035679293, 0.9220598032166089, 0.001237522670163396, -0.2035381110666839, -0.7214610628375114, -2.275782698263265, 3.2572710355363337, -1.309956720253099 },
-    }
-  },
-  /*output_layer=*/{
-    /*weights=*/{{
-      {{ -2.214950317234679, 2.3173207097966624, -2.4148863077632057, 2.440952250974181, 0.016504153668811035, 3.00219780922754, 2.454200734592688, 2.444832006369846 }},
-    }},
-    /*biases=*/{
-      { -0.2538826384470055 },
-    }
-  },
-  /*threshold=*/0.05,
-};
-
-// clang-format on
-// NOLINTEND
-
-bool IsAMDRome(const llvm::TargetMachine* target_machine) {
-  CHECK(target_machine);
-  return target_machine->getTargetCPU() == "znver2";
-}
-
-bool IsAMDMilan(const llvm::TargetMachine* target_machine) {
-  CHECK(target_machine);
-  return target_machine->getTargetCPU() == "znver3";
-}
-
-bool IsAMDGenoa(const llvm::TargetMachine* target_machine) {
-  CHECK(target_machine);
-  return target_machine->getTargetCPU() == "znver4";
-}
-
-}  // namespace
-
-XnnGemmConfig::Opinion XnnGemmConfig::Evaluate(
-    const XnnGemm& gemm, const TargetMachineFeatures* cpu_features) const {
-  if (test_filter_) {
-    return test_filter_(gemm) ? XnnGemmConfig::Opinion::kAccept
-                              : XnnGemmConfig::Opinion::kReject;
-  }
-
-  if (!cpu_features || !cpu_features->target_machine()) {
-    return XnnGemmConfig::Opinion::kNoIdea;
-  }
-
-  CHECK(cpu_features);
-  CHECK(cpu_features->target_machine());
-
-  if (BF16BF16F32GemmFilter(gemm)) {
-    return XnnGemmConfig::Opinion::kAccept;
-  }
-
-  if ((IsAMDRome(cpu_features->target_machine()) ||
-       IsAMDMilan(cpu_features->target_machine())) &&
-      AMDRomeGemmFilter(gemm)) {
-    int out = AMDRomeNet(gemm.dot_canonical_dims.m, gemm.dot_canonical_dims.k,
-                         gemm.dot_canonical_dims.n);
-    return out == 1 ? XnnGemmConfig::Opinion::kAccept
-                    : XnnGemmConfig::Opinion::kReject;
-  }
-
-  if (IsAMDGenoa(cpu_features->target_machine()) && AMDGenoaGemmFilter(gemm)) {
-    int out = AMDGenoaNet(gemm.dot_canonical_dims.m, gemm.dot_canonical_dims.k,
-                          gemm.dot_canonical_dims.n);
-    return out == 1 ? XnnGemmConfig::Opinion::kAccept
-                    : XnnGemmConfig::Opinion::kReject;
-  }
-
-  return XnnGemmConfig::Opinion::kNoIdea;
-}
-
-const XnnGemmConfig& GetXnnGemmConfig() {
-  static const absl::NoDestructor<XnnGemmConfig> gemm_config;
-  return *gemm_config;
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h b/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
deleted file mode 100644
index 83bac68b0c2ce8..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
-#define XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
-
-#include <functional>
-
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-struct XnnGemm {
-  DotCanonicalDims dot_canonical_dims;
-  PrimitiveType lhs_dtype;
-  PrimitiveType rhs_dtype;
-  PrimitiveType out_dtype;
-};
-
-// XnnGemmConfig is a static lightweight  mechanism for determining if a given
-// gemm should be offloaded to XNNPACK vs handled by OneDNN/Eigen.
-// Currently it uses a classifier - neural network with: 6 input features
-// m, k, n, log(m), log(k), log(n), two hidden layers of size 8 and a cut-off
-// threshold for the predicted probability tuned to keep the false positive rate
-// below 1%. The classifier was trained on synthetic data (20K random gemms).
-// TODO(ashaposhnikov): add a reference to documentation / collab.
-class XnnGemmConfig {
-  mutable std::function<bool(const XnnGemm&)> test_filter_ = nullptr;
-
- public:
-  XnnGemmConfig() = default;
-
-  enum class Opinion { kAccept, kReject, kNoIdea };
-
-  Opinion Evaluate(const XnnGemm& xnn_gemm,
-                   const TargetMachineFeatures* cpu_features) const;
-
-  template <typename Filter>
-  void SetTestFilter(Filter&& test_filter) const {
-    test_filter_ = std::forward<Filter>(test_filter);
-  }
-};
-
-const XnnGemmConfig& GetXnnGemmConfig();
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_support.cc b/third_party/xla/xla/backends/cpu/xnn_support.cc
deleted file mode 100644
index 307d7adb859472..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_support.cc
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_support.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <utility>
-
-#include "xnnpack.h"
-#include "absl/base/no_destructor.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/xnn_gemm_config.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout_util.h"
-#include "xla/primitive_util.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-bool AreDtypesSupported(const Shape& lhs_shape, const Shape& rhs_shape,
-                        const Shape& out_shape,
-                        const TargetMachineFeatures* cpu_features) {
-  // Stores tuple of allowed (input, output) dtypes.
-  static const auto* kAllowedTypes =
-      new absl::flat_hash_set<std::pair<PrimitiveType, PrimitiveType>>(
-          {{F32, F32}, {BF16, F32}, {BF16, BF16}});
-
-  // Types must be in the allowed set.
-  PrimitiveType lhs_dtype = lhs_shape.element_type();
-  PrimitiveType rhs_dtype = rhs_shape.element_type();
-  PrimitiveType out_dtype = out_shape.element_type();
-  if (lhs_dtype != rhs_dtype ||
-      !kAllowedTypes->contains({lhs_dtype, out_dtype})) {
-    return false;
-  }
-
-  // BF16 matmuls can only run when CPU has AVX512_BF16.
-  if (lhs_dtype == BF16) {
-    return cpu_features == nullptr || cpu_features->has_avx512bf16();
-  }
-  return true;
-}
-
-absl::StatusOr<bool> IsDotSupportedByXnn(
-    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
-    const Shape& rhs_shape, const Shape& out_shape,
-    const TargetMachineFeatures* cpu_features, bool use_cost_model) {
-  // Check data types.
-  if (!AreDtypesSupported(lhs_shape, rhs_shape, out_shape, cpu_features)) {
-    return false;
-  }
-  if (!IsLayoutSupportedByXnn(lhs_shape) ||
-      !IsLayoutSupportedByXnn(rhs_shape) ||
-      !IsLayoutSupportedByXnn(out_shape)) {
-    return false;
-  }
-
-  // Check shapes.
-  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
-                                                      rhs_shape, out_shape));
-
-  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
-                      GetDotCanonicalDims(dot_dimensions, dot_shape));
-
-  if (dot_canonical_dims.m == 1 && dot_canonical_dims.n == 1 &&
-      dot_shape.batch_size > 1) {
-    // TODO(b/430079105): XNNPACK does not handle batch dimensions that are not
-    // matrix dimensions. We could handle this case by fully implementing dot
-    // (b/430079105), but we also could just insert dummy dimensions of size 1
-    // for the matrix dimensions, so the batch dimensions get handled correctly.
-    return false;
-  }
-
-  // XNNPACK does not support transposing LHS or col-major layouts.
-  if (!dot_canonical_dims.lhs_canonical ||
-      dot_canonical_dims.lhs_column_major ||
-      dot_canonical_dims.rhs_column_major) {
-    return false;
-  }
-
-  if (!use_cost_model) {
-    return true;
-  }
-
-  const XnnGemm gemm{/*dot_canonical_dims=*/dot_canonical_dims,
-                     /*lhs_dtype=*/lhs_shape.element_type(),
-                     /*rhs_dtype=*/rhs_shape.element_type(),
-                     /*out_dtype=*/out_shape.element_type()};
-  switch (GetXnnGemmConfig().Evaluate(gemm, cpu_features)) {
-    case XnnGemmConfig::Opinion::kAccept:
-      return true;
-    default:
-      return false;
-  }
-}
-
-const absl::flat_hash_map<HloOpcode, xnn_unary_operator>& GetXnnUnaryOpMap() {
-  // TODO(ashaposhnikov): Investigate adding support for kErf, kExpm1, kLog1p,
-  // kNot, kRoundNearestAfz, kTan.
-  static absl::NoDestructor<absl::flat_hash_map<HloOpcode, xnn_unary_operator>>
-      unary_op_map({
-          {HloOpcode::kAbs, xnn_unary_abs},
-          {HloOpcode::kCeil, xnn_unary_ceiling},
-          {HloOpcode::kClz, xnn_unary_count_leading_zeros},
-          {HloOpcode::kConvert, xnn_unary_convert},
-          {HloOpcode::kCos, xnn_unary_cosine},
-          {HloOpcode::kExp, xnn_unary_exp},
-          {HloOpcode::kCbrt, xnn_unary_cube_root},
-          {HloOpcode::kFloor, xnn_unary_floor},
-          {HloOpcode::kLog, xnn_unary_log},
-          {HloOpcode::kLogistic, xnn_unary_sigmoid},
-          {HloOpcode::kNegate, xnn_unary_negate},
-          {HloOpcode::kRoundNearestEven, xnn_unary_bankers_rounding},
-          {HloOpcode::kRsqrt, xnn_unary_reciprocal_square_root},
-          {HloOpcode::kSign, xnn_unary_sign},
-          {HloOpcode::kSin, xnn_unary_sine},
-          {HloOpcode::kSqrt, xnn_unary_square_root},
-          {HloOpcode::kTanh, xnn_unary_tanh},
-      });
-  return *unary_op_map;
-}
-
-absl::StatusOr<xnn_unary_operator> XnnUnaryOperator(const HloOpcode& opcode) {
-  const auto& unary_op_map = GetXnnUnaryOpMap();
-  auto result = unary_op_map.find(opcode);
-  if (result == unary_op_map.end()) {
-    return InvalidArgument("Unsupported XNNPACK unary operator: %s",
-                           HloOpcodeString(opcode));
-  }
-  return result->second;
-}
-
-const absl::flat_hash_map<HloOpcode, xnn_binary_operator>& GetXnnBinaryOpMap() {
-  static absl::NoDestructor<absl::flat_hash_map<HloOpcode, xnn_binary_operator>>
-      binary_op_map({
-          {HloOpcode::kAdd, xnn_binary_add},
-          {HloOpcode::kAnd, xnn_binary_bitwise_and},
-          {HloOpcode::kDivide, xnn_binary_divide},
-          {HloOpcode::kMaximum, xnn_binary_maximum},
-          {HloOpcode::kMinimum, xnn_binary_minimum},
-          {HloOpcode::kMultiply, xnn_binary_multiply},
-          {HloOpcode::kOr, xnn_binary_bitwise_or},
-          {HloOpcode::kPower, xnn_binary_pow},
-          {HloOpcode::kRemainder, xnn_binary_modulus},
-          {HloOpcode::kShiftLeft, xnn_binary_shift_left},
-          {HloOpcode::kShiftRightArithmetic, xnn_binary_shift_right_arithmetic},
-          {HloOpcode::kShiftRightLogical, xnn_binary_shift_right_logical},
-          {HloOpcode::kSubtract, xnn_binary_subtract},
-          {HloOpcode::kXor, xnn_binary_bitwise_xor},
-      });
-  return *binary_op_map;
-}
-
-absl::StatusOr<xnn_binary_operator> XnnBinaryOperator(const HloOpcode& opcode) {
-  const auto& binary_op_map = GetXnnBinaryOpMap();
-  auto result = binary_op_map.find(opcode);
-  if (result == binary_op_map.end()) {
-    return InvalidArgument("Unsupported XNNPACK binary operator: %s",
-                           HloOpcodeString(opcode));
-  }
-  return result->second;
-}
-
-bool IsLayoutSupportedByXnn(const Shape& shape) {
-  return !shape.has_layout() || LayoutUtil::HasDescendingLayout(shape.layout());
-}
-
-bool IsConstantSupportedByXnn(const HloInstruction* hlo) {
-  CHECK(hlo->IsConstant());
-
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-
-  return hlo->shape().IsArray();
-}
-
-bool IsElementwiseOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK(hlo->IsElementwise());
-  // In XLA IsElementwise is true for constants.
-  CHECK(!hlo->IsConstant());
-
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-
-  if (!std::all_of(hlo->operands().begin(), hlo->operands().end(),
-                   [](const HloInstruction* op) {
-                     return XnnDatatype(op->shape().element_type()).ok();
-                   })) {
-    return false;
-  }
-
-  switch (hlo->operand_count()) {
-    case 1:
-      return XnnUnaryOperator(hlo->opcode()).ok();
-    case 2:
-      return XnnBinaryOperator(hlo->opcode()).ok();
-    default:
-      return false;
-  }
-}
-
-bool IsBitcastOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK_EQ(hlo->opcode(), HloOpcode::kBitcast);
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-  const HloInstruction* input = hlo->operand(0);
-  return hlo->shape().element_type() == input->shape().element_type();
-}
-
-bool IsBroadcastOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK_EQ(hlo->opcode(), HloOpcode::kBroadcast);
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-  const absl::Span<const int64_t> dims =
-      Cast<HloBroadcastInstruction>(hlo)->dimensions();
-  if (dims.empty()) {
-    return true;
-  }
-  if (!std::is_sorted(dims.begin(), dims.end())) {
-    return false;
-  }
-  // TODO(ashaposhnikov): this case works well, but we should investigate the
-  // performance regressions that occur if this condition is removed.
-  return dims.back() + 1 == dims.size();
-}
-
-template <class T>
-static T InvariantValueFor(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kAdd:
-      return T{0};
-    case HloOpcode::kMinimum:
-      return std::numeric_limits<T>::infinity();
-    case HloOpcode::kMaximum:
-      return -std::numeric_limits<T>::infinity();
-    default:
-      LOG(FATAL) << "Unexpected opcode " << opcode;
-  }
-}
-
-bool IsReduceOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK_EQ(hlo->opcode(), HloOpcode::kReduce);
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
-  CHECK_NE(reduce, nullptr);
-  // TODO(ashaposhnikov): we can support this edge case,
-  // planning to come back to this later.
-  if (reduce->dimensions().empty()) {
-    return false;
-  }
-  const HloComputation* to_apply = reduce->to_apply();
-  CHECK_NE(to_apply, nullptr);
-  if (!Match(to_apply->root_instruction(),
-             match::AnyOf<HloInstruction>(match::Add(), match::Maximum(),
-                                          match::Minimum())
-                 .WithBinaryOperandsAnyOrder(match::Parameter(0),
-                                             match::Parameter(1)))) {
-    return false;
-  }
-  if (reduce->init_values().size() != 1) {
-    return false;
-  }
-  HloInstruction* init = reduce->init_values().front();
-  CHECK_EQ(init->shape().element_type(), hlo->shape().element_type());
-  const HloOpcode opcode = to_apply->root_instruction()->opcode();
-  const PrimitiveType ty = init->shape().element_type();
-  return primitive_util::FloatingPointTypeSwitch(
-      [&](auto primitive_type) {
-        return Match(
-            init,
-            match::ConstantScalar(
-                InvariantValueFor<primitive_util::NativeTypeOf<primitive_type>>(
-                    opcode)));
-      },
-      ty);
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_support.h b/third_party/xla/xla/backends/cpu/xnn_support.h
deleted file mode 100644
index 2e39e9430e3eb0..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_support.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_XNN_SUPPORT_H_
-#define XLA_BACKENDS_CPU_XNN_SUPPORT_H_
-
-#include "xnnpack.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-inline constexpr absl::string_view kXnnFusionKind = "__xnn_fusion";
-
-// Returns true if the dot operation is supported by XNNPACK. Returns an error
-// if the dot operation shape is invalid.
-absl::StatusOr<bool> IsDotSupportedByXnn(
-    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
-    const Shape& rhs_shape, const Shape& out_shape,
-    const TargetMachineFeatures* cpu_features = nullptr,
-    bool use_cost_model = true);
-
-// Returns the mappings from HLO opcodes to XNNPACK unary operators.
-const absl::flat_hash_map<HloOpcode, xnn_unary_operator>& GetXnnUnaryOpMap();
-
-// Returns the XNNPACK unary operator corresponding to the given HLO opcode.
-// Returns `InvalidArgument` if the opcode is not supported.
-absl::StatusOr<xnn_unary_operator> XnnUnaryOperator(const HloOpcode& opcode);
-
-// Returns the mappings from HLO opcodes to XNNPACK binary operators.
-const absl::flat_hash_map<HloOpcode, xnn_binary_operator>& GetXnnBinaryOpMap();
-
-// Returns the XNNPACK binary operator corresponding to the given HLO opcode.
-// Returns `InvalidArgument` if the opcode is not supported.
-absl::StatusOr<xnn_binary_operator> XnnBinaryOperator(const HloOpcode& opcode);
-
-// Returns true if the shape either doesn't have a layout or the layout is
-// descending. Shapes without layout are accepted to make HLO tests less
-// verbose.
-bool IsLayoutSupportedByXnn(const Shape& shape);
-
-// Returns true if the constant is supported by XNNPACK.
-bool IsConstantSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the nonconstant elementwise op is supported by XNNPACK.
-bool IsElementwiseOpSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the bitcast op is supported by XNNPACK.
-bool IsBitcastOpSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the broadcast op is supported by XNNPACK.
-bool IsBroadcastOpSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the reduce op is supported by XNNPACK.
-bool IsReduceOpSupportedByXnn(const HloInstruction* hlo);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_XNN_SUPPORT_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_support_test.cc b/third_party/xla/xla/backends/cpu/xnn_support_test.cc
deleted file mode 100644
index a69b3287f54d8a..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_support_test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_support.h"
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "absl/container/flat_hash_map.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-
-namespace xla::cpu {
-namespace {
-
-class XnnSupportTest : public ::testing::Test {};
-
-TEST_F(XnnSupportTest, UnaryEltwiseOpMap) {
-  const auto& unary_map = GetXnnUnaryOpMap();
-
-  auto check = [&](const HloOpcode opcode, const xnn_unary_operator expected) {
-    auto result = unary_map.find(opcode);
-    EXPECT_NE(result, unary_map.end());
-    EXPECT_EQ(result->second, expected);
-  };
-
-  // Supported unary ops.
-  check(HloOpcode::kAbs, xnn_unary_abs);
-  check(HloOpcode::kExp, xnn_unary_exp);
-  check(HloOpcode::kFloor, xnn_unary_floor);
-  check(HloOpcode::kSqrt, xnn_unary_square_root);
-
-  // Unsupported unary ops.
-  EXPECT_EQ(unary_map.find(HloOpcode::kErf), unary_map.end());
-  EXPECT_EQ(unary_map.find(HloOpcode::kSort), unary_map.end());
-}
-
-TEST_F(XnnSupportTest, BinaryEltwiseOpMap) {
-  const auto& binary_map = GetXnnBinaryOpMap();
-
-  auto check = [&](const HloOpcode opcode, const xnn_binary_operator expected) {
-    auto result = binary_map.find(opcode);
-    EXPECT_NE(result, binary_map.end());
-    EXPECT_EQ(result->second, expected);
-  };
-
-  // Supported unary ops.
-  check(HloOpcode::kAdd, xnn_binary_add);
-  check(HloOpcode::kMultiply, xnn_binary_multiply);
-  check(HloOpcode::kSubtract, xnn_binary_subtract);
-  check(HloOpcode::kDivide, xnn_binary_divide);
-
-  // Unsupported unary ops.
-  EXPECT_EQ(binary_map.find(HloOpcode::kAtan2), binary_map.end());
-  EXPECT_EQ(binary_map.find(HloOpcode::kComplex), binary_map.end());
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index 145fd16e1ac1d8..b2fdada31d3618 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <numeric>
 #include <utility>
 #include <vector>
 
@@ -245,6 +246,58 @@ static absl::StatusOr<uint32_t> DefineReduceOp(ynn_subgraph_t subgraph,
   return out;
 }
 
+static absl::StatusOr<uint32_t> DefineDotOp(ynn_subgraph_t subgraph,
+                                            TensorIdMap& tensor_ids,
+                                            const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for dot op: %s",
+                                instr->ToString());
+  CHECK_EQ(instr->opcode(), HloOpcode::kDot);
+  const HloInstruction* lhs = instr->operand(0);
+  const HloInstruction* rhs = instr->operand(1);
+  CHECK_EQ(lhs->shape().element_type(), instr->shape().element_type());
+  CHECK_EQ(rhs->shape().element_type(), instr->shape().element_type());
+
+  TF_ASSIGN_OR_RETURN(auto lhs_id, FindTensorValue(tensor_ids, lhs));
+  TF_ASSIGN_OR_RETURN(auto rhs_id, FindTensorValue(tensor_ids, rhs));
+  TF_ASSIGN_OR_RETURN(auto output_id, DefineTensorValue(subgraph, instr));
+
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& out_shape = instr->shape();
+
+  DotDimensionNumbers dot_dimensions = instr->dot_dimension_numbers();
+  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
+                                                      rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
+                      GetDotCanonicalDims(dot_dimensions, dot_shape));
+
+  const size_t b_rank = rhs_shape.dimensions().size();
+  const bool transpose_b = !dot_canonical_dims.rhs_canonical;
+
+  if (transpose_b) {
+    uint32_t rhs_id_transposed = YNN_INVALID_VALUE_ID;
+    std::array<int32_t, YNN_MAX_TENSOR_RANK> perm;
+    absl::c_iota(perm, 0);
+    CHECK_LT(b_rank, YNN_MAX_TENSOR_RANK);
+    CHECK_GE(b_rank, 2);
+    std::swap(perm[b_rank - 1], perm[b_rank - 2]);
+    ynn_status status = ynn_define_static_transpose(
+        subgraph,
+        /*num_dims=*/b_rank, perm.data(), rhs_id, &rhs_id_transposed,
+        /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    rhs_id = rhs_id_transposed;
+  }
+
+  YNN_RETURN_IF_ERROR(ynn_define_dot(subgraph, /*num_k_dims=*/1, lhs_id, rhs_id,
+                                     YNN_INVALID_VALUE_ID, &output_id,
+                                     /*flags=*/0));
+  return output_id;
+}
+
 //===----------------------------------------------------------------------===//
 // Emit YNNPACK subgraph for the given HLO computation.
 //===----------------------------------------------------------------------===//
@@ -319,6 +372,16 @@ static absl::StatusOr<YnnSubgraph> EmitYnnSubgraph(
                             DefineBitcastOp(subgraph.get(), tensor_ids, instr));
       } break;
 
+      case HloOpcode::kDot: {
+        if (!IsDotSupportedByYnn(instr).value_or(false)) {
+          return InvalidArgument(
+              "Unsupported dot instruction in YNN fusion: %s",
+              instr->ToString());
+        }
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineDotOp(subgraph.get(), tensor_ids, instr));
+      } break;
+
       case HloOpcode::kReduce: {
         TF_ASSIGN_OR_RETURN(tensor_ids[instr],
                             DefineReduceOp(subgraph.get(), tensor_ids, instr));
@@ -368,11 +431,158 @@ static ynn_status DefineBatchMatrixMultiply(ynn_subgraph_t subgraph,
                         YNN_INVALID_VALUE_ID, &output_id, /*flags=*/0);
 }
 
+static ynn_status DefineConvolution(
+    ynn_subgraph_t subgraph, ynn_type input1_id_type, ynn_type output_id_type,
+    uint32_t input1_id, uint32_t input2_id, uint32_t output_id,
+    const std::vector<size_t>& filter_dims, const std::vector<size_t>& out_dims,
+    size_t feature_group_count, size_t input_channels,
+    size_t kernel_output_channels, std::vector<int32_t> stencil_axes,
+    std::vector<size_t> stencil_dims, std::vector<size_t> stencil_strides,
+    std::vector<size_t> stencil_dilations,
+    const std::vector<int64_t>& padding_lows,
+    const std::vector<int64_t>& padding_highs) {
+  size_t num_k_dims = stencil_dims.size() + 1;
+  ynn_status status;
+
+  // We will need to create an intermediate buffer for the output if it's
+  // grouped convolution.
+  uint32_t output_unfused_id =
+      feature_group_count != 1 ? YNN_INVALID_VALUE_ID : output_id;
+
+  if (feature_group_count != 1) {
+    uint32_t split_id = YNN_INVALID_VALUE_ID;
+    CHECK_EQ(filter_dims.size(), 4);
+    // [kh, kw, ci/g, co] -> [kh, kw, ci/g, g, co/g].
+    size_t filter_split[] = {feature_group_count,
+                             kernel_output_channels / feature_group_count};
+    status =
+        ynn_define_split_dim(subgraph, /*axis=*/-1, /*num_splits=*/2,
+                             filter_split, input2_id, &split_id, /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input2_id = split_id;
+
+    uint32_t transposed_filter_id = YNN_INVALID_VALUE_ID;
+    // [kh, kw, ci/g, g, co/g] -> [g, kh, kw, ci/g, co/g]
+    int32_t swap_co_ci[5] = {3, 0, 1, 2, 4};
+    status =
+        ynn_define_static_transpose(subgraph, /*rank=*/5, swap_co_ci, input2_id,
+                                    &transposed_filter_id, /*flags=*/0);
+
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input2_id = transposed_filter_id;
+
+    // Create intermediate output buffer.
+    std::vector<size_t> unfused_dims(out_dims.begin(), out_dims.end() - 1);
+    unfused_dims.push_back(feature_group_count);
+    unfused_dims.push_back(1);
+    unfused_dims.push_back(kernel_output_channels / feature_group_count);
+    status = ynn_define_tensor_value(subgraph, output_id_type,
+                                     /*rank=*/out_dims.size() + 2,
+                                     /*dims=*/unfused_dims.data(),
+                                     /*data=*/nullptr,
+                                     /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                                     /*scale_id=*/YNN_INVALID_VALUE_ID,
+                                     /*flags=*/0, &output_unfused_id);
+    if (status != ynn_status_success) {
+      return status;
+    }
+  }
+
+  // If any of paddings is not zero, define a padding value and pad the input.
+  if (absl::c_any_of(padding_lows, [](int32_t i) { return i != 0; }) ||
+      absl::c_any_of(padding_highs, [](int32_t i) { return i != 0; })) {
+    uint32_t padding_id = YNN_INVALID_VALUE_ID;
+
+    // Define padding value.
+    uint64_t padding_value = 0;
+    status = ynn_define_tensor_value(subgraph, input1_id_type,
+                                     /*rank=*/0, /*dims=*/nullptr,
+                                     /*data=*/&padding_value,
+                                     /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                                     /*scale_id=*/YNN_INVALID_VALUE_ID,
+                                     /*flags=*/YNN_VALUE_FLAG_COPY_DATA,
+                                     &padding_id);
+
+    if (status != ynn_status_success) {
+      return status;
+    }
+
+    uint32_t padded_id = YNN_INVALID_VALUE_ID;
+    status = ynn_define_static_pad(
+        subgraph, stencil_axes.size(), stencil_axes.data(), padding_lows.data(),
+        padding_highs.data(), input1_id, padding_id, &padded_id, /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input1_id = padded_id;
+    padding_id = YNN_INVALID_VALUE_ID;
+  }
+
+  std::vector<int32_t> new_axes;
+
+  if (feature_group_count != 1) {
+    // (n, h, w, c) -> (n, h, w, [g, 1,] kh, kw, c / g)
+    stencil_dims.push_back(feature_group_count);
+    stencil_dims.push_back(1);
+    stencil_axes.push_back(3);
+    stencil_axes.push_back(3);
+    // We need to insert stencil dimensions [kh, kw] right before the channel
+    // dimension and [g, 1] before stencil dimensions.
+    new_axes = {-3, -2, -5, -4};
+    stencil_strides.push_back(1);
+    stencil_strides.push_back(1);
+    stencil_dilations.push_back(input_channels / feature_group_count);
+    stencil_dilations.push_back(1);
+  } else {
+    // We need to insert stencil dimensions [kh, kw] right before the channel
+    // dimension.
+    new_axes = {-3, -2};
+  }
+
+  uint32_t stencil_id = YNN_INVALID_VALUE_ID;
+  // Make a stenciled view of the input [n, h, w, ci] -> [n, h, w, kh, kw, ci].
+  status = ynn_define_stencil_copy(
+      subgraph, /*num_stencils=*/stencil_dims.size(), stencil_axes.data(),
+      new_axes.data(), stencil_dims.data(), stencil_strides.data(),
+      stencil_dilations.data(), input1_id, YNN_INVALID_VALUE_ID, &stencil_id,
+      /*flags=*/0);
+  if (status != ynn_status_success) {
+    return status;
+  }
+
+  status = ynn_define_dot(subgraph, num_k_dims, stencil_id, input2_id,
+                          YNN_INVALID_VALUE_ID, &output_unfused_id,
+                          /*flags=*/0);
+
+  if (status != ynn_status_success) {
+    return status;
+  }
+
+  if (feature_group_count > 1) {
+    // The output of the grouped convolution is [n, h, w, g, 1, co/g], so we
+    // need to fuse three of the innermost dimensions.
+    status = ynn_define_fuse_dim(subgraph, /*axis=*/-3, /*axes_count=*/3,
+                                 output_unfused_id, &output_id,
+                                 /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+  }
+
+  return status;
+}
+
 static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
     const HloDotInstruction* dot,
     std::vector<std::unique_ptr<Literal>>& literals,
     absl::Span<const se::DeviceAddressBase> arguments_buffers,
     bool capture_rhs) {
+  // TODO(b/468895209): Use the fusion emitter above instead of replicating the
+  // logic here.
   TF_ASSIGN_OR_RETURN(
       YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
         return ynn_create_subgraph(
@@ -442,6 +652,100 @@ static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
   return subgraph;
 }
 
+static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
+    const HloConvolutionInstruction* conv,
+    std::vector<std::unique_ptr<Literal>>& literals,
+    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
+  TF_ASSIGN_OR_RETURN(
+      YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
+        return ynn_create_subgraph(
+            /*external_value_ids=*/3,
+            YnnFlags(conv->GetModule()->config().debug_options()), subgraph);
+      }));
+
+  uint32_t lhs_id = 0;
+  uint32_t rhs_id = 1;
+  uint32_t out_id = 2;
+
+  const HloInstruction* lhs = conv->operand(0);
+  const HloInstruction* rhs = conv->operand(1);
+
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& out_shape = conv->shape();
+
+  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
+    return {dims.begin(), dims.end()};
+  };
+
+  std::vector<size_t> lhs_dims = dims(lhs_shape.dimensions());
+  std::vector<size_t> rhs_dims = dims(rhs_shape.dimensions());
+  std::vector<size_t> out_dims = dims(out_shape.dimensions());
+
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_lhs_type, YnnType(lhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_rhs_type, YnnType(rhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_out_type, YnnType(out_shape.element_type()));
+
+  const uint32_t input_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_INPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_lhs_type, lhs_dims.size(), lhs_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &lhs_id));
+
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_rhs_type, rhs_dims.size(), rhs_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &rhs_id));
+
+  const uint32_t output_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_out_type, out_dims.size(), out_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, output_tensor_flags, &out_id));
+
+  Window conv_window = conv->window();
+  int conv_window_dims_size = conv_window.dimensions_size();
+
+  ConvolutionDimensionNumbers conv_dimensions =
+      conv->convolution_dimension_numbers();
+
+  std::vector<int32_t> stencil_axes(conv_window_dims_size);
+  std::vector<size_t> stencil_dims(conv_window_dims_size);
+  std::vector<size_t> stencil_strides(conv_window_dims_size);
+  std::vector<size_t> stencil_dilations(conv_window_dims_size);
+  std::vector<int64_t> padding_lows(conv_window_dims_size);
+  std::vector<int64_t> padding_highs(conv_window_dims_size);
+
+  for (size_t i = 0; i < conv_window.dimensions_size(); ++i) {
+    stencil_axes[i] = conv_dimensions.input_spatial_dimensions(i);
+    stencil_dims[i] = conv_window.dimensions(i).size();
+    stencil_strides[i] = conv_window.dimensions(i).stride();
+    stencil_dilations[i] = conv_window.dimensions(i).window_dilation();
+    padding_lows[i] = conv_window.dimensions(i).padding_low();
+    padding_highs[i] = conv_window.dimensions(i).padding_high();
+  }
+
+  YNN_RETURN_IF_ERROR(DefineConvolution(
+      subgraph.get(), ynn_lhs_type, ynn_out_type, lhs_id, rhs_id, out_id,
+      rhs_dims, out_dims, conv->feature_group_count(),
+      conv->operand(0)->shape().dimensions(
+          conv_dimensions.input_feature_dimension()),
+      conv->operand(1)->shape().dimensions(
+          conv_dimensions.kernel_output_feature_dimension()),
+      std::move(stencil_axes), std::move(stencil_dims),
+      std::move(stencil_strides), std::move(stencil_dilations), padding_lows,
+      padding_highs));
+
+  ynn_status status = ynn_optimize_subgraph(
+      subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
+  TF_RETURN_IF_ERROR(YnnStatusToStatus(status));
+
+  return subgraph;
+}
+
 absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
     absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
 EmitYnnFusionBuilder(const HloComputation* computation) {
@@ -478,4 +782,14 @@ EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs) {
       };
 }
 
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
+EmitYnnConvolutionBuilder(const HloConvolutionInstruction* conv) {
+  return
+      [conv, literals = std::vector<std::unique_ptr<Literal>>()](
+          absl::Span<const se::DeviceAddressBase> arguments_buffers) mutable {
+        return EmitYnnConvolutionSubgraph(conv, literals, arguments_buffers);
+      };
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.h b/third_party/xla/xla/backends/cpu/ynn_emitter.h
index ff8a2949926979..b1b767e51a44e9 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.h
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.h
@@ -33,6 +33,10 @@ absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
     absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
 EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs);
 
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
+EmitYnnConvolutionBuilder(const HloConvolutionInstruction* conv);
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_YNN_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 4eda97a255f752..d3d260371e75aa 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -48,10 +48,13 @@ const absl::flat_hash_map<HloOpcode, ynn_unary_operator>& GetYnnUnaryOpMap() {
           {HloOpcode::kCeil, ynn_unary_ceil},
           {HloOpcode::kConvert, ynn_unary_convert},
           {HloOpcode::kCos, ynn_unary_cosine},
+          {HloOpcode::kErf, ynn_unary_erf},
           {HloOpcode::kExp, ynn_unary_exp},
+          {HloOpcode::kExpm1, ynn_unary_expm1},
           {HloOpcode::kCbrt, ynn_unary_cube_root},
           {HloOpcode::kFloor, ynn_unary_floor},
           {HloOpcode::kLog, ynn_unary_log},
+          {HloOpcode::kLog1p, ynn_unary_log1p},
           {HloOpcode::kLogistic, ynn_unary_sigmoid},
           {HloOpcode::kNegate, ynn_unary_negate},
           {HloOpcode::kRoundNearestEven, ynn_unary_round},
@@ -141,6 +144,15 @@ bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo) {
     return false;
   }
 
+  // We don't want to handle ops that are too small, overhead will be
+  // significant.
+  // TODO(b/469236467): This threshold is probably too small in some cases and
+  // too big in others.
+  constexpr int64_t kMinElements = 64;
+  if (ShapeUtil::ElementsIn(hlo->shape()) < kMinElements) {
+    return false;
+  }
+
   switch (hlo->operand_count()) {
     case 1:
       return YnnUnaryOperator(hlo->opcode()).ok();
@@ -218,6 +230,13 @@ absl::StatusOr<bool> IsDotSupportedByYnn(
   return true;
 }
 
+absl::StatusOr<bool> IsDotSupportedByYnn(const HloInstruction* hlo) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kDot);
+  return IsDotSupportedByYnn(hlo->dot_dimension_numbers(),
+                             hlo->operand(0)->shape(), hlo->operand(1)->shape(),
+                             hlo->shape());
+}
+
 bool IsReduceOpSupportedByYnn(const HloInstruction* hlo) {
   CHECK_EQ(hlo->opcode(), HloOpcode::kReduce);
   if (!YnnType(hlo->shape().element_type()).ok()) {
@@ -273,6 +292,71 @@ bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo) {
   }
 }
 
+bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
+  CHECK_EQ(instr->opcode(), HloOpcode::kConvolution);
+  const HloConvolutionInstruction* conv =
+      Cast<HloConvolutionInstruction>(instr);
+
+  ConvolutionDimensionNumbers conv_dimensions =
+      conv->convolution_dimension_numbers();
+  Window window = conv->window();
+
+  if (conv->batch_group_count() != 1) {
+    return false;
+  }
+
+  // Only support 2D convolution.
+  if (window.dimensions_size() != 2) {
+    return false;
+  }
+
+  // Stores tuple of allowed (input, output) dtypes.
+  static const absl::NoDestructor<absl::flat_hash_set<
+      std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
+      kAllowedTypes({{F32, F32, F32}, {BF16, BF16, F32}, {S8, S8, S32}});
+
+  PrimitiveType lhs_dtype = conv->operand(0)->shape().element_type();
+  PrimitiveType rhs_dtype = conv->operand(1)->shape().element_type();
+  PrimitiveType out_dtype = conv->shape().element_type();
+  if (!kAllowedTypes->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+    return false;
+  }
+
+  // Make sure that this layout is supported.
+  if (conv_dimensions.input_feature_dimension() != 3 ||
+      conv_dimensions.output_feature_dimension() != 3) {
+    return false;
+  }
+
+  if (conv_dimensions.kernel_input_feature_dimension() != 2 ||
+      conv_dimensions.kernel_output_feature_dimension() != 3) {
+    return false;
+  }
+
+  if (conv_dimensions.input_spatial_dimensions_size() != 2 ||
+      conv_dimensions.kernel_spatial_dimensions_size() != 2 ||
+      conv_dimensions.output_spatial_dimensions_size() != 2) {
+    return false;
+  }
+
+  if (conv_dimensions.input_spatial_dimensions(0) != 1 ||
+      conv_dimensions.input_spatial_dimensions(1) != 2 ||
+      conv_dimensions.kernel_spatial_dimensions(0) != 0 ||
+      conv_dimensions.kernel_spatial_dimensions(1) != 1 ||
+      conv_dimensions.output_spatial_dimensions(0) != 1 ||
+      conv_dimensions.output_spatial_dimensions(1) != 2) {
+    return false;
+  }
+
+  // No base dilation for now.
+  if ((window.dimensions(0).base_dilation() != 1) ||
+      (window.dimensions(1).base_dilation() != 1)) {
+    return false;
+  }
+
+  return true;
+}
+
 uint32_t YnnFlags(const DebugOptions& debug_options) {
   uint32_t flags = 0;
   if (!debug_options.xla_cpu_enable_platform_dependent_math()) {
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.h b/third_party/xla/xla/backends/cpu/ynn_support.h
index 7025010715dad9..4586126a617187 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.h
+++ b/third_party/xla/xla/backends/cpu/ynn_support.h
@@ -64,6 +64,7 @@ bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo);
 absl::StatusOr<bool> IsDotSupportedByYnn(
     const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
     const Shape& rhs_shape, const Shape& out_shape);
+absl::StatusOr<bool> IsDotSupportedByYnn(const HloInstruction* hlo);
 
 // Returns true if the reduce op is supported by YNNPACK.
 bool IsReduceOpSupportedByYnn(const HloInstruction* hlo);
@@ -71,6 +72,9 @@ bool IsReduceOpSupportedByYnn(const HloInstruction* hlo);
 // Returns true if the reduce op will be offloaded to YNNPACK.
 bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo);
 
+// Returns true if the convolution op is supported by YNNPACK.
+bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr);
+
 // Convert XLA options to YNNPACK flags.
 uint32_t YnnFlags(const DebugOptions& debug_options);
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index c39e235d3f0782..013466289fd03b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -225,7 +225,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -286,6 +285,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
@@ -425,7 +425,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:float_normalization",
         "//xla/hlo/utils:hlo_query",
@@ -440,12 +439,12 @@ cc_library(
         "//xla/service/gpu/autotuning:dot_search_space",
         "//xla/service/gpu/autotuning:triton_configs",
         "//xla/service/gpu/transforms:fusion_wrapper",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -506,7 +505,7 @@ cc_library(
         "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu/autotuning:redzone_buffers",
@@ -568,9 +567,9 @@ xla_test(
     ],
     deps = [
         ":native_emitter",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:compiler",
         "//xla/service:executable",
@@ -603,12 +602,13 @@ cc_library(
         ":factory",
         ":fission_backend",
         ":triton",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/service:compiler",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:scaled_dot_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_platform_id",
@@ -628,12 +628,13 @@ cc_library(
         ":cublas",
         ":factory",
         ":triton",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/service:compiler",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/stream_executor/rocm:rocm_platform_id",
+        "@llvm-project//mlir:IR",
     ],
     alwayslink = True,
 )
@@ -660,6 +661,7 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/autotuner:profiler",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -692,6 +694,7 @@ cc_library(
     srcs = ["legacy_cache.cc"],
     hdrs = ["legacy_cache.h"],
     deps = [
+        "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:autotuner_cache_interface",
@@ -716,6 +719,7 @@ cc_library(
     hdrs = ["fission_backend.h"],
     deps = [
         ":gpu_codegen_backend",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -745,6 +749,7 @@ xla_test(
         ":custom_kernel",
         ":fission_backend",
         ":gpu_codegen_backend",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -756,6 +761,7 @@ xla_test(
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:scaled_dot_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
@@ -775,6 +781,7 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
@@ -795,7 +802,9 @@ xla_cc_test(
     srcs = ["legacy_cache_test.cc"],
     deps = [
         ":legacy_cache",
+        "//xla:autotuning_proto_cc",
         "//xla:literal_util",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:autotuner_cache_interface",
         "//xla/backends/autotuner:autotuner_cache_proto_cc",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
index b81cf80665db88..b32110d7971e4a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/gpu_profiler.h"
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
index bfda4e0c804bcc..253c2373c087a7 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
@@ -290,12 +290,9 @@ BlockLevelEmitterBackend::GetSupportedConfigs(const HloInstruction& instr) {
     configs.push_back(std::move(any));
   }
 
-  // Allow TMA tuning for Hopper+ devices when TMA flag is passed.
-  bool autotune_tma =
-      debug_options().xla_gpu_experimental_enable_triton_tma() &&
-      stream_executor::gpu::IsTmaAvailableForDevice(
-          target_config().device_description);
-  if (autotune_tma) {
+  // Allow TMA tuning for Hopper+ devices.
+  if (stream_executor::gpu::IsTmaAvailableForDevice(
+          target_config().device_description)) {
     ExtendConfigsWithTma(configs);
   }
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
index 6a493adf92edba..877b69ef4b767e 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
@@ -85,9 +85,7 @@ class TritonBlockLevelFusionEmitterBackendTest
                              .value()),
         target_config_(stream_executor_),
         backend_(&debug_options_, &compiler_,
-                 compiler_.ShapeSizeBytesFunction(), &target_config_) {
-    debug_options_.set_xla_gpu_experimental_enable_triton_tma(true);
-  }
+                 compiler_.ShapeSizeBytesFunction(), &target_config_) {}
 
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
index 425343686beeaf..751c5c75502d58 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
@@ -104,7 +104,8 @@ CublasBackend::GetSupportedConfigs(const HloInstruction& instr) {
       out_desc.compute_type,
       se::gpu::GetBlasComputationType(
           gemm_config.precision_algorithm, gemm_config.lhs_layout.dtype,
-          gemm_config.output_layout.dtype, gemm_config.compute_precision));
+          gemm_config.output_layout.dtype, gemm_config.compute_precision,
+          target_config().device_description.gpu_compute_capability()));
 
   se::blas::BlasSupport* blas = stream_executor()->AsBlas();
   if (blas == nullptr) {
@@ -158,6 +159,8 @@ absl::Status CublasBackend::ApplyConfig(HloInstruction& instr,
                       instr.backend_config<GpuBackendConfig>());
   GemmBackendConfig& backend_config = *gpu_config.mutable_gemm_backend_config();
   backend_config.set_selected_algorithm(gemm_key.algorithm());
+  backend_config.set_autotune_workspace_size(
+      gemm_key.autotune_workspace_size());
   TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
index 54c5b0e50a7bd6..5235c0646073f5 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
@@ -124,6 +124,7 @@ CublasLtBackend::GetSupportedConfigs(const HloInstruction& instr) {
   for (int i = 0; i < num_algorithms; ++i) {
     CublasLtBackendConfig gemm_key;
     gemm_key.set_algorithm(i);
+    gemm_key.set_autotune_workspace_size(workspace_size);
     auto any = std::make_unique<google::protobuf::Any>();
     any->PackFrom(gemm_key);
     configs.push_back(std::move(any));
@@ -157,6 +158,8 @@ absl::Status CublasLtBackend::ApplyConfig(HloInstruction& instr,
                       instr.backend_config<GpuBackendConfig>());
   GemmBackendConfig& backend_config = *gpu_config.mutable_gemm_backend_config();
   backend_config.set_selected_algorithm(gemm_key.algorithm());
+  backend_config.set_autotune_workspace_size(
+      gemm_key.autotune_workspace_size());
   TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
index c5c9111d0eac80..f311ea3e0deb0e 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -238,6 +239,7 @@ GetCudnnFusionConfigs(const HloInstruction& instr,
   std::vector<std::unique_ptr<BackendConfig>> configs;
   int plan_count = CuDnnFusionCompiler::GetAvailablePlanCount(
       *stream_executor, *DynCast<HloFusionInstruction>(&instr));
+  VLOG(2) << "Found " << plan_count << " plans for cudnn fusion.";
   configs.reserve(plan_count);
   for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
     CudnnBackendConfig config;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
index 1b4f73b9a7c2ec..28b5786357d1a8 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
@@ -1,3 +1,4 @@
+#include "xla/service/gpu/transforms/scaled_dot_rewriter.h"
 /* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -35,6 +36,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -45,6 +47,7 @@ using ::mlir::MLIRContext;
 std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
     const se::DeviceDescription& device_description) {
   auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<ScaledDotRewriter>());
   pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
   for (GemmRewriterOptions::DType dtype :
        {GemmRewriterOptions::DType::kFp8Only,
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
index 602aa66217ca2b..142738a9ef16e6 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
@@ -19,15 +19,16 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/factory.h"
 #include "xla/backends/gpu/autotuner/triton.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
index 2cddcba025c932..5452a82f39656b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
@@ -83,6 +83,8 @@ absl::Status InlineFissionedComputation(HloInstruction* fusion_instr,
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 FissionBackend::GetSupportedConfigs(const HloInstruction& instr) {
   if (!IsSupported(instr)) {
+    VLOG(3) << "Instruction not supported by " << name() << ": "
+            << instr.ToString();
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
@@ -90,6 +92,8 @@ FissionBackend::GetSupportedConfigs(const HloInstruction& instr) {
   absl::StatusOr<HloInstruction*> supported_instr =
       FindFirstSupportedInstruction(hlo_module.get());
   if (supported_instr.status().code() == absl::StatusCode::kNotFound) {
+    VLOG(3) << "No supported instructions found by " << name() << ": "
+            << instr.ToString();
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
   TF_RETURN_IF_ERROR(supported_instr.status());
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
index 01f42446ec07d6..edc5c814af89b2 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
index f6ec44d6aaf755..0400562457db38 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
@@ -40,10 +40,12 @@ limitations under the License.
 #include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/gpu/transforms/scaled_dot_rewriter.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -89,6 +91,27 @@ ENTRY main {
   ROOT %dot.0 = f32[64,64]{1,0} fusion(p0, p1), kind=kCustom, calls=gemm_fusion, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm"},"force_earliest_schedule":false}
 })";
 
+const char kScaledDotFusionHlo[] = R"(
+HloModule module
+
+fusion_computation {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  p0_scale = f32[1024,8] parameter(2)
+  p1_scale = f32[8,1024] parameter(3)
+  ROOT r = f32[1024,1024] scaled-dot(p0, p1, p0_scale, p1_scale),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  p0_scale = f32[1024,8] parameter(2)
+  p1_scale = f32[8,1024] parameter(3)
+  ROOT r = f32[1024,1024] fusion(p0, p1, p0_scale, p1_scale),
+    kind=kCustom, calls=fusion_computation
+})";
+
 const char kUnsupportedFusionHlo[] = R"(
   HloModule module
   computation {
@@ -130,6 +153,7 @@ class FissionTest : public HloHardwareIndependentTestBase,
   static std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
       const se::DeviceDescription& device_description) {
     auto pipeline = std::make_unique<HloPassPipeline>("fission_pipeline");
+    pipeline->AddPass(std::make_unique<ScaledDotRewriter>());
     pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
     for (GemmRewriterOptions::DType dtype :
          {GemmRewriterOptions::DType::kFp8Only,
@@ -286,6 +310,14 @@ INSTANTIATE_TEST_SUITE_P(
              "\"kind\":\"__custom_fusion\"",
          },
          /*expected_backend_name=*/"CustomKernel_fission"},
+        {"ScaledDotFusion_Cublas",
+         kScaledDotFusionHlo,
+         &FissionTest::GetCublasRewriterPipeline,
+         &FissionTest::CreateCublasBackend,
+         /*expected_module_substrings=*/
+         {"custom_call_target=\"__cublas$gemm\"",
+          "\"selected_algorithm\":\"-1\""},
+         /*expected_backend_name=*/"Cublas_fission"},
     }),
     [](const ::testing::TestParamInfo<FissionTest::ParamType>& info) {
       return info.param.test_name;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
index 7b08c0f0637cbf..9e9ddf541aa602 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 
 #include <gtest/gtest.h>
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
index 81b5135600507c..82c8405af97e3d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -62,7 +62,7 @@ std::vector<ExecutionInput> CreateExecutionInputsFromBuffers(
     // Our executable doesn't have input-output aliasing, so we can pass
     // unowned input buffers.
     inputs.back().SetUnownedBuffer(
-        /*index=*/{}, MaybeOwningDeviceMemory(/*unowned=*/buffers.at(i)));
+        /*index=*/{}, MaybeOwningDeviceAddress(/*unowned=*/buffers.at(i)));
   }
   return inputs;
 }
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
index eb30a989d791a4..f6e8dfebad3653 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
index 79242cf983550c..18323dc1ceef2b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/autotuning/autotune_cache_key.h"
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
index 6c2744e0bd026f..13c0e93baa1313 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
index c4e0872244c092..636be11815a4d0 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
index 94b0d741c01ad5..2a5e1742bbe6ff 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -202,12 +203,11 @@ class MockCompiler : public Compiler {
                std::vector<se::StreamExecutor*> stream_execs,
                const CompileOptions& options),
               (override));
-  MOCK_METHOD(
-      absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>,
-      CompileAheadOfTime,
-      (std::unique_ptr<HloModule> hlo_module,
-       const AotCompilationOptions& options),
-      (override));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>,
+              CompileAheadOfTime,
+              (std::unique_ptr<HloModule> hlo_module,
+               const AotCompilationOptions& options),
+              (override));
   MOCK_METHOD(HloCostAnalysis::ShapeSizeFunction, ShapeSizeBytesFunction, (),
               (const, override));
 };
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.cc b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
index a6cc696a80ef91..71a113e2e4631d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -42,12 +41,12 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -58,9 +57,9 @@ namespace gpu {
 
 namespace {
 std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
-    se::GpuComputeCapability compute_capability, bool autotune_tma) {
+    se::GpuComputeCapability compute_capability) {
   if (compute_capability.IsRocm()) {
-    return *kDefaultRocmConfigs;
+    return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm);
   }
 
   CHECK(compute_capability.IsCuda());
@@ -68,30 +67,16 @@ std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
   std::vector<TritonGemmConfig> configs;
 
   if (cuda_compute_capability->IsAtLeastBlackwell()) {
-    configs = *kBlackwellConfigs;
-  } else if (cuda_compute_capability->IsHopper() ||
-             cuda_compute_capability->IsAmpere()) {
-    configs = *kHopperAmpereConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kBlackwell);
+  } else if (cuda_compute_capability->IsHopper()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopper);
+  } else if (cuda_compute_capability->IsAmpere()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kAmpere);
   } else {
-    configs = *kDefaultCudaConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultCuda);
   }
 
-  if (!autotune_tma) {
-    return configs;
-  }
-
-  // Hopper+ devices support TMA. Add TMA parameterized configs.
-  std::vector<TritonGemmConfig> tma_parameterized_configs;
-  for (auto& config : configs) {
-    config.is_tma_allowed = false;
-    tma_parameterized_configs.push_back(config);
-
-    if (IsTmaRecommended(config)) {
-      config.is_tma_allowed = true;
-      tma_parameterized_configs.push_back(config);
-    }
-  }
-  return tma_parameterized_configs;
+  return configs;
 }
 
 }  // namespace
@@ -128,11 +113,6 @@ TritonBackend::GetSupportedConfigsForDot(const HloInstruction* instr) {
       supports_contracting_split &&
       debug_options().xla_gpu_enable_split_k_autotuning();
 
-  // Allow TMA tuning for Hopper+ devices when TMA flag is passed.
-  bool autotune_tma =
-      debug_options().xla_gpu_experimental_enable_triton_tma() &&
-      stream_executor::gpu::IsTmaAvailableForDevice(
-          target_config().device_description);
   std::vector<std::unique_ptr<BackendConfig>> configs;
   VLOG(1) << "Generating configs from search space: "
           << search_space.ToString();
@@ -141,15 +121,13 @@ TritonBackend::GetSupportedConfigsForDot(const HloInstruction* instr) {
   std::vector<TritonGemmConfig> gemm_configs = search_space.GenerateConfigs(
       /*force_contracting_split=*/autotune_contracting_split
           ? std::nullopt
-          : std::make_optional(1),
-      /*autotune_tma=*/autotune_tma);
+          : std::make_optional(1));
 
   if (!debug_options().xla_gpu_exhaustive_tiling_search()) {
     VLOG(1) << "Restricting configs to the default set.";
     gemm_configs = search_space.OptimizeConfigSet(
         gemm_configs, /*hints=*/GetDefaultTritonConfigs(
-            target_config().device_description.gpu_compute_capability(),
-            autotune_tma));
+            target_config().device_description.gpu_compute_capability()));
   }
   configs.reserve(gemm_configs.size());
   for (const auto& config : gemm_configs) {
@@ -242,7 +220,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonBackend::RunHloPasses(
   // into fusions.
   FusionWrapper fusion_wrapper(gpu_device_info);
   TF_RETURN_IF_ERROR(fusion_wrapper.Run(hlo_module.get()).status());
-
+  TF_RETURN_IF_ERROR(HoistFusedBitcasts().Run(hlo_module.get()).status());
   NestGemmFusion nest_gemm_fusion(gpu_device_info, mlir_context_);
   TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(hlo_module.get()).status());
   return hlo_module;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
index 595599700c6f3b..0c029caa2a5b88 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
@@ -96,7 +96,6 @@ class TritonBackendTest : public HloHardwareIndependentTestBase {
                              .value()),
         target_config_(stream_executor_),
         backend_(&debug_options_, &compiler_, &target_config_, &mlir_context_) {
-    debug_options_.set_xla_gpu_experimental_enable_triton_tma(true);
   }
 
   DebugOptions debug_options_;
diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 2be8a0247fbe27..806b28020c6aca 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -1,6 +1,7 @@
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -25,6 +26,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/runtime:copy_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
@@ -93,7 +95,7 @@ xla_test(
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:dump",
-        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:cudnn_support_utils",
         "//xla/service/gpu:ir_emission_utils",
@@ -103,16 +105,16 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -184,6 +186,16 @@ cc_library(
 xla_test(
     name = "dynamic_slice_fusion_test",
     srcs = ["dynamic_slice_fusion_test.cc"],
+    # TODO(b/467915739): Remove heap_check= once the bug is fixed.
+    backend_args = if_google(
+        {
+            "gpu": ["--heap_check="],
+            "b200": ["--heap_check="],
+            "a100": ["--heap_check="],
+            "h100": ["--heap_check="],
+        },
+        {},
+    ),
     backend_tags = {
         "gpu": [
             "multi_gpu",
@@ -223,7 +235,6 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -286,6 +297,7 @@ cc_library(
         ":cudnn",
         ":custom",
         ":fusion_emitter",
+        ":sort",
         "//xla:shape_util",
         "//xla/backends/gpu/codegen/emitters:concatenate",
         "//xla/backends/gpu/codegen/emitters:in_place_dynamic_update_slice",
@@ -295,7 +307,6 @@ cc_library(
         "//xla/backends/gpu/codegen/emitters:transpose",
         "//xla/backends/gpu/codegen/triton:fusion",
         "//xla/codegen:ir_emission_utils",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
@@ -306,3 +317,27 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "sort",
+    srcs = ["sort.cc"],
+    hdrs = ["sort.h"],
+    deps = [
+        ":fusion_emitter",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/codegen/llvm:llvm_emitter",
+        "//xla/backends/gpu/runtime:copy_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
+        "//xla/backends/gpu/runtime:thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emitter_context",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.cc b/third_party/xla/xla/backends/gpu/codegen/copy.cc
index 901103333277b7..54d13fb70e059a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -64,13 +65,16 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
   std::vector<BufferAllocation::Slice> src_buffers;
+  std::vector<Shape> src_shapes;
   for (const HloInstructionAdaptor& root_adaptor : analysis_.fusion_roots()) {
     const HloInstruction* root = &root_adaptor.instruction();
     const HloInstruction* src_instr =
         fusion.operand(root->operand(0)->parameter_number());
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                        buffer_assignment_->GetUniqueSlice(src_instr, {}));
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice slice,
+        ir_emitter_context.buffer_assignment().GetUniqueSlice(src_instr, {}));
     src_buffers.push_back(slice);
+    src_shapes.push_back(root->operand(0)->shape());
   }
 
   std::vector<BufferAllocation::Slice> dst_buffers;
@@ -79,8 +83,10 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
         if (!subshape.IsArray()) {
           return absl::OkStatus();
         }
-        TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                            buffer_assignment_->GetUniqueSlice(&fusion, index));
+        TF_ASSIGN_OR_RETURN(
+            BufferAllocation::Slice slice,
+            ir_emitter_context.buffer_assignment().GetUniqueSlice(&fusion,
+                                                                  index));
         dst_buffers.push_back(slice);
         return absl::OkStatus();
       }));
@@ -91,8 +97,8 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
       result.thunks.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(
               &fusion, ir_emitter_context.GetNextThunkId()),
-          /*source_buffer=*/src_buffers[i],
-          /*destination_buffer=*/dst_buffers[i],
+          /*source_buffer=*/ShapedSlice{src_buffers[i], src_shapes[i]},
+          /*destination_buffer=*/ShapedSlice{dst_buffers[i], src_shapes[i]},
           /*mem_size=*/src_buffers[i].size()));
     }
   }
@@ -119,10 +125,11 @@ absl::StatusOr<FusionEmissionResult> DynamicMemcpyFusion::Emit(
     // implemented: we only support dynamic offsets, no dynamic sizes.
     TF_ASSIGN_OR_RETURN(
         BufferAllocation::Slice input_slice,
-        buffer_assignment_->GetUniqueSlice(
+        ir_emitter_context.buffer_assignment().GetUniqueSlice(
             &SkipOptionalBitcast(root.GetOperand(0)).instruction(), {}));
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_slice,
-                        buffer_assignment_->GetUniqueSlice(&fusion, {}));
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice dst_slice,
+        ir_emitter_context.buffer_assignment().GetUniqueSlice(&fusion, {}));
     CHECK_EQ(input_slice, dst_slice);
 
     source_operand_index = 1;
@@ -135,10 +142,12 @@ absl::StatusOr<FusionEmissionResult> DynamicMemcpyFusion::Emit(
 
   const auto* src_instr =
       &SkipOptionalBitcast(root.GetOperand(source_operand_index)).instruction();
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_buffer,
-                      buffer_assignment_->GetUniqueSlice(src_instr, {}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_buffer,
-                      buffer_assignment_->GetUniqueSlice(&fusion, {}));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice src_buffer,
+      ir_emitter_context.buffer_assignment().GetUniqueSlice(src_instr, {}));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice dst_buffer,
+      ir_emitter_context.buffer_assignment().GetUniqueSlice(&fusion, {}));
 
   FusionEmissionResult result;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.h b/third_party/xla/xla/backends/gpu/codegen/copy.h
index abd2e73e222842..7ea70ff9ef3072 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.h
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 
@@ -33,9 +32,8 @@ namespace gpu {
 // implemented using `memcpy`s.
 class MemcpyFusion : public FusionInterface {
  public:
-  MemcpyFusion(const HloFusionAnalysis& analysis,
-               const BufferAssignment* buffer_assignment)
-      : analysis_(analysis), buffer_assignment_(buffer_assignment) {}
+  explicit MemcpyFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
 
   absl::StatusOr<FusionEmissionResult> Emit(
       IrEmitterContext& ir_emitter_context,
@@ -43,7 +41,6 @@ class MemcpyFusion : public FusionInterface {
 
  private:
   const HloFusionAnalysis& analysis_;
-  const BufferAssignment* buffer_assignment_;
 };
 
 // Special case of a fusion consisting only of instructions that can be
@@ -52,9 +49,8 @@ class MemcpyFusion : public FusionInterface {
 // (e.g. dynamic-slice in a while loop).
 class DynamicMemcpyFusion : public FusionInterface {
  public:
-  DynamicMemcpyFusion(const HloFusionAnalysis& analysis,
-                      const BufferAssignment* buffer_assignment)
-      : analysis_(analysis), buffer_assignment_(buffer_assignment) {}
+  explicit DynamicMemcpyFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
 
   absl::StatusOr<FusionEmissionResult> Emit(
       IrEmitterContext& ir_emitter_context,
@@ -70,7 +66,6 @@ class DynamicMemcpyFusion : public FusionInterface {
 
  private:
   const HloFusionAnalysis& analysis_;
-  const BufferAssignment* buffer_assignment_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
index 6da38eae63927d..01e47f38e0ab1d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
@@ -315,6 +315,73 @@ TEST_F(CopyFusionTest, BuildUpdateSliceDescriptor) {
   EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
 }
 
+TEST_F(CopyFusionTest, BuildDescriptorWithDynamicVariable) {
+  constexpr char kModuleWithDynamicVariable[] = R"(
+    dynamic_slice {
+      p0 = s32[4,8,8] parameter(0)
+      p1 = s32[1,1,8] parameter(1)
+      p2 = s32[] parameter(2)
+      c1 = s32[] constant(1)
+
+      ROOT update-slice = s32[4,8,8] dynamic-update-slice(p0, p1, p2, c1, c1)
+    }
+
+    body {
+      p0 = (s32[], s32[4,8,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,8] get-tuple-element(p0), index=1
+      dynamic_idx = s32[] get-tuple-element(p0), index=2
+      val = s32[1,1,8] constant({{{1,2,3,4,5,6,7,8}}})
+
+      updated = s32[4,8,8] fusion(input, val, dynamic_idx), kind=kLoop, calls=dynamic_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      c1 = s32[] constant(1)
+      next_ivar = s32[] add(ivar, c1)
+      next_dynamic_idx = s32[] add(dynamic_idx, c1)
+
+      ROOT result = (s32[], s32[4,8,8], s32[])
+          tuple(next_ivar, updated, next_dynamic_idx)
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(ivar, c6), direction=LT
+    }
+
+    ENTRY main {
+      input = s32[4,8,8] parameter(0)
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[4,8,8], s32[]) tuple(c0, input, c0)
+      ROOT while = (s32[], s32[4,8,8], s32[]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"},
+                          "dynamic_variable_tuple_indices":["2"]}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleWithDynamicVariable));
+
+  auto descriptor = DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+      GetFusion(module.get()));
+
+  ASSERT_TRUE(descriptor.has_value());
+  EXPECT_THAT(descriptor->src_dynamic_offsets, ::testing::IsEmpty());
+  EXPECT_EQ(descriptor->src_byte_static_offset, 0);
+
+  ASSERT_THAT(descriptor->dst_dynamic_offsets, ::testing::SizeIs(1));
+  const auto& offset = descriptor->dst_dynamic_offsets[0];
+  EXPECT_EQ(descriptor->dst_byte_static_offset, 32);
+  EXPECT_EQ(offset.while_loop->name(), "while");
+  EXPECT_EQ(offset.induction_variable->name(), "dynamic_idx");
+  EXPECT_EQ(offset.offset->name(), "p2");
+  EXPECT_EQ(offset.dimension_size, 4);
+  EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
+}
+
 TEST_F(CopyFusionTest, PackedSubByteTypesAreNotSupported) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     dynamic_slice {
diff --git a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
index 2ceef17cbdf053..8ebf435947689e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
@@ -35,24 +35,24 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/primitive_util.h"
 #include "xla/service/dump.h"
-#include "xla/service/executable.h"
 #include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -66,6 +66,10 @@ class CuDnnFusionTest : public GpuCodegenTest {
     // autotuning.
     debug_options.set_xla_gpu_autotune_level(0);
     debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
+    // Only run the CuDNN backend.
+    debug_options.clear_xla_gpu_experimental_autotune_backends();
+    debug_options.add_xla_gpu_experimental_autotune_backends(
+        DebugOptions::AUTOTUNE_BACKEND_CUDNN);
     return debug_options;
   }
   se::CudaComputeCapability get_cuda_cc() const {
@@ -261,12 +265,17 @@ e {
                                      dnn_compiled_graphs);
   EXPECT_THAT(cudnn_compiler.Run(module.get()),
               absl_testing::IsOkAndHolds(false));
+  // Single dot is not supported by cuDNN, so Triton should be used.
+  HloModuleConfig config = GetModuleConfigForTest();
+  config.mutable_debug_options().add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_TRITON);
   EXPECT_TRUE(RunAndCompareTwoModules(kHloText, R"(e {
     a = f32[32,96] parameter(0)
     b = f32[96,64] parameter(1)
     d = f32[32,64] dot(a, b),
       lhs_contracting_dims={1}, rhs_contracting_dims={0}
   })",
+                                      config, config,
                                       ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/custom.cc b/third_party/xla/xla/backends/gpu/codegen/custom.cc
index 22a2e1d159b29d..6bd21df61eb9cd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/custom.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/custom.cc
@@ -308,7 +308,7 @@ absl::Status CollectSliceInfo(
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>& offsets,
     std::vector<std::optional<Shape>>& orig_shapes,
     std::vector<std::optional<Shape>>& sliced_shapes,
-    std::vector<std::optional<uint64_t>>& offset_byte_sizes,
+    std::vector<std::optional<PrimitiveType>>& offset_primitive_types,
     std::vector<std::unique_ptr<HloModule>>& extracted_offset_modules,
     unsigned arg_idx, bool can_compute_indvar_on_host,
     std::optional<const HloInstruction*> while_op,
@@ -381,8 +381,8 @@ absl::Status CollectSliceInfo(
   sliced_shapes[arg_idx] = DynCast<HloDynamicSliceInstruction>(arg_slice_instr)
                                ? arg_slice_instr->shape()
                                : arg_slice_instr->operand(1)->shape();
-  offset_byte_sizes[arg_idx] = ShapeUtil::ByteSizeOfPrimitiveType(
-      arg_slice_instr->index_operands().front()->shape().element_type());
+  offset_primitive_types[arg_idx] =
+      arg_slice_instr->index_operands().front()->shape().element_type();
 
   return absl::OkStatus();
 }
@@ -556,7 +556,8 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
       offset_buffer_indices(4, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
-  std::vector<std::optional<uint64_t>> offset_byte_sizes(4, std::nullopt);
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types(
+      4, std::nullopt);
 
   std::vector<HloInstruction*> slice_instrs(4, nullptr);
 
@@ -594,7 +595,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                       /*shape_idx=*/{}, arg_idx));
   TF_RETURN_IF_ERROR(CollectSliceInfo(
       buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_primitive_types,
       extracted_offset_modules, arg_idx++, can_compute_indvar_on_host, while_op,
       indvar_idx, inlined_module));
 
@@ -605,7 +606,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                       /*shape_idx=*/{}, arg_idx));
   TF_RETURN_IF_ERROR(CollectSliceInfo(
       buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_primitive_types,
       extracted_offset_modules, arg_idx++, can_compute_indvar_on_host, while_op,
       indvar_idx, inlined_module));
 
@@ -624,9 +625,9 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                                slice_instrs, /*shape_idx=*/{}, arg_idx));
     TF_RETURN_IF_ERROR(CollectSliceInfo(
         buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
-        extracted_offset_modules, arg_idx, can_compute_indvar_on_host, while_op,
-        indvar_idx, inlined_module));
+        offset_buffer_indices, orig_shapes, sliced_shapes,
+        offset_primitive_types, extracted_offset_modules, arg_idx,
+        can_compute_indvar_on_host, while_op, indvar_idx, inlined_module));
   } else {
     TF_ASSIGN_OR_RETURN(
         output,
@@ -635,9 +636,9 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                        arg_idx));
     TF_RETURN_IF_ERROR(CollectSliceInfo(
         buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
-        extracted_offset_modules, arg_idx++, can_compute_indvar_on_host,
-        while_op, indvar_idx, inlined_module));
+        offset_buffer_indices, orig_shapes, sliced_shapes,
+        offset_primitive_types, extracted_offset_modules, arg_idx++,
+        can_compute_indvar_on_host, while_op, indvar_idx, inlined_module));
 
     // TODO(vuson): If we want to support slices of workspace, we'd need to
     // start `HloFindIf` with `get-tuple-element` with the right index.
@@ -646,9 +647,9 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                                       /*index=*/{kGEMMWorkspaceBufferIndex}));
     TF_RETURN_IF_ERROR(CollectSliceInfo(
         buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
-        extracted_offset_modules, arg_idx, can_compute_indvar_on_host, while_op,
-        indvar_idx, inlined_module));
+        offset_buffer_indices, orig_shapes, sliced_shapes,
+        offset_primitive_types, extracted_offset_modules, arg_idx,
+        can_compute_indvar_on_host, while_op, indvar_idx, inlined_module));
     fake_allocations[arg_idx] = BufferAllocation(
         /*index=*/arg_idx, workspace->size(), /*color=*/0);
     slice_workspace_fake = BufferAllocation::Slice(&fake_allocations[arg_idx],
@@ -723,7 +724,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
         std::move(arguments), std::move(fake_allocations),
         std::move(offset_buffer_indices), std::move(orig_shapes),
-        std::move(sliced_shapes), std::move(offset_byte_sizes),
+        std::move(sliced_shapes), std::move(offset_primitive_types),
         std::move(offset_modules_metadata));
   } else {
     thunk = std::make_unique<GemmThunk>(thunk_info, std::move(config),
@@ -777,8 +778,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
       num_args, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
-  std::vector<std::optional<uint64_t>> offset_byte_sizes(num_args,
-                                                         std::nullopt);
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types(
+      num_args, std::nullopt);
 
   std::vector<HloInstruction*> slice_instrs(num_args, nullptr);
   std::vector<std::optional<BufferAllocation::Slice>> arguments;
@@ -830,7 +831,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           TF_RETURN_IF_ERROR(CollectSliceInfo(
               buffer_assignment, fusion,
               absl::Span<HloInstruction*>(slice_instrs), offsets, orig_shapes,
-              sliced_shapes, offset_byte_sizes, extracted_offset_modules,
+              sliced_shapes, offset_primitive_types, extracted_offset_modules,
               arg_idx++, can_compute_indvar_on_host, while_op, indvar_idx,
               inlined_module));
 
@@ -858,7 +859,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         TF_RETURN_IF_ERROR(CollectSliceInfo(
             buffer_assignment, fusion,
             absl::Span<HloInstruction*>(slice_instrs), offsets, orig_shapes,
-            sliced_shapes, offset_byte_sizes, extracted_offset_modules,
+            sliced_shapes, offset_primitive_types, extracted_offset_modules,
             arg_idx++, can_compute_indvar_on_host, while_op, indvar_idx,
             inlined_module));
 
@@ -1040,7 +1041,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
         std::move(arguments), std::move(fake_allocations), std::move(offsets),
         std::move(orig_shapes), std::move(sliced_shapes),
-        std::move(offset_byte_sizes), std::move(offset_modules_metadata));
+        std::move(offset_primitive_types), std::move(offset_modules_metadata));
   } else {
     TF_ASSIGN_OR_RETURN(
         thunk, found_ffi_handler
@@ -1073,7 +1074,7 @@ struct SliceDataForCollectives {
   std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
       offset_buffer_indices;
   std::vector<std::optional<Shape>> orig_shapes, sliced_shapes;
-  std::vector<std::optional<uint64_t>> offset_byte_sizes;
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types;
   std::vector<std::unique_ptr<HloModule>> extracted_offset_modules;
   std::unique_ptr<HloModule> init_module, update_module;
   bool isDynamic, can_compute_indvar_on_host;
@@ -1085,7 +1086,7 @@ struct SliceDataForCollectives {
         offset_buffer_indices(num_args, std::nullopt),
         orig_shapes(num_args, std::nullopt),
         sliced_shapes(num_args, std::nullopt),
-        offset_byte_sizes(num_args, std::nullopt),
+        offset_primitive_types(num_args, std::nullopt),
         init_module(nullptr),
         update_module(nullptr),
         isDynamic(false),
@@ -1138,7 +1139,7 @@ CollectSliceArgumentMetadataForCollectives(
         buffer_assignment, fusion_instr,
         /*slice_instrs=*/absl::Span<HloInstruction*>(slice_data.slice_instrs),
         /*offsets=*/slice_data.offset_buffer_indices, slice_data.orig_shapes,
-        slice_data.sliced_shapes, slice_data.offset_byte_sizes,
+        slice_data.sliced_shapes, slice_data.offset_primitive_types,
         slice_data.extracted_offset_modules, arg_idx,
         slice_data.can_compute_indvar_on_host, while_op, indvar_idx,
         inlined_module));
@@ -1165,7 +1166,7 @@ CollectSliceArgumentMetadataForCollectives(
         buffer_assignment, fusion_instr,
         /*slice_instrs=*/absl::Span<HloInstruction*>(slice_data.slice_instrs),
         /*offsets=*/slice_data.offset_buffer_indices, slice_data.orig_shapes,
-        slice_data.sliced_shapes, slice_data.offset_byte_sizes,
+        slice_data.sliced_shapes, slice_data.offset_primitive_types,
         slice_data.extracted_offset_modules, arg_idx,
         slice_data.can_compute_indvar_on_host, while_op, indvar_idx,
         inlined_module));
@@ -1289,8 +1290,8 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
              "collective";
       seq.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
           thunk_info,
-          /*source_buffer=*/src.value(),
-          /*destination_buffer=*/dst.value(),
+          /*source_buffer=*/ShapedSlice{src.value(), shape},
+          /*destination_buffer=*/ShapedSlice{dst.value(), shape},
           /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
     }
   } else if (implementable_status.ok()) {
@@ -1359,7 +1360,7 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
         std::move(slice_data.arguments), std::move(slice_data.fake_allocations),
         std::move(slice_data.offset_buffer_indices),
         std::move(slice_data.orig_shapes), std::move(slice_data.sliced_shapes),
-        std::move(slice_data.offset_byte_sizes),
+        std::move(slice_data.offset_primitive_types),
         std::move(offset_modules_metadata));
     result.thunks.push_back(std::move(thunk));
   } else {
diff --git a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
index 6b6b860885065c..d99cd48be0455a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -3412,8 +3411,10 @@ TEST_F(DynamicSliceFusionTest,
       ROOT while = (s32[], s32[32,32], s32[32,32]) while(tuple), body=body, condition=condition
     }
   )";
+  HloModuleConfig config = GetModuleConfigWithoutCommandBuffer();
+  config.mutable_debug_options().set_xla_gpu_enable_dynamic_slice_fusion(true);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> fused_module,
-                          ParseAndReturnVerifiedModule(hlo_fused));
+                          ParseAndReturnVerifiedModule(hlo_fused, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> wrapped_exec,
                           CreateExecutable(fused_module->Clone(), false));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> exec,
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
index b72a97787b82fe..debca012ccc125 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
@@ -25,7 +25,6 @@ cc_library(
         "//xla/codegen/emitters:concatenate_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_constants",
@@ -59,10 +58,10 @@ cc_library(
         "//xla/codegen/emitters:kernel_api_builder",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/emitters/transforms:lower_to_llvm_gpu_pass",
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
@@ -136,7 +135,6 @@ xla_cc_test(
         ":emitter_base",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -174,7 +172,6 @@ cc_library(
         "//xla/codegen/emitters:dynamic_update_slice_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -206,7 +203,6 @@ cc_library(
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -242,7 +238,6 @@ cc_library(
         "//xla/codegen/emitters:type_util",
         "//xla/codegen/emitters:utils",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:platform_util",
@@ -334,7 +329,6 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:scatter_simplifier",
         "//xla/service/gpu:gpu_fusible",
@@ -377,7 +371,6 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:hlo_fusion_analysis",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
index 0500357b5b0277..c7470ee44b7efa 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/codegen/emitters/concatenate_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
index 673dbb8e50a327..497e3a5bc3d0b3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index 91b52d4d011ee6..45a5f5ced1c791 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -89,11 +88,11 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -512,7 +511,7 @@ void AddLoweringPasses(mlir::OpPassManager& pm,
   pm.addPass(emitters::CreateExpandFloatOpsPass());
   pm.addPass(mlir::createLowerAffinePass());
   pm.addPass(mlir::createSCFToControlFlowPass());
-  pm.addPass(emitters::CreateLowerToLLVMPass(device));
+  pm.addPass(emitters::CreateLowerToLLVMGPUPass(device));
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
index bb0adddf675d1d..c019f8fb51535d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
index 151456f686069a..536daf99c5e728 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
index 4eb2b55f210f88..1e77fbbba4f168 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
index 2b96e3f1fe54ab..66f09a64c11a06 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
index 20b5dec1c59a74..4c9961033ebc17 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
@@ -116,7 +116,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
-        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
index d71111d9f1329b..9f2ad846138a8a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
index 8519333b9a21b4..00ed28b3f9bbab 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
index 5841543ccf5148..3de57127a1b009 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
@@ -28,13 +28,13 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/backends/gpu/codegen/emitters/reduction_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
index 7a2ba02dfda734..fd07b7ee32e156 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
index d6261abfaadc56..a20866185c3db8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
@@ -25,12 +25,12 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
index b96ff357233680..2446234eacb089 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
@@ -1,10 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load(
-    "//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -43,37 +38,22 @@ cc_library(
         "recover_exp2.cc",
     ],
     hdrs = ["passes.h"],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":passes_inc_gen",
-        "//xla:shape_util",
-        "//xla:util",
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/emitters/ir:xla_dialect_inc_gen",
-        "//xla/codegen/emitters/transforms:atomic_rmw_utils",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/mlir_hlo",
-        "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:gpu_fusible",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor/rocm:rocm_compute_capability",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:CallOpInterfaces",
-        "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
@@ -88,10 +68,5 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:VectorDialect",
-        "@local_tsl//tsl/platform:protobuf",
-    ] + if_cuda_is_configured([
-        "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
-    ]) + if_rocm_is_configured([
-        "//xla/service/gpu/llvm_gpu_backend:amdgpu_backend",
-    ]),
+    ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
index 8dead6470b4763..1e7d6464ad1d0f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
@@ -59,6 +59,7 @@ namespace {
 
 namespace LLVM = ::mlir::LLVM;
 namespace arith = ::mlir::arith;
+namespace se = stream_executor;
 namespace vector = ::mlir::vector;
 
 template <typename SourceOp>
@@ -365,7 +366,7 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
       return std::nullopt;
     }
 
-    mlir::Value vector = extract.getVector();
+    mlir::Value vector = extract.getSource();
 
     size_t element_count =
         mlir::cast<FixedVectorValue>(vector).getType().getNumElements();
@@ -389,7 +390,7 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
 
     for (const mlir::OpOperand& use : vector.getUses()) {
       extract = mlir::dyn_cast<vector::ExtractOp>(use.getOwner());
-      if (!extract || !extract->hasOneUse() || extract.getVector() != vector ||
+      if (!extract || !extract->hasOneUse() || extract.getSource() != vector ||
           !matchPos(extract, &pos)) {
         return std::nullopt;
       }
@@ -560,8 +561,8 @@ class ConvertFloatAMDPass
   void runOnOperation() override {
     if (!gpu_device_info_.empty()) {
       se::GpuDeviceInfoProto device_info;
-      CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
-                                                       &device_info));
+      CHECK(
+          google::protobuf::TextFormat::ParseFromString(gpu_device_info_, &device_info));
       absl::StatusOr<se::DeviceDescription> device_description =
           se::DeviceDescription::FromProto(device_info);
       CHECK_OK(device_description.status());
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
index 5e3b87105c049c..bdd72354685ccd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
@@ -39,7 +39,7 @@ std::unique_ptr<mlir::Pass> CreateConvertFloatNvidiaPass(
 std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
     const std::string& gpu_device_info = "");
 std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
-    const se::RocmComputeCapability& cc);
+    const stream_executor::RocmComputeCapability& cc);
 std::unique_ptr<mlir::Pass> CreateConvertIndexTypePass();
 std::unique_ptr<mlir::Pass> CreateOptimizeLoopsPass();
 std::unique_ptr<mlir::Pass> CreatePeelLoopsPass();
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
index 3b8cd150b0eb08..531dc6ff0fe71d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -544,7 +543,7 @@ std::vector<int64_t> GetBlockCounts(absl::Span<const int64_t> shape,
 }
 
 PackedTranspose::PackedTranspose(const HloFusionAnalysis& analysis,
-                                 const TransposeSpec& spec,
+                                 const PackedTransposeDescription& spec,
                                  absl::Span<const int64_t> output_block_tile,
                                  int64_t num_shmem_groups,
                                  MLIRContext* mlir_context)
@@ -677,8 +676,8 @@ PackedTranspose::WriteResult PackedTranspose::EmitWriteToShMemMlir(
     auto* root_tuple = fusion.fused_expression_root();
     for (auto root : side_output_roots_) {
       auto indexing = ComposeIndexingMaps(
-          input_indexing,
-          GetBitcastMap(spec_.input_shape(), root->shape(), mlir_context_));
+          input_indexing, GetBitcastMap(spec_.original_input_shape(),
+                                        root->shape(), mlir_context_));
       indexing.Simplify();
       side_output_indices.push_back(ApplyIndexing(
           indexing, thread_and_block_ids, symbol_values, nested_b));
@@ -865,7 +864,7 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* mlir_context) const {
 
   // Actual indexing.
   auto canonical_input_shape_to_real_shape = GetBitcastMap(
-      spec_.canonical_input_shape, spec_.input_shape(), mlir_context);
+      spec_.canonical_input_shape, spec_.original_input_shape(), mlir_context);
   // When we compose, the constraints w.r.t. to the input dimension sizes will
   // be added.
   auto input_indexing = ComposeIndexingMaps(
@@ -1001,8 +1000,9 @@ IndexingMap PackedTranspose::GetOutputIndexing(
   canonical_output_indexing.Simplify();
 
   // Actual indexing.
-  auto canonical_output_shape_to_real_shape = GetBitcastMap(
-      spec_.canonical_output_shape, spec_.output_shape(), mlir_context);
+  auto canonical_output_shape_to_real_shape =
+      GetBitcastMap(spec_.canonical_output_shape, spec_.original_output_shape(),
+                    mlir_context);
   // When we compose, the constraints w.r.t. to the output dimension sizes will
   // be added.
   auto output_indexing = ComposeIndexingMaps(
@@ -1013,8 +1013,7 @@ IndexingMap PackedTranspose::GetOutputIndexing(
 
 std::unique_ptr<EmitterBase> CreateTransposeFusion(
     const HloFusionAnalysis& analysis, MLIRContext* mlir_context) {
-  auto spec = GetTransposeSpec(
-      Cast<HloTransposeInstruction>(analysis.tiled_transpose().instr));
+  PackedTransposeDescription spec(analysis.tiled_transpose());
   auto packed_transpose_tile = GetPackedTransposeTileSizes(spec);
   if (packed_transpose_tile.ok()) {
     return std::make_unique<PackedTranspose>(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
index e3a8091c07f95f..7c92bef8af1d2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
@@ -29,13 +29,13 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -164,9 +164,9 @@ class TransposeFusion : public TransposeFusionBase {
 };
 
 // Packed transpose is a more advanced version of the transpose emitter.
-// It considers the canonical transpose described by TransposeSpec class,
-// i.e. [T2, A, T1, B] -> [T1, A, T2, B] and tries to pack as many T1 rows into
-// shared memory as possible.
+// It considers the canonical transpose described by PackedTransposeDescription
+// class, i.e. [T2, A, T1, B] -> [T1, A, T2, B] and tries to pack as many T1
+// rows into shared memory as possible.
 //
 // Let's describe the algorithm for a concrete example.
 //   bf16 [640,100,6,1] - > bf16 [6,100,640,1]
@@ -237,7 +237,7 @@ class TransposeFusion : public TransposeFusionBase {
 class PackedTranspose : public TransposeFusionBase {
  public:
   explicit PackedTranspose(const HloFusionAnalysis& analysis,
-                           const TransposeSpec& spec,
+                           const PackedTransposeDescription& spec,
                            absl::Span<const int64_t> output_block_tile,
                            int64_t num_shmem_groups,
                            mlir::MLIRContext* mlir_context);
@@ -273,7 +273,7 @@ class PackedTranspose : public TransposeFusionBase {
   IndexingMap GetShmemReadIndexing(mlir::MLIRContext* ctx) const;
   IndexingMap GetOutputIndexing(mlir::MLIRContext* ctx) const;
 
-  TransposeSpec spec_;
+  PackedTransposeDescription spec_;
 
   // Tile sizes for the canonical input shape.
   std::vector<int64_t> output_tile_;
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.cc b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
index d8e082f8b0a3a8..ebce9ff67d9355 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/scatter.h"
 #include "xla/backends/gpu/codegen/emitters/transpose.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/sort.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -53,8 +54,7 @@ std::optional<std::unique_ptr<FusionInterface>> HloFusionInfo::GetCopyFusion()
       return std::nullopt;
     }
 
-    return std::make_unique<DynamicMemcpyFusion>(analysis(),
-                                                 buffer_assignment_);
+    return std::make_unique<DynamicMemcpyFusion>(analysis());
   }
 
   for (const HloInstructionAdaptor& root_adaptor : analysis().fusion_roots()) {
@@ -67,7 +67,7 @@ std::optional<std::unique_ptr<FusionInterface>> HloFusionInfo::GetCopyFusion()
     }
   }
 
-  return std::make_unique<MemcpyFusion>(analysis(), buffer_assignment_);
+  return std::make_unique<MemcpyFusion>(analysis());
 }
 
 bool HloFusionInfo::CanEmitDynamicUpdateSliceInPlace() const {
@@ -122,6 +122,9 @@ std::unique_ptr<FusionInterface> GetFusionEmitter(
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
       return std::make_unique<ConcatenateFusion>(analysis);
     }
+    case HloFusionAnalysis::EmitterFusionKind::kSort: {
+      return std::make_unique<SortFusion>();
+    }
     case HloFusionAnalysis::EmitterFusionKind::kTriton:
       return std::make_unique<TritonFusion>(analysis);
     case HloFusionAnalysis::EmitterFusionKind::kCuDnn:
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
index 43cd45f395a9cc..00d63c372342ef 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:thunk",
@@ -53,7 +54,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -78,6 +79,8 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
@@ -86,8 +89,9 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
index 8ecab390d4d120..3837599da053fb 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
@@ -89,9 +89,10 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/fingerprint.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -250,6 +251,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   llvm::IRBuilderBase* builder() { return &b_; }
 
+  llvm::Module* module() { return module_; }
+
   // Generate the code for the computation passed in the constructor, if it
   // wasn't already generated previously.
   // As well as generting the code for the function, emits code for global
@@ -360,7 +363,7 @@ absl::Status CallNestedComputation(llvm::IRBuilderBase* builder,
                                    llvm::Value* output) {
   TF_RET_CHECK(computation.num_parameters() > 0);
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       llvm::Function * emitted_function,
       IrEmitter(&ir_emitter_context, llvm_module, /*is_nested=*/true)
           .CodegenNestedComputation(computation));
@@ -458,7 +461,7 @@ absl::StatusOr<llvm::Function*> IrEmitter::CodegenNestedComputation(
     return function;
   }
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       EmitConstants(module_, ir_emitter_context_, nested_computation));
   std::vector<const HloInstruction*> io_hlos;
   std::vector<llvm::Type*> argument_types;
@@ -519,7 +522,7 @@ absl::StatusOr<llvm::Function*> IrEmitter::CodegenNestedComputation(
   }
   bindings_.EmitBasePointersForHlos(io_hlos, non_io_hlos);
 
-  TF_RETURN_IF_ERROR(nested_computation.root_instruction()->Accept(this));
+  RETURN_IF_ERROR(nested_computation.root_instruction()->Accept(this));
   b_.SetInsertPoint(ret_instr);
 
   // Function epilogue: copy the output value back.
@@ -573,7 +576,7 @@ absl::Status IrEmitter::EmitTargetElementLoop(
   if (hlo.shape().IsTuple()) {
     std::vector<llvm_ir::IrArray> target_arrays =
         ConstructIrArrayForOutputs(hlo);
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         llvm_ir::LoopEmitter(element_generator, target_arrays, &b_).EmitLoop());
     llvm_ir::EmitTuple(GetIrArray(hlo, hlo), target_arrays, &b_);
     return absl::OkStatus();
@@ -596,13 +599,13 @@ absl::StatusOr<KernelThunkInfo> BuildKernelThunkForNonFusionOp(
     IrEmitter& ir_emitter, const LaunchDimensions& launch_dimensions) {
   std::string suggested_kernel_name(hlo->name());
 
-  TF_ASSIGN_OR_RETURN(auto kernel_arguments,
-                      emitters::KernelArguments::Create(
-                          buffer_assignment, GetDefaultBufferAlignment(), hlo));
+  ASSIGN_OR_RETURN(auto kernel_arguments,
+                   emitters::KernelArguments::Create(
+                       buffer_assignment, GetDefaultBufferAlignment(), hlo));
 
   VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       llvm::Function * kernel,
       BuildKernelPrototype(llvm_module, gpu_device_info, suggested_kernel_name,
                            sanitized_kernel_name, kernel_arguments,
@@ -854,6 +857,7 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
   VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
                                 op_name, num_blocks, kThreadsPerBlock);
   ThunkSequence thunks;
+  bool emit_iota_operands = true;
   auto emit_kernel = [&](absl::Span<const int64_t> xor_masks) {
     VLOG(2) << absl::StreamFormat(
         "%s uses kernel for xor masks [%s]", op_name,
@@ -863,14 +867,17 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
-    TF_ASSIGN_OR_RETURN(
-        KernelThunkInfo kernel_thunk_info,
-        BuildKernelThunkForNonFusionOp(
-            llvm_module, sort, ir_emitter_context->buffer_assignment(),
-            ir_emitter_context->GetNextThunkId(),
-            ir_emitter_context->gpu_device_info(),
-            ir_emitter_context->GetSanitizedUniqueName(op_name), ir_emitter,
-            launch_dimensions));
+    bool is_fusion = sort->parent()->IsFusionComputation();
+    const HloInstruction* hlo_with_buffers =
+        is_fusion ? sort->parent()->FusionInstruction() : sort;
+    ASSIGN_OR_RETURN(KernelThunkInfo kernel_thunk_info,
+                     BuildKernelThunkForNonFusionOp(
+                         llvm_module, hlo_with_buffers,
+                         ir_emitter_context->buffer_assignment(),
+                         ir_emitter_context->GetNextThunkId(),
+                         ir_emitter_context->gpu_device_info(),
+                         ir_emitter_context->GetSanitizedUniqueName(op_name),
+                         ir_emitter, launch_dimensions));
     thunks.push_back(std::move(kernel_thunk_info.thunk));
 
     // The first `operand_count()` elements of `ir_arrays` are the input
@@ -878,13 +885,13 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
     // outputs, so we need to pass only the outputs to the in-place sort kernel.
     auto output_arrays_span =
         absl::Span<const llvm_ir::IrArray>(kernel_thunk_info.ir_arrays)
-            .subspan(sort->operand_count());
+            .subspan(hlo_with_buffers->operand_count());
 
     auto* comparator = sort->called_computations().front();
     auto* builder = ir_emitter.builder();
-    return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, output_arrays_span, llvm_ir::IrName(op_name),
-        xor_masks, ir_emitter.builder(), launch_dimensions,
+    auto result = llvm_ir::EmitSortInPlace(
+        sort, output_arrays_span, emit_iota_operands, llvm_ir::IrName(op_name),
+        xor_masks, ir_emitter.module(), ir_emitter.builder(), launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
         tile_size, kUnrollFactor,
@@ -893,6 +900,8 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
                                        llvm_module, *comparator, operands,
                                        output);
         });
+    emit_iota_operands = false;
+    return result;
   };
   std::vector<int64_t> xor_masks;
   for (int64_t stage = 0; stage < num_stages; ++stage) {
@@ -905,17 +914,17 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
       }
       if (xor_mask >= tile_size) {
         if (!xor_masks.empty()) {
-          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+          RETURN_IF_ERROR(emit_kernel(xor_masks));
           xor_masks.clear();
         }
-        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
+        RETURN_IF_ERROR(emit_kernel({xor_mask}));
       } else {
         xor_masks.push_back(xor_mask);
       }
     }
   }
   if (!xor_masks.empty()) {
-    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+    RETURN_IF_ERROR(emit_kernel(xor_masks));
   }
   return thunks;
 }
@@ -935,7 +944,7 @@ absl::StatusOr<ThunkSequence> EmitPadToStaticLLVMIR(
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       input_shape, ir_emitter_context->gpu_device_info(), {kUnrollFactor});
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       KernelThunkInfo kernel_thunk_info,
       BuildKernelThunkForNonFusionOp(
           llvm_module, hlo, ir_emitter_context->buffer_assignment(),
@@ -1061,10 +1070,10 @@ absl::StatusOr<ThunkSequence> EmitPadToStaticLLVMIR(
   };
 
   const Shape& data_shape = hlo->shape().tuple_shapes(0);
-  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
-                                         launch_dimensions,
-                                         ir_emitter.builder(), {kUnrollFactor})
-                         .EmitLoop(ir_name, index_ty));
+  RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
+                                      launch_dimensions, ir_emitter.builder(),
+                                      {kUnrollFactor})
+                      .EmitLoop(ir_name, index_ty));
   return thunk_sequence;
 }
 
@@ -1083,7 +1092,7 @@ absl::StatusOr<ThunkSequence> EmitSliceToDynamicLLVMIR(
       input_shape, ir_emitter_context->gpu_device_info(), {kUnrollFactor});
   llvm::Type* index_ty = GetIndexTypeForKernel(
       hlo, launch_dimensions.launch_bound(), ir_emitter.builder());
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       KernelThunkInfo kernel_thunk_info,
       BuildKernelThunkForNonFusionOp(
           llvm_module, hlo, ir_emitter_context->buffer_assignment(),
@@ -1199,10 +1208,10 @@ absl::StatusOr<ThunkSequence> EmitSliceToDynamicLLVMIR(
     return absl::OkStatus();
   };
 
-  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
-                                         launch_dimensions,
-                                         ir_emitter.builder(), {kUnrollFactor})
-                         .EmitLoop(ir_name, index_ty));
+  RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
+                                      launch_dimensions, ir_emitter.builder(),
+                                      {kUnrollFactor})
+                      .EmitLoop(ir_name, index_ty));
   return thunk_sequence;
 }
 
@@ -1216,7 +1225,7 @@ absl::StatusOr<ThunkSequence> EmitRngGetAndUpdateStateLLVMIR(
   auto& b = *ir_emitter.builder();
   // Emit a kernel to increment the global state for Philox RNG
   // algorithm.
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       KernelThunkInfo kernel_thunk_info,
       BuildKernelThunkForNonFusionOp(
           llvm_module, hlo, ir_emitter_context->buffer_assignment(),
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
index e7634a27615e37..167eefc4256b7a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -34,7 +35,12 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "xla/backends/gpu/codegen/llvm/parallel_loop_emitter.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
+#include "xla/primitive_util.h"
+#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
@@ -47,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace llvm_ir {
@@ -58,14 +65,15 @@ absl::Status EmitCompareLoopBody(
     int64_t iteration_bound, int64_t num_threads, int64_t unroll_factor,
     int64_t num_values, llvm::Value* element_pair_index, int64_t xor_mask,
     llvm::Type* index_type,
-    std::function<llvm::Value*(int64_t operand, llvm::Value* index)>
+    std::function<absl::StatusOr<llvm::Value*>(int64_t operand,
+                                               llvm::Value* index)>
         element_address,
     std::function<llvm::Type*(int64_t operand, llvm::Value* index)>
         element_address_pointee_type,
     std::function<void(int64_t operand, llvm::Value* index, llvm::Value* value)>
         write_element,
     const EmitCallToNestedComputationCallback& emit_compare_callback,
-    llvm::IRBuilderBase* b, bool needs_bounds_checks = true) {
+    llvm::IRBuilderBase* b, bool force_write, bool needs_bounds_checks = true) {
   auto index_typed_constant = [&](int64_t value) {
     return llvm::ConstantInt::get(index_type, value);
   };
@@ -172,16 +180,21 @@ absl::Status EmitCompareLoopBody(
 
     // if (index_is_inbounds)
     KernelSupportLibrary ksl(b);
-    TF_RETURN_IF_ERROR(
-        ksl.IfWithStatus("smaller_comparison_index", index_is_inbounds, [&]() {
+    RETURN_IF_ERROR(ksl.IfWithStatus(
+        "smaller_comparison_index", index_is_inbounds, [&]() -> absl::Status {
           std::vector<llvm::Value*> values_to_compare;
           std::vector<llvm::Type*> values_to_compare_types;
+          values_to_compare.reserve(num_values * 2);
+          values_to_compare_types.reserve(num_values * 2);
           for (int i = 0; i < num_values; ++i) {
-            values_to_compare.push_back(element_address(i, compare_keys_index));
+            ASSIGN_OR_RETURN(llvm::Value * address,
+                             element_address(i, compare_keys_index));
+            values_to_compare.push_back(address);
             values_to_compare_types.push_back(
                 element_address_pointee_type(i, compare_keys_index));
 
-            values_to_compare.push_back(element_address(i, current_keys_index));
+            ASSIGN_OR_RETURN(address, element_address(i, current_keys_index));
+            values_to_compare.push_back(address);
             values_to_compare_types.push_back(
                 element_address_pointee_type(i, current_keys_index));
           }
@@ -190,7 +203,7 @@ absl::Status EmitCompareLoopBody(
           llvm::Value* compare_return_buffer =
               llvm_ir::EmitAllocaAtFunctionEntry(pred_type,
                                                  "compare_return_buffer", b);
-          TF_RETURN_IF_ERROR(
+          RETURN_IF_ERROR(
               emit_compare_callback(values_to_compare, compare_return_buffer));
           llvm::Value* result = b->CreateLoad(pred_type, compare_return_buffer);
 
@@ -198,7 +211,18 @@ absl::Status EmitCompareLoopBody(
           llvm::Value* is_smaller_than = b->CreateICmpNE(
               result, llvm::ConstantInt::get(result->getType(), 0),
               "boolean_predicate");
-          ksl.If("is_smaller_than", is_smaller_than, [&]() {
+          auto write_original_order = [&]() {
+            for (int64_t i = 0; i < num_values; ++i) {
+              // Don't swap the values.
+              auto value1 = b->CreateLoad(values_to_compare_types[i * 2],
+                                          values_to_compare[i * 2]);
+              auto value2 = b->CreateLoad(values_to_compare_types[i * 2 + 1],
+                                          values_to_compare[i * 2 + 1]);
+              write_element(i, current_keys_index, value2);
+              write_element(i, compare_keys_index, value1);
+            }
+          };
+          auto write_swapped_order = [&]() {
             for (int64_t i = 0; i < num_values; ++i) {
               // Swap the values.
               auto value1 = b->CreateLoad(values_to_compare_types[i * 2],
@@ -208,7 +232,18 @@ absl::Status EmitCompareLoopBody(
               write_element(i, current_keys_index, value1);
               write_element(i, compare_keys_index, value2);
             }
-          });
+          };
+          if (force_write) {
+            // If we don't use shared memory, we have to make sure that values
+            // that were emitted as part of the first iteration get written to
+            // global memory, even if the comparison determined that no swap is
+            // necessary.
+            ksl.If("is_smaller_than", is_smaller_than, write_swapped_order,
+                   write_original_order);
+          } else {
+            ksl.If("is_smaller_than", is_smaller_than, write_swapped_order);
+          }
+
           return absl::OkStatus();
         }));
   }
@@ -216,13 +251,15 @@ absl::Status EmitCompareLoopBody(
 }
 
 absl::Status EmitTiledCompareLoop(
-    const IrArray::Index& tiled_keys_index, int64_t dimension_to_sort,
+    const IrArray::Index& tiled_keys_index, const HloSortInstruction* sort,
     int64_t dimension_to_sort_bound, int64_t num_threads,
     absl::Span<const int64_t> xor_masks, absl::Span<const IrArray> params,
+    bool emit_iota_operands,
     const std::vector<llvm::GlobalVariable*>& param_shmem_buffers,
     int64_t tile_size, int64_t unroll_factor,
     const EmitCallToNestedComputationCallback& emit_compare_callback,
-    llvm::IRBuilderBase* b) {
+    llvm::Module* module, llvm::IRBuilderBase* b) {
+  int64_t dimension_to_sort = sort->sort_dimension();
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = gpu::EmitCallToTargetIntrinsic(
       gpu::TargetIntrinsicID::kThreadIdx, {}, {}, b);
@@ -232,50 +269,61 @@ absl::Status EmitTiledCompareLoop(
   thread_id = b->CreateIntCast(thread_id, tiled_keys_index.GetType(),
                                /*isSigned=*/true, "thread.id.x");
 
-  auto copy_loop_body =
-      [&](std::function<void(llvm::Value * cache_index, llvm::Value * index)>
-              read_or_write) {
-        auto unroll = tiled_keys_index.GetConstantWithIndexType(unroll_factor);
-        auto base_keys_index =
-            b->CreateMul(tiled_keys_index[dimension_to_sort], unroll,
-                         "base_keys_index", /*HasNUW=*/true, /*HasNSW=*/true);
-        auto base_cache_index =
-            b->CreateMul(thread_id, unroll, "base_cache_index", /*HasNUW=*/true,
-                         /*HasNSW=*/true);
-        // We want to copy `unroll_factor` many adjacent elements.
-        for (int i = 0; i < unroll_factor; ++i) {
-          auto offset = tiled_keys_index.GetConstantWithIndexType(i);
-          auto current_keys_index =
-              b->CreateAdd(base_keys_index, offset, "current_keys_index",
-                           /*HasNUW=*/true, /*HasNSW=*/true);
-          // We check whether the index position is within bounds.
-          ksl.If("smaller_keys_index",
-                 b->CreateICmpSLT(current_keys_index,
-                                  tiled_keys_index.GetConstantWithIndexType(
-                                      dimension_to_sort_bound)),
-                 [&]() {
-                   auto cache_index =
-                       b->CreateAdd(base_cache_index, offset, "cache_index",
-                                    /*HasNUW=*/true, /*HasNSW=*/true);
-                   read_or_write(cache_index, current_keys_index);
-                 });
-        }
-      };
+  auto copy_loop_body = [&](std::function<absl::Status(
+                                llvm::Value * cache_index, llvm::Value * index)>
+                                read_or_write) -> absl::Status {
+    auto unroll = tiled_keys_index.GetConstantWithIndexType(unroll_factor);
+    auto base_keys_index =
+        b->CreateMul(tiled_keys_index[dimension_to_sort], unroll,
+                     "base_keys_index", /*HasNUW=*/true, /*HasNSW=*/true);
+    auto base_cache_index =
+        b->CreateMul(thread_id, unroll, "base_cache_index", /*HasNUW=*/true,
+                     /*HasNSW=*/true);
+    // We want to copy `unroll_factor` many adjacent elements.
+    for (int i = 0; i < unroll_factor; ++i) {
+      auto offset = tiled_keys_index.GetConstantWithIndexType(i);
+      auto current_keys_index =
+          b->CreateAdd(base_keys_index, offset, "current_keys_index",
+                       /*HasNUW=*/true, /*HasNSW=*/true);
+      // We check whether the index position is within bounds.
+      RETURN_IF_ERROR(ksl.IfWithStatus(
+          "smaller_keys_index",
+          b->CreateICmpSLT(current_keys_index,
+                           tiled_keys_index.GetConstantWithIndexType(
+                               dimension_to_sort_bound)),
+          [&]() {
+            auto cache_index =
+                b->CreateAdd(base_cache_index, offset, "cache_index",
+                             /*HasNUW=*/true, /*HasNSW=*/true);
+            return read_or_write(cache_index, current_keys_index);
+          }));
+    }
+    return absl::OkStatus();
+  };
 
   // Copy operand tiles from the operand buffers to shared memory.
   std::vector<llvm::Value*> keys_multi_index = tiled_keys_index.multidim();
   for (int64_t i = 0; i < params.size(); ++i) {
-    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+    RETURN_IF_ERROR(copy_loop_body([&](llvm::Value* cache_index,
+                                       llvm::Value* index) {
       keys_multi_index[dimension_to_sort] = index;
       IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
                                 tiled_keys_index.GetType());
-      auto value = params[i].EmitReadArrayElement(keys_index, b);
+      llvm::Value* value;
+      if (emit_iota_operands &&
+          HloPredicateIsOp<HloOpcode::kIota>(sort->operand(i))) {
+        ASSIGN_OR_RETURN(value,
+                         EmitIota(sort->operand(i), keys_index, module, b));
+      } else {
+        value = params[i].EmitReadArrayElement(keys_index, b);
+      }
       b->CreateStore(
           value,
           b->CreateGEP(
               param_shmem_buffers[i]->getValueType(), param_shmem_buffers[i],
               {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
-    });
+      return absl::OkStatus();
+    }));
   }
   // Wait until all reads have happened.
   gpu::EmitCallToTargetIntrinsic(gpu::TargetIntrinsicID::kBarrierId, {}, {}, b);
@@ -319,7 +367,7 @@ absl::Status EmitTiledCompareLoop(
     if (dimension_to_sort_bound % tile_size) {
       // Otherwise we need a bounds check for the last tile. The last tile has
       // size 'dimension_to_sort_bound' % 'tile_size'.
-      TF_RETURN_IF_ERROR(ksl.IfWithStatus(
+      RETURN_IF_ERROR(ksl.IfWithStatus(
           "is_last_tile",
           b->CreateICmpUGE(
               b->CreateMul(
@@ -333,22 +381,22 @@ absl::Status EmitTiledCompareLoop(
                 unroll_factor / 2, params.size(), element_pair_index, xor_mask,
                 tiled_keys_index.GetType(), element_address,
                 element_address_pointee_type, write_element,
-                emit_compare_callback, b);
+                emit_compare_callback, b, /*force_write=*/false);
           },
           [&]() {
             return EmitCompareLoopBody(
                 tile_size, num_threads, unroll_factor / 2, params.size(),
                 element_pair_index, xor_mask, tiled_keys_index.GetType(),
                 element_address, element_address_pointee_type, write_element,
-                emit_compare_callback, b,
+                emit_compare_callback, b, /*force_write=*/false,
                 /*needs_bounds_checks=*/false);
           }));
     } else {
-      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+      RETURN_IF_ERROR(EmitCompareLoopBody(
           tile_size, num_threads, unroll_factor / 2, params.size(),
           element_pair_index, xor_mask, tiled_keys_index.GetType(),
           element_address, element_address_pointee_type, write_element,
-          emit_compare_callback, b,
+          emit_compare_callback, b, /*force_write=*/false,
           /*needs_bounds_checks=*/false));
     }
     // Wait until all comparisons have happened.
@@ -358,7 +406,8 @@ absl::Status EmitTiledCompareLoop(
 
   // Copy the operand tiles back from shared memory to the operand buffers.
   for (int64_t i = 0; i < params.size(); ++i) {
-    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+    RETURN_IF_ERROR(copy_loop_body([&](llvm::Value* cache_index,
+                                       llvm::Value* index) {
       keys_multi_index[dimension_to_sort] = index;
       IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
                                 tiled_keys_index.GetType());
@@ -370,7 +419,8 @@ absl::Status EmitTiledCompareLoop(
           {tiled_keys_index.GetConstantWithIndexType(0), cache_index});
       auto value = b->CreateLoad(gep_type, gep);
       params[i].EmitWriteArrayElement(keys_index, value, b);
-    });
+      return absl::OkStatus();
+    }));
   }
   // We should normally synchronize here to make sure all writes have happened.
   // However the very next thing each thread does is reading `unroll_factor`
@@ -386,8 +436,9 @@ absl::Status EmitTiledCompareLoop(
 }  // namespace
 
 absl::Status EmitSortInPlace(
-    int64_t dimension_to_sort, absl::Span<const IrArray> values_arrays,
-    absl::string_view name, absl::Span<const int64_t> xor_masks,
+    const HloSortInstruction* sort, absl::Span<const IrArray> values_arrays,
+    bool emit_iota_operands, absl::string_view name,
+    absl::Span<const int64_t> xor_masks, llvm::Module* module,
     llvm::IRBuilderBase* b, const gpu::LaunchDimensions& launch_dimensions,
     int64_t num_iterations_in_sort_dim, int64_t tile_size,
     int64_t unroll_factor,
@@ -404,6 +455,7 @@ absl::Status EmitSortInPlace(
   const Shape& keys_shape = values_arrays[0].GetShape();
   int64_t rank = keys_shape.dimensions().size();
   int64_t num_threads = std::max(int64_t{1}, tile_size / unroll_factor);
+  int64_t dimension_to_sort = sort->sort_dimension();
   int64_t dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64_t> dimensions_in_iteration_order(rank);
   std::vector<int64_t> iteration_order_to_logical_order(rank);
@@ -459,17 +511,38 @@ absl::Status EmitSortInPlace(
     if (xor_masks.size() > 1) {
       IrArray::Index keys_index(keys_multi_index, values_arrays[0].GetShape(),
                                 tiles_index.GetType());
-      TF_RETURN_IF_ERROR(EmitTiledCompareLoop(
-          keys_index, dimension_to_sort, dimension_to_sort_bound, num_threads,
-          xor_masks, values_arrays, param_shmem_buffers, tile_size,
-          unroll_factor, emit_compare_callback, b));
+      RETURN_IF_ERROR(EmitTiledCompareLoop(
+          keys_index, sort, dimension_to_sort_bound, num_threads, xor_masks,
+          values_arrays, emit_iota_operands, param_shmem_buffers, tile_size,
+          unroll_factor, emit_compare_callback, module, b));
     } else {
-      auto element_address = [&](int64_t operand, llvm::Value* index) {
+      auto element_address =
+          [&](int64_t operand,
+              llvm::Value* index) -> absl::StatusOr<llvm::Value*> {
         keys_multi_index[dimension_to_sort] = index;
         IrArray::Index keys_index(keys_multi_index,
                                   values_arrays[operand].GetShape(),
                                   tiles_index.GetType());
-        return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
+        PrimitiveType element_type =
+            values_arrays[operand].GetShape().element_type();
+        llvm::Value* element;
+        if (emit_iota_operands &&
+            HloPredicateIsOp<HloOpcode::kIota>(sort->operand(operand))) {
+          ASSIGN_OR_RETURN(
+              element, EmitIota(sort->operand(operand), keys_index, module, b));
+        } else {
+          if (!primitive_util::IsSubByteNonPredType(element_type)) {
+            return values_arrays[operand].EmitArrayElementAddress(keys_index,
+                                                                  b);
+          }
+          element = values_arrays[operand].EmitReadArrayElement(keys_index, b);
+        }
+        auto llvm_element_type =
+            llvm_ir::PrimitiveTypeToIrType(element_type, b->getContext());
+        llvm::Value* element_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
+            llvm_element_type, "element_buffer", b);
+        b->CreateStore(element, element_buffer);
+        return element_buffer;
       };
       auto element_address_pointee_type = [&](int64_t operand, llvm::Value*) {
         return values_arrays[operand].GetElementLlvmType();
@@ -482,11 +555,12 @@ absl::Status EmitSortInPlace(
                                   tiles_index.GetType());
         values_arrays[operand].EmitWriteArrayElement(keys_index, value, b);
       };
-      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+      RETURN_IF_ERROR(EmitCompareLoopBody(
           dimension_to_sort_bound, /*num_threads=*/1, unroll_factor / 2,
           values_arrays.size(), tiles_index[rank - 1], xor_masks[0],
           tiles_index.GetType(), element_address, element_address_pointee_type,
-          write_element, emit_compare_callback, b));
+          write_element, emit_compare_callback, b,
+          /*force_write=*/emit_iota_operands));
     }
     return absl::OkStatus();
   };
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h
index 7486d218bc3f6b..13179f189f87f9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/llvm_ir/ir_array.h"
 
@@ -35,9 +36,12 @@ using EmitCallToNestedComputationCallback =
 // dimension of each array in 'values_arrays'. All other dimensions are kept
 // as-is. This implements the inner loop of BitonicSort. It is assumed that
 // 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
+// `emit_iota_operands` should be set to true in the first call to
+// EmitSortInPlace.
 absl::Status EmitSortInPlace(
-    int64_t dimension_to_sort, absl::Span<const IrArray> values_arrays,
-    absl::string_view name, absl::Span<const int64_t> xor_masks,
+    const HloSortInstruction* sort, absl::Span<const IrArray> values_arrays,
+    bool emit_iota_operands, absl::string_view name,
+    absl::Span<const int64_t> xor_masks, llvm::Module* module,
     llvm::IRBuilderBase* b, const gpu::LaunchDimensions& launch_dimensions,
     int64_t num_iterations_in_sort_dim, int64_t tile_size,
     int64_t unroll_factor,
diff --git a/third_party/xla/xla/backends/gpu/codegen/sort.cc b/third_party/xla/xla/backends/gpu/codegen/sort.cc
new file mode 100644
index 00000000000000..35c0049f09486c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/sort.cc
@@ -0,0 +1,114 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/backends/gpu/codegen/sort.h"
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/llvm/llvm_emitter.h"
+#include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<FusionEmissionResult> SortFusion::Emit(
+    IrEmitterContext& ir_emitter_context,
+    const HloFusionInstruction& fusion) const {
+  std::vector<BufferAllocation::Slice> src_buffers;
+  std::vector<BufferAllocation::Slice> dst_buffers;
+  std::vector<Shape> src_shapes;
+  src_buffers.reserve(fusion.operand_count());
+  dst_buffers.reserve(fusion.operand_count());
+  src_shapes.reserve(fusion.operand_count());
+  const HloSortInstruction* sort =
+      Cast<HloSortInstruction>(fusion.fused_expression_root());
+  Shape keys_shape = sort->operand(0)->shape();
+  for (int64_t i = 0; i < sort->operand_count(); ++i) {
+    // We assume that the layout of all involved operands and
+    // outputs is the same.
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, sort->operand(i)->shape(),
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
+    ShapeIndex shape_index =
+        sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index),
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
+    // We expect only parameters or iotas as operand of sort.
+    if (HloPredicateIsOp<HloOpcode::kParameter>(sort->operand(i))) {
+      const HloInstruction* src_instr =
+          fusion.operand(sort->operand(i)->parameter_number());
+      ASSIGN_OR_RETURN(
+          BufferAllocation::Slice slice,
+          ir_emitter_context.buffer_assignment().GetUniqueSlice(src_instr, {}));
+      src_buffers.push_back(slice);
+      src_shapes.push_back(sort->operand(i)->shape());
+      ASSIGN_OR_RETURN(slice,
+                       ir_emitter_context.buffer_assignment().GetUniqueSlice(
+                           &fusion, shape_index));
+      dst_buffers.push_back(slice);
+    } else {
+      TF_RET_CHECK(HloPredicateIsOp<HloOpcode::kIota>(sort->operand(i)));
+    }
+  }
+
+  FusionEmissionResult result;
+  for (int i = 0; i < src_buffers.size(); ++i) {
+    if (src_buffers[i] != dst_buffers[i]) {
+      result.thunks.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
+          Thunk::ThunkInfo::WithProfileAnnotation(
+              &fusion, ir_emitter_context.GetNextThunkId()),
+          /*source_buffer=*/ShapedSlice{src_buffers[i], src_shapes[i]},
+          /*destination_buffer=*/ShapedSlice{dst_buffers[i], src_shapes[i]},
+          /*mem_size=*/src_buffers[i].size()));
+    }
+  }
+  std::string op_name(sort->name());
+  result.module = ir_emitter_context.CreateLLVMModule(op_name);
+  ASSIGN_OR_RETURN(
+      ThunkSequence sort_thunks,
+      EmitBitonicSortLLVMIR(sort, result.module.get(), &ir_emitter_context));
+  result.thunks.insert(result.thunks.end(),
+                       std::make_move_iterator(sort_thunks.begin()),
+                       std::make_move_iterator(sort_thunks.end()));
+  return result;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/sort.h b/third_party/xla/xla/backends/gpu/codegen/sort.h
new file mode 100644
index 00000000000000..3125278f529c9e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/sort.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_GPU_CODEGEN_SORT_H_
+#define XLA_BACKENDS_GPU_CODEGEN_SORT_H_
+
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+// A fusion consisting of a sort op with operands which are either parameters or
+// iotas.
+class SortFusion : public FusionInterface {
+ public:
+  SortFusion() = default;
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_SORT_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
index dfe8f7f9a8100f..1e9e86b0dca529 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
@@ -32,7 +32,6 @@ cc_library(
         "//xla:status_macros",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/emitters:emitter_base",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -55,7 +54,6 @@ xla_cc_binary(
     deps = [
         ":test_lib",
         "//xla/codegen/tools:test_lib",
-        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
@@ -85,6 +83,7 @@ xla_cc_binary(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "//xla/codegen/tools:test_lib",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla:debug_options_flags",
         "//xla:error_spec",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc b/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
index edc204a7ef8833..a8a606a22557ac 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
@@ -35,11 +35,11 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/shape.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
 
 struct Flags {
   std::string input_file = "";
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index d103bc152028e5..5b34e62ebd634e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -3,7 +3,6 @@ load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_cuda_or_rocm_is_configured",
-    "if_gpu_is_configured",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google")
@@ -81,7 +80,6 @@ xla_cc_test(
         ":fusion",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -132,6 +130,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Support",
@@ -211,6 +210,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Transforms",
+        "@triton//:GluonTransforms",
         "@triton//:TritonDialects",
         "@triton//:TritonGPUToLLVM",
         "@triton//:TritonGPUTransforms",
@@ -259,7 +259,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/tiling:symbolic_tile_analysis",
@@ -314,60 +313,70 @@ cc_library(
 )
 
 cc_library(
-    name = "xtile_compiler",
-    # Using if_cuda_or_rocm_is_configured guard to prevent sycl target build / link errors.
-    srcs = if_cuda_or_rocm_is_configured(
-        ["xtile_compiler.cc"],
-        ["xtile_compiler_stub.cc"],
-    ),
-    hdrs = ["xtile_compiler.h"],
-    compatible_with = get_compatible_with_portable(),
+    name = "xtile_compiler_impl",
+    srcs =
+        [
+            "xtile_compiler.cc",
+            "xtile_compiler.h",
+        ],
+    tags = [
+        "gpu",
+        "manual",
+        "no-oneapi",
+    ],
+    visibility = ["//visibility:private"],
     deps = [
+        ":collective_emitter",
+        ":compilation_pipeline",
+        ":fusion_emitter",
+        ":lowering_util",
+        ":support",
         "//xla:autotuning_proto_cc",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/backends/gpu/codegen/triton/transforms:passes",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/emitters/transforms:passes",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/codegen/tiling:tiled_hlo_computation",
-        "//xla/codegen/tiling:tiled_hlo_fusion_instruction",
-        "//xla/codegen/tiling:tiled_hlo_instruction",
-        "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/tiling:tiling_specification",
         "//xla/codegen/xtile/ir:xtile",
         "//xla/codegen/xtile/ir/transforms:passes",
-        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
+        "//xla/service:dump",
         "//xla/service:hlo_module_config",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu/llvm_gpu_backend:nvptx_libdevice_path",
         "//xla/service/gpu/model:block_level_parameters",
+        "//xla/service/gpu/model:triton_emitter_constraints",
+        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@stablehlo//:stablehlo_ops",
-        "@triton//:TritonDialects",
-    ] + if_cuda_or_rocm_is_configured([
-        ":fusion_emitter",
-        ":lowering_util",
-        ":compilation_pipeline",
-        ":collective_emitter",
-        ":dot_algorithms",
-        ":emitter_helpers",
-        ":support",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Linker",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
@@ -377,60 +386,87 @@ cc_library(
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:IndexToLLVM",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMIRTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
-        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
-        "//xla:permutation_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
-        "//xla/backends/gpu/codegen/emitters/transforms:passes",
-        "//xla/backends/gpu/codegen/triton/transforms:passes",
-        "//xla/codegen/emitters:elemental_hlo_to_mlir",
-        "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/emitters/transforms:passes",
-        "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
-        "//xla/hlo/utils:hlo_traversal",
-        "//xla/mlir_hlo",
-        "//xla/service:dump",
-        "//xla/service:instruction_fusion",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/llvm_gpu_backend:amdgpu_backend",
-        "//xla/service/gpu/llvm_gpu_backend:nvptx_libdevice_path",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:triton_fusion_analysis",
-        "//xla/service/gpu/model:triton_emitter_constraints",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tools:hlo_decomposer_lib",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:rocm_rocdl_path",
-        "//xla/tsl/platform:statusor",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
+        "@stablehlo//:stablehlo_ops",
+        "@triton//:TritonDialects",
         "@triton//:TritonTransforms",
-    ]) + if_cuda_is_configured([
+    ],
+)
+
+cc_library(
+    name = "xtile_compiler_stub",
+    srcs = [
+        "xtile_compiler.h",
+        "xtile_compiler_stub.cc",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla:autotuning_proto_cc_impl",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@triton//:TritonDialects",
+    ],
+)
+
+cc_library(
+    name = "xtile_compiler",
+    hdrs = [
+        "xtile_compiler.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = if_cuda_or_rocm_is_configured(
+        [":xtile_compiler_impl"],
+        [":xtile_compiler_stub"],
+    ) + if_cuda_is_configured([
         "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
-    ]),
+    ]) + [
+        "//xla:autotuning_proto_cc",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
 )
 
 cc_library(
@@ -522,6 +558,7 @@ xla_cc_test(
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:target_constants",
         "//xla/service/gpu/model:block_level_parameters",
@@ -541,7 +578,7 @@ xla_cc_test(
 
 xla_test(
     name = "triton_gemm_fusion_test",
-    srcs = if_gpu_is_configured(["triton_gemm_fusion_test.cc"]),
+    srcs = ["triton_gemm_fusion_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -552,14 +589,12 @@ xla_test(
         "no_mac",
     ],
     deps = [
-        ":fusion_emitter",
         ":test_utils",
         ":xtile_compiler",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -571,6 +606,7 @@ xla_test(
         "//xla/service/gpu:target_constants",
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -595,7 +631,8 @@ xla_test(
 
 xla_test(
     name = "fusion_emitter_int4_device_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_int4_device_test.cc"]),
+    size = "large",
+    srcs = ["fusion_emitter_int4_device_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -611,19 +648,13 @@ xla_test(
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
@@ -698,19 +729,18 @@ xla_test(
 
 xla_test(
     name = "fusion_emitter_device_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_device_test.cc"]),
+    srcs = ["fusion_emitter_device_test.cc"],
     backends = [
         "a100",
         "h100",
         "b200",
         "amdgpu_any",
     ],
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "no_mac",
     ],
     deps = [
-        ":fusion_emitter",
         ":support",
         ":test_utils",
         ":xtile_compiler",
@@ -723,12 +753,6 @@ xla_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/codegen/tiling:tiled_hlo_computation",
-        "//xla/codegen/tiling:tiled_hlo_instruction",
-        "//xla/codegen/tiling:tiled_hlo_schedule",
-        "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -774,7 +798,6 @@ cc_library(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -786,7 +809,6 @@ cc_library(
         "//xla/service/gpu:gpu_float_support",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:target_constants",
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/model:triton_emitter_constraints",
         "//xla/stream_executor:device_description",
@@ -813,7 +835,8 @@ cc_library(
 
 xla_test(
     name = "fusion_emitter_large_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_large_test.cc"]),
+    size = "large",
+    srcs = ["fusion_emitter_large_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -842,7 +865,7 @@ xla_test(
 
 xla_test(
     name = "fusion_emitter_parametrized_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_parametrized_test.cc"]),
+    srcs = ["fusion_emitter_parametrized_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -871,7 +894,7 @@ xla_test(
 
 xla_cc_test(
     name = "fusion_emitter_shared_dialect_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_shared_dialect_test.cc"]),
+    srcs = ["fusion_emitter_shared_dialect_test.cc"],
     # TODO(b/353912594): this test does not need to run on GPU, but it is broken on CPU in OSS.
     # Force it to run on GPU temporarily in order to get important OSS coverage.
     tags = [
@@ -933,8 +956,7 @@ xla_cc_test(
     shard_count = 25,
     # TODO(b/353912594): this test does not need to run on GPU, but it is broken on CPU in OSS.
     # Force it to run on GPU temporarily in order to get important OSS coverage.
-    tags = ["gpu"] +
-           ["cuda-only"], # TODO(rocm) 240729
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         ":support",
@@ -964,15 +986,14 @@ xla_cc_test(
 
 xla_test(
     name = "support_legacy_test",
-    srcs = if_gpu_is_configured(["support_legacy_test.cc"]),
+    srcs = ["support_legacy_test.cc"],
     backends = [
         "a100",
         "h100",
         "b200",
         "amdgpu_any",
     ],
-    tags = ["no_mac",
-            "cuda-only"], # TODO(rocm) 240729
+    tags = ["no_mac",],
     deps = [
         ":fusion_emitter",
         ":support",
@@ -1011,7 +1032,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:LLVMDialect",
     ],
 )
 
@@ -1025,6 +1045,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/backends/gpu/runtime:all_reduce",
         "//xla/codegen/tiling:tiled_hlo_instruction",
@@ -1065,6 +1086,7 @@ xla_cc_test(
         ":xtile_compiler",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
index 25b6c1d8f3834c..e659734dd922af 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
@@ -60,6 +60,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
@@ -190,8 +191,11 @@ absl::StatusOr<TensorValue> EmitAllReduce(
     const BlockLevelParameters& block_level_parameters,
     mlir::FunctionOpInterface fn, mlir::Value pid,
     absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
-  const int64_t num_elements =
-      ShapeUtil::ElementsIn(computation->root_instruction()->shape());
+  const HloInstruction* root_instruction = computation->root_instruction();
+  if (root_instruction->opcode() == HloOpcode::kAllReduceDone) {
+    root_instruction = root_instruction->operand(0);
+  }
+  const int64_t num_elements = ShapeUtil::ElementsIn(root_instruction->shape());
   const TiledHloInstruction* tiled_input_hlo = tiled_hlo_reduce.operand(0);
   TensorValue input_tile = values[tiled_input_hlo];
 
@@ -279,6 +283,10 @@ absl::StatusOr<TensorValue> EmitAllReduce(
   }
 
   // 2. Synchronization phase: Wait for all ranks to complete the scatter.
+  if (all_reduce.device_list().replica_groups().empty()) {
+    return Internal(
+        "Triton emitting AllReduce without replica groups is not supported.");
+  }
   int64_t world_size = all_reduce.device_list().num_devices_per_group();
   mtx::BlockBarrierOp::create(b, signal_buffers, device_rank, signal_value,
                               b.getI32IntegerAttr(world_size));
@@ -451,6 +459,9 @@ absl::StatusOr<TensorValue> EmitCollective(
     absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const HloComputation* computation = fusion->fused_instructions_computation();
   const HloInstruction* root = computation->root_instruction();
+  if (root->opcode() == HloOpcode::kAllReduceDone) {
+    root = root->operand(0);
+  }
   switch (root->opcode()) {
     case HloOpcode::kAllReduceStart:
       return EmitAllReduce(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
index 5191a7ef22b438..5972869610880e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
index b23a7dfe498e48..3fb0c9c2a1e1de 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla::gpu {
 
@@ -40,6 +39,7 @@ void CreateTritonXlaPipeline(
   pm->addPass(mlir::triton::xla::CreateStableHLOLowerToTritonPass());
 
   pm->addPass(emitters::CreateSafeIntegerArithmeticPass());
+  pm->addPass(mlir::triton::xla::CreateUnsupportedElementwiseToTritonPass());
 
   auto* cuda_cc = gpu_cc.cuda_compute_capability();
   bool is_at_least_hopper = cuda_cc != nullptr && cuda_cc->IsAtLeastHopper();
@@ -67,30 +67,23 @@ void CreateTritonXlaPipeline(
 void CreateTritonCudaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::CudaComputeCapability& cuda_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info);
+    int num_ctas, int num_stages);
 
 void CreateTritonRocmPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::RocmComputeCapability& rocm_cc, int num_warps,
     int num_ctas, int num_stages);
 
-void CreateTritonPipeline(
-    mlir::OpPassManager* pm,
-    const stream_executor::GpuComputeCapability& gpu_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
+void CreateTritonPipeline(mlir::OpPassManager* pm,
+                          const stream_executor::GpuComputeCapability& gpu_cc,
+                          int num_warps, int num_ctas, int num_stages) {
   if (auto* cuda_cc = gpu_cc.cuda_compute_capability()) {
     return CreateTritonCudaPipeline(pm, *cuda_cc, num_warps, num_ctas,
-                                    num_stages, out_cluster_info);
+                                    num_stages);
   }
 
   CreateTritonRocmPipeline(pm, *gpu_cc.rocm_compute_capability(), num_warps,
                            num_ctas, num_stages);
-  // There is no clusters in ROCm for now.
-  out_cluster_info.clusterDimX = 1;
-  out_cluster_info.clusterDimY = 1;
-  out_cluster_info.clusterDimZ = 1;
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
index 8ae26d3a691cc5..2d4fc0ee7fa5b3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "mlir/Pass/PassManager.h"
 #include "xla/stream_executor/device_description.h"
-#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla::gpu {
 
@@ -29,19 +28,9 @@ void CreateTritonXlaPipeline(
     bool allow_tma, int num_stages);
 
 // Creates a Triton compilation pipeline.
-//
-// `out_cluster_info` must be kept alive at least until pm.run() is called.
-// It should be read after that. We have to pass the cluster dims to
-// LaunchDimensions. Triton currently uses this as an out-parameter to return
-// the cluster dims determined based on `config.num_ctas` and a heuristic. There
-// are some signs that show that this was intended to be used as an in-out
-// parameter which would give a hint to Triton which cluster dims we prefer to
-// use, but that's not the case currently.
-void CreateTritonPipeline(
-    mlir::OpPassManager* pm,
-    const stream_executor::GpuComputeCapability& gpu_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info);
+void CreateTritonPipeline(mlir::OpPassManager* pm,
+                          const stream_executor::GpuComputeCapability& gpu_cc,
+                          int num_warps, int num_ctas, int num_stages);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index 133de281e50e85..5bf494417e65c2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string>
-
 #include "nvidia/hopper/include/Transforms/Passes.h"
 #include "nvidia/include/NVGPUToLLVM/Passes.h"
 #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
@@ -28,6 +26,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
+#include "triton/Dialect/Gluon/Transforms/Passes.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
@@ -61,17 +60,14 @@ static void MakeTTIR(mlir::OpPassManager* pm,
 // @triton//:third_party/nvidia/backend/compiler.py
 static void MakeTTGIR(mlir::OpPassManager* pm,
                       const stream_executor::CudaComputeCapability& cuda_cc,
-                      int num_warps, int num_ctas, int num_stages,
-                      mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
+                      int num_warps, int num_ctas, int num_stages) {
   const int cuda_cc_as_int = cuda_cc.major * 10 + cuda_cc.minor;
   pm->addPass(mt::createConvertTritonToTritonGPU(
       {absl::StrFormat("cuda:%u", cuda_cc_as_int), num_warps,
        /*threads_per_warp=*/32, num_ctas}));
   pm->addPass(mt::gpu::createTritonGPUCoalesce());
-  if (cuda_cc.IsAtLeastAmpere()) {
-    pm->addPass(mt::gpu::createTritonGPUF32DotTC());
-  }
-  pm->addPass(ttng::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
+  pm->addPass(mt::gpu::createTritonGPUF32DotTC({cuda_cc.IsAtLeastAmpere()}));
+  pm->addPass(ttng::createTritonNvidiaGPUPlanCTAPass());
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
   pm->addPass(mt::gpu::createTritonGPUAccelerateMatmul());
@@ -143,10 +139,12 @@ static void MakeLLIR(mlir::OpPassManager* pm,
   pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
   pm->addPass(mt::gpu::createTritonGPUAllocateWarpGroups());
   pm->addPass(mlir::createSCFToControlFlowPass());
+  pm->addPass(mlir::triton::gluon::createGluonInline());
   pm->addPass(mt::createAllocateSharedMemoryNvPass(
       cuda_cc_as_int,
       mlir::triton::AllocateSharedMemoryNvOptions{}.ptxVersion));
   pm->addPass(ttng::createTritonTensorMemoryAllocationPass());
+  pm->addPass(ttng::createTritonNvidiaGPUCheckMatmulTwoCTAPass());
   // We could add a flag to XLA to optionally enable the following pass:
   // pm->addPass(mt::instrument::createTritonInstrumentConcurrencySanitizer());
   pm->addPass(mt::gpu::createTritonGPUGlobalScratchAllocationPass());
@@ -158,7 +156,6 @@ static void MakeLLIR(mlir::OpPassManager* pm,
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mt::createConvertNVGPUToLLVM());
   pm->addPass(mt::createConvertWarpSpecializeToLLVM());
-  pm->addPass(mlir::createArithToLLVMConversionPass());
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
@@ -169,10 +166,9 @@ static void MakeLLIR(mlir::OpPassManager* pm,
 void CreateTritonCudaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::CudaComputeCapability& cuda_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
+    int num_ctas, int num_stages) {
   MakeTTIR(pm, cuda_cc);
-  MakeTTGIR(pm, cuda_cc, num_warps, num_ctas, num_stages, out_cluster_info);
+  MakeTTGIR(pm, cuda_cc, num_warps, num_ctas, num_stages);
   MakeLLIR(pm, cuda_cc);
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index 502f22fafa6b35..591bdb8a6ff33c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -63,6 +63,7 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
       {absl::StrCat("hip:", rocm_cc.gfx_version()), num_warps, threadsPerWarp,
        num_ctas}));
   pm->addPass(mt::gpu::createTritonGPUCoalesce());
+  pm->addPass(mt::gpu::createTritonGPUF32DotTC({false}));
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
   // TODO ROCm Pass rocm_cc.gfx_version() after fixing issue with fmfa
@@ -85,28 +86,30 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
     pm->addPass(mlir::createTritonAMDGPUScheduleLoops({num_stages}));
     pm->addPass(mlir::createTritonAMDGPUPipeline(
         {/*useAsyncCopy=*/false, /*usePingpong=*/false}));
-    if (/*use_async_copy=*/false) {  // Not enabled by default.
+    if (/*use_async_copy=*//* DISABLES CODE */ (
+        false)) {  // Not enabled by default.
       pm->addPass(mlir::createTritonAMDGPUCoalesceAsyncCopy());
     }
     pm->addPass(mlir::createCanonicalizerPass());
   }
-  if (/*(instruction_sched_variant=="none") == */ false) {
+  if (/*(instruction_sched_variant=="none") == */ /* DISABLES CODE */ (false)) {
     pm->addPass(mt::createTritonAMDGPUInsertInstructionSchedHintsPass("none"));
   }
   pm->addPass(mt::gpu::createTritonGPUOptimizeDotOperands({true}));
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUReduceDataDuplication());
-  if (/*(instruction_sched_variant=="none") == */ false) {
+  if (/*(instruction_sched_variant=="none") == */ /* DISABLES CODE */ (false)) {
     pm->addPass(mlir::createTritonAMDGPUInThreadTranspose());
     pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   }
   if (rocm_cc.has_amd_matrix_instr()) {
     pm->addPass(mt::gpu::createTritonGPUReorderInstructions());
   }
-  if (/*use_block_pingpong=*/false) {
+  if (/*use_block_pingpong=*//* DISABLES CODE */ (false)) {
     pm->addPass(mlir::createTritonAMDGPUBlockPingpong({num_stages}));
   }
-  if (/*use_buffer_ops=*/false) {  // Not enabled by default.
+  if (/*use_buffer_ops=*//* DISABLES CODE */ (
+      false)) {  // Not enabled by default.
     pm->addPass(mlir::createTritonAMDGPUCanonicalizePointers());
     pm->addPass(mlir::createCanonicalizerPass());
     pm->addPass(mlir::createTritonAMDGPUConvertToBufferOps({arch_name}));
@@ -123,6 +126,7 @@ static void MakeLLIR(mlir::OpPassManager* pm,
                      const stream_executor::RocmComputeCapability& rocm_cc,
                      int num_stages) {
   const int custom_lds_size = 0;
+  pm->addPass(mlir::createTritonAMDGPUUpdateAsyncWaitCount());
   pm->addPass(mlir::triton::AMD::createOptimizeLDSUsagePass(
       rocm_cc.gfx_version(), custom_lds_size));
   pm->addPass(mlir::createSCFToControlFlowPass());
@@ -138,7 +142,7 @@ static void MakeLLIR(mlir::OpPassManager* pm,
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
-  if (/*(instruction_sched_variant=="none") == */ false) {
+  if (/*(instruction_sched_variant=="none") == */ /* DISABLES CODE */ (false)) {
     pm->addPass(mt::createTritonAMDGPULowerInstructionSchedHintsPass(
         rocm_cc.gfx_version(), num_stages));
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
index 69d5d43dcb20e6..58e7690e35aca3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
 
-#include <algorithm>
 #include <iterator>
 #include <string>
 #include <vector>
@@ -25,6 +24,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
index 6e819ccbdb2dcf..e73d7f5d793016 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 
-#include <limits>
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <vector>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
index bf0d228ef87f9a..0eab343cfd957f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/xla_data.pb.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
index a03aaa8fcd81ef..bdd679412e0261 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
@@ -2086,7 +2086,11 @@ INSTANTIATE_TEST_SUITE_P(
     PrecisionTests, PrecisionTests,
     Combine(Values(PC::ALG_DOT_TF32_TF32_F32, PC::ALG_DOT_TF32_TF32_F32_X3,
                    PC::ALG_DOT_BF16_BF16_F32, PC::ALG_DOT_BF16_BF16_F32_X3,
-                   PC::ALG_DOT_BF16_BF16_F32_X6, PC::ALG_DOT_BF16_BF16_F32_X9,
+                   PC::ALG_DOT_BF16_BF16_F32_X6,
+                   // TODO(basioli): re-enable this algorithm testing once the
+                   // attribute
+                   // importer supports the conversion.
+                   //  PC::ALG_DOT_BF16_BF16_F32_X9,
                    PC::ALG_DOT_F32_F32_F32),
             Values(Backend::kTriton, Backend::kBlas)),
     AlgorithmAndBackendTestParamToString);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
index c81bfdb35696c5..19c2bed34cf55c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Builders.h"
@@ -156,6 +157,17 @@ absl::StatusOr<TensorValue> EmitNestedFusion(
 
   return EmitScope(b, to_emit, region_values);
 }
+
+// Get a constant with all high bits of the same type as provided.
+mlir::Value OnesLike(mlir::ImplicitLocOpBuilder& b, mlir::Type type) {
+  mlir::Type element_type = mlir::getElementTypeOrSelf(type);
+  CHECK(element_type.isInteger()) << "OnesLike only supports integer types.";
+
+  int64_t width = element_type.getIntOrFloatBitWidth();
+  mlir::APInt all_ones = mlir::APInt::getAllOnes(width);
+  return mlir::createScalarOrSplatConstant(b, b.getLoc(), type, all_ones);
+}
+
 }  // namespace
 
 SmallVector<int64_t> GetPaddedTileSizes(ArrayRef<int64_t> tile_sizes) {
@@ -425,10 +437,12 @@ absl::StatusOr<Value> EmitElementwise(mlir::ImplicitLocOpBuilder& b,
     case HloOpcode::kFloor:
       return mm::FloorOp::create(b, inputs[0]);
     case HloOpcode::kNot:
-      return ma::XOrIOp::create(b, inputs[0], OnesLike(b, inputs[0]));
+      return ma::XOrIOp::create(b, inputs[0], OnesLike(b, inputs[0].getType()));
     case HloOpcode::kNegate:
-      // NegFOp is not supported by Triton.
-      return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
+      if (is_integer) {
+        return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
+      }
+      return ma::NegFOp::create(b, inputs[0]);
     case HloOpcode::kConvert: {
       TF_ASSIGN_OR_RETURN(
           Type dst_ty, PrimitiveTypeToMlirType(b, hlo.shape().element_type()));
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
index 89ed1ef978bb52..5d1dfee338123f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -197,10 +198,6 @@ inline mlir::Value ZerosLike(mlir::ImplicitLocOpBuilder& b, mlir::Value x) {
   return ConstLike(b, x, 0);
 }
 
-inline mlir::Value OnesLike(mlir::ImplicitLocOpBuilder& b, mlir::Value x) {
-  return ConstLike(b, x, 1);
-}
-
 bool IsFp8Type(mlir::Type t);
 
 // Triton type conversions.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
index 2caf28722c47e8..88443f24071477 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
@@ -203,9 +203,8 @@ absl::StatusOr<TritonFusion::EmitResult> TritonFusion::Emit(
         local_module.get()));
 
     return {{kernel->getName().str(), launch_dimensions,
-             triton_wrapper_result.cluster_dim,
-             triton_wrapper_result.shmem_bytes, /*binary=*/"",
-             triton_wrapper_result.tma_metadata}};
+             /*cluster_dim=*/std::nullopt, triton_wrapper_result.shmem_bytes,
+             /*binary=*/"", triton_wrapper_result.tma_metadata}};
   };
 
   auto [status_or_entry, was_cached] =
@@ -218,7 +217,8 @@ absl::StatusOr<TritonFusion::EmitResult> TritonFusion::Emit(
           Thunk::ThunkInfo::WithProfileAnnotation(
               &fusion, ir_emitter_context.GetNextThunkId()),
           entry->kernel_name, kernel_arguments, entry->launch_dimensions,
-          entry->cluster_dim, entry->shmem_bytes, entry->tma_metadata),
+          /*cluster_dim=*/std::nullopt, entry->shmem_bytes,
+          entry->tma_metadata),
       was_cached ? nullptr : std::move(local_module)};
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
index 17168e1b7fb251..a344c472fe143a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
@@ -1191,6 +1191,10 @@ absl::StatusOr<TensorValue> EmitTiledHloInstruction(
                           values);
   }
 
+  if (hlo->opcode() == HloOpcode::kAllReduceDone) {
+    return values[tiled_hlo.operand(0)];
+  }
+
   if (hlo->IsElementwise()) {
     std::vector<Value> operands;
     operands.reserve(hlo->operands().size());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index 07605fb6d34987..e5df20ac1480d7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -104,14 +104,7 @@ class TritonEmitterTest : public GpuCodegenTest {
 
 class TmaParameterizedTritonEmitterTest
     : public TritonEmitterTest,
-      public ::testing::WithParamInterface<bool> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(GetParam());
-    return debug_options;
-  }
-};
+      public ::testing::WithParamInterface<bool> {};
 
 INSTANTIATE_TEST_SUITE_P(TmaParameterizedTritonEmitterTestSuite,
                          TmaParameterizedTritonEmitterTest, ::testing::Bool(),
@@ -123,7 +116,6 @@ class WarpSpecializationTritonEmitterTest : public TritonEmitterTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
     debug_options.set_xla_gpu_experimental_enable_triton_warp_specialization(
         true);
     return debug_options;
@@ -139,15 +131,7 @@ struct TmaAndDotLayoutTestParams {
 
 class TmaAndLayoutParameterizedTritonEmitterTest
     : public TritonEmitterTest,
-      public ::testing::WithParamInterface<TmaAndDotLayoutTestParams> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(
-        GetParam().enable_tma);
-    return debug_options;
-  }
-};
+      public ::testing::WithParamInterface<TmaAndDotLayoutTestParams> {};
 
 std::string TmaAndDotLayoutTestParamsToString(
     const ::testing::TestParamInfo<TmaAndDotLayoutTestParams>& data) {
@@ -420,6 +404,32 @@ CHECK: arith.divsi {{.*}} : i32
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
+TEST_F(TritonEmitterTest, BitwiseNotIsEmittedCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_not {
+  param_0 = s32[100] parameter(0)
+  ROOT not = s32[100] not(param_0)
+}
+
+ENTRY main {
+  p0 = s32[100] parameter(0)
+  ROOT not = s32[100] fusion(p0), kind=kCustom, calls=fused_not,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "num_warps":"1","output_tiles":[{"sizes":[100]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false}}}
+}
+)";
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fused_not", R"(
+CHECK: arith.constant dense<-1>
+CHECK: arith.xori
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
 TEST_F(TritonEmitterTest, ReductionOnMinormostAxisIsEmittedCorrectly) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -3083,10 +3093,10 @@ ENTRY entry_computation {
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
-CHECK:     arith.subf {{.*}} f32
+CHECK:     arith.negf {{.*}} f32
 CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
-CHECK:     arith.subf {{.*}} f32
+CHECK:     arith.negf {{.*}} f32
 CHECK:     arith.addf {{.*}} f32
 CHECK:     arith.mulf {{.*}} f32
 CHECK:     arith.divf {{.*}} f32
@@ -3596,7 +3606,7 @@ CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
 CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
 CHECK-DAG:  xtile.extract %[[ARG0]]
 CHECK-DAG:  xtile.extract %[[ARG1]]
-CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  arith.negf {{.*}} : tensor<16x32xf32>
 CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
 CHECK:      stablehlo.dot_general {{.*}} (tensor<16x32xf32>, tensor<32x64xf32>) -> tensor<16x64xf32>
 CHECK:      arith.addf {{.*}}
@@ -3617,7 +3627,7 @@ CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
 CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
 CHECK-DAG:  xtile.extract %[[ARG0]]
 CHECK-DAG:  xtile.extract %[[ARG1]]
-CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  arith.negf {{.*}} : tensor<16x32xf32>
 CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
 CHECK:      tt.dot {{.*}} tensor<16x32xf32> * tensor<32x64xf32> -> tensor<16x64xf32>
 CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
@@ -4351,7 +4361,9 @@ constexpr std::array kMultiDotAlgorithms = {
     PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3,
     PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6,
     PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3,
-    PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9,
+    // TODO(basioli): re-enable this algorithm testing once the attribute
+    // importer supports the conversion.
+    // PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9,
 };
 
 TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
index 0e750a6e600c19..19fb63a74c651b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/target_constants.h"
@@ -48,7 +49,6 @@ class WarpSpecializationTritonEmitterTest : public TritonEmitterDevicelessTest {
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options =
         TritonEmitterDevicelessTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
     debug_options.set_xla_gpu_experimental_enable_triton_warp_specialization(
         true);
     return debug_options;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
index 4af7ab9cbf76e1..6e0c33e70d8d8f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
-#include <variant>
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
index e305a4c5cadbe0..452c2bf63e2ada 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -24,7 +25,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
index b64d94cd87becb..8211e65adf870a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
-
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/Attributes.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
index de63f5dd669128..2a0a85b0916a2e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
@@ -92,7 +92,8 @@ absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
   absl::flat_hash_set<HloOpcode> ret{HloOpcode::kAbs, HloOpcode::kCopy};
 
   if (element_type != PrimitiveType::F8E5M2 &&
-      element_type != PrimitiveType::F8E4M3FN) {
+      element_type != PrimitiveType::F8E4M3FN &&
+      element_type != PrimitiveType::F8E8M0FNU) {
     ret.insert(HloOpcode::kNegate);
   }
 
@@ -291,6 +292,31 @@ CodegenDecision CanTritonHandleReduce(
       "Reduction is not a row-reduction of a single operand.");
 }
 
+CodegenDecision IsTritonSupportedAllReduce(
+    const HloAllReduceInstruction& all_reduce,
+    const se::GpuComputeCapability& gpu_version) {
+  if (all_reduce.replica_groups().empty()) {
+    return CodegenDecision::Forbid("All-reduce does not have replica groups.");
+  }
+  if (all_reduce.shape().element_type() == PrimitiveType::F8E4M3FN ||
+      all_reduce.shape().element_type() == PrimitiveType::F8E5M2 ||
+      all_reduce.shape().element_type() == PrimitiveType::S4) {
+    return CodegenDecision::Forbid(
+        "S4, F8E4M3FN and F8E5M2 are not supported for all-reduces.");
+  }
+
+  bool is_triton_supported_all_reduce_computation = absl::c_all_of(
+      all_reduce.to_apply()->instructions(), [&](const HloInstruction* instr) {
+        return IsTritonSupportedInstructionImpl(*instr, gpu_version).CanFuse();
+      });
+  if (!is_triton_supported_all_reduce_computation) {
+    return CodegenDecision::Forbid(
+        "Unsupported all-reduce computation by Triton.");
+  }
+
+  return CodegenDecision::Allow();
+}
+
 bool IsInTritonNestedGemmFusion(const HloInstruction& hlo) {
   if (!hlo.parent()->IsFusionComputation()) {
     return false;
@@ -681,6 +707,12 @@ CodegenDecision IsTritonSupportedInstructionImpl(
     case HloOpcode::kFusion:
       return IsTritonSupportedFusion(*Cast<HloFusionInstruction>(&instr),
                                      gpu_version);
+    case HloOpcode::kAllReduceStart:
+      return IsTritonSupportedAllReduce(*Cast<HloAllReduceInstruction>(&instr),
+                                        gpu_version);
+    case HloOpcode::kAllReduceDone:
+      return IsTritonSupportedAllReduce(
+          *Cast<HloAllReduceInstruction>(instr.operand(0)), gpu_version);
     default:
       // Not all instructions have a special handling.
       break;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
index 76802738624631..28c740b481d112 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
@@ -123,9 +123,10 @@ bool DoesOpSupportType(HloOpcode opcode, PrimitiveType type) {
     case HloOpcode::kDivide:
     case HloOpcode::kRemainder:
     case HloOpcode::kSubtract:
-    case HloOpcode::kNegate:
     case HloOpcode::kIota:
       return type != PRED;
+    case HloOpcode::kNegate:
+      return type != PRED && type != F8E8M0FNU;
     case HloOpcode::kRng:
       return !pu::IsComplexType(type);
     case HloOpcode::kComplex:
@@ -1171,7 +1172,7 @@ ENTRY triton_computation {
 }
 
 TEST_P(CollectiveTest,
-       UnsupportedAllReduceStartAndDoneFailGracefullyWithTriton) {
+       IsTritonSupportedAllReduceStartAndDoneWithNoReplicaGroups) {
   // 'all-reduce-start' and 'all-reduce-done' need to be tested together, since
   // the HLO verifier relies on one directly consuming the other.
   auto [data_type, cc] = GetParam();
@@ -1200,6 +1201,36 @@ ENTRY triton_computation {
   RunSupportTest(std::move(ti_done), /*output_tile_sizes=*/{2, 2}, cc);
 }
 
+TEST_P(CollectiveTest,
+       IsTritonSupportedAllReduceStartAndDoneWithReplicaGroups) {
+  // 'all-reduce-start' and 'all-reduce-done' need to be tested together, since
+  // the HLO verifier relies on one directly consuming the other.
+  auto [data_type, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+apply_op {
+  x = $0[] parameter(0)
+  y = $0[] parameter(1)
+  ROOT apply_op = $0[] add(x, y)
+}
+
+ENTRY triton_computation {
+  input = $0[128,32] parameter(0)
+  all-reduce-start = $0[128,32] all-reduce-start(input), replica_groups={{0,1}},
+      to_apply=apply_op
+  ROOT all-reduce-done = $0[128,32] all-reduce-done(all-reduce-start)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti_start,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
+                                     HloOpcode::kAllReduceStart));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti_done,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
+                                     HloOpcode::kAllReduceDone));
+  RunSupportTest(std::move(ti_start), /*output_tile_sizes=*/{2, 2}, cc);
+  RunSupportTest(std::move(ti_done), /*output_tile_sizes=*/{2, 2}, cc);
+}
+
 TEST_P(CollectiveTest, UnsupportedAllToAllFailsGracefullyWithTriton) {
   auto [data_type, cc] = GetParam();
   const std::string kHloTestTemplate = R"(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
index c02705a9e07ff2..e95c64815e783f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
@@ -300,6 +300,10 @@ std::string TritonSupportTestDeviceToString(
 
 namespace {
 
+bool IsCollectiveFusion(const HloFusionInstruction& fusion) {
+  return fusion.fused_expression_root()->opcode() == HloOpcode::kAllReduceDone;
+}
+
 // This function does nothing if the input module already has an entry
 // computation whose root is a fusion. Otherwise, creates a new entry
 // computation whose root is a fusion instruction that calls the original entry
@@ -327,7 +331,9 @@ absl::Status ConvertEntryToTritonFusion(HloModule* module) {
 
   gpu::GpuBackendConfig gpu_config;
   gpu_config.mutable_fusion_backend_config()->set_kind(
-      kTritonNestedGemmFusionKind);
+      IsCollectiveFusion(*xla::Cast<HloFusionInstruction>(fusion))
+          ? kTritonCollectiveFusionKind
+          : kTritonNestedGemmFusionKind);
   TF_RETURN_IF_ERROR(fusion->set_backend_config(gpu_config));
 
   auto new_entry =
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
index 0d32097bf4cd88..74da9f00311e7e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
index b7a6eb1b6ce397..0f53dd18a323c8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
@@ -139,10 +139,10 @@ absl::StatusOr<TmaDescriptor> CreateTmaDescriptor(
                              GetTmaSwizzleMode(swizzle_mode));
 }
 
+// The current recommendation is based on analyzing the E2E "Nucleo" group
+// data. It might make sense to re-evaluate this recommendation later if we
+// believe there are missed opportunities.
 bool IsTmaRecommended(const TritonGemmConfig& config) {
-  // The current recommendation is based on analyzing the E2E "Nucleo" group
-  // data. It might make sense to re-evaluate this recommendation later if we
-  // believe there are missed opportunities.
   return (config.split_k == 1 || config.split_k == 16) &&
          config.num_warps <= 8 &&
          (config.num_stages == 1 || config.num_stages == 3 ||
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
index e4fb3e0cfb6589..40358e73b6366c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "triton_xla_math_to_libdevice.cc",
         "triton_xla_squeeze_dims_pass.cc",
         "triton_xla_unswitch_loops_pass.cc",
+        "unsupported_elementwise_to_triton_pass.cc",
         "xtile_lower_to_triton.cc",
     ],
     hdrs = ["passes.h"],
@@ -82,6 +83,7 @@ cc_library(
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
index a5adf3a2bdd05d..1d7e637a0eaa90 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <iterator>
 #include <memory>
 #include <optional>
@@ -173,15 +172,15 @@ class I4ToI8Converter : public TypeConverter {
 // Divides a value by an integer constant.
 Value div(ConversionPatternRewriter& r, Value value, int64_t constant) {
   auto const_attr = r.getIntegerAttr(value.getType(), constant);
-  auto const_op = r.template create<ma::ConstantOp>(value.getLoc(), const_attr);
-  return r.template create<ma::DivSIOp>(value.getLoc(), value, const_op);
+  auto const_op = ma::ConstantOp::create(r, value.getLoc(), const_attr);
+  return ma::DivSIOp::create(r, value.getLoc(), value, const_op);
 }
 
 // Divides a value by an integer constant.
 Value ceilDiv(ConversionPatternRewriter& r, Value value, int64_t constant) {
   auto const_attr = r.getIntegerAttr(value.getType(), constant);
-  auto const_op = r.template create<ma::ConstantOp>(value.getLoc(), const_attr);
-  return r.template create<ma::CeilDivSIOp>(value.getLoc(), value, const_op);
+  auto const_op = ma::ConstantOp::create(r, value.getLoc(), const_attr);
+  return ma::CeilDivSIOp::create(r, value.getLoc(), value, const_op);
 }
 
 // Returns the integer value of a constant op.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
index 2ab433be265408..75007131ffdb13 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -54,6 +54,7 @@ std::unique_ptr<mlir::Pass> CreateTritonXLAMathToLibdevicePass(
     absl::string_view libdevice_path, absl::string_view triple);
 std::unique_ptr<mlir::Pass> CreateXTileLowerToTritonPass();
 std::unique_ptr<mlir::Pass> CreateArithFP8ConversionToTritonPass();
+std::unique_ptr<mlir::Pass> CreateUnsupportedElementwiseToTritonPass();
 
 // Returns true if the `op` contains an operation in it's regions that satisfies
 // the `fn`.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
index 1c4d71feb98e1b..d8779d2ba0f4ce 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -264,5 +264,13 @@ def ArithFP8ConversionToTritonPass
     "::mlir::triton::TritonDialect",
   ];
 }
+def UnsupportedElementwiseToTritonPass
+    : Pass<"unsupported-elementwise-to-triton"> {
+  let summary =
+    "Converts unsupported elementwise operations to their Triton equivalent.";
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+  ];
+}
 
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_PASSES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir
new file mode 100644
index 00000000000000..8313bbc324ae06
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir
@@ -0,0 +1,20 @@
+// RUN: xla-opt %s -split-input-file -unsupported-elementwise-to-triton \
+// RUN: | FileCheck %s
+
+func.func @converts_tensor_negf_to_subf(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  // CHECK: %[[ZERO:.*]] = arith.constant dense<0.000000e+00> : tensor<10xf32>
+  // CHECK: %[[SUB:.*]] = arith.subf %[[ZERO]], %arg0 : tensor<10xf32>
+  %0 = arith.negf %arg0 : tensor<10xf32>
+  // CHECK: return %[[SUB]] : tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+//-----
+
+func.func @converts_scalar_negf_to_subf(%arg0: f32) -> f32 {
+  // CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[SUB:.*]] = arith.subf %[[ZERO]], %arg0 : f32
+  %0 = arith.negf %arg0 : f32
+  // CHECK: return %[[SUB]] : f32
+  func.return %0 : f32
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc
new file mode 100644
index 00000000000000..fabfc8d9815c3b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc
@@ -0,0 +1,83 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/APFloat.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+
+namespace mlir::triton::xla {
+
+#define GEN_PASS_DEF_UNSUPPORTEDELEMENTWISETOTRITONPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class RewriteNegFToSubtract : public OpRewritePattern<mlir::arith::NegFOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::arith::NegFOp op,
+                                PatternRewriter& rewriter) const override {
+    mlir::Type element_type = getElementTypeOrSelf(op.getType());
+    auto type = mlir::dyn_cast<mlir::FloatType>(element_type);
+
+    if (!type) {
+      return rewriter.notifyMatchFailure(op, "expected float type");
+    }
+
+    const llvm::fltSemantics& semantics = type.getFloatSemantics();
+    mlir::Value zero_value =
+        mlir::createScalarOrSplatConstant(rewriter, op->getLoc(), op.getType(),
+                                          mlir::APFloat::getZero(semantics));
+
+    rewriter.replaceOpWithNewOp<mlir::arith::SubFOp>(op, zero_value,
+                                                     op.getOperand());
+    return success();
+  }
+};
+
+struct UnsupportedElementwiseToTritonPass
+    : public impl::UnsupportedElementwiseToTritonPassBase<
+          UnsupportedElementwiseToTritonPass> {
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::RewritePatternSet patterns(
+        &getContext(), std::make_unique<RewriteNegFToSubtract>(&getContext()));
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateUnsupportedElementwiseToTritonPass() {
+  return std::make_unique<UnsupportedElementwiseToTritonPass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
index ffbaa53190266b..bceba2f4172441 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/error_spec.h"
@@ -50,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/target_constants.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -112,6 +112,7 @@ class TritonTest : public GpuCodegenTest {
   GetModuleAndNestedFusionMetadata(absl::string_view hlo_text) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
                         ParseAndReturnVerifiedModule(hlo_text));
+    TF_RETURN_IF_ERROR(HoistFusedBitcasts().Run(module.get()).status());
     TF_ASSIGN_OR_RETURN(
         bool fusion_was_nested,
         NestGemmFusion(device_desc(), &mlir_context_).Run(module.get()));
@@ -685,7 +686,8 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: ENTRY
-; CHECK: transpose
+; CHECK: fusion
+; CHECK-SAME: kind=kLoop
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
 ; CHECK-SAME: "__triton_nested_gemm_fusion"
@@ -710,7 +712,8 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: ENTRY
-; CHECK: transpose
+; CHECK: fusion
+; CHECK-SAME: kind=kLoop
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
 ; CHECK-SAME: "__triton_nested_gemm_fusion"
@@ -1205,7 +1208,8 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK:      ENTRY
-; CHECK:      concatenate
+; CHECK:      fusion
+; CHECK-SAME:   kind=kLoop
 ; CHECK:      fusion
 ; CHECK-SAME:   kind=kCustom
 ; CHECK-SAME:   "__triton_nested_gemm_fusion"
@@ -1336,12 +1340,14 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Add(
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom),
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom),
+                    m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom))
+              .WithFusionKind(HloInstruction::FusionKind::kLoop)));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
@@ -1511,10 +1517,12 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Sin(
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom))
+              .WithFusionKind(HloInstruction::FusionKind::kLoop)));
 }
 
 // TODO(b/393299275): this should just be a fusion test and does not need to be
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
index 8d2b17c25097dd..36fadbcef88696 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
@@ -432,9 +432,7 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
         "(num_warps, num_ctas, num_stages) must be positive, but got: (",
         num_warps, ", ", num_ctas, ", ", num_stages, ")"));
   }
-  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
-  CreateTritonPipeline(&pm, gpu_cc, num_warps, num_ctas, num_stages,
-                       cluster_info);
+  CreateTritonPipeline(&pm, gpu_cc, num_warps, num_ctas, num_stages);
 
   // Triton generates pointers to the global address space, while XLA needs a
   // kernel signature with pointers to the generic address space.
@@ -496,24 +494,6 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     }
   }
 
-  // `cluster_info` must be read after pm.run().
-  std::optional<se::ClusterDim> cluster_dim;
-  if (block_level_parameters.num_ctas > 1) {
-    VLOG(3) << "num_ctas: " << block_level_parameters.num_ctas
-            << ", cluster_info: " << cluster_info.clusterDimX << ","
-            << cluster_info.clusterDimY << "," << cluster_info.clusterDimZ;
-    if (cluster_info.clusterDimX > 1 || cluster_info.clusterDimY > 1 ||
-        cluster_info.clusterDimZ > 1) {
-      cluster_dim =
-          se::ClusterDim(cluster_info.clusterDimX, cluster_info.clusterDimY,
-                         cluster_info.clusterDimZ);
-    }
-  } else {
-    TF_RET_CHECK(cluster_info.clusterDimX == 1 &&
-                 cluster_info.clusterDimY == 1 &&
-                 cluster_info.clusterDimZ == 1);
-  }
-
   SmallVector<mlir::LLVM::LLVMFuncOp> func_ops;
   for (auto func : triton_module.getOps<mlir::LLVM::LLVMFuncOp>()) {
     // Custom calls will also match to LLVMFuncOp, so we are only interested in
@@ -535,10 +515,7 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   // - TMA metadata.
   // - Total threads per block. Computed from module attributes.
   // - Captured NVVM annotations.
-  TritonWrapperResult result = {shared_mem_bytes,
-                                cluster_dim,
-                                tma_metadata,
-                                thread_dims,
+  TritonWrapperResult result = {shared_mem_bytes, tma_metadata, thread_dims,
                                 captured_nvvm_annotations,
                                 std::move(ll_triton_module)};
   return result;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
index 3370b7f81a12c0..a1fe63ad299d04 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
@@ -48,9 +48,6 @@ limitations under the License.
 
 namespace mlir {
 namespace triton {
-namespace nvidia_gpu {
-struct ClusterInfo;
-}
 }  // namespace triton
 }  // namespace mlir
 
@@ -59,7 +56,6 @@ namespace gpu {
 
 struct TritonWrapperResult {
   int64_t shmem_bytes = 0;
-  std::optional<se::ClusterDim> cluster_dim;
   se::gpu::TmaMetadata tma_metadata;
   se::ThreadDim thread_dims;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
index 5b47611387f732..12216068683e2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include <gtest/gtest.h>
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index 975166a9b64b34..291f156d5000d5 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -1,4 +1,3 @@
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm_is_configured")
@@ -20,55 +19,15 @@ package_group(
     ],
 )
 
-# Allows to explicitely disable nvshmem collectives using invocation flag.
-bool_flag(
-    name = "nvshmem_enabled",
-    build_setting_default = True,
-)
-
-# NVSHMEM requires builtin functions since it uses printf's for debugging.
-config_setting(
-    name = "no_builtin_used",
-    values = {
-        "copt": "-fno-builtin",
-    },
-)
-
-cc_library(
-    name = "nvshmem_collectives_if_builtin_used",
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps =
-        select({
-            ":no_builtin_used": [],
-            "//conditions:default": [":nvshmem_collectives"],
-        }),
-)
-
-config_setting(
-    name = "nvshmem_supported",
-    constraint_values = [
-        "@platforms//os:linux",
-    ],
-    flag_values = {
-        ":nvshmem_enabled": "True",
-    },
-)
-
-# Since selects can't be nested we need to create this intermediate target
-cc_library(
-    name = "nvshmem_collectives_if_supported",
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps =
-        select({
-            ":nvshmem_supported": [":nvshmem_collectives_if_builtin_used"],
-            "//conditions:default": [],
-        }),
+alias(
+    # Since RCCL reimplements the NCCL API and there is no way to disable that,
+    # we need to make sure that we only link either NCCL or RCCL.
+    name = "nccl_or_rccl_collectives",
+    actual = if_rocm_is_configured(
+        ":rccl_collectives",
+        ":nccl_collectives",
+    ),
+    tags = ["manual"],
 )
 
 # Build target that registers all available GPU collectives implementations with the collectives
@@ -78,7 +37,7 @@ cc_library(
     deps = [
         ":gpu_collectives_stub",
     ] + if_cuda_or_rocm_is_configured([
-        ":nccl_collectives",
+        ":nccl_or_rccl_collectives",
     ]) + if_cuda_is_configured([
         ":nvshmem_collectives_if_supported",
     ]),
@@ -150,7 +109,6 @@ cc_library(
         "//xla/service:rendezvous",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -166,11 +124,9 @@ xla_cc_test(
         ":gpu_clique_rendezvous",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
-        "//xla/service:rendezvous",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -247,6 +203,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -292,13 +249,30 @@ cc_library(
     name = "nccl_errors",
     srcs = ["nccl_errors.cc"],
     hdrs = ["nccl_errors.h"],
-    local_defines =
-        if_rocm_is_configured([
-            "TENSORFLOW_USE_ROCM=1",
-        ]),
+    tags = [
+        "cuda-only",
+        "gpu",
+        "no-oneapi",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla:util",
+        "//xla/tsl/cuda:nccl",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "rccl_errors",
+    srcs = ["rccl_errors.cc"],
+    hdrs = ["rccl_errors.h"],
     tags = [
         "gpu",
         "no-oneapi",
+        "rocm-only",
     ],
     visibility = ["//visibility:private"],
     deps = [
@@ -308,23 +282,17 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
 )
 
 cc_library(
     name = "nccl_collectives",
     srcs = ["nccl_collectives.cc"],
     hdrs = ["nccl_collectives.h"],
-    local_defines =
-        if_rocm_is_configured([
-            "TENSORFLOW_USE_ROCM=1",
-        ]),
     tags = [
+        "cuda-only",
         "gpu",
         "no-oneapi",
     ],
@@ -332,11 +300,9 @@ cc_library(
     deps = [
         ":gpu_clique_key",
         ":gpu_collectives",
-        ":gpu_communicator",
         ":nccl_communicator",
         ":nccl_errors",
         "//xla:debug_options_flags",
-        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla/core/collectives",
@@ -347,12 +313,10 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/cuda:nccl_memory_allocator",  # buildcleaner: keep (static registration)
+        "//xla/tsl/cuda:nccl",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
@@ -360,26 +324,70 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/debugging:leak_check",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:numbers",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
+    ],
+    alwayslink = True,  # registers collectives implementation
+)
+
+cc_library(
+    name = "rccl_collectives",
+    srcs = ["rccl_collectives.cc"],
+    hdrs = ["rccl_collectives.h"],
+    tags = [
+        "gpu",
+        "no-oneapi",
+        "rocm-only",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":gpu_clique_key",
+        ":gpu_collectives",
+        ":rccl_communicator",
+        ":rccl_errors",
+        "//xla:debug_options_flags",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla/core/collectives",
+        "//xla/core/collectives:clique_id",
+        "//xla/core/collectives:clique_key",
+        "//xla/core/collectives:collectives_registry",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/runtime:device_id",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_config_rocm//rocm:rccl",  # buildcleaner: keep
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:numbers",
+    ],
     alwayslink = True,  # registers collectives implementation
 )
 
@@ -387,10 +395,8 @@ cc_library(
     name = "nccl_communicator",
     srcs = ["nccl_communicator.cc"],
     hdrs = ["nccl_communicator.h"],
-    local_defines = if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     tags = [
+        "cuda-only",
         "gpu",
         "no-oneapi",
     ],
@@ -402,35 +408,23 @@ cc_library(
         ":single_threaded_executor",
         "//xla:future",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:util",
-        "//xla/core/collectives",
-        "//xla/core/collectives:clique_id",
-        "//xla/core/collectives:clique_key",
-        "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
-        "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
-        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/core/collectives:reduction_kind",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:executor",
+        "//xla/tsl/cuda:nccl",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/debugging:leak_check",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
@@ -438,17 +432,59 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:numbers",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
+    ],
+)
+
+cc_library(
+    name = "rccl_communicator",
+    srcs = ["rccl_communicator.cc"],
+    hdrs = ["rccl_communicator.h"],
+    tags = [
+        "gpu",
+        "no-oneapi",
+        "rocm-only",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":gpu_collectives",
+        ":gpu_communicator",
+        ":rccl_errors",
+        ":single_threaded_executor",
+        "//xla:future",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:executor",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_config_rocm//rocm:rccl",  # buildcleaner: keep
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_tsl//tsl/platform:casts",
+    ],
 )
 
 cc_library(
@@ -478,11 +514,12 @@ cc_library(
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
-        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/core/collectives:reduction_kind",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
-        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/cuda:nvshmem",
+        "//xla/stream_executor/cuda:nvshmem_memory_allocator",  # buildcleaner: keep (static registration)
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -494,7 +531,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
@@ -504,14 +540,36 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "nvshmem_collectives_if_builtin_used",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = select({
+        "//xla/stream_executor/cuda:no_builtin_used": [],
+        "//conditions:default": [":nvshmem_collectives"],
+    }),
+)
+
+cc_library(
+    name = "nvshmem_collectives_if_supported",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = select({
+        "//xla/stream_executor/cuda:nvshmem_supported": [
+            ":nvshmem_collectives_if_builtin_used",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 xla_test(
     name = "nccl_communicator_test",
     srcs = ["nccl_communicator_test.cc"],
     backends = ["gpu"],
-    local_defines =
-        if_rocm_is_configured([
-            "TENSORFLOW_USE_ROCM=1",
-        ]),
     tags = [
         # Stop chloroxylenol from running this test with msan because msan does
         # not work with CUDA.
@@ -522,6 +580,7 @@ xla_test(
         "no-oneapi",
         # TODO(b/435404154): Reenable once this is fixed.
         "no_oss",
+        "cuda-only",
     ],
     visibility = ["//visibility:private"],
     deps = [
@@ -530,8 +589,40 @@ xla_test(
         ":nccl_communicator",
         ":nccl_errors",
         "//xla:future",
+        "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
+        "//xla/stream_executor:device_address",
+        "//xla/tsl/cuda:nccl",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_test(
+    name = "rccl_communicator_test",
+    srcs = ["rccl_communicator_test.cc"],
+    backends = ["gpu"],
+    tags = [
+        "no-oneapi",
+        # TODO(b/435404154): Reenable once this is fixed.
+        "no_oss",
+        "rocm-only",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":gpu_collectives",
+        ":rccl_collectives",
+        ":rccl_communicator",
+        ":rccl_errors",
+        "//xla:future",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
@@ -544,12 +635,9 @@ xla_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/utility",
         "@com_google_googletest//:gtest_main",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
 )
 
 xla_test(
@@ -561,6 +649,11 @@ xla_test(
             "no_oss",
             "noasan",
             "nomsan",
+            "requires-gpu-nvidia",
+        ],
+        "b200": [
+            "multi_gpu",
+            "broken",
         ],
         "nvgpu_any": [
             "broken",
@@ -571,12 +664,16 @@ xla_test(
     env = {
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
     },
+    tags = ["cuda-only"],
     deps = [
+        ":nvshmem_collectives",
         "//xla:debug_options_flags",
         "//xla:status_macros",
+        "//xla/core/collectives:communicator",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:service",
+        "//xla/stream_executor/cuda:nvshmem",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
@@ -588,7 +685,7 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
         "@local_config_cuda//cuda:cuda_headers",
-    ] + if_cuda_is_configured([":nvshmem_collectives"]),
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
index d588bff13c6f8e..3eca048ba75175 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
@@ -86,6 +86,11 @@ bool GpuCliqueKey::is_p2p() const { return is_p2p_; }
 
 GlobalDeviceId GpuCliqueKey::root_device() const { return root_device_; }
 
+std::vector<std::vector<GlobalDeviceId>> GpuCliqueKey::ParticipantGroups()
+    const {
+  return participant_groups_;
+};
+
 bool GpuCliqueKey::IsSubsetOf(const CliqueKey& other) const {
   auto* other_gpu = tsl::down_cast<const GpuCliqueKey*>(&other);
   if (other_gpu == nullptr) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
index c2587e085ccabc..2a1a0a85823396 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
@@ -64,6 +64,8 @@ class GpuCliqueKey : public CliqueKey {
 
   CollectiveStreamId stream_id() const;
 
+  std::vector<std::vector<GlobalDeviceId>> ParticipantGroups() const;
+
   // Device generating the unique id for this key
   GlobalDeviceId root_device() const;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc
index 277cea6198ae80..fcb87afc21d28e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc
@@ -65,8 +65,8 @@ struct RankFormatter {
 }  // namespace
 
 GpuCliqueRendezvous::GpuCliqueRendezvous(
-    GpuCliqueKey clique_key, absl::btree_map<RankId, std::any> state)
-    : clique_key_(std::move(clique_key)), state_(std::move(state)) {}
+    GpuCliqueKey clique_key, absl::btree_map<RankId, std::any> values)
+    : clique_key_(std::move(clique_key)), values_(std::move(values)) {}
 
 absl::StatusOr<std::shared_ptr<GpuCliqueRendezvous>> GpuCliqueRendezvous::Join(
     const GpuCliqueKey& clique_key, RankId rank, std::any data) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h
index a3220996f214c5..623cd7d8513fd7 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h
@@ -42,15 +42,15 @@ class GpuCliqueRendezvous {
   static absl::StatusOr<std::shared_ptr<GpuCliqueRendezvous>> Join(
       const GpuCliqueKey& clique_key, RankId rank, std::any data);
 
-  // Returns the clique key associated with this data.
+  // Returns the clique key associated with this rendezvous object.
   const GpuCliqueKey& clique_key() const { return clique_key_; }
 
-  // Returns the state associated with the given rank. If state type is not
-  // the same as `T`, returns an error.
+  // Returns the value at the given rank. If value type is not the same as `T`,
+  // returns an error.
   template <typename T>
-  absl::StatusOr<std::reference_wrapper<const T>> state(RankId rank) const {
-    auto it = state_.find(rank);
-    if (it == state_.end()) {
+  absl::StatusOr<std::reference_wrapper<const T>> at(RankId rank) const {
+    auto it = values_.find(rank);
+    if (it == values_.end()) {
       return NotFound("Data not found for rank %d", rank.value());
     }
 
@@ -64,10 +64,10 @@ class GpuCliqueRendezvous {
 
  private:
   GpuCliqueRendezvous(GpuCliqueKey clique_key,
-                      absl::btree_map<RankId, std::any> state);
+                      absl::btree_map<RankId, std::any> values);
 
   GpuCliqueKey clique_key_;
-  absl::btree_map<RankId, std::any> state_;
+  absl::btree_map<RankId, std::any> values_;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc
index 58d0bd7b2c402a..ef8a2ce09383c0 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc
@@ -48,8 +48,8 @@ TEST(GpuCliqueRendezvousTest, TwoParticipants) {
 
       GpuCliqueRendezvous& data = **rendezvous;
       ASSERT_EQ(data.clique_key(), key);
-      ASSERT_EQ(*data.state<int32_t>(RankId(0)), 0);
-      ASSERT_EQ(*data.state<int32_t>(RankId(1)), 1);
+      ASSERT_EQ(*data.at<int32_t>(RankId(0)), 0);
+      ASSERT_EQ(*data.at<int32_t>(RankId(1)), 1);
     };
   };
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
index 64654bac8b85a5..696c8925e8ac82 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
@@ -492,6 +492,17 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
   GpuCollectives::DeviceRank device_rank = {&gpu_device, rank};
   RankPair rank_pair = {parent_rank, device_rank};
 
+  // Synchronize the device to make sure no other collectives are
+  // running before we do the split.
+  {
+    tsl::profiler::TraceMe trace("SynchronizeAllActivityBeforeSplit");
+    if (!device->SynchronizeAllActivity()) {
+      return Internal(
+          "Failed to synchronize GPU before splitting communicators.");
+    }
+    VLOG(3) << "Synchronized device before splitting";
+  }
+
   // Current approach for communicator splitting works because of XLAs SPMD
   // programming model where all collective operations have replica groups that
   // include all ranks. This property guarantees that we'll split each
@@ -718,10 +729,16 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
 
   if (enable_nccl_comm_splitting) {
     for (auto& [acquired_clique_key, acquired_clique] : acquired_cliques) {
-      if (clique_key.IsSubsetOf(acquired_clique_key)) {
+      // If the participant group is empty, we won't know if there are other
+      // ranks involved in the split. Proceed to normal initialization.
+      if (clique_key.IsSubsetOf(acquired_clique_key) &&
+          !clique_key.ParticipantGroups().empty()) {
         return InitializeGpuClique(collectives, device, run_id, clique_key,
                                    acquired_clique, num_local_participants,
                                    rank, config);
+      } else if (clique_key.ParticipantGroups().empty()) {
+        LOG(WARNING) << "Found empty participant groups."
+                     << " Skip splitting communicators.";
       }
     }
   }
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
index 05c54e2cf1e022..ca98b036a35dd8 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/shape_util.h"
@@ -31,9 +32,9 @@ limitations under the License.
 
 namespace xla::gpu {
 
-GpuCollectives* GpuCollectives::Default() {
+GpuCollectives* GpuCollectives::Default(absl::string_view platform_name) {
   absl::StatusOr<Collectives*> collectives =
-      CollectivesRegistry::Default("gpu");
+      CollectivesRegistry::Default(platform_name);
   CHECK_OK(collectives) << "Failed to get GPU collectives";  // Crash OK
 
   if (auto* gpu_collectives = tsl::down_cast<GpuCollectives*>(*collectives)) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
index 5c7c51f8f7068d..7297b98d6e2741 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
@@ -46,8 +47,8 @@ namespace xla::gpu {
 // XLA:GPU extension of the Collectives interface with GPU-specific APIs.
 class GpuCollectives : public Collectives {
  public:
-  // Returns the default collectives implementation for GPU backend.
-  static GpuCollectives* Default();
+  // Returns the default collectives implementation for the given platform.
+  static GpuCollectives* Default(absl::string_view platform_name);
 
   // A callback to get a unique clique id.
   using CliqueIdCallback =  // NOLINT
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index bb9397f99df70c..d4990d32193f10 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/collectives/nccl_collectives.h"
 
+#include <atomic>
 #include <cstdint>
 #include <cstdlib>
 #include <functional>
@@ -35,7 +36,9 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_communicator.h"
@@ -61,17 +64,6 @@ limitations under the License.
 #include "tsl/platform/casts.h"
 #include "tsl/platform/numbers.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 
 static ncclComm_t Cast(const Communicator* comm) {
@@ -113,9 +105,7 @@ static absl::StatusOr<ncclConfig_t> AsNcclConfig(
     const se::StreamExecutor* stream_executor) {
   ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
   comm_config.blocking = config.blocking_communicators ? 1 : 0;
-#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
   comm_config.splitShare = config.split_share;
-#endif
   int nccl_version;
   XLA_NCCL_RETURN_IF_ERROR(ncclGetVersion(&nccl_version));
   if (config.max_nchannels > 0) {
@@ -159,7 +149,9 @@ NcclCollectives::CreateCommunicatorsWithCancel(
     return InvalidArgument(
         "CliqueIds size must be 1 for NCCL communicator initialization");
   }
-  VLOG(1) << "Initialize NCCL communicator for " << ranks.size() << " devices"
+  VLOG(1) << "Initialize NCCL (version "
+          << absl::StrCat(NCCL_MAJOR, ".", NCCL_MINOR, ".", NCCL_PATCH)
+          << ") communicator for " << ranks.size() << " devices"
           << "; fingerprint(id)=" << clique_ids->fingerprint();
 
   const auto& gpu_config =
@@ -238,7 +230,6 @@ NcclCollectives::SplitCommunicatorsWithCancel(
   const auto& gpu_config =
       tsl::down_cast<const GpuCollectives::Config&>(config);
 
-#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
   auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
     auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
     TF_RET_CHECK(device != nullptr);
@@ -275,11 +266,6 @@ NcclCollectives::SplitCommunicatorsWithCancel(
   }  // pool's destructor blocks until all scheduled work is done.
   TF_RETURN_IF_ERROR(status);
   return split_comms;
-#else
-  return absl::UnimplementedError(
-      absl::StrFormat("%s:%d: NCCL operation ncclCommSplit not implemented",
-                      __FILE__, __LINE__));
-#endif  // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
 }
 
 static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
@@ -341,7 +327,8 @@ class NcclIdStore {
         device_to_node_(std::move(device_to_node)),
         kv_store_(std::move(kv_store)) {}
 
-  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key) {
+  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key,
+                                           NcclCollectives& nccl_collectives) {
     auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
     if (gpu_key == nullptr) {
       return InvalidArgument("Expected GPU clique key");
@@ -360,8 +347,7 @@ class NcclIdStore {
     CliqueId clique_id;
     int primary_node_id = device_to_node_.at(gpu_key->root_device());
     if (node_id_ == primary_node_id) {
-      TF_ASSIGN_OR_RETURN(
-          clique_id, gpu::GpuCollectives::Default()->CreateUniqueCliqueId());
+      TF_ASSIGN_OR_RETURN(clique_id, nccl_collectives.CreateUniqueCliqueId());
       TF_RETURN_IF_ERROR(
           kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
     } else {
@@ -397,13 +383,13 @@ absl::Status NcclCollectives::InitializeTopology(
         topology.node_id, topology.device_id_to_node_id,
         std::move(topology.kv_store));
     topology.gpu_executable_run_options->set_clique_id_callback(
-        [nccl_id_store](const CliqueKey& key) {
-          return nccl_id_store->GetNcclUniqueId(key);
+        [nccl_id_store, this](const CliqueKey& key) {
+          return nccl_id_store->GetNcclUniqueId(key, *this);
         });
   }
   return absl::OkStatus();
 }
 }  // namespace xla::gpu
 
-XLA_COLLECTIVES_REGISTER("gpu", "nccl", 1,
+XLA_COLLECTIVES_REGISTER("CUDA", "nccl", 1,
                          std::make_unique<xla::gpu::NcclCollectives>());
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
index 1ed56cb153b71e..1924b34a02320b 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/memory/memory.h"
@@ -33,17 +34,18 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
 #include "xla/backends/gpu/collectives/single_threaded_executor.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
 #include "xla/primitive_util.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/executor.h"
@@ -54,20 +56,13 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 namespace {
 
+CUstream AsCudaStream(se::Stream* stream) {
+  return absl::bit_cast<CUstream>(stream->platform_specific_handle().stream);
+}
+
 se::Stream* ToStream(const Communicator::Executor& executor) {
   return tsl::down_cast<const GpuCollectives::Executor&>(executor).stream();
 }
@@ -556,7 +551,7 @@ absl::Status NcclCommunicator::LaunchAllReduce(
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclAllReduce(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
       nccl_dtype, ToNcclReduction(reduction_kind), comm_,
-      se::gpu::AsGpuStreamValue(stream))));
+      AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -583,7 +578,7 @@ absl::Status NcclCommunicator::LaunchBroadcast(
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclBroadcast(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
-      nccl_dtype, root.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+      nccl_dtype, root.value(), comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -612,7 +607,7 @@ absl::Status NcclCommunicator::LaunchReduceScatter(
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclReduceScatter(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
       nccl_dtype, ToNcclReduction(reduction_kind), comm_,
-      se::gpu::AsGpuStreamValue(stream))));
+      AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -638,13 +633,39 @@ absl::Status NcclCommunicator::LaunchAllGather(
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclAllGather(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
-      nccl_dtype, comm_, se::gpu::AsGpuStreamValue(stream))));
+      nccl_dtype, comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
   return absl::OkStatus();
 }
 
+// If all buffers are contiguous returns a device address range that covers all
+// of them, otherwise returns an empty optional.
+static std::optional<se::DeviceAddressBase> IsContinguous(
+    absl::Span<const se::DeviceAddressBase> buffers) {
+  if (buffers.empty()) {
+    return std::nullopt;
+  }
+
+  if (buffers.size() == 1) {
+    return buffers[0];
+  }
+
+  size_t total_size = buffers[0].size();
+  for (size_t i = 1; i < buffers.size(); ++i) {
+    se::DeviceAddress<uint8_t> a(buffers[i - 1]);
+    se::DeviceAddress<uint8_t> b(buffers[i]);
+    total_size += b.size();
+
+    if (a.base() + a.size() != b.base()) {
+      return std::nullopt;
+    }
+  }
+
+  return se::DeviceAddressBase(buffers[0].opaque(), total_size);
+}
+
 absl::Status NcclCommunicator::LaunchAllToAll(
     absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
     absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
@@ -658,12 +679,18 @@ absl::Status NcclCommunicator::LaunchAllToAll(
     absl::StrAppendFormat(out, "%p", buffer.opaque());
   };
 
+  auto send_contiguous = IsContinguous(send_buffers);
+  auto recv_contiguous = IsContinguous(recv_buffers);
+
   VLOG(3) << absl::StreamFormat(
       "[%d] Launch NCCL AllToAll operation; send_buffers=[%s]; "
-      "recv_buffers=[%s]; dtype=%s; count=%d; comm=%p; stream=%p",
+      "send_contiguous=%v; recv_buffers=[%s]; recv_contiguous=%v; dtype=%s; "
+      "count=%d; comm=%p; stream=%p",
       stream->parent()->device_ordinal(),
       absl::StrJoin(send_buffers, ", ", buffer_formatter),
+      send_contiguous.has_value(),
       absl::StrJoin(recv_buffers, ", ", buffer_formatter),
+      recv_contiguous.has_value(),
       primitive_util::LowercasePrimitiveTypeName(dtype), count, comm_, stream);
 
   if (send_buffers.size() != recv_buffers.size()) {
@@ -683,18 +710,29 @@ absl::Status NcclCommunicator::LaunchAllToAll(
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
+#if NCCL_VERSION_CODE >= 22800
+  // If send and receive buffers are contiguous we can use all-to-all API from
+  // NCCL directly without launching individual send/recv operations.
+  if (send_contiguous && recv_contiguous) {
+    XLA_NCCL_RETURN_IF_ERROR(ncclAlltoAll(
+        send_contiguous->opaque(), recv_contiguous->opaque(),
+        ToNcclCount(dtype, count), nccl_dtype, comm_, AsCudaStream(stream)));
+    return absl::OkStatus();
+  }
+#endif
+
   TF_RETURN_IF_ERROR(GroupStart());
   for (size_t i = 0; i < send_buffers.size(); ++i) {
     se::DeviceAddressBase send_buffer = send_buffers[i];
     se::DeviceAddressBase recv_buffer = recv_buffers[i];
 
-    XLA_NCCL_RETURN_IF_ERROR(
-        ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype, i,
-                 comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(ncclSend(send_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsCudaStream(stream)));
 
-    XLA_NCCL_RETURN_IF_ERROR(
-        ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype, i,
-                 comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(ncclRecv(recv_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsCudaStream(stream)));
   }
   TF_RETURN_IF_ERROR(GroupEnd());
   return absl::OkStatus();
@@ -732,15 +770,15 @@ absl::Status NcclCommunicator::LaunchCollectivePermute(
   TF_RETURN_IF_ERROR(GroupStart());
 
   if (source_rank) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclRecv(
-        recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-        source_rank->value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(
+        ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 source_rank->value(), comm_, AsCudaStream(stream)));
   }
 
   for (auto target_rank : target_ranks) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclSend(
-        send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-        target_rank.value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(
+        ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 target_rank.value(), comm_, AsCudaStream(stream)));
   }
 
   TF_RETURN_IF_ERROR(GroupEnd());
@@ -768,7 +806,7 @@ absl::Status NcclCommunicator::LaunchSend(se::DeviceAddressBase send_buffer,
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(
       ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+               peer.value(), comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -795,7 +833,7 @@ absl::Status NcclCommunicator::LaunchRecv(se::DeviceAddressBase recv_buffer,
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(
       ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+               peer.value(), comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
index f23b21682c200e..3c1d5cbd805f07 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
@@ -32,27 +32,15 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/future.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/executor.h"
 #include "xla/tsl/platform/env.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 
 // XLA collectives communicator wrapping an NCCL communicator.
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
index 988081b1c94423..bc0e49a2a90795 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
@@ -26,25 +26,15 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/errors.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 namespace {
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
index bb3714843bcf95..4716280de16b6b 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
@@ -20,18 +20,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "xla/util.h"
-
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
 #include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
+#include "xla/util.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
index 5968a3961d0c4e..5806ebf2e39582 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
@@ -19,20 +19,8 @@ limitations under the License.
 #include <atomic>
 
 #include "absl/status/status.h"
-#include "absl/strings/str_format.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/util.h"  // IWYU pragma: keep
-                                                       //
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
 #include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
+#include "xla/tsl/platform/logging.h"
 
 //===----------------------------------------------------------------------===//
 // Collection of helper macros for handling NCCL errors.
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
index 215f42b89fab0e..df73c5b8b0e892 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
@@ -14,42 +14,41 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/backends/gpu/collectives/nvshmem_collectives.h"
 
-#include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <memory>
-#include <string>
 
-#include "absl/base/call_once.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "absl/time/time.h"
 #include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
 #include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
 #include "xla/backends/gpu/collectives/nvshmem_communicator.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
-#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/stream_executor/cuda/nvshmem.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/numbers.h"
 
 namespace xla::gpu {
 
 NvshmemCollectives::~NvshmemCollectives() {
-  if (initialized_) {
-    Finalize();
+  if (se::gpu::nvshmem::IsInitialized()) {
+    se::gpu::nvshmem::Finalize();
   }
 }
 
+bool NvshmemCollectives::IsInitialized() const {
+  return se::gpu::nvshmem::IsInitialized();
+}
+
 NvshmemCollectives* NvshmemCollectives::Default() {
   absl::StatusOr<Collectives*> collectives =
-      CollectivesRegistry::Get("gpu", "nvshmem");
+      CollectivesRegistry::Get("CUDA", "nvshmem");
   CHECK_OK(collectives) << "Failed to get NVSHMEM collectives";  // Crash OK
 
   if (auto* nvshmem_collectives =
@@ -61,87 +60,14 @@ NvshmemCollectives* NvshmemCollectives::Default() {
 }
 
 absl::Status NvshmemCollectives::InitializeTopology(Topology topology) {
-  SetEnvInfo(topology.node_id, topology.num_nodes,
-             topology.device_count_per_process, topology.kv_store);
+  se::gpu::nvshmem::SetEnvInfo(topology.node_id, topology.num_nodes,
+                               topology.device_count_per_process,
+                               topology.kv_store);
   return absl::OkStatus();
 }
 
-void NvshmemCollectives::SetEnvInfo(
-    int process_id, size_t num_processes, size_t device_count_per_process,
-    std::weak_ptr<KeyValueStoreInterface> kv_store) {
-  process_id_ = process_id;
-  num_processes_ = num_processes;
-  device_count_per_process_ = device_count_per_process;
-  kv_store_ = kv_store;
-}
-
-absl::Status NvshmemCollectives::InitializeOnce() {
-  auto init_fn = [this]() -> absl::Status {
-    if (process_id_ == -1) {
-      LOG(FATAL)
-          << "NvshmemCollectives::SetEnvInfo was not called before using "
-             "NVSHMEM API";
-    }
-    if (device_count_per_process_ != 1) {
-      LOG(FATAL) << "NVSHMEM API is only supported with one device per process";
-    }
-    nvshmemx_init_attr_t nvshmem_init_attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
-    nvshmemx_uniqueid_t nvshmem_id = NVSHMEMX_UNIQUEID_INITIALIZER;
-
-    // Initialize NVSHMEM
-    if (std::shared_ptr<KeyValueStoreInterface> kv_store = kv_store_.lock()) {
-      if (process_id_ == 0) {
-        if (nvshmemx_get_uniqueid(&nvshmem_id) != 0) {
-          return absl::InternalError("nvshmemx_get_uniqueid failed.");
-        }
-        char buf[sizeof(nvshmemx_uniqueid_t)];
-        std::memcpy(buf, &nvshmem_id, sizeof(nvshmemx_uniqueid_t));
-        absl::string_view nvshmem_id_str{buf, sizeof(buf)};
-        TF_RETURN_IF_ERROR(kv_store->Set(kKvStoreKey, nvshmem_id_str));
-      } else {
-        TF_ASSIGN_OR_RETURN(std::string id_str,
-                            kv_store->Get(kKvStoreKey, absl::Minutes(10)));
-        CHECK(id_str.size() >= sizeof(nvshmemx_uniqueid_t));
-        std::memcpy(&nvshmem_id, id_str.data(), sizeof(nvshmemx_uniqueid_t));
-      }
-    } else {
-      return absl::InternalError(
-          "KV store is not available for nvshmem initialization.");
-    }
-
-    if (nvshmemx_set_attr_uniqueid_args(process_id_, num_processes_,
-                                        &nvshmem_id, &nvshmem_init_attr) != 0) {
-      return absl::InternalError("nvshmemx_set_attr_uniqueid_args failed.");
-    }
-    if (nvshmemx_hostlib_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID,
-                                   &nvshmem_init_attr) != 0) {
-      return absl::InternalError("nvshmemx_hostlib_init_attr failed.");
-    }
-
-    VLOG(3) << absl::StreamFormat(
-        "Initialized NVSHMEM on process %d; num_processes=%llu", process_id_,
-        num_processes_);
-    return absl::OkStatus();
-  };
-
-  static absl::once_flag once_flag;
-  absl::Status status = absl::OkStatus();
-  absl::call_once(once_flag, [&]() {
-    status = init_fn();
-    initialized_ = true;
-  });
-  return status;
-}
-
-void NvshmemCollectives::Finalize() {
-  VLOG(3) << absl::StreamFormat(
-      "Finilizing NVSHMEM on process %d; num_processes=%llu", process_id_,
-      num_processes_);
-  nvshmemx_hostlib_finalize();
-}
-
 absl::StatusOr<void*> NvshmemCollectives::Allocate(uint64_t bytes) {
-  TF_RETURN_IF_ERROR(InitializeOnce());
+  TF_RETURN_IF_ERROR(se::gpu::nvshmem::InitializeOnce());
   VLOG(3) << absl::StreamFormat(
       "Start allocation of %s (%llu bytes) for NVSHMEM",
       tsl::strings::HumanReadableNumBytes(bytes), bytes);
@@ -155,7 +81,7 @@ absl::StatusOr<void*> NvshmemCollectives::Allocate(uint64_t bytes) {
 }
 
 absl::Status NvshmemCollectives::Deallocate(void* buffer) {
-  TF_RETURN_IF_ERROR(InitializeOnce());
+  TF_RETURN_IF_ERROR(se::gpu::nvshmem::InitializeOnce());
   VLOG(3) << absl::StreamFormat("Start de-allocation for NVSHMEM buffer: %p",
                                 buffer);
   nvshmem_free(buffer);
@@ -172,5 +98,5 @@ NvshmemCollectives::CreateCommunicator() {
 
 // NvshmemCollectives currently does not implement GpuCollectives, so it cannot
 // be used as a host-side collectives library. Therefore, set priority to -100.
-XLA_COLLECTIVES_REGISTER("gpu", "nvshmem", -100,
+XLA_COLLECTIVES_REGISTER("CUDA", "nvshmem", -100,
                          std::make_unique<xla::gpu::NvshmemCollectives>());
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
index 4fc7485d48a0e1..82f717e8cb85d9 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
 
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -31,7 +30,6 @@ limitations under the License.
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
-#include "xla/pjrt/distributed/key_value_store_interface.h"
 
 namespace xla::gpu {
 
@@ -41,11 +39,8 @@ class NvshmemCollectives : public GpuCollectives {
   ~NvshmemCollectives() override;
 
   static NvshmemCollectives* Default();
-  bool IsInitialized() { return initialized_; }
 
-  void SetEnvInfo(int process_id, size_t num_processes,
-                  size_t device_count_per_process,
-                  std::weak_ptr<KeyValueStoreInterface> kv_store);
+  bool IsInitialized() const;
 
   absl::StatusOr<void*> Allocate(uint64_t bytes) final;
 
@@ -82,19 +77,6 @@ class NvshmemCollectives : public GpuCollectives {
   }
 
   absl::Status InitializeTopology(Topology topology) final;
-
- private:
-  absl::Status InitializeOnce();
-
-  void Finalize();
-
-  int process_id_ = -1;
-  size_t num_processes_ = 0;
-  size_t device_count_per_process_ = 0;
-  std::weak_ptr<KeyValueStoreInterface> kv_store_;
-  bool initialized_ = false;
-
-  static constexpr char kKvStoreKey[] = "nvshmem_global_init";
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
index 9598c9aa7e866d..ef5ddcc9459ae7 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
@@ -25,11 +25,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "xla/core/collectives/communicator.h"
 #include "xla/debug_options_flags.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/nvshmem.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -82,7 +84,7 @@ absl::Status InitializationTestBody(const int node_id, const int num_nodes) {
   auto kv_store =
       GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"gpu:");
 
-  NvshmemCollectives::Default()->SetEnvInfo(node_id, num_nodes, 1, kv_store);
+  se::gpu::nvshmem::SetEnvInfo(node_id, num_nodes, 1, kv_store);
   cudaSetDevice(node_id);
   TF_ASSIGN_OR_RETURN(void* ptr, NvshmemCollectives::Default()->Allocate(1024));
   TF_RET_CHECK(ptr != nullptr);
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
index fd14b490160e59..585302fa25fb3e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/base/casts.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_bf16.h"
 #include "third_party/gpus/cuda/include/cuda_fp16.h"
 #include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
@@ -29,11 +31,10 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/nvshmem_collectives.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
 #include "xla/primitive_util.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -47,10 +48,14 @@ namespace xla::gpu {
 // NVSHMEM Utility Functions
 //==-----------------------------------------------------------------------===//
 
-size_t ToRealCount(PrimitiveType dtype, size_t count) {
+static size_t ToRealCount(PrimitiveType dtype, size_t count) {
   return primitive_util::IsComplexType(dtype) ? count * 2 : count;
 }
 
+static CUstream AsCudaStream(se::Stream* stream) {
+  return absl::bit_cast<CUstream>(stream->platform_specific_handle().stream);
+}
+
 //==-----------------------------------------------------------------------===//
 // NVSHMEM Templated APIs
 //==-----------------------------------------------------------------------===//
@@ -130,7 +135,7 @@ size_t ToRealCount(PrimitiveType dtype, size_t count) {
                          num_elements, stream)                            \
   nvshmemx_##TYPENAME##_##op##_nbi_on_stream(                             \
       (TYPE*)dest_ptr, (const TYPE*)source_ptr, num_elements, pe.value(), \
-      se::gpu::AsGpuStreamValue(stream))
+      AsCudaStream(stream))
 
 //==-----------------------------------------------------------------------===//
 // NVSHMEM Communicator
@@ -169,7 +174,7 @@ absl::Status NvshmemCommunicator::Barrier(
 
   TF_ASSIGN_OR_RETURN(se::Stream * stream, ToStream(executor));
 
-  auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
+  auto gpu_stream = AsCudaStream(stream);
 
   if (nvshmemx_barrier_on_stream(NVSHMEM_TEAM_SHARED, gpu_stream) != 0) {
     return absl::InternalError("Nvshmem team barrier failed.");
@@ -239,69 +244,62 @@ Future<> NvshmemCommunicator::AllReduce(
   switch (dtype) {
     case PrimitiveType::F64: {
       CALL_NVSHMEM_REDUCTION_DATATYPE(double, double, NVSHMEM_TEAM_SHARED,
-                                      se::gpu::AsGpuStreamValue(stream),
-                                      reduction_kind, source_ptr, dest_ptr,
-                                      count);
+                                      AsCudaStream(stream), reduction_kind,
+                                      source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::F16: {
-      CALL_NVSHMEM_REDUCTION_DATATYPE(
-          half, __half, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
-          reduction_kind, source_ptr, dest_ptr, count);
+      CALL_NVSHMEM_REDUCTION_DATATYPE(half, __half, NVSHMEM_TEAM_SHARED,
+                                      AsCudaStream(stream), reduction_kind,
+                                      source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::F32: {
-      CALL_NVSHMEM_REDUCTION_DATATYPE(
-          float, float, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
-          reduction_kind, source_ptr, dest_ptr, count);
+      CALL_NVSHMEM_REDUCTION_DATATYPE(float, float, NVSHMEM_TEAM_SHARED,
+                                      AsCudaStream(stream), reduction_kind,
+                                      source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::BF16: {
       CALL_NVSHMEM_REDUCTION_DATATYPE(
-          bfloat16, __nv_bfloat16, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          bfloat16, __nv_bfloat16, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::S32: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          int32, int32_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          int32, int32_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::S64: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          int64, int64_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          int64, int64_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::U32: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          uint32, uint32_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          uint32, uint32_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::U64: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          uint64, uint64_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          uint64, uint64_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::PRED:
     case PrimitiveType::U8: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          uint8, uint8_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          uint8, uint8_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::S8: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          int8, int8_t, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
+          int8, int8_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
           reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
@@ -492,7 +490,7 @@ absl::Status NvshmemCommunicator::Quiet(const Executor& executor) {
   }
 
   TF_ASSIGN_OR_RETURN(se::Stream * stream, ToStream(executor));
-  nvshmemx_quiet_on_stream(se::gpu::AsGpuStreamValue(stream));
+  nvshmemx_quiet_on_stream(AsCudaStream(stream));
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc
new file mode 100644
index 00000000000000..974e36d9b4916a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc
@@ -0,0 +1,407 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_collectives.h"
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/call_once.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "rocm/rocm_config.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/rccl_communicator.h"
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/collectives_registry.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/numbers.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+
+static ncclComm_t Cast(const Communicator* comm) {
+  auto* nccl_communicator = tsl::down_cast<const RcclCommunicator*>(comm);
+  CHECK(nccl_communicator != nullptr) << "Unsupported XLA communicator";
+  return nccl_communicator->comm();
+}
+
+absl::StatusOr<CliqueId> RcclCollectives::CreateUniqueCliqueId() const {
+  VLOG(3) << "Create NCCL unique clique id";
+  ncclUniqueId id;
+  XLA_RCCL_RETURN_IF_ERROR(ncclGetUniqueId(&id));
+  return CliqueId(absl::string_view(id.internal, NCCL_UNIQUE_ID_BYTES));
+}
+
+bool RcclCollectives::IsGlobalConfig() const {
+  static const char* const nccl_comm_id = std::getenv("NCCL_COMM_ID");
+  return nccl_comm_id != nullptr;
+}
+
+absl::StatusOr<const RcclCollectives::CliqueIdCallback*>
+RcclCollectives::GetCliqueIdCallback(const CliqueIdCallback* clique_id_callback,
+                                     bool is_local) {
+  if (clique_id_callback != nullptr) {
+    return clique_id_callback;
+  }
+
+  TF_RET_CHECK(is_local || IsGlobalConfig())
+      << "If non-local devices are taking part of a collective API on "
+         "GPU, the clique_id_callback must be provided by the client.";
+
+  static auto* const local_callback = new CliqueIdCallback(
+      [this](const CliqueKey&) { return CreateUniqueCliqueId(); });
+  return local_callback;
+}
+
+static absl::StatusOr<ncclConfig_t> AsRcclConfig(
+    const GpuCollectives::Config& config,
+    const se::StreamExecutor* stream_executor) {
+  ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
+  comm_config.blocking = config.blocking_communicators ? 1 : 0;
+#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
+  comm_config.splitShare = config.split_share;
+#endif
+  int nccl_version;
+  XLA_RCCL_RETURN_IF_ERROR(ncclGetVersion(&nccl_version));
+  if (config.max_nchannels > 0) {
+    VLOG(1) << "Maximum number of channels is set to: " << comm_config.maxCTAs;
+    comm_config.maxCTAs = config.max_nchannels;
+  } else if (stream_executor->GetDeviceDescription()
+                 .cuda_compute_capability()
+                 .IsBlackwell() &&
+             nccl_version >= NCCL_VERSION(2, 28, 0)) {
+    // Future NCCL versions will reduce the default max number of channels on
+    // Blackwell to 16. We need to manually set it to 32 here to avoid surprise
+    // perf regressions.
+    VLOG(1) << "Setting max number of channels to 32 on Blackwell.";
+    comm_config.maxCTAs = 32;
+  }
+  return comm_config;
+}
+
+static absl::StatusOr<ncclUniqueId> AsRcclUniqueId(const CliqueId& clique_id) {
+  if (clique_id.size() != NCCL_UNIQUE_ID_BYTES) {
+    return Internal(
+        "CliqueId size is not equal to NCCL_UNIQUE_ID_BYTES: %d vs %d",
+        clique_id.size(), NCCL_UNIQUE_ID_BYTES);
+  }
+  ncclUniqueId id;
+  absl::c_copy(clique_id.data(), id.internal);
+  return id;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+RcclCollectives::CreateCommunicatorsWithCancel(
+    const CliqueKey& clique_key, const std::optional<CliqueIds>& clique_ids,
+    absl::Span<const DeviceRank> ranks, const Collectives::Config& config,
+    std::atomic_bool* cancel) {
+  // Validate clique ids. With the NCCL backend, we rely on the host to exchange
+  // unique clique ids.
+  if (!clique_ids.has_value() || clique_ids->data().empty()) {
+    return InvalidArgument("CliqueId is required to create NCCL communicators");
+  }
+  if (clique_ids->data().size() != 1) {
+    return InvalidArgument(
+        "CliqueIds size must be 1 for NCCL communicator initialization");
+  }
+  VLOG(1) << "Initialize NCCL communicator for " << ranks.size() << " devices"
+          << "; fingerprint(id)=" << clique_ids->fingerprint();
+
+  const auto& gpu_config =
+      tsl::down_cast<const GpuCollectives::Config&>(config);
+  if (!gpu_config.blocking_communicators && !gpu_config.async_execution) {
+    return FailedPrecondition(
+        "GpuCollectives::Config blocking_communicators is false, but "
+        "async_execution is false. Non-blocking communicators require "
+        "asynchronous execution.");
+  }
+
+  // make_comm returns a new ncclComm_t.
+  auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
+    VLOG(1) << "Initialize NCCL communicator for rank #" << ranks[i].rank
+            << " of " << clique_key.num_devices()
+            << "; fingerprint(id)=" << clique_ids->fingerprint()
+            << "; size(id)=" << clique_ids->data().size();
+    auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
+    TF_RET_CHECK(device != nullptr);
+    auto activate_context = device->stream_executor()->Activate();
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsRcclConfig(gpu_config, device->stream_executor()));
+
+    TF_ASSIGN_OR_RETURN(auto nccl_unique_id, AsRcclUniqueId(clique_ids->at(0)));
+    ncclComm_t comm;
+    XLA_RCCL_RETURN_IF_ERROR(
+        ncclCommInitRankConfig(&comm, clique_key.num_devices(), nccl_unique_id,
+                               ranks[i].rank.value(), &comm_config));
+    return comm;
+  };
+
+  // Create all communicators. Each communicator is created on its own thread.
+  std::vector<std::unique_ptr<Communicator>> comms(ranks.size());
+  absl::Status status;
+  absl::once_flag once;
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "CreateCommunicators",
+                                 ranks.size());
+    for (size_t i = 0; i < ranks.size(); ++i) {
+      pool.Schedule([&, i]() {
+        absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+            RcclCommunicator::Create(std::bind(make_comm, i),
+                                     gpu_config.async_execution, cancel);
+        if (!comm.ok()) {
+          absl::call_once(once, [&] { status = comm.status(); });
+          return;
+        }
+        comms[i] = *std::move(comm);
+      });
+    }
+  }  // pool's destructor blocks until all scheduled work is done.
+  TF_RETURN_IF_ERROR(status);
+  return comms;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+RcclCollectives::SplitCommunicatorsWithCancel(
+    absl::Span<const Communicator* const> comms, int32_t color,
+    absl::Span<const RankId> keys, const Collectives::Config& config,
+    absl::Span<const DeviceRank> ranks, std::atomic_bool* cancel) {
+  auto rank_formatter = [](std::string* str, RankId rank) {
+    absl::StrAppend(str, rank.value());
+  };
+
+  VLOG(1) << absl::StreamFormat(
+      "Split %d NCCL communicators using color %d and keys: [%s]", comms.size(),
+      color, absl::StrJoin(keys, ",", rank_formatter));
+
+  if (keys.size() != comms.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Comms and keys must have the same size, but %d != %d",
+                        comms.size(), keys.size()));
+  }
+
+  const auto& gpu_config =
+      tsl::down_cast<const GpuCollectives::Config&>(config);
+
+#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
+  auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
+    auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
+    TF_RET_CHECK(device != nullptr);
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsRcclConfig(gpu_config, device->stream_executor()));
+
+    VLOG(1) << "Split NCCL communicator " << comms[i] << " with color " << color
+            << " and key " << keys[i];
+    ncclComm_t split_comm;
+    XLA_RCCL_RETURN_IF_ERROR(ncclCommSplit(
+        Cast(comms[i]), color, keys[i].value(), &split_comm, &comm_config));
+    return split_comm;
+  };
+
+  std::vector<std::unique_ptr<Communicator>> split_comms(comms.size());
+  absl::Status status;
+  absl::once_flag once;
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "SplitCommunicators",
+                                 comms.size());
+    for (size_t i = 0; i < comms.size(); ++i) {
+      pool.Schedule([&, i]() {
+        absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+            RcclCommunicator::Create(std::bind(make_comm, i),
+                                     gpu_config.async_execution, cancel);
+        if (!comm.ok()) {
+          absl::call_once(once, [&] { status = comm.status(); });
+          return;
+        }
+        split_comms[i] = *std::move(comm);
+      });
+    }
+  }  // pool's destructor blocks until all scheduled work is done.
+  TF_RETURN_IF_ERROR(status);
+  return split_comms;
+#else
+  return absl::UnimplementedError(
+      absl::StrFormat("%s:%d: NCCL operation ncclCommSplit not implemented",
+                      __FILE__, __LINE__));
+#endif  // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
+}
+
+static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Get("gpu", "nvshmem"));
+  xla::gpu::GpuCollectives* nvshmem_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+  if (nvshmem_collectives == nullptr) {
+    return absl::InternalError("Failed to get NVSHMEM collectives");
+  }
+
+  return nvshmem_collectives;
+}
+
+absl::StatusOr<void*> RcclCollectives::Allocate(uint64_t bytes) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Allocate(bytes);
+  }
+
+  void* ptr = nullptr;
+  ncclResult_t res = ncclMemAlloc(&ptr, bytes);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to allocate %s (%llu bytes) from device collective memory: %s, "
+        "Last NCCL warning(error) log entry (may be unrelated): %s",
+        tsl::strings::HumanReadableNumBytes(bytes), bytes,
+        ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+  VLOG(2) << "Allocated collective memory " << ptr << " of " << bytes
+          << " bytes";
+  return ptr;
+}
+
+absl::Status RcclCollectives::Deallocate(void* location) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Deallocate(location);
+  }
+
+  ncclResult_t res = ncclMemFree(location);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to free device collective memory at %p; result: %s, Last NCCL "
+        "warning(error) log entry (may be unrelated): %s",
+        location, ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+
+  VLOG(2) << "Deallocated collective memory " << location;
+  return absl::OkStatus();
+}
+
+class RcclIdStore {
+ public:
+  RcclIdStore(int node_id,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
+              std::shared_ptr<KeyValueStoreInterface> kv_store)
+      : node_id_(node_id),
+        device_to_node_(std::move(device_to_node)),
+        kv_store_(std::move(kv_store)) {}
+
+  absl::StatusOr<CliqueId> GetRcclUniqueId(const CliqueKey& key,
+                                           RcclCollectives& rccl_collectives) {
+    auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
+    if (gpu_key == nullptr) {
+      return InvalidArgument("Expected GPU clique key");
+    }
+
+    // The caller must ensure that threads calling this method concurrently have
+    // unique keys, otherwise the global key-value store may hold the wrong
+    // value.
+    {
+      absl::MutexLock lock(mu_);
+      auto it = cache_.find(*gpu_key);
+      if (it != cache_.end()) {
+        return it->second;
+      }
+    }
+    CliqueId clique_id;
+    int primary_node_id = device_to_node_.at(gpu_key->root_device());
+    if (node_id_ == primary_node_id) {
+      TF_ASSIGN_OR_RETURN(clique_id, rccl_collectives.CreateUniqueCliqueId());
+      TF_RETURN_IF_ERROR(
+          kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          std::string id_str,
+          kv_store_->Get(gpu_key->ToString(), absl::Minutes(10)));
+      clique_id = CliqueId(id_str);
+    }
+    absl::MutexLock lock(mu_);
+    auto result = cache_.emplace(*gpu_key, std::move(clique_id));
+    TF_RET_CHECK(result.second) << "Unique ID already in cache.";
+    return result.first->second;
+  }
+
+ private:
+  const int node_id_;
+  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+  const std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<gpu::GpuCliqueKey, CliqueId> cache_ ABSL_GUARDED_BY(mu_);
+};
+
+absl::Status RcclCollectives::InitializeTopology(
+    RcclCollectives::Topology topology) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    TF_RETURN_IF_ERROR(nvshmem_collectives->InitializeTopology(topology));
+  }
+
+  if (topology.num_nodes > 1) {
+    auto nccl_id_store = std::make_shared<RcclIdStore>(
+        topology.node_id, topology.device_id_to_node_id,
+        std::move(topology.kv_store));
+    topology.gpu_executable_run_options->set_clique_id_callback(
+        [nccl_id_store, this](const CliqueKey& key) {
+          return nccl_id_store->GetRcclUniqueId(key, *this);
+        });
+  }
+  return absl::OkStatus();
+}
+}  // namespace xla::gpu
+
+XLA_COLLECTIVES_REGISTER("ROCM", "nccl", 1,
+                         std::make_unique<xla::gpu::RcclCollectives>());
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h
new file mode 100644
index 00000000000000..0b7f274686a061
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h
@@ -0,0 +1,93 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COLLECTIVES_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COLLECTIVES_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla::gpu {
+
+// XLA host-initiated collectives implemented on top of NCCL.
+class RcclCollectives : public GpuCollectives {
+ public:
+  bool IsImplemented() const final { return true; }
+
+  bool IsGlobalConfig() const final;
+
+  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback* clique_id_callback, bool is_local) final;
+
+  absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueIds>& clique_ids,
+                      absl::Span<const DeviceRank> ranks,
+                      const Collectives::Config& config) final {
+    return CreateCommunicatorsWithCancel(clique_key, clique_ids, ranks, config,
+                                         nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicatorsWithCancel(const CliqueKey& clique_key,
+                                const std::optional<CliqueIds>& clique_ids,
+                                absl::Span<const DeviceRank> ranks,
+                                const Collectives::Config& config,
+                                std::atomic_bool* cancel) final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const> comms, int32_t color,
+      absl::Span<const RankId> keys, const Collectives::Config& config,
+      absl::Span<const DeviceRank> ranks) final {
+    return SplitCommunicatorsWithCancel(comms, color, keys, config, ranks,
+                                        nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicatorsWithCancel(absl::Span<const Communicator* const> comms,
+                               int32_t color, absl::Span<const RankId> keys,
+                               const Collectives::Config& config,
+                               absl::Span<const DeviceRank> ranks,
+                               std::atomic_bool* cancel) final;
+
+  absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
+
+  absl::Status Deallocate(void* location) final;
+
+  absl::Status InitializeTopology(Topology topology) final;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COLLECTIVES_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
new file mode 100644
index 00000000000000..2b54d5fdcfe495
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
@@ -0,0 +1,831 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_communicator.h"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+#include "xla/backends/gpu/collectives/single_threaded_executor.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
+#include "xla/future.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/executor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+namespace {
+
+hipStream_t AsHipStream(se::Stream* stream) {
+  return absl::bit_cast<hipStream_t>(stream->platform_specific_handle().stream);
+}
+
+se::Stream* ToStream(const Communicator::Executor& executor) {
+  return tsl::down_cast<const GpuCollectives::Executor&>(executor).stream();
+}
+
+//==-----------------------------------------------------------------------===//
+// Conversions between XLA and RCCL data types
+//==-----------------------------------------------------------------------===//
+
+static size_t ToNcclCount(PrimitiveType dtype, size_t count) {
+  return primitive_util::IsComplexType(dtype) ? count * 2 : count;
+}
+
+static absl::StatusOr<ncclDataType_t> ToNcclDataType(PrimitiveType dtype,
+                                                     bool is_reduction_op) {
+  switch (dtype) {
+    case S8:
+    case F8E5M2:
+    case F8E4M3FN:
+    case F8E5M2FNUZ:
+    case F8E4M3FNUZ:
+    case F8E8M0FNU:
+      return ncclInt8;
+    case PRED:
+    case U8:
+      return ncclUint8;
+    case S32:
+      return ncclInt32;
+    case U32:
+      return ncclUint32;
+    case S64:
+      return ncclInt64;
+    case U64:
+      return ncclUint64;
+    case F16:
+      return ncclFloat16;
+    case F32:
+    case C64:
+      return ncclFloat32;
+    case F64:
+    case C128:
+      return ncclFloat64;
+    case S16:
+    case U16:
+      // For reductions we expect 16 bit integer types to be promoted to 32-bit.
+      if (is_reduction_op) {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Unsupported data type for reduction operation: %s",
+                            primitive_util::LowercasePrimitiveTypeName(dtype)));
+      }
+      // For collectives that just move data around, we can use ncclFloat16 for
+      // 16-bit integer data types.
+      return ncclFloat16;
+    case BF16:
+      return ncclBfloat16;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Unsupported data type: %s",
+                          primitive_util::LowercasePrimitiveTypeName(dtype)));
+  }
+}
+
+static ncclRedOp_t ToNcclReduction(ReductionKind kind) {
+  switch (kind) {
+    case ReductionKind::SUM:
+      return ncclSum;
+    case ReductionKind::PRODUCT:
+      return ncclProd;
+    case ReductionKind::MIN:
+      return ncclMin;
+    case ReductionKind::MAX:
+      return ncclMax;
+  }
+}
+
+}  // namespace
+
+//==-----------------------------------------------------------------------===//
+// RCCL Registered Buffer Handle
+//==-----------------------------------------------------------------------===//
+
+// An RAII handle for user buffers registered with an RCCL communicator.
+class RcclCommunicator::RcclRegisteredBufferHandle
+    : public Communicator::RegisteredBufferHandle {
+ public:
+  RcclRegisteredBufferHandle(RcclCommunicator& comm, void* handle,
+                             tsl::Executor* executor, bool symmetric_handle,
+                             int device_ordinal)
+      : comm_(comm),
+        handle_(handle),
+        symmetric_handle_(symmetric_handle),
+        device_ordinal_(device_ordinal) {}
+
+  ~RcclRegisteredBufferHandle() override {
+    if (auto status = Unregister(); !status.ok()) {
+      LOG(ERROR) << status.message();
+    }
+  }
+
+  absl::Status Unregister() final {
+    VLOG(3) << absl::StreamFormat(
+        "[%d] Deregister buffer for RCCL communicator; handle=%p; comm=%p",
+        device_ordinal_, handle_, comm_.comm_);
+    if (!symmetric_handle_) {
+#if (NCCL_VERSION_CODE >= 21901)
+      auto f = [this]() -> absl::Status {
+        if (comm_.canceling_.load()) {
+          return FailedPrecondition("[%d] RcclCommunicator aborted",
+                                    device_ordinal_);
+        }
+        XLA_RCCL_RETURN_IF_ERROR(ncclCommDeregister(comm_.comm_, handle_));
+        return comm_.PollUntilDone();
+      };
+      return executor_ ? Future<>::MakeOn(*executor_, f).Await() : f();
+#else
+      return Unimplemented(
+          "[%d] RCCL version does not support ncclCommDeregister",
+          device_ordinal_);
+#endif  // NCCL_VERSION_CODE >= 21901
+    } else {
+      VLOG(3) << absl::StreamFormat(
+          "[%d] Deregister symmetric buffer for RCCL communicator; handle=%p; "
+          "comm=%p",
+          device_ordinal_, handle_, comm_.comm());
+#if (NCCL_VERSION_CODE >= 22700)
+      auto f = [this]() -> absl::Status {
+        if (comm_.canceling_.load()) {
+          return FailedPrecondition("[%d] RcclCommunicator aborted",
+                                    device_ordinal_);
+        }
+        XLA_RCCL_RETURN_IF_ERROR(
+            ncclCommWindowDeregister(comm_.comm_, *(ncclWindow_t*)(handle_)));
+        return comm_.PollUntilDone();
+      };
+      return executor_ ? Future<>::MakeOn(*executor_, f).Await() : f();
+#else
+      return Unimplemented(
+          "[%d] RCCL version does not support ncclCommWindowDeregister",
+          device_ordinal_);
+#endif  // NCCL_VERSION_CODE >= 22700
+    }
+  }
+
+ private:
+  RcclCommunicator& comm_;
+  void* handle_;
+  bool symmetric_handle_;
+  tsl::Executor* executor_;
+  int device_ordinal_;
+};
+
+//==-----------------------------------------------------------------------===//
+// RCCL Communicator
+//==-----------------------------------------------------------------------===//
+
+absl::StatusOr<std::unique_ptr<RcclCommunicator>> RcclCommunicator::Create(
+    absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm, bool is_async,
+    std::atomic_bool* cancel, tsl::Env& env) {
+  auto f = [cancel, &make_comm]() -> absl::StatusOr<ncclComm_t> {
+    TF_ASSIGN_OR_RETURN(ncclComm_t comm, make_comm());
+    if (cancel) {
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, *cancel));
+    } else {
+      std::atomic_bool never_cancelled;
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, never_cancelled));
+    }
+    return comm;
+  };
+
+  if (!is_async) {
+    // If this RcclCommunicator is synchronous, construct ncclComm_t in the
+    // calling thread.
+    TF_ASSIGN_OR_RETURN(ncclComm_t comm, f());
+    return absl::WrapUnique(new RcclCommunicator(comm, nullptr));
+  }
+
+  // If this RcclCommunicator is asynchronous, then all operations on the
+  // underlying ncclComm_t, including its creation, must take place on the
+  // single threaded executor.
+  auto executor = std::make_unique<SingleThreadedExecutor>(env);
+  TF_ASSIGN_OR_RETURN(ncclComm_t comm,
+                      Future<ncclComm_t>::MakeOn(*executor, f).Await());
+  return absl::WrapUnique(new RcclCommunicator(comm, std::move(executor)));
+}
+
+RcclCommunicator::~RcclCommunicator() {
+  auto f = [this]() -> absl::Status {
+    if (comm_ == nullptr) {
+      VLOG(1) << "Skipping destruction; null comm_ " << *this;
+      return absl::OkStatus();
+    }
+
+    if (aborted_) {
+      VLOG(1) << "Skipping destruction; already aborted " << *this;
+      return absl::OkStatus();
+    }
+
+    // Note that we intentionally don't call PollUntilDone. Once comm_ has been
+    // destroyed, we can no longer safely touch it.
+    VLOG(1) << "Destroy " << *this;
+    return XLA_RCCL_STATUS(ncclCommDestroy(comm_));
+  };
+
+  if (absl::Status s = Execute(f).Await(); !s.ok()) {
+    LOG(ERROR) << "RcclCommunicator::~RcclCommunicator: " << s;
+  }
+}
+
+absl::Status RcclCommunicator::Abort() {
+  // By setting canceling_ to true, all pending collectives scheduled on
+  // executor_ will cancel. This will allow the aborting lambda below to run.
+  canceling_.store(true);
+
+  return ExecuteAwait([this]() -> absl::Status {
+    VLOG(1) << "Abort RCCL communicator: " << *this;
+    if (aborted_) {
+      return FailedPrecondition("RcclCommunicator already aborted");
+    }
+    aborted_ = true;
+    // Note that we intentionally don't call PollUntilDone. Once comm_
+    // has been aborted, we can no longer safely touch it.
+    return XLA_RCCL_STATUS(ncclCommAbort(comm_));
+  });
+}
+
+absl::Status RcclCommunicator::HealthCheck() const {
+  return ExecuteAwait([this]() -> absl::Status {
+    VLOG(5) << "Get last async error for RCCL communicator: " << *this;
+    if (canceling_.load()) {
+      return absl::FailedPreconditionError("RcclCommunicator aborted");
+    }
+
+    ncclResult_t async_err;
+    XLA_RCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm_, &async_err));
+    if (async_err == ncclSuccess) {
+      return absl::OkStatus();
+    }
+
+    return Internal("%s. Last RCCL error (maybe unrelated): %s",
+                    ncclGetLastError(comm_), ncclGetErrorString(async_err));
+  });
+}
+
+absl::StatusOr<size_t> RcclCommunicator::NumRanks() const {
+  return ExecuteAwait<size_t>([this]() -> absl::StatusOr<size_t> {
+    VLOG(5) << "Get the number of ranks in RCCL communicator: " << *this;
+    if (canceling_.load()) {
+      return absl::FailedPreconditionError("RcclCommunicator aborted");
+    }
+
+    // We intentionally don't call PollUntilDone. ncclCommCount is
+    // blocking.
+    int32_t count = 0;
+    XLA_RCCL_RETURN_IF_ERROR(ncclCommCount(comm_, &count));
+    return count;
+  });
+}
+
+absl::Status RcclCommunicator::RegisterBufferOnce(
+    se::DeviceAddressBase buffer_range, int device_ordinal,
+    bool use_symmetric_buffer) {
+  bool need_reg = false;
+  {
+    absl::MutexLock lock(registered_buffers_.mu);
+    if (!registered_buffers_.range_to_handle.contains(buffer_range.opaque())) {
+      need_reg = true;
+    } else {
+      XLA_VLOG_DEVICE(5, device_ordinal)
+          << "Buffer range: " << buffer_range.opaque()
+          << " with size: " << buffer_range.size() << " is already registered.";
+    }
+  }
+  if (need_reg) {
+    XLA_VLOG_DEVICE(5, device_ordinal)
+        << "Registering " << buffer_range.opaque()
+        << " with size: " << buffer_range.size()
+        << ", is symmetric: " << (use_symmetric_buffer ? "true" : "false");
+    // Symmetric buffer registration is a collective operation,
+    // we need to do that before locking on a global.
+    TF_ASSIGN_OR_RETURN(
+        auto handle,
+        RegisterBuffer(buffer_range, device_ordinal, use_symmetric_buffer));
+    absl::MutexLock lock(registered_buffers_.mu);
+    registered_buffers_.range_to_handle[buffer_range.opaque()] =
+        std::move(handle);
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::unique_ptr<Communicator::RegisteredBufferHandle>>
+RcclCommunicator::RegisterBuffer(stream_executor::DeviceAddressBase buffer,
+                                 int device_ordinal,
+                                 bool use_symmetric_buffer) {
+#if (NCCL_VERSION_CODE >= 21901)
+  using Handle = std::unique_ptr<Communicator::RegisteredBufferHandle>;
+
+  if (!use_symmetric_buffer) {
+    return ExecuteAwait<Handle>(
+        [&buffer, device_ordinal, this]() -> absl::StatusOr<Handle> {
+          VLOG(3) << absl::StreamFormat(
+              "[%d] Register buffer for RCCL communicator; buffer=%p; "
+              "size=%d; "
+              "comm=%p",
+              device_ordinal, buffer.opaque(), buffer.size(), comm_);
+          if (canceling_.load()) {
+            return absl::FailedPreconditionError("RcclCommunicator aborted");
+          }
+          void* handle = nullptr;
+          XLA_RCCL_RETURN_IF_ERROR(
+              ncclCommRegister(comm_, buffer.opaque(), buffer.size(), &handle));
+          if (group_nesting_level_ == 0) {
+            TF_RETURN_IF_ERROR(PollUntilDone());
+          }
+          return std::make_unique<RcclRegisteredBufferHandle>(
+              *this, handle, executor_.get(), /*symmetric_buffer= */ false,
+              device_ordinal);
+        });
+#else
+  return Unimplemented("[%d] RCCL version does not support ncclCommRegister",
+                       device_ordinal);
+#endif  // RCCL_VERSION_CODE >= 21901
+  } else {
+#if (NCCL_VERSION_CODE >= 22700)
+    return ExecuteAwait<Handle>(
+        [&buffer, device_ordinal, this]() -> absl::StatusOr<Handle> {
+          VLOG(3) << absl::StreamFormat(
+              "[%d] Register symmetric buffer for RCCL communicator; "
+              "buffer=%p; size=%d; comm=%p",
+              device_ordinal, buffer.opaque(), buffer.size(), comm_);
+          void* handle = nullptr;
+          XLA_RCCL_RETURN_IF_ERROR(ncclGroupStart());
+          XLA_RCCL_RETURN_IF_ERROR(ncclCommWindowRegister(
+              comm_, buffer.opaque(), buffer.size(), (ncclWindow_t*)&handle,
+              NCCL_WIN_COLL_SYMMETRIC));
+          XLA_RCCL_RETURN_IF_ERROR(ncclGroupEnd());
+          if (group_nesting_level_ == 0) {
+            TF_RETURN_IF_ERROR(PollUntilDone());
+          }
+          return std::make_unique<RcclRegisteredBufferHandle>(
+              *this, handle, executor_.get(),
+              /*symmetric_buffer= */ true, device_ordinal);
+        });
+#else
+  return Unimplemented(
+      "[%d] RCCL version does not support ncclCommWindowRegister",
+      device_ordinal);
+#endif  // RCCL_VERSION_CODE >= 22700
+  }
+}
+
+Future<> RcclCommunicator::GroupExecute(
+    absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) {
+  return Execute([f = std::move(f), this]() mutable -> absl::Status {
+    TF_RETURN_IF_ERROR(GroupStart());
+    TF_RETURN_IF_ERROR(f(this));
+    TF_RETURN_IF_ERROR(GroupEnd());
+    return absl::OkStatus();
+  });
+}
+
+Future<> RcclCommunicator::AllReduce(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     ReductionKind reduction_kind,
+                                     const Communicator::Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, reduction_kind,
+                  &executor, this]() -> absl::Status {
+    return LaunchAllReduce(send_buffer, recv_buffer, dtype, count,
+                           reduction_kind, executor);
+  });
+}
+
+Future<> RcclCommunicator::Broadcast(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     RankId root, const Executor& executor) {
+  return Execute(
+      [send_buffer, recv_buffer, dtype, count, root, &executor, this]() {
+        return LaunchBroadcast(send_buffer, recv_buffer, dtype, count, root,
+                               executor);
+      });
+}
+
+Future<> RcclCommunicator::ReduceScatter(se::DeviceAddressBase send_buffer,
+                                         se::DeviceAddressBase recv_buffer,
+                                         PrimitiveType dtype, size_t count,
+                                         ReductionKind reduction_kind,
+                                         const Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, reduction_kind,
+                  &executor, this]() {
+    return LaunchReduceScatter(send_buffer, recv_buffer, dtype, count,
+                               reduction_kind, executor);
+  });
+}
+
+Future<> RcclCommunicator::AllGather(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     const Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, &executor, this]() {
+    return LaunchAllGather(send_buffer, recv_buffer, dtype, count, executor);
+  });
+}
+
+Future<> RcclCommunicator::AllToAll(
+    absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  return Execute([send_buffers, recv_buffers, dtype, count, &executor, this]() {
+    return LaunchAllToAll(send_buffers, recv_buffers, dtype, count, executor);
+  });
+}
+
+Future<> RcclCommunicator::CollectivePermute(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+    absl::Span<const RankId> target_ranks, const Executor& executor) {
+  std::vector<RankId> owned_target_ranks(target_ranks.begin(),
+                                         target_ranks.end());
+  return Execute([send_buffer, recv_buffer, dtype, count, source_rank,
+                  owned_target_ranks = std::move(owned_target_ranks), &executor,
+                  this]() {
+    return LaunchCollectivePermute(send_buffer, recv_buffer, dtype, count,
+                                   source_rank, owned_target_ranks, executor);
+  });
+}
+
+Future<> RcclCommunicator::Send(se::DeviceAddressBase send_buffer,
+                                PrimitiveType dtype, size_t count, RankId peer,
+                                const Executor& executor) {
+  return Execute([send_buffer, dtype, count, peer, &executor, this]() {
+    return LaunchSend(send_buffer, dtype, count, peer, executor);
+  });
+}
+
+Future<> RcclCommunicator::Recv(se::DeviceAddressBase recv_buffer,
+                                PrimitiveType dtype, size_t count, RankId peer,
+                                const Executor& executor) {
+  return Execute([recv_buffer, dtype, count, peer, &executor, this]() {
+    return LaunchRecv(recv_buffer, dtype, count, peer, executor);
+  });
+}
+
+absl::Status RcclCommunicator::GroupStart() {
+  VLOG(5) << "Start RCCL group";
+  XLA_RCCL_RETURN_IF_ERROR(ncclGroupStart());
+  group_nesting_level_++;
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::GroupEnd() {
+  VLOG(5) << "End RCCL group";
+  XLA_RCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  group_nesting_level_--;
+  if (group_nesting_level_ > 0) {
+    // Though NCCL allows groups to be nested, no operations are actually
+    // performed until the outermost group ends. The inner calls to
+    // GroupStart() and GroupEnd() are effectively noops.
+    //
+    // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/groups.html
+    return absl::OkStatus();
+  }
+  // Wait for the communicator to finish.
+  return PollUntilDone();
+}
+
+absl::Status RcclCommunicator::LaunchAllReduce(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Communicator::Executor& executor) {
+  if (canceling_.load()) {
+    return FailedPrecondition("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL AllReduce operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; reduction_kind=%v; comm=%p; "
+      "stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, reduction_kind, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclAllReduce(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, ToNcclReduction(reduction_kind), comm_,
+      AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchBroadcast(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, RankId root, const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL Broadcast operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; root=%d; comm=%p; "
+      "stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, root.value(), comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclBroadcast(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, root.value(), comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchReduceScatter(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL ReduceScatter operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; reduction_kind=%v; comm=%p; "
+      "stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, reduction_kind, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclReduceScatter(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, ToNcclReduction(reduction_kind), comm_,
+      AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchAllGather(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL AllGather operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclAllGather(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchAllToAll(
+    absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  auto buffer_formatter = [](std::string* out, se::DeviceAddressBase buffer) {
+    absl::StrAppendFormat(out, "%p", buffer.opaque());
+  };
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL AllToAll operation; send_buffers=[%s]; "
+      "recv_buffers=[%s]; dtype=%s; count=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(),
+      absl::StrJoin(send_buffers, ", ", buffer_formatter),
+      absl::StrJoin(recv_buffers, ", ", buffer_formatter),
+      primitive_util::LowercasePrimitiveTypeName(dtype), count, comm_, stream);
+
+  if (send_buffers.size() != recv_buffers.size()) {
+    return InvalidArgument(
+        "Number of send buffers must match number of recv buffers: %d != %d",
+        send_buffers.size(), recv_buffers.size());
+  }
+
+  int32_t num_ranks;
+  XLA_RCCL_RETURN_IF_ERROR(ncclCommCount(comm_, &num_ranks));
+
+  if (send_buffers.size() != num_ranks) {
+    return InvalidArgument(
+        "Number of send buffers must match number of ranks: %d != %d",
+        send_buffers.size(), num_ranks);
+  }
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(GroupStart());
+  for (size_t i = 0; i < send_buffers.size(); ++i) {
+    se::DeviceAddressBase send_buffer = send_buffers[i];
+    se::DeviceAddressBase recv_buffer = recv_buffers[i];
+
+    XLA_RCCL_RETURN_IF_ERROR(ncclSend(send_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsHipStream(stream)));
+
+    XLA_RCCL_RETURN_IF_ERROR(ncclRecv(recv_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsHipStream(stream)));
+  }
+  TF_RETURN_IF_ERROR(GroupEnd());
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchCollectivePermute(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+    absl::Span<const RankId> target_ranks, const Executor& executor) {
+  if (canceling_.load()) {
+    return FailedPrecondition("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  auto rank_formatter = [](std::string* out, RankId rank) {
+    absl::StrAppendFormat(out, "%d", rank.value());
+  };
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL CollectivePermute operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; source_rank=%s; target_ranks=[%s]; count=%d; "
+      "comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      source_rank ? absl::StrCat(source_rank->value()) : "<empty>",
+      absl::StrJoin(target_ranks, ", ", rank_formatter), count, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  // Short-circuit if there is no source or target rank.
+  if (!source_rank && target_ranks.empty()) {
+    return absl::OkStatus();
+  }
+
+  TF_RETURN_IF_ERROR(GroupStart());
+
+  if (source_rank) {
+    XLA_RCCL_RETURN_IF_ERROR(
+        ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 source_rank->value(), comm_, AsHipStream(stream)));
+  }
+
+  for (auto target_rank : target_ranks) {
+    XLA_RCCL_RETURN_IF_ERROR(
+        ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 target_rank.value(), comm_, AsHipStream(stream)));
+  }
+
+  TF_RETURN_IF_ERROR(GroupEnd());
+
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchSend(se::DeviceAddressBase send_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          RankId peer,
+                                          const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL Send operation; send_buffer=%p; dtype=%s; "
+      "count=%d; peer=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      primitive_util::LowercasePrimitiveTypeName(dtype), count, peer.value(),
+      comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(
+      ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+               peer.value(), comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchRecv(se::DeviceAddressBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          RankId peer,
+                                          const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL Recv operation; recv_buffer=%p; dtype=%s; "
+      "count=%d; peer=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), recv_buffer.opaque(),
+      primitive_util::LowercasePrimitiveTypeName(dtype), count, peer.value(),
+      comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(
+      ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+               peer.value(), comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+std::string RcclCommunicator::ToString() const {
+  // comm_ should not be "touched" outside of executor_, but we are printing the
+  // pointer itself and not touching the value, so this is safe.
+  return absl::StrFormat("RcclCommunicator(ncclComm_t=%p)", comm_);
+}
+
+absl::Status RcclCommunicator::PollUntilDone() const {
+  if (canceling_.load()) {
+    return FailedPrecondition("RcclCommunicator aborted");
+  }
+  return ::xla::gpu::PollUntilDone(comm_, canceling_);
+}
+
+Future<> RcclCommunicator::Execute(
+    absl::AnyInvocable<absl::Status() &&> f) const {
+  return executor_ ? Future<>::MakeOn(*executor_, std::move(f))
+                   : Future<>(std::move(f)());
+}
+
+template <typename T>
+Future<T> RcclCommunicator::Execute(
+    absl::AnyInvocable<absl::StatusOr<T>() &&> f) const {
+  return executor_ ? Future<T>::MakeOn(*executor_, std::move(f))
+                   : Future<T>(std::move(f)());
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h
new file mode 100644
index 00000000000000..d9b853fc918ec5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h
@@ -0,0 +1,254 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COMMUNICATOR_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COMMUNICATOR_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
+#include "xla/future.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/tsl/concurrency/executor.h"
+#include "xla/tsl/platform/env.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+
+// XLA collectives communicator wrapping an RCCL communicator.
+class RcclCommunicator : public GpuCommunicator {
+ public:
+  // Creates a RCCL communicator.
+  //
+  // make_comm should construct and return a new ncclComm_t. For example, it
+  // could call ncclCommInitRank. make_comm should not return a ncclComm_t that
+  // was created by a different thread.
+  //
+  // If is_async is true, all collective methods (e.g., AllReduce) are performed
+  // asynchronously on a separate thread. Otherwise, they are performed
+  // synchronously on the calling thread.
+  static absl::StatusOr<std::unique_ptr<RcclCommunicator>> Create(
+      absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm,
+      bool is_async = false, std::atomic_bool* cancel = nullptr,
+      tsl::Env& env = *tsl::Env::Default());
+
+  ~RcclCommunicator() override;
+
+  // RcclCommunicator is not copyable or movable.
+  RcclCommunicator(const RcclCommunicator&) = delete;
+  RcclCommunicator(RcclCommunicator&&) = delete;
+  RcclCommunicator& operator=(const RcclCommunicator&) = delete;
+  RcclCommunicator& operator=(RcclCommunicator&&) = delete;
+
+  absl::Status Abort() final;
+  absl::Status HealthCheck() const final;
+  absl::StatusOr<size_t> NumRanks() const final;
+
+  // Since each XLA buffer is a slice into a larger BFCAllocator chunk, first
+  // get the base address of buffer. We will use the base address to keep track
+  // of which chunks we have registered.
+  absl::Status RegisterBufferOnce(se::DeviceAddressBase buffer_range,
+                                  int device_ordinal,
+                                  bool use_symmetric_buffer) final;
+
+  Future<> GroupExecute(
+      absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) final;
+
+  Future<> AllReduce(se::DeviceAddressBase send_buffer,
+                     se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                     size_t count, ReductionKind reduction_kind,
+                     const Executor& executor) final;
+
+  Future<> Broadcast(se::DeviceAddressBase send_buffer,
+                     se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                     size_t count, RankId root, const Executor& executor) final;
+
+  Future<> ReduceScatter(se::DeviceAddressBase send_buffer,
+                         se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                         size_t count, ReductionKind reduction_kind,
+                         const Executor& executor) final;
+
+  Future<> AllGather(se::DeviceAddressBase send_buffer,
+                     se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                     size_t count, const Executor& executor) final;
+
+  Future<> AllToAll(absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+                    absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+                    PrimitiveType dtype, size_t count,
+                    const Executor& executor) final;
+
+  Future<> CollectivePermute(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
+                             PrimitiveType dtype, size_t count,
+                             std::optional<RankId> source_rank,
+                             absl::Span<const RankId> target_ranks,
+                             const Executor& executor) final;
+
+  Future<> Send(se::DeviceAddressBase send_buffer, PrimitiveType dtype,
+                size_t count, RankId peer, const Executor& executor) final;
+
+  Future<> Recv(se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                size_t count, RankId peer, const Executor& executor) final;
+
+  std::string ToString() const final;
+
+  ncclComm_t comm() const { return comm_; }
+
+ private:
+  absl::StatusOr<std::unique_ptr<RegisteredBufferHandle>> RegisterBuffer(
+      se::DeviceAddressBase buffer, int device_ordinal,
+      bool use_symmetric_buffer);
+
+  class RcclRegisteredBufferHandle;
+
+  explicit RcclCommunicator(ncclComm_t comm,
+                            std::unique_ptr<tsl::Executor> executor)
+      : comm_(comm), executor_(std::move(executor)) {
+    VLOG(1) << "Created " << *this;
+  }
+
+  absl::Status GroupStart();
+  absl::Status GroupEnd();
+
+  absl::Status LaunchAllReduce(se::DeviceAddressBase send_buffer,
+                               se::DeviceAddressBase recv_buffer,
+                               PrimitiveType dtype, size_t count,
+                               ReductionKind reduction_kind,
+                               const Executor& executor) final;
+
+  absl::Status LaunchBroadcast(se::DeviceAddressBase send_buffer,
+                               se::DeviceAddressBase recv_buffer,
+                               PrimitiveType dtype, size_t count, RankId root,
+                               const Executor& executor) final;
+
+  absl::Status LaunchReduceScatter(se::DeviceAddressBase send_buffer,
+                                   se::DeviceAddressBase recv_buffer,
+                                   PrimitiveType dtype, size_t count,
+                                   ReductionKind reduction_kind,
+                                   const Executor& executor) final;
+
+  absl::Status LaunchAllGather(se::DeviceAddressBase send_buffer,
+                               se::DeviceAddressBase recv_buffer,
+                               PrimitiveType dtype, size_t count,
+                               const Executor& executor) final;
+
+  absl::Status LaunchAllToAll(
+      absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) final;
+
+  absl::Status LaunchCollectivePermute(se::DeviceAddressBase send_buffer,
+                                       se::DeviceAddressBase recv_buffer,
+                                       PrimitiveType dtype, size_t count,
+                                       std::optional<RankId> source_rank,
+                                       absl::Span<const RankId> target_ranks,
+                                       const Executor& executor) final;
+
+  absl::Status LaunchSend(se::DeviceAddressBase send_buffer,
+                          PrimitiveType dtype, size_t count, RankId peer,
+                          const Executor& executor) final;
+
+  absl::Status LaunchRecv(se::DeviceAddressBase recv_buffer,
+                          PrimitiveType dtype, size_t count, RankId peer,
+                          const Executor& executor) final;
+
+  // Polls the communicator until any pending non-blocking operations are "done"
+  // or aborted.
+  absl::Status PollUntilDone() const;
+
+  // Executes f on executor_, or calls f directly if executor_ is null.
+  Future<> Execute(absl::AnyInvocable<absl::Status() &&> f) const;
+
+  // Executes f on executor_, or calls f directly if executor_ is null.
+  template <typename T>
+  Future<T> Execute(absl::AnyInvocable<absl::StatusOr<T>() &&> f) const;
+
+  absl::Status ExecuteAwait(absl::AnyInvocable<absl::Status() &&> f) const {
+    return Execute(std::move(f)).Await();
+  }
+
+  template <typename T>
+  absl::StatusOr<T> ExecuteAwait(
+      absl::AnyInvocable<absl::StatusOr<T>() &&> f) const {
+    return Execute<T>(std::move(f)).Await();
+  }
+
+  // Underlying RCCL communicator.
+  ncclComm_t comm_;
+
+  // If not null, used to execute methods.
+  //
+  // RCCL communicators (instances of ncclComm_t) are not thread safe. Thus,
+  // multiple threads cannot concurrently access the same ncclComm_t. This is
+  // not surprising. What is very surprising is that multiple threads cannot
+  // serially access the same ncclComm_t. In fact, a ncclComm_t must be created
+  // by, live on, and be destroyed by a single thread. A ncclComm_t cannot be
+  // accessed by any thread except the one that created it. To accomplish this,
+  // we perform all comm_ operations on executor_, if it is not null.
+  //
+  // Concretely, the lack of thread safety comes from the fact that the RCCL
+  // code uses thread-local variables that do not work properly when a
+  // ncclComm_t is accessed from multiple threads. Emperically, the lack of
+  // thread safety only manifests as buggy behavior when using non-blocking
+  // communicators.
+  std::unique_ptr<tsl::Executor> executor_;
+
+  // Should all pending collectives cancel?
+  std::atomic_bool canceling_ = false;
+
+  // Has comm_ been aborted?
+  bool aborted_ = false;
+
+  // Nesting level of current RCCL group
+  int group_nesting_level_ = 0;
+
+  // Keep track of which communicators we have registered for already.
+  // Each ncclMemAlloc'd buffer needs to be registered once per comm.
+  struct RegisteredBuffers {
+    absl::Mutex mu;
+    // Buffer range to the registered buffer handle.
+    absl::flat_hash_map<void*,
+                        std::unique_ptr<Communicator::RegisteredBufferHandle>>
+        range_to_handle ABSL_GUARDED_BY(mu);
+  };
+  RegisteredBuffers registered_buffers_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COMMUNICATOR_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc
new file mode 100644
index 00000000000000..1d21082326f747
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_communicator.h"
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
+#include "xla/future.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/tsl/platform/errors.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::HasSubstr;
+
+constexpr absl::string_view kCudaError = "unhandled cuda error";
+
+void AssertAborted(absl::Status s) {
+  ASSERT_THAT(s, absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition,
+                                        HasSubstr("aborted")));
+};
+
+void AssertEventAborted(Future<> future) {
+  ASSERT_THAT(future.Await(),
+              absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition,
+                                     HasSubstr("aborted")));
+};
+
+// Creates a non-blocking NCCL communicator.
+absl::StatusOr<std::unique_ptr<RcclCommunicator>> CreateCommunicator(
+    bool blocking) {
+  auto f = [blocking]() -> absl::StatusOr<ncclComm_t> {
+    // Create a unique NCCL Id.
+    ncclUniqueId id;
+    TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclGetUniqueId(&id)));
+
+    // Initialize a communicator.
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+    config.blocking = blocking ? 1 : 0;
+    ncclComm_t comm;
+    ncclResult_t r =
+        ncclCommInitRankConfig(&comm, /*nranks=*/1, id, /*rank=*/0, &config);
+    if (r == ncclUnhandledCudaError) {
+      // If this test runs on a machine without any CUDA-capable devices
+      // available, we get a ncclUnhandledCudaError. We return a specific error
+      // and skip the test.
+      LOG(ERROR) << XLA_RCCL_STATUS(r);
+      return absl::FailedPreconditionError(kCudaError);
+    }
+    if (r != ncclSuccess && r != ncclInProgress) {
+      return XLA_RCCL_STATUS(r);
+    }
+
+    // Wait for the communicator to finish initializing.
+    ncclResult_t state = ncclInProgress;
+    while (state == ncclInProgress) {
+      TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclCommGetAsyncError(comm, &state)));
+    }
+    TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(state));
+    return comm;
+  };
+  bool is_async = !blocking;
+  return RcclCommunicator::Create(f, is_async);
+}
+
+TEST(RcclCommunicator, AbortSucceeds) {
+  for (const bool blocking : {true, false}) {
+    absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm, absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(), absl_testing::IsOk());
+  }
+}
+
+TEST(RcclCommunicator, DoubleAbortFails) {
+  for (const bool blocking : {true, false}) {
+    absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm.status(), absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(), absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(),
+                absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition,
+                                       HasSubstr("aborted")));
+  }
+}
+
+TEST(RcclCommunicator, OperationsFailAfterAbort) {
+  for (const bool blocking : {true, false}) {
+    // Declare placeholder variables to make the operations below compile.
+    se::DeviceAddressBase buf;
+    PrimitiveType dtype = PrimitiveType::U64;
+    size_t count = 0;
+    ReductionKind rk = ReductionKind::SUM;
+    GpuCollectives::Executor executor(nullptr);
+
+    // Execute RcclCommunicator operations. They should all immediately fail
+    // because the communicator has been aborted.
+    absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm.status(), absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(), absl_testing::IsOk());
+    AssertAborted((*comm)->HealthCheck());
+    AssertAborted((*comm)->NumRanks().status());
+    AssertAborted((*comm)->RegisterBufferOnce(buf, 0, false));
+    AssertEventAborted(
+        (*comm)->AllReduce(buf, buf, dtype, count, rk, executor));
+    AssertEventAborted(
+        (*comm)->Broadcast(buf, buf, dtype, count, RankId(0), executor));
+    AssertEventAborted(
+        (*comm)->ReduceScatter(buf, buf, dtype, count, rk, executor));
+    AssertEventAborted((*comm)->AllGather(buf, buf, dtype, count, executor));
+    AssertEventAborted((*comm)->AllToAll({}, {}, dtype, count, executor));
+    AssertEventAborted(
+        (*comm)->CollectivePermute(buf, buf, dtype, count, {}, {}, executor));
+    AssertEventAborted((*comm)->Send(buf, dtype, count, RankId(0), executor));
+    AssertEventAborted((*comm)->Recv(buf, dtype, count, RankId(0), executor));
+  }
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc
new file mode 100644
index 00000000000000..c79483e1126fa6
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+
+#include <atomic>
+
+#include "absl/log/log.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/util.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+
+absl::Status PollUntilDone(ncclComm_t comm, const std::atomic_bool& aborted) {
+  auto poll = [](ncclComm_t comm,
+                 const std::atomic_bool& aborted) -> absl::Status {
+    ncclResult_t state = ncclInProgress;
+    while (state == ncclInProgress && !aborted.load()) {
+      XLA_RCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm, &state));
+    }
+    if (aborted.load()) {
+      return Cancelled("NcclCommunicator aborted");
+    }
+    return XLA_RCCL_STATUS(state);
+  };
+
+  if (!VLOG_IS_ON(1)) {
+    return poll(comm, aborted);
+  }
+
+  absl::Time start = absl::Now();
+  absl::Status s = poll(comm, aborted);
+  absl::Time stop = absl::Now();
+  VLOG(1) << "Polled RCCL communicator " << comm << " for " << (stop - start)
+          << ": " << s;
+  return s;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_errors.h b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.h
new file mode 100644
index 00000000000000..90384b5700f0db
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_RCCL_ERRORS_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_RCCL_ERRORS_H_
+
+#include <atomic>
+
+#include "absl/status/status.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/tsl/platform/logging.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+//===----------------------------------------------------------------------===//
+// Collection of helper macros for handling RCCL errors.
+//===----------------------------------------------------------------------===//
+
+#define XLA_RCCL_STATUS(expr)                                         \
+  [](ncclResult_t s, absl::string_view str) -> absl::Status {         \
+    if (s == ncclSuccess || s == ncclInProgress) {                    \
+      return absl::OkStatus();                                        \
+    }                                                                 \
+    return xla::Internal(                                             \
+        "RCCL operation %s failed: %s. Last RCCL warning(error) log " \
+        "entry (may be unrelated) '%s'.",                             \
+        str, ncclGetErrorString(s), ncclGetLastError(nullptr));       \
+  }(expr, #expr)
+
+#define XLA_RCCL_RETURN_IF_ERROR(expr)      \
+  do {                                      \
+    absl::Status s = XLA_RCCL_STATUS(expr); \
+    if (!s.ok()) {                          \
+      return s;                             \
+    }                                       \
+  } while (0)
+
+#define XLA_RCCL_LOG_IF_ERROR(expr)         \
+  do {                                      \
+    absl::Status s = XLA_RCCL_STATUS(expr); \
+    if (!s.ok()) {                          \
+      LOG(ERROR) << s.ToString();           \
+    }                                       \
+  } while (0)
+
+#define XLA_RCCL_CHECK(expr) CHECK(XLA_RCCL_STATUS(expr).ok())
+
+namespace xla::gpu {
+
+// Polls the provided communicator until it is "done" or aborted.
+//
+// RCCL communicators can be blocking or non-blocking. Operations performed on
+// non-blocking communicators return immediately, and it is the responsibility
+// of the programmer to repeatedly call ncclCommGetAsyncError on the
+// communicator until ncclCommGetAsyncError no long returns inProgress. That is
+// what PollUntilDone does.
+//
+// Note, however, that the semantics of RCCL collectives are a bit subtle. For
+// example, a collective operation may report itself as done when it is
+// scheduled on the GPU but has not yet executed. Refer to the RCCL
+// documentation and exercise caution when reasoning about whether an operation
+// is really "done".
+absl::Status PollUntilDone(ncclComm_t comm, const std::atomic_bool& aborted);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_RCCL_ERRORS_H_
diff --git a/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc b/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc
index cddd983bdaddbb..cc3d66dbff9d03 100644
--- a/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc
+++ b/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc
@@ -146,8 +146,9 @@ void* TransferBufferSubAllocator::Alloc(size_t alignment, size_t num_bytes,
     return nullptr;
   }
 
-  void* opaque = allocation.value()->opaque();
-  allocated_buffers_[allocation.value()->opaque()] = std::move(*allocation);
+  void* opaque = allocation.value()->address().opaque();
+  allocated_buffers_[allocation.value()->address().opaque()] =
+      std::move(*allocation);
   *bytes_received = num_bytes;
 
   return opaque;
diff --git a/third_party/xla/xla/backends/gpu/profiler/BUILD b/third_party/xla/xla/backends/gpu/profiler/BUILD
index bbbdeb708afda8..251fc2aac50469 100644
--- a/third_party/xla/xla/backends/gpu/profiler/BUILD
+++ b/third_party/xla/xla/backends/gpu/profiler/BUILD
@@ -68,9 +68,6 @@ xla_test(
     backends = [
         "gpu",
     ],
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":kernel_name_tracer",
         "//xla/backends/gpu/runtime:command_buffer_cmd",
diff --git a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc
index 1a94b5e1be242b..a40f44c0008fc7 100644
--- a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc
+++ b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc
@@ -172,7 +172,7 @@ void LaunchCommandBufferThunk(stream_executor::StreamExecutor* executor,
   CommandBufferThunk thunk(std::move(cmd_buffer_executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  stream_executor::StreamExecutorMemoryAllocator allocator(executor);
+  stream_executor::StreamExecutorAddressAllocator allocator(executor);
   BufferAllocations allocations({a, b, c}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 95a2d33030d5f3..3952f956af78d9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -89,8 +89,8 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:reduction_kind",
         "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_state",
@@ -104,7 +104,6 @@ cc_library(
         "//xla/runtime:object_pool",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:custom_call_status_internal",
         "//xla/service:custom_call_status_public_headers",
@@ -122,6 +121,7 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:tensor_map",
         "//xla/stream_executor:trace_command_buffer_factory",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/lib/gtl:int_type",
@@ -155,7 +155,9 @@ xla_test(
     deps = [
         ":command_buffer_cmd",
         ":copy_thunk",
+        ":shaped_slice",
         ":thunk",
+        "//xla:shape_util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
@@ -245,6 +247,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
@@ -279,6 +282,7 @@ xla_test(
         "gpu",
     ],
     deps = [
+        ":collective_multimem_registry",
         ":custom_call_thunk",
         ":dynamic_slice_thunk",
         ":dynamic_slice_thunk_proto_cc",
@@ -296,6 +300,7 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:platform_util",
@@ -362,6 +367,7 @@ xla_test(
         ":gpublas_lt_matmul_thunk",
         ":memset_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -380,6 +386,7 @@ xla_test(
         "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -448,6 +455,7 @@ cc_library(
     deps = [
         ":host_memory_pool",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
         "//xla:status_macros",
@@ -480,8 +488,10 @@ xla_cc_test(
     deps = [
         ":conditional_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -500,6 +510,7 @@ cc_library(
     srcs = ["convolution_thunk.cc"],
     hdrs = ["convolution_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla:util",
         "//xla/runtime:buffer_use",
@@ -548,8 +559,11 @@ cc_library(
     hdrs = ["convolution_reorder_thunk.h"],
     deps = [
         ":convolution_filter_thunk_proto_cc",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
+        "//xla:util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_assignment_proto_cc",
@@ -584,6 +598,7 @@ tf_proto_library(
     name = "convolution_filter_thunk_proto",
     srcs = ["convolution_filter_thunk.proto"],
     protodeps = ["//xla/service:buffer_assignment_proto"],
+    deps = [":shaped_slice_proto"],
 )
 
 cc_library(
@@ -591,9 +606,11 @@ cc_library(
     srcs = ["copy_thunk.cc"],
     hdrs = ["copy_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
         ":while_thunk",
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
@@ -609,6 +626,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
@@ -623,6 +641,7 @@ xla_cc_test(
         ":copy_thunk",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:parse_text_proto",
@@ -747,6 +766,7 @@ xla_test(
     srcs = ["custom_call_thunk_test.cc"],
     backends = ["gpu"],
     deps = [
+        ":collective_multimem_registry",
         ":custom_call_thunk",
         ":shaped_slice",
         ":thunk",
@@ -754,8 +774,11 @@ xla_test(
         "//xla:shape_util",
         "//xla/ffi",
         "//xla/ffi:attribute_map",
+        "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
+        "//xla/ffi:type_registry",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status_public_headers",
         "//xla/service:custom_call_target_registry",
@@ -1194,6 +1217,7 @@ cc_library(
     srcs = ["memset_thunk.cc"],
     hdrs = ["memset_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
@@ -1235,11 +1259,13 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -1249,6 +1275,23 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "all_gather_thunk_test",
+    srcs = ["all_gather_thunk_test.cc"],
+    deps = [
+        ":all_gather_thunk",
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "collective_kernel_thunk",
     srcs = ["collective_kernel_thunk.cc"],
@@ -1258,6 +1301,7 @@ cc_library(
         ":collective_cliques",
         ":collective_metadata_thunk",
         ":collective_multimem",
+        ":collective_params",
         ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
@@ -1267,7 +1311,7 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/core/collectives:rank_id",
         "//xla/core/collectives:reduction_kind",
-        "//xla/service:collective_ops_utils",
+        "//xla/runtime:device_id",
         "//xla/service/gpu:gpu_constants",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
@@ -1304,13 +1348,16 @@ xla_test(
     },
     backends = ["h100"],
     deps = [
+        ":collective_clique_requests",
         ":collective_kernel_thunk",
+        ":collective_multimem_registry",
         ":collective_params",
         ":collective_thunk",
         ":thunk",
         "//xla:array",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/core/collectives:reduction_kind",
         "//xla/pjrt:worker_thread",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
@@ -1365,7 +1412,9 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:reduction_kind",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
@@ -1374,15 +1423,32 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
 )
 
+xla_cc_test(
+    name = "all_reduce_thunk_test",
+    srcs = ["all_reduce_thunk_test.cc"],
+    deps = [
+        ":all_reduce_thunk",
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "all_to_all_thunk",
     srcs = ["all_to_all_thunk.cc"],
@@ -1400,6 +1466,7 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_address",
@@ -1408,6 +1475,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -1422,11 +1490,25 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "all_to_all_thunk_test",
+    srcs = ["all_to_all_thunk_test.cc"],
+    deps = [
+        ":all_to_all_thunk",
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "ragged_all_to_all_thunk",
     srcs = ["ragged_all_to_all_thunk.cc"],
     hdrs = ["ragged_all_to_all_thunk.h"],
-    tags = ["gpu"],
     deps = [
         ":collective_thunk",
         ":ragged_all_to_all",
@@ -1441,6 +1523,7 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_address",
@@ -1449,6 +1532,7 @@ cc_library(
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -1467,6 +1551,21 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "ragged_all_to_all_thunk_test",
+    srcs = ["ragged_all_to_all_thunk_test.cc"],
+    deps = [
+        ":collective_thunk",
+        ":ragged_all_to_all_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "collective_broadcast_thunk",
     srcs = ["collective_broadcast_thunk.cc"],
@@ -1546,6 +1645,7 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:computation_placer",
         "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
@@ -1555,6 +1655,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -1603,6 +1704,8 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -1664,16 +1767,34 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "collective_multimem_registry",
+    srcs = ["collective_multimem_registry.cc"],
+    hdrs = ["collective_multimem_registry.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":collective_multimem",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/runtime:device_id",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 cc_library(
     name = "collective_params",
     srcs = ["collective_params.cc"],
     hdrs = ["collective_params.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        ":collective_clique_requests",
         "//xla:executable_run_options",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/runtime:device_id",
         "//xla/service:computation_placer",
@@ -1684,9 +1805,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1703,7 +1822,6 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_clique",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_cliques",
-        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
@@ -1713,9 +1831,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -1733,7 +1849,6 @@ cc_library(
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/platform:statusor",
@@ -1749,7 +1864,6 @@ cc_library(
     srcs = ["collective_thunk.cc"],
     hdrs = ["collective_thunk.h"],
     deps = [
-        ":collective_cliques",
         ":collective_execution",
         ":collective_params",
         ":thunk",
@@ -1761,14 +1875,12 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:rendezvous",
         "//xla/service/gpu:buffer_allocations",
@@ -1778,6 +1890,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -1795,6 +1908,21 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "collective_thunk_test",
+    srcs = ["collective_thunk_test.cc"],
+    deps = [
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "p2p_thunk_common",
     srcs = ["p2p_thunk_common.cc"],
@@ -1872,16 +2000,35 @@ cc_library(
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
+        "//xla/service:buffer_assignment",
         "//xla/service:computation_placer",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "send_thunk_test",
+    srcs = ["send_thunk_test.cc"],
+    deps = [
+        ":collective_thunk",
+        ":send_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -1916,6 +2063,7 @@ cc_library(
     hdrs = ["norm_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
@@ -1958,7 +2106,9 @@ cc_library(
     hdrs = ["outfeed_thunk.h"],
     deps = [
         ":shaped_slice",
+        ":shaped_slice_proto_cc",
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -2019,24 +2169,23 @@ cc_library(
     hdrs = ["collective_metadata_thunk.h"],
     deps = [
         ":collective_multimem",
+        ":collective_multimem_registry",
         ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/collectives:gpu_clique_rendezvous",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
-        "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/gpu:multicast_memory",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -2044,10 +2193,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:protobuf_lite",
     ],
 )
@@ -2150,21 +2296,18 @@ cc_library(
     deps = [
         ":collective_clique_requests",
         ":collective_cliques",
+        ":collective_multimem_registry",
         ":collective_params",
         ":thunk_id",
         ":thunk_proto_cc",
         "//xla:executable_run_options",
         "//xla:status_macros",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_cliques",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
         "//xla/ffi:execution_context",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
-        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:backend_configs_cc",
@@ -2174,7 +2317,6 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/gtl:int_type",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -2255,6 +2397,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_proto_cc",
         ":while_thunk",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -2268,8 +2411,10 @@ cc_library(
     hdrs = ["triangular_solve_thunk.h"],
     deps = [
         ":make_batch_pointers",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -2773,6 +2918,7 @@ tf_proto_library(
         "//xla:xla_data_proto",
         "//xla/core/host_offloading:host_offloading_executable_proto",
         "//xla/ffi:attribute_map_proto",
+        "//xla/ffi:execution_state_proto",
         "//xla/service:buffer_assignment_proto",
         "//xla/service:hlo_proto",
         "//xla/service/gpu:backend_configs",
@@ -2809,6 +2955,11 @@ cc_library(
     srcs = ["thunk_proto_deserialization.cc"],
     hdrs = ["thunk_proto_deserialization.h"],
     deps = [
+        ":all_gather_thunk",
+        ":all_reduce_thunk",
+        ":all_to_all_thunk",
+        ":collective_permute_thunk",
+        ":collective_thunk",
         ":conditional_thunk",
         ":convolution_reorder_thunk",
         ":convolution_thunk",
@@ -2828,7 +2979,9 @@ cc_library(
         ":memset_thunk",
         ":norm_thunk",
         ":outfeed_thunk",
+        ":ragged_all_to_all_thunk",
         ":replica_id_thunk",
+        ":send_thunk",
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
@@ -2964,6 +3117,7 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
@@ -2995,6 +3149,7 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
@@ -3221,6 +3376,7 @@ xla_test(
         ":gemm_thunk",
         ":replica_id_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_pass_pipeline",
         ":while_thunk",
@@ -3275,6 +3431,7 @@ cc_library(
         ":thunk_id",
         ":thunk_pass_pipeline",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu:ffi",
@@ -3303,6 +3460,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_googlesource_code_re2//:re2",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -3316,6 +3474,7 @@ xla_cc_test(
         ":custom_call_thunk",
         ":runtime_intrinsics",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_buffer_debug_pass",
         ":thunk_id",
@@ -3441,15 +3600,18 @@ xla_test(
     },
     backends = ["gpu"],
     tags = [
-        "cuda-only",
         "gpu",
     ],
     deps = [
         ":buffer_debug_log_entry_metadata_store",
         ":buffer_debug_log_structs",
         ":buffers_checksum_thunk",
+        ":collective_clique_requests",
+        ":collective_multimem_registry",
+        ":collective_params",
         ":thunk",
         ":thunk_id",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
@@ -3479,9 +3641,13 @@ cc_library(
         ":buffer_debug_log_structs",
         ":thunk",
         "//xla:types",
+        "//xla:util",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/cuda:cuda_platform_id",
@@ -3493,6 +3659,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -3509,16 +3676,19 @@ xla_test(
     },
     backends = ["gpu"],
     tags = [
-        "cuda-only",
         "gpu",
     ],
     deps = [
         ":buffer_debug_log_entry_metadata_store",
         ":buffer_debug_log_structs",
         ":buffers_float_check_thunk",
+        ":collective_clique_requests",
+        ":collective_multimem_registry",
+        ":collective_params",
         ":thunk",
         ":thunk_id",
         "//xla:types",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
@@ -3623,7 +3793,6 @@ cc_library(
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:kernel_args",
-        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
index 7ab65e853aebc1..c2dfc808d40e07 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
@@ -30,6 +33,7 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/shape.h"
@@ -40,11 +44,12 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
 
-namespace impl {
+namespace {
 AllGatherConfig GetAllGatherConfig(const HloAllGatherInstruction* inst) {
   AllGatherConfig config;
   config.config = GetCollectiveConfig(inst, inst->use_global_device_ids());
@@ -55,7 +60,7 @@ absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
   for (HloInstruction* operand : inst->operands()) {
     const Shape& shape = operand->shape();
 
-    TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kAllGather));
+    RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kAllGather));
 
     if (!ShapeUtil::IsEffectivelyMostMajorDimension(
             shape, inst->all_gather_dimension())) {
@@ -67,7 +72,16 @@ absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
 
   return absl::OkStatus();
 }
-}  // namespace impl
+}  // namespace
+
+AllGatherStartThunk::AllGatherStartThunk(
+    ThunkInfo thunk_info,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
+    CollectiveConfig config, std::vector<Buffer> buffers)
+    : CollectiveThunk(Thunk::kAllGatherStart, thunk_info, async_events,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      config_(AllGatherConfig{config}),
+      buffers_(std::move(buffers)) {}
 
 AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
                                          const HloAllGatherInstruction* inst,
@@ -76,7 +90,7 @@ AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
     : CollectiveThunk(Thunk::kAllGatherStart, thunk_info,
                       IsGPUSyncCollective(*inst),
                       AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
-      config_(impl::GetAllGatherConfig(inst)),
+      config_(GetAllGatherConfig(inst)),
       buffers_(std::move(buffers)) {
   CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
@@ -85,23 +99,72 @@ AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
     const HloAllGatherInstruction* inst, int64_t replica_count,
     int64_t partition_count) {
   return AddOpDescription<AllGatherStartThunk>(
-      impl::CheckImplementableInst(inst), inst, replica_count, partition_count);
+      CheckImplementableInst(inst), inst, replica_count, partition_count);
 }
 
 /*static*/ CollectiveOpGroupMode AllGatherStartThunk::GetGroupMode(
     const HloAllGatherInstruction* inst) {
-  return impl::GetAllGatherConfig(inst).config.group_mode;
+  return GetAllGatherConfig(inst).config.group_mode;
+}
+
+absl::StatusOr<std::unique_ptr<AllGatherStartThunk>>
+AllGatherStartThunk::FromProto(
+    ThunkInfo thunk_info, const AllGatherStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  return std::make_unique<AllGatherStartThunk>(
+      std::move(thunk_info), async_events,
+      CollectiveConfig::FromProto(thunk_proto.collective_config()),
+      std::move(buffers));
+}
+
+absl::StatusOr<ThunkProto> AllGatherStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  AllGatherStartThunkProto* thunk_proto =
+      proto.mutable_all_gather_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  return proto;
 }
 
 absl::StatusOr<bool> AllGatherStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_RETURN_IF_ERROR(xla::gpu::RunAllGather(
-      device_buffers, stream, comm, config_.config.use_symmetric_buffer));
+  ASSIGN_OR_RETURN(std::vector<DeviceBufferPair> device_buffers,
+                   ConvertToDeviceBuffers(params, buffers_,
+                                          config_.config.operand_element_type));
+  RETURN_IF_ERROR(xla::gpu::RunAllGather(device_buffers, stream, comm,
+                                         config_.config.use_symmetric_buffer));
   return true;
 }
 
@@ -110,13 +173,13 @@ absl::Status RunAllGather(std::vector<DeviceBufferPair>& buffers,
                           bool use_symmetric_buffer) {
   int device_ordinal = stream.parent()->device_ordinal();
   XLA_VLOG_DEVICE(3, device_ordinal) << "Performing all-gather";
-  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(stream.parent(), buffers, &comm,
-                                          use_symmetric_buffer));
+  RETURN_IF_ERROR(MaybeRegisterBuffers(stream.parent(), buffers, &comm,
+                                       use_symmetric_buffer));
   auto* gpu_comm = tsl::down_cast<GpuCommunicator*>(&comm);
   Future<> future = gpu_comm->GroupExecute(
       [&buffers, &stream](GpuCommunicator* comm) -> absl::Status {
         for (DeviceBufferPair& buffer : buffers) {
-          TF_RETURN_IF_ERROR(comm->LaunchAllGather(
+          RETURN_IF_ERROR(comm->LaunchAllGather(
               buffer.source_buffer, buffer.destination_buffer,
               buffer.element_type, buffer.element_count,
               GpuCollectives::On(stream)));
@@ -124,7 +187,7 @@ absl::Status RunAllGather(std::vector<DeviceBufferPair>& buffers,
         return absl::OkStatus();
       });
 
-  TF_RETURN_IF_ERROR(future.Await());
+  RETURN_IF_ERROR(future.Await());
   XLA_VLOG_DEVICE(3, device_ordinal) << "Done performing all-gather";
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
index 422781194dc72c..2c3c9beb571448 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_ALL_GATHER_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
@@ -41,6 +43,10 @@ class AllGatherStartThunk : public CollectiveThunk {
   AllGatherStartThunk(ThunkInfo thunk_info, const HloAllGatherInstruction* inst,
                       std::vector<Buffer> buffers,
                       bool p2p_memcpy_enabled = false);
+  AllGatherStartThunk(
+      ThunkInfo thunk_info,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
+      CollectiveConfig config, std::vector<Buffer> buffers);
 
   static const char* GetHloOpName() { return "all-gather-start"; }
 
@@ -54,6 +60,13 @@ class AllGatherStartThunk : public CollectiveThunk {
   const CollectiveConfig& config() const override { return config_.config; }
   absl::Span<const Buffer> buffers() const { return buffers_; }
 
+  static absl::StatusOr<std::unique_ptr<AllGatherStartThunk>> FromProto(
+      ThunkInfo thunk_info, const AllGatherStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc
new file mode 100644
index 00000000000000..38a08a872fe0f4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_gather_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllGatherStartThunk> thunk,
+      AllGatherStartThunk::FromProto(thunk_info, proto.all_gather_start_thunk(),
+                                     buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_all_gather_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.all_gather_start_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
index 9bd1540910c0af..13215657451150 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
@@ -217,6 +217,7 @@ bool IsAllReduceKernelSupported(int64_t num_ranks, int64_t num_elements,
                                 ReductionKind reduction_kind,
                                 AllReduceStrategy all_reduce_strategy) {
   if (!IsElementReductionSupported(element_type, reduction_kind)) {
+    VLOG(3) << "Element type and reduction kind combination is not supported.";
     return false;
   }
   const int64_t alignment_requirement =
@@ -226,6 +227,8 @@ bool IsAllReduceKernelSupported(int64_t num_ranks, int64_t num_elements,
           : se::gpu::kNumElementsPerThread * num_ranks;
 
   if (num_elements % alignment_requirement != 0) {
+    VLOG(3)
+        << "Number of elements is not aligned to the alignment requirement.";
     return false;
   }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
index 5bad2a67644375..33f9802818b17f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
@@ -30,9 +32,11 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
@@ -46,11 +50,42 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+ReductionKindProto ToReductionKindProto(ReductionKind kind) {
+  switch (kind) {
+    case ReductionKind::SUM:
+      return REDUCTION_KIND_SUM;
+    case ReductionKind::PRODUCT:
+      return REDUCTION_KIND_PRODUCT;
+    case ReductionKind::MIN:
+      return REDUCTION_KIND_MIN;
+    case ReductionKind::MAX:
+      return REDUCTION_KIND_MAX;
+  }
+}
+
+absl::StatusOr<ReductionKind> FromReductionKindProto(
+    const ReductionKindProto& proto) {
+  switch (proto) {
+    case REDUCTION_KIND_SUM:
+      return ReductionKind::SUM;
+    case REDUCTION_KIND_PRODUCT:
+      return ReductionKind::PRODUCT;
+    case REDUCTION_KIND_MIN:
+      return ReductionKind::MIN;
+    case REDUCTION_KIND_MAX:
+      return ReductionKind::MAX;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unknown ReductionKindProto: ", proto));
+  }
+}
+
 absl::Status CheckImplementableInst(const HloInstruction* inst,
                                     Thunk::Kind reduction_op) {
   for (HloInstruction* operand : inst->operands()) {
@@ -120,14 +155,36 @@ AllReduceReduceScatterThunkBase::AllReduceReduceScatterThunkBase(
   CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
+AllReduceReduceScatterThunkBase::AllReduceReduceScatterThunkBase(
+    Thunk::Kind kind, ThunkInfo thunk_info, AllReduceConfig config,
+    std::vector<Buffer> buffers,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
+    : CollectiveThunk(kind, thunk_info, async_events,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      config_(std::move(config)),
+      buffers_(std::move(buffers)) {
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
+}
+
 AllReduceStartThunk::AllReduceStartThunk(
     ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
     std::vector<Buffer> buffers,
     std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
     bool p2p_memcpy_enabled)
-    : AllReduceReduceScatterThunkBase(
-          Thunk::kAllReduceStart, thunk_info, GetAllReduceConfigInst(inst),
-          std::move(buffers), IsGPUSyncCollective(*inst)),
+    : AllReduceStartThunk(
+          thunk_info, GetAllReduceConfigInst(inst), std::move(buffers),
+          std::move(collective_kernel_thunk),
+          IsGPUSyncCollective(*inst)
+              ? nullptr
+              : std::make_shared<CollectiveThunk::AsyncEvents>()) {}
+
+AllReduceStartThunk::AllReduceStartThunk(
+    ThunkInfo thunk_info, const AllReduceConfig& config,
+    std::vector<Buffer> buffers,
+    std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
+    : AllReduceReduceScatterThunkBase(Thunk::kAllReduceStart, thunk_info,
+                                      config, std::move(buffers), async_events),
       collective_kernel_thunk_(std::move(collective_kernel_thunk)) {}
 
 absl::Status AllReduceStartThunk::CheckImplementable(
@@ -153,9 +210,10 @@ absl::Status AllReduceStartThunk::Initialize(const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, config()));
-  TF_ASSIGN_OR_RETURN(bool use_collective_kernel,
-                      collective_kernel_thunk_->IsSupported(
-                          clique_key, params.collective_cliques));
+  TF_ASSIGN_OR_RETURN(
+      bool use_collective_kernel,
+      collective_kernel_thunk_->IsSupported(clique_key, *params.executor,
+                                            *params.collective_params));
   if (use_collective_kernel) {
     TF_RETURN_IF_ERROR(collective_kernel_thunk_->Initialize(params));
   }
@@ -170,9 +228,10 @@ absl::StatusOr<bool> AllReduceStartThunk::RunCollective(
       ConvertToDeviceBuffers(params, buffers_,
                              config_.config.operand_element_type));
 
-  TF_ASSIGN_OR_RETURN(bool use_collective_kernel,
-                      collective_kernel_thunk_->IsSupported(
-                          clique_key, params.collective_cliques));
+  TF_ASSIGN_OR_RETURN(
+      bool use_collective_kernel,
+      collective_kernel_thunk_->IsSupported(
+          clique_key, *params.stream->parent(), *params.collective_params));
 
   if (use_collective_kernel) {
     TF_RETURN_IF_ERROR(collective_kernel_thunk_->ExecuteOnStream(params));
@@ -186,6 +245,77 @@ absl::StatusOr<bool> AllReduceStartThunk::RunCollective(
   return true;
 }
 
+absl::StatusOr<std::unique_ptr<AllReduceStartThunk>>
+AllReduceStartThunk::FromProto(
+    ThunkInfo thunk_info, const AllReduceStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  ASSIGN_OR_RETURN(ReductionKind reduction_kind,
+                   FromReductionKindProto(thunk_proto.reduction_kind()));
+
+  auto kernel_thunk = std::make_unique<CollectiveKernelThunk>(
+      thunk_info, config, reduction_kind, thunk_proto.is_async(), buffers,
+      thunk_proto.collective_kernel_enabled(), thunk_proto.kernel_name(),
+      thunk_proto.shmem_bytes(), thunk_proto.is_multimem_enabled());
+
+  return std::make_unique<AllReduceStartThunk>(
+      std::move(thunk_info), AllReduceConfig{config, reduction_kind},
+      std::move(buffers), std::move(kernel_thunk), async_events);
+}
+
+absl::StatusOr<ThunkProto> AllReduceStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  AllReduceStartThunkProto* thunk_proto =
+      proto.mutable_all_reduce_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  thunk_proto->set_reduction_kind(ToReductionKindProto(config_.reduction_kind));
+
+  thunk_proto->set_is_multimem_enabled(
+      collective_kernel_thunk_->is_multimem_enabled());
+  thunk_proto->set_shmem_bytes(collective_kernel_thunk_->shmem_bytes());
+  thunk_proto->set_kernel_name(collective_kernel_thunk_->kernel_name());
+  thunk_proto->set_collective_kernel_enabled(
+      collective_kernel_thunk_->collective_kernel_enabled());
+  thunk_proto->set_is_async(collective_kernel_thunk_->is_async());
+
+  return proto;
+}
+
 ReduceScatterStartThunk::ReduceScatterStartThunk(
     ThunkInfo thunk_info, const HloReduceScatterInstruction* inst,
     std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
index 4825fc672ef669..88856dd6e608d5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
@@ -22,12 +22,15 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
@@ -48,6 +51,10 @@ class AllReduceReduceScatterThunkBase : public CollectiveThunk {
   AllReduceReduceScatterThunkBase(Kind kind, ThunkInfo thunk_info,
                                   AllReduceConfig config,
                                   std::vector<Buffer> buffers, bool is_sync);
+  AllReduceReduceScatterThunkBase(
+      Kind kind, ThunkInfo thunk_info, AllReduceConfig config,
+      std::vector<Buffer> buffers,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events);
 
   const CollectiveConfig& config() const override { return config_.config; }
   ReductionKind reduction_kind() const { return config_.reduction_kind; }
@@ -70,8 +77,13 @@ class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
       std::vector<Buffer> buffers,
       std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
       bool p2p_memcpy_enabled = false);
+  AllReduceStartThunk(
+      ThunkInfo thunk_info, const AllReduceConfig& config,
+      std::vector<Buffer> buffers,
+      std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events);
 
-  static const char* GetHloOpName() { return "all-reduce-start"; }
+  static absl::string_view GetHloOpName() { return "all-reduce-start"; }
 
   static absl::Status CheckImplementable(const HloAllReduceInstruction* inst,
                                          int64_t replica_count,
@@ -83,6 +95,13 @@ class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
   absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
 
+  static absl::StatusOr<std::unique_ptr<AllReduceStartThunk>> FromProto(
+      ThunkInfo thunk_info, const AllReduceStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
@@ -104,7 +123,7 @@ class ReduceScatterStartThunk : public AllReduceReduceScatterThunkBase {
                           std::vector<Buffer> buffers,
                           bool p2p_memcpy_enabled = false);
 
-  static const char* GetHloOpName() { return "reduce-scatter-start"; }
+  static absl::string_view GetHloOpName() { return "reduce-scatter-start"; }
 
   static absl::Status CheckImplementable(
       const HloReduceScatterInstruction* inst, int64_t replica_count,
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
new file mode 100644
index 00000000000000..2aa2cc91c0e3f8
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_reduce_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          reduction_kind: 1
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllReduceStartThunk> thunk,
+      AllReduceStartThunk::FromProto(thunk_info, proto.all_reduce_start_thunk(),
+                                     buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_all_reduce_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.all_reduce_start_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(CollectiveThunkTest, SyncCollective) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_reduce_start_thunk {
+          collective_config {}
+          reduction_kind: 1
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllReduceStartThunk> thunk,
+      AllReduceStartThunk::FromProto(thunk_info, proto.all_reduce_start_thunk(),
+                                     buffer_allocations, async_events_map));
+  ASSERT_EQ(thunk->async_events(), nullptr);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
index c88e31a35b64a7..8d68160cc8b2be 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -56,6 +57,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -77,17 +79,27 @@ struct BufferRendezvousValue {
 }  // namespace
 
 AllToAllStartThunk::AllToAllStartThunk(
-    ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
-    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
-    : CollectiveThunk(Thunk::kAllToAllStart, thunk_info,
-                      IsGPUSyncCollective(*instr),
+    ThunkInfo thunk_info, std::shared_ptr<AsyncEvents> async_events,
+    const AllToAllConfig& config, std::vector<CollectiveThunk::Buffer> buffers,
+    bool p2p_memcpy_enabled)
+    : CollectiveThunk(Thunk::kAllToAllStart, thunk_info, async_events,
                       AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
-      config_(GetAllToAllConfig(instr)),
+      config_(config),
       buffers_(std::move(buffers)),
       p2p_memcpy_enabled_(p2p_memcpy_enabled) {
   CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
+AllToAllStartThunk::AllToAllStartThunk(
+    ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
+    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
+    : AllToAllStartThunk(std::move(thunk_info),
+                         IsGPUSyncCollective(*instr)
+                             ? nullptr
+                             : std::make_shared<CollectiveThunk::AsyncEvents>(),
+                         GetAllToAllConfig(instr), std::move(buffers),
+                         p2p_memcpy_enabled) {}
+
 /*static*/ absl::Status AllToAllStartThunk::CheckImplementable(
     const HloAllToAllInstruction* instr, int64_t replica_count,
     int64_t partition_count) {
@@ -271,6 +283,63 @@ bool AllToAllStartThunk::is_local() const {
   return true;
 }
 
+absl::StatusOr<std::unique_ptr<AllToAllStartThunk>>
+AllToAllStartThunk::FromProto(
+    ThunkInfo thunk_info, const AllToAllStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  return std::make_unique<AllToAllStartThunk>(
+      std::move(thunk_info), async_events,
+      AllToAllConfig{config, thunk_proto.has_split_dimension()}, buffers,
+      thunk_proto.p2p_memcpy_enabled());
+}
+
+absl::StatusOr<ThunkProto> AllToAllStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  AllToAllStartThunkProto* thunk_proto = proto.mutable_all_to_all_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+
+  thunk_proto->set_has_split_dimension(has_split_dimension());
+  thunk_proto->set_p2p_memcpy_enabled(p2p_memcpy_enabled_);
+
+  return proto;
+}
+
 absl::Status RunAllToAll(bool has_split_dimension,
                          std::vector<DeviceBufferPair>& buffers,
                          se::Stream& stream, Communicator& comm,
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
index 4ced40fd1bff65..b35f1340688605 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
@@ -49,6 +51,12 @@ class AllToAllStartThunk : public CollectiveThunk {
   AllToAllStartThunk(ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
                      std::vector<Buffer> buffers, bool p2p_memcpy_enabled);
 
+  AllToAllStartThunk(ThunkInfo thunk_info,
+                     std::shared_ptr<AsyncEvents> async_events,
+                     const AllToAllConfig& config,
+                     std::vector<CollectiveThunk::Buffer> buffers,
+                     bool p2p_memcpy_enabled);
+
   // Returns whether the given instruction can be lowered to an all-to-all
   // call.
   static absl::Status CheckImplementable(const HloAllToAllInstruction* instr,
@@ -57,11 +65,18 @@ class AllToAllStartThunk : public CollectiveThunk {
 
   absl::Status Initialize(const InitializeParams& params) override;
 
-  static const char* GetHloOpName() { return "all-to-all-start"; }
+  static absl::string_view GetHloOpName() { return "all-to-all-start"; }
 
   static CollectiveOpGroupMode GetGroupMode(
       const HloAllToAllInstruction* instr);
 
+  static absl::StatusOr<std::unique_ptr<AllToAllStartThunk>> FromProto(
+      ThunkInfo thunk_info, const AllToAllStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   const CollectiveConfig& config() const override { return config_.config; }
   bool has_split_dimension() const { return config_.has_split_dimension; }
   absl::Span<const Buffer> buffers() const { return buffers_; }
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc
new file mode 100644
index 00000000000000..521151a2ee9df3
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_to_all_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          has_split_dimension: false
+          p2p_memcpy_enabled: true
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllToAllStartThunk> thunk,
+      AllToAllStartThunk::FromProto(thunk_info, proto.all_to_all_start_thunk(),
+                                    buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_all_to_all_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.all_to_all_start_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
index 9ff067c00b633d..9d3492ae964f33 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
@@ -54,6 +54,25 @@ static_assert(sizeof(BufferDebugLogEntry) == sizeof(uint32_t) * 2);
 static_assert(offsetof(BufferDebugLogEntry, entry_id) == 0);
 static_assert(offsetof(BufferDebugLogEntry, value) == sizeof(uint32_t));
 
+struct FloatCheckResult {
+  uint32_t nan_count;
+  uint32_t inf_count;
+  uint32_t zero_count;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const FloatCheckResult& result) {
+    absl::Format(&sink, "{nan_count: %u, inf_count: %u, zero_count: %u}",
+                 result.nan_count, result.inf_count, result.zero_count);
+  }
+};
+
+// The struct layout must match on both host and device.
+static_assert(_Alignof(FloatCheckResult) == _Alignof(uint32_t));
+static_assert(sizeof(FloatCheckResult) == sizeof(uint32_t) * 3);
+static_assert(offsetof(FloatCheckResult, nan_count) == 0);
+static_assert(offsetof(FloatCheckResult, inf_count) == sizeof(uint32_t));
+static_assert(offsetof(FloatCheckResult, zero_count) == sizeof(uint32_t) * 2);
+
 struct BufferDebugFloatCheckEntry {
   // An ID that uniquely identifies a log entry within a HLO module execution.
   BufferDebugLogEntryId entry_id;
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
index 47208ddc437641..3eb51302f049fc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
@@ -28,8 +28,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/service_executable_run_options.h"
@@ -147,11 +151,26 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
   TF_ASSERT_OK(stream_->Memcpy(&inputs0_mem, zeros.data(), zeros.size()));
   zeros[123] = 56785678;  // expected checksum for inputs_mem[1]
   TF_ASSERT_OK(stream_->Memcpy(&inputs1_mem, zeros.data(), zeros.size()));
+
   // Setup parameters for Initialize/Prepare/ExecuteOnStream
   Thunk::InitializeParams init_params;
   init_params.executor = executor_;
   init_params.stream = stream_.get();
-  auto execute_params = Thunk::ExecuteParams::Create(
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream_.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor_->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor_, collective_params.global_device_id);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor_,
+                                      &allocations};
+
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       ServiceExecutableRunOptions(), allocations, stream_.get(),
       /*command_buffer_trace_stream=*/stream_.get(),
       /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
@@ -163,7 +182,7 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
       {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
       /*runs_before_checked_thunk=*/true, metadata_store);
   TF_ASSERT_OK(thunk.Initialize(init_params));
-  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}));
+  TF_ASSERT_OK(thunk.Prepare(prepare_params));
   TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugLogEntry> entries,
                           device_log.ReadFromDevice(*stream_));
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
index 6ff174e2a418d2..d6b8b04c70c47a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
 
+#include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -30,14 +34,18 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
 #include "xla/stream_executor/gpu/buffer_debug_log.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
+#include "xla/util.h"
 
 namespace xla::gpu {
 
@@ -73,15 +81,33 @@ absl::Status BuffersDebugFloatCheckThunk::Initialize(
           auto kernel_bf16,
           registry.LoadKernel<se::gpu::BufferDebugFloatCheckBf16Kernel>(
               params.executor));
+      TF_ASSIGN_OR_RETURN(
+          auto kernel_reduce,
+          registry.LoadKernel<
+              se::gpu::BufferDebugAppendReducedFloatCheckResultsKernel>(
+              params.executor));
       kernels_[params.executor] = std::make_unique<Kernels>(
-          Kernels{std::move(kernel_f32), std::move(kernel_bf16)});
+          Kernels{std::move(kernel_f32), std::move(kernel_bf16),
+                  std::move(kernel_reduce)});
+      VLOG(1) << "NanCount kernels loaded";
     }
   }
 
-  VLOG(1) << "FloatCheck kernel loaded";
   return absl::OkStatus();
 }
 
+template <typename T>
+se::BlockDim GetBlockDimForBuffer(se::Stream* stream,
+                                  se::DeviceMemory<T> buffer,
+                                  int64_t max_blocks) {
+  const int64_t num_elements = buffer.size() / sizeof(T);
+  const se::DeviceDescription& desc = stream->parent()->GetDeviceDescription();
+  const int64_t num_blocks =
+      std::min(xla::CeilOfRatio(num_elements, desc.threads_per_block_limit()),
+               max_blocks);
+  return se::BlockDim(num_blocks);
+}
+
 absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
@@ -102,8 +128,13 @@ absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
 
   VLOG(1) << "BuffersDebugFloatCheckThunk::ExecuteOnStream";
 
-  const se::ThreadDim thread_dim(
-      executor->GetDeviceDescription().threads_per_block_limit(), 1, 1);
+  se::DeviceAddress<xla::gpu::FloatCheckResult> tmp_ptr(
+      params.buffer_allocations->GetDeviceAddress(tmp_slice_));
+  const size_t tmp_size_elements =
+      tmp_slice_.size() / sizeof(xla::gpu::FloatCheckResult);
+  CHECK_GT(tmp_size_elements, 0)
+      << "tmp_slice_ is too small to hold any results, this should have been "
+         "caught during initialization";
 
   se::DeviceAddress<uint8_t> log_ptr(
       params.buffer_allocations->GetDeviceAddress(log_slice_));
@@ -111,6 +142,8 @@ absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
       se::gpu::BufferDebugLog<
           BufferDebugFloatCheckEntry>::FromDeviceAddressUnchecked(log_ptr);
   const uint32_t execution_id = execution_count_.fetch_add(1);
+  // The kernel assumes 1024 threads per block.
+  const se::ThreadDim thread_dim(1024);
 
   for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
     BufferDebugLogEntryMetadataStore::Metadata metadata{
@@ -130,22 +163,32 @@ absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
       VLOG(1) << "F32 buffer detected with id: " << entry_id
               << " and size: " << device_buffer.size();
       se::DeviceAddress<float> f32_buffer(device_buffer);
-      TF_RETURN_IF_ERROR(kernels->f32.Launch(
-          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
-          f32_buffer, f32_buffer.size(), buffer_debug_log.GetDeviceHeader(),
-          buffer_debug_log.GetDeviceEntries()));
+      const se::BlockDim block_dim = GetBlockDimForBuffer<float>(
+          params.stream, f32_buffer, tmp_size_elements);
+      TF_RETURN_IF_ERROR(
+          kernels->f32.Launch(thread_dim, block_dim, params.stream, f32_buffer,
+                              f32_buffer.size(), tmp_ptr, tmp_size_elements));
     } else if (buffer_type == PrimitiveType::BF16) {
       VLOG(1) << "BF16 buffer detected with id: " << entry_id
               << " and size: " << device_buffer.size();
       se::DeviceAddress<Eigen::bfloat16> bf16_buffer(device_buffer);
+      const se::BlockDim block_dim = GetBlockDimForBuffer<Eigen::bfloat16>(
+          params.stream, bf16_buffer, tmp_size_elements);
       TF_RETURN_IF_ERROR(kernels->bf16.Launch(
-          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
-          bf16_buffer, bf16_buffer.size(), buffer_debug_log.GetDeviceHeader(),
-          buffer_debug_log.GetDeviceEntries()));
+          thread_dim, block_dim, params.stream, bf16_buffer, bf16_buffer.size(),
+          tmp_ptr, tmp_size_elements));
     } else {
       VLOG(1) << "Unsupported primitive type for float checking: "
               << PrimitiveType_Name(buffer_type);
+      continue;
     }
+
+    // Operations on the same stream perform in sequence, so at this point the
+    // results of the previous FloatCheck operation are available.
+    TF_RETURN_IF_ERROR(kernels->reduce.Launch(
+        thread_dim, se::BlockDim(1, 1, 1), params.stream, tmp_ptr,
+        tmp_size_elements, entry_id, buffer_debug_log.GetDeviceHeader(),
+        buffer_debug_log.GetDeviceEntries()));
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
index 5d2f78e80edb99..f73c9ef305fde6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <atomic>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -38,18 +39,21 @@ class BuffersDebugFloatCheckThunk : public Thunk {
  public:
   explicit BuffersDebugFloatCheckThunk(
       ThunkInfo info, const ThunkInfo& checked_thunk_info,
-      BufferAllocation::Slice log_slice,
+      BufferAllocation::Slice log_slice, BufferAllocation::Slice tmp_slice,
       absl::flat_hash_map<size_t, BufferAllocation::Slice>
           checked_thunk_buffers,
       std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store)
       : Thunk(Thunk::Kind::kBuffersDebugFloatCheck, std::move(info)),
         log_slice_(log_slice),
+        tmp_slice_(tmp_slice),
         checked_thunk_info_(checked_thunk_info),
         checked_thunk_buffers_(std::move(checked_thunk_buffers)),
         metadata_store_(std::move(metadata_store)) {}
 
-  absl::Status Initialize(const InitializeParams& params) override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
 
   std::string ToString(int indent) const override;
 
@@ -67,6 +71,8 @@ class BuffersDebugFloatCheckThunk : public Thunk {
   struct Kernels {
     stream_executor::gpu::BufferDebugFloatCheckF32Kernel::KernelType f32;
     stream_executor::gpu::BufferDebugFloatCheckBf16Kernel::KernelType bf16;
+    stream_executor::gpu::BufferDebugAppendReducedFloatCheckResultsKernel::
+        KernelType reduce;
   };
   absl::Mutex kernels_mutex_;
   // Each loaded kernel is associated with a specific device (represented by its
@@ -79,6 +85,7 @@ class BuffersDebugFloatCheckThunk : public Thunk {
       kernels_ ABSL_GUARDED_BY(kernels_mutex_);
 
   BufferAllocation::Slice log_slice_;
+  BufferAllocation::Slice tmp_slice_;
   ThunkInfo checked_thunk_info_;
   absl::flat_hash_map<size_t, BufferAllocation::Slice> checked_thunk_buffers_;
   std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
index dfb933bce2a4ee..977e2500f76f68 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
@@ -28,8 +28,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/service_executable_run_options.h"
@@ -101,17 +105,20 @@ class BuffersDebugFloatCheckThunkTest : public ::testing::Test {
 TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
   static constexpr size_t kLogSize =
       BufferDebugLog<BufferDebugFloatCheckEntry>::RequiredSizeForEntries(10);
+  static constexpr size_t kTmpSizeElems = 1024;
+  static constexpr size_t kTmpSizeBytes = kTmpSizeElems * sizeof(uint32_t);
   static constexpr size_t kInputElems = 1024;
   static constexpr size_t kInputSizeInBytes = kInputElems * sizeof(float);
   static constexpr size_t kTotalDeviceMemoryBytes =
-      kLogSize + kInputSizeInBytes * 2;
+      kLogSize + kTmpSizeBytes + kInputSizeInBytes * 2;
   // Setup memory allocations for the log and inputs
   BufferAllocation alloc(/*index=*/0,
                          /*size=*/kTotalDeviceMemoryBytes,
                          /*color=*/0);
   int64_t input_offset = kLogSize;
   BufferAllocation::Slice log_slice(&alloc, /*offset=*/0, kLogSize);
-  input_offset += kLogSize;
+  BufferAllocation::Slice tmp_slice(&alloc, /*offset=*/kLogSize, kTmpSizeBytes);
+  input_offset += kLogSize + kTmpSizeBytes;
 
   BufferAllocation::Slice inputs[2];
   int64_t input_size_bf16 = kInputElems * sizeof(Eigen::bfloat16);
@@ -150,7 +157,21 @@ TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
   Thunk::InitializeParams init_params;
   init_params.executor = executor_;
   init_params.stream = stream_.get();
-  auto execute_params = Thunk::ExecuteParams::Create(
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream_.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor_->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor_, collective_params.global_device_id);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor_,
+                                      &allocations};
+
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       ServiceExecutableRunOptions(), allocations, stream_.get(),
       /*command_buffer_trace_stream=*/stream_.get(),
       /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
@@ -159,11 +180,11 @@ TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
   Thunk::ThunkInfo checked_thunk_info;
   checked_thunk_info.thunk_id = ThunkId(123);
   BuffersDebugFloatCheckThunk thunk(
-      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice, tmp_slice,
       {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
       metadata_store);
   TF_ASSERT_OK(thunk.Initialize(init_params));
-  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}));
+  TF_ASSERT_OK(thunk.Prepare(prepare_params));
   TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugFloatCheckEntry> entries,
                           device_log.ReadFromDevice(*stream_));
@@ -202,8 +223,13 @@ TEST_F(BuffersDebugFloatCheckThunkTest,
     GTEST_SKIP() << "need at least 2 devices for this test";
   }
 
+  static constexpr size_t kLogOffset = 0;
   static constexpr size_t kLogSizeBytes = 1024;
+  static constexpr size_t kTmpOffset = kLogOffset + kLogSizeBytes;
+  static constexpr size_t kTmpSizeBytes = 1024 * sizeof(uint32_t);
+  static constexpr size_t kInputOffset = kTmpOffset + kTmpSizeBytes;
   static constexpr size_t kInputSizeBytes = 1024;
+  static constexpr size_t kTotalDeviceMemory = kInputOffset + kInputSizeBytes;
 
   struct TestDevice {
     se::StreamExecutor* executor;
@@ -219,7 +245,7 @@ TEST_F(BuffersDebugFloatCheckThunkTest,
     auto allocator =
         std::make_unique<se::StreamExecutorMemoryAllocator>(executor);
     BufferAllocations allocations(
-        {executor->AllocateArray<uint8_t>(kLogSizeBytes + kInputSizeBytes)},
+        {executor->AllocateArray<uint8_t>(kTotalDeviceMemory)},
         executor->device_ordinal(), allocator.get());
 
     return TestDevice{std::move(executor), std::move(stream),
@@ -227,16 +253,17 @@ TEST_F(BuffersDebugFloatCheckThunkTest,
   };
   TF_ASSERT_OK_AND_ASSIGN(TestDevice device0, setup_device(0));
   TF_ASSERT_OK_AND_ASSIGN(TestDevice device1, setup_device(1));
-  BufferAllocation allocation(0, kLogSizeBytes + kInputSizeBytes, 0);
-  BufferAllocation::Slice log_slice(&allocation, 0, kLogSizeBytes);
-  BufferAllocation::Slice f32_slice(&allocation, kLogSizeBytes, kInputSizeBytes,
+  BufferAllocation allocation(/*index=*/0, kTotalDeviceMemory, /*color=*/0);
+  BufferAllocation::Slice log_slice(&allocation, kLogOffset, kLogSizeBytes);
+  BufferAllocation::Slice tmp_slice(&allocation, kTmpOffset, kTmpSizeBytes);
+  BufferAllocation::Slice f32_slice(&allocation, kInputOffset, kInputSizeBytes,
                                     PrimitiveType::F32);
-  BufferAllocation::Slice bf16_slice(&allocation, kLogSizeBytes,
-                                     kInputSizeBytes, PrimitiveType::BF16);
+  BufferAllocation::Slice bf16_slice(&allocation, kInputOffset, kInputSizeBytes,
+                                     PrimitiveType::BF16);
   Thunk::ThunkInfo checked_thunk_info;
   checked_thunk_info.thunk_id = ThunkId(123);
   BuffersDebugFloatCheckThunk thunk(
-      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice, tmp_slice,
       {{/*buffer_idx=*/0, f32_slice}, {/*buffer_idx=*/1, bf16_slice}},
       std::make_shared<BufferDebugLogEntryMetadataStore>());
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
index 345d6148d18713..703babbbb6f648 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
@@ -33,12 +33,12 @@ limitations under the License.*/
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_reduce.h"
-#include "xla/backends/gpu/runtime/collective_cliques.h"
 #include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
-#include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
@@ -86,49 +86,125 @@ absl::StatusOr<se::DeviceAddressHandle> AllocateMemory(
   return local_buffer_alloc;
 };
 
+absl::StatusOr<int> GetLocalDeviceId(
+    const GlobalDeviceId& global_device_id,
+    const CollectiveParams& collective_params) {
+  // If the global device id map is not provided, then we can assume that
+  // execution is local.
+  if (!collective_params.global_device_id_map) {
+    return global_device_id.value();
+  }
+
+  for (const auto& local_device : *collective_params.global_device_id_map) {
+    if (local_device.second == global_device_id) {
+      return local_device.first.value();
+    }
+  }
+  return absl::NotFoundError(
+      absl::StrFormat("Global device id %d not found in global device id map.",
+                      global_device_id.value()));
+}
+
 }  // namespace
 
 absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
-    const GpuCliqueKey& clique_key,
-    const CollectiveCliques* collective_cliques) const {
+    const GpuCliqueKey& clique_key, se::StreamExecutor& executor,
+    const CollectiveParams& collective_params) const {
   if (!collective_kernel_enabled_) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Collective kernel is not enabled.";
     return false;
   }
 
   // TODO(b/407736956): Support variadic all-reduce.
   if (buffers_.size() != 1) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Variadic arguments are not implemented for collective kernels.";
     return false;
   }
-
   const int64_t num_elements = buffers_[0].element_count;
   const int64_t input_size_bytes = GetInputSizeBytes();
   const AllReduceStrategy strategy =
       GetAllReduceStrategy(input_size_bytes, is_multimem_enabled_);
   // Custom all-reduce strategy is only supported for small inputs.
   if (input_size_bytes > GetMaxSupportedAllReduceSizeBytes(strategy)) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Custom all-reduce strategy is only supported for small inputs.";
     return false;
   }
 
-  TF_ASSIGN_OR_RETURN(bool peer_access_enabled,
-                      collective_cliques->peer_access_enabled(clique_key));
-
-  // Check that peer access is enabled.
-  if (!peer_access_enabled) {
+  // Only single-host collectives are supported for now.
+  if (!clique_key.is_local()) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Cross-host symmetric memory collectives are not supported.";
     return false;
   }
 
+  for (const GlobalDeviceId& device : clique_key.devices()) {
+    TF_ASSIGN_OR_RETURN(const int peer_device_id,
+                        GetLocalDeviceId(device, collective_params));
+    if (!executor.CanEnablePeerAccessTo(peer_device_id)) {
+      XLA_VLOG_DEVICE(3, executor.device_ordinal())
+          << "Peer access is not supported with device " << peer_device_id;
+      return false;
+    }
+  }
+
   return IsAllReduceKernelSupported(
       clique_key.num_local_participants(), num_elements,
       collective_config_.operand_element_type[0], reduction_kind_, strategy);
 }
 
 absl::Status CollectiveKernelThunk::Prepare(const PrepareParams& params) {
-  TF_RET_CHECK(params.collective_params != nullptr);
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, collective_config_,
                                 /*include_participant_groups=*/false));
-  return params.clique_requests->RequestClique(clique_key);
+  TF_ASSIGN_OR_RETURN(
+      bool use_collective_kernel,
+      IsSupported(clique_key, *params.executor, *params.collective_params));
+  if (!use_collective_kernel) {
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(params.clique_requests->RequestClique(clique_key));
+
+  absl::MutexLock lock(mutex_);
+  if (!per_stream_memory_.contains(params.executor)) {
+    // Allocate scratch buffers.
+    const AllReduceStrategy strategy =
+        GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
+    const LaunchDimensions launch_dimensions = AllReduceLaunchDimensions(
+        buffers_[0].element_count, clique_key.num_local_participants(),
+        strategy);
+    const int64_t kNumSignalFlags =
+        clique_key.num_local_participants() * launch_dimensions.num_blocks();
+    const int64_t kSignalBufferSize = xla::RoundUpTo<uint64_t>(
+        kNumSignalFlags * sizeof(int32_t), kXlaAllocatedBufferAlignBytes);
+    const int64_t kLocalBufferSize = xla::RoundUpTo<uint64_t>(
+        buffers_[0].source_buffer.size(), kXlaAllocatedBufferAlignBytes);
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceAddressHandle local_buffers_handle,
+        AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
+                       "Local buffers"));
+
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceAddressHandle signal_buffers_handle,
+        AllocateMemory(params.executor, kSignalBufferSize * kNumBuffers,
+                       "Signal buffers"));
+
+    se::DeviceAddressBase local_buffers_ptr = local_buffers_handle.address();
+    per_stream_memory_.emplace(
+        params.executor,
+        std::make_unique<StreamMemory>(StreamMemory{
+            std::move(local_buffers_handle), std::move(signal_buffers_handle),
+            strategy, kLocalBufferSize, kSignalBufferSize}));
+    if (is_multimem_enabled_ && strategy == AllReduceStrategy::kMultimem) {
+      params.multimem_registry->Register(
+          {clique_key, /*map_to=*/local_buffers_ptr});
+    }
+  }
+
+  return absl::OkStatus();
 }
 
 int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
@@ -137,30 +213,6 @@ int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
              collective_config_.operand_element_type[0]);
 }
 
-absl::Status CollectiveKernelThunk::ExchangeStateMetadata(
-    const GpuCliqueKey& clique_key, const InitializeParams& params,
-    StreamState& state) {
-  const std::optional<RankId> rank =
-      clique_key.rank(params.collective_params->global_device_id);
-  TF_RET_CHECK(rank.has_value())
-      << "Device " << params.collective_params->global_device_id
-      << "is not in the clique.";
-
-  std::vector<se::DeviceAddressBase> parameters{
-      state.local_buffers_handle.memory(),
-      state.signal_buffers_handle.memory()};
-  TF_RET_CHECK(parameters.size() == kNumParameters);
-
-  const size_t param_to_peers_ptrs_size_bytes =
-      parameters.size() * clique_key.num_devices() * sizeof(uint64_t);
-  state.metadata = params.executor->Allocate(
-      sizeof(CollectiveKernelMetadata) + param_to_peers_ptrs_size_bytes, 0);
-
-  return CollectiveMetadataThunk::ConstructCollectiveMetadata(
-      clique_key, state.rank, params.stream, std::move(parameters),
-      state.collective_multimem, state.metadata);
-}
-
 absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
       const GpuCliqueKey clique_key,
@@ -171,43 +223,21 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_RET_CHECK(rank.has_value())
       << "Device " << params.collective_params->global_device_id
       << "is not in the clique.";
-  const AllReduceStrategy strategy =
-      GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
-  const LaunchDimensions launch_dimensions = AllReduceLaunchDimensions(
-      buffers_[0].element_count, clique_key.num_local_participants(), strategy);
 
   StreamState* state = nullptr;
   {
     absl::MutexLock lock(mutex_);
     if (!per_stream_state_.contains(params.executor)) {
-      // Step1: Allocate signal and local buffers.
-      const int64_t kNumSignalFlags =
-          clique_key.num_local_participants() * launch_dimensions.num_blocks();
-
-      int64_t kSignalBufferSize = xla::RoundUpTo<uint64_t>(
-          kNumSignalFlags * sizeof(int32_t), kXlaAllocatedBufferAlignBytes);
-      const int64_t kLocalBufferSize = xla::RoundUpTo<uint64_t>(
-          buffers_[0].source_buffer.size(), kXlaAllocatedBufferAlignBytes);
-
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceAddressHandle local_buffers_handle,
-          AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
-                         "Local buffers"));
-
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceAddressHandle signal_buffers_handle,
-          AllocateMemory(params.executor, kSignalBufferSize * kNumBuffers,
-                         "Signal buffers"));
-
-      // Step2: We needs 1 atomic flag per block per device on each device.
+      StreamMemory* memory_state = per_stream_memory_.at(params.executor).get();
+      // Step1: We needs 1 atomic flag per block per device on each device.
       // One-shot kernel expects that the signal flags buffer is zeroed out.
       // Initial state of device memory is undefined, so we need to zero out
       // the buffer. The kernel will take care of leaving the buffer in
       // correct state after use, so we don't need to zero out after
       // initialization.
       TF_RETURN_IF_ERROR(params.executor->SynchronousMemZero(
-          signal_buffers_handle.memory_ptr(),
-          signal_buffers_handle.memory().size()));
+          memory_state->signal_buffers_handle.memory_ptr(),
+          memory_state->signal_buffers_handle.memory().size()));
       // Create a kernel for execution.
       std::unique_ptr<se::Kernel> kernel = nullptr;
       if (!kernel_name_.empty()) {
@@ -224,13 +254,11 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
                            params.executor, shmem_bytes_));
         }
       }
-      // Step3: Emplace into the stream state.
+      // Step2: Emplace into the stream state.
       per_stream_state_.emplace(
           params.executor,
-          std::make_unique<StreamState>(
-              params.executor->device_ordinal(), rank.value(),
-              std::move(local_buffers_handle), std::move(signal_buffers_handle),
-              std::move(kernel)));
+          std::make_unique<StreamState>(params.executor->device_ordinal(),
+                                        rank.value(), std::move(kernel)));
 
       state = per_stream_state_.at(params.executor).get();
 
@@ -238,28 +266,47 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
       // half of the total allocation.
       for (int i = 0; i < kNumBuffers; ++i) {
         state->remote_buffer_ptrs[i] =
-            state->local_buffers_handle.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/i * kLocalBufferSize,
-                /*size_bytes=*/kLocalBufferSize);
+            memory_state->local_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * memory_state->local_buffer_size_bytes,
+                /*size_bytes=*/memory_state->local_buffer_size_bytes);
 
         state->signal_buffer_ptrs[i] =
-            state->signal_buffers_handle.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/i * kSignalBufferSize,
-                /*size_bytes=*/kSignalBufferSize);
+            memory_state->signal_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * memory_state->signal_buffer_size_bytes,
+                /*size_bytes=*/memory_state->signal_buffer_size_bytes);
       }
     }
   }
 
+  StreamMemory* memory_state = nullptr;
+  {
+    absl::MutexLock lock(mutex_);
+    memory_state = per_stream_memory_.at(params.executor).get();
+  }
+
   if (state != nullptr) {
-    if (strategy == AllReduceStrategy::kMultimem) {
+    if (memory_state->strategy == AllReduceStrategy::kMultimem) {
       TF_ASSIGN_OR_RETURN(
           state->collective_multimem,
-          CollectiveMultimem::Allocate(params.executor, clique_key, *rank,
-                                       state->local_buffers_handle.memory()));
+          params.multicast_memory_registry->Get(
+              {clique_key, memory_state->local_buffers_handle.memory()}));
       state->multicast_device_ptr =
           state->collective_multimem->mapped_ptr(*rank);
     }
-    TF_RETURN_IF_ERROR(ExchangeStateMetadata(clique_key, params, *state));
+
+    std::vector<se::DeviceAddressBase> parameters{
+        memory_state->local_buffers_handle.memory(),
+        memory_state->signal_buffers_handle.memory()};
+    TF_RET_CHECK(parameters.size() == kNumParameters);
+
+    const size_t param_to_peers_ptrs_size_bytes =
+        parameters.size() * clique_key.num_devices() * sizeof(uint64_t);
+    state->metadata = params.executor->Allocate(
+        sizeof(CollectiveKernelMetadata) + param_to_peers_ptrs_size_bytes, 0);
+
+    return CollectiveMetadataThunk::ConstructCollectiveMetadata(
+        clique_key, state->rank, params.stream, std::move(parameters),
+        state->collective_multimem, state->metadata);
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
index 8350a7de83b7c1..2d7de400a4cb1d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
@@ -28,16 +28,14 @@ limitations under the License.*/
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_cliques.h"
-#include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 #include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/core/collectives/reduction_kind.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/gpu/all_reduce_kernel.h"
@@ -80,10 +78,19 @@ class CollectiveKernelThunk : public Thunk {
     per_stream_state_.reserve(kMaxNumExecutors);
   }
 
+  bool is_multimem_enabled() const { return is_multimem_enabled_; }
+
+  int32_t shmem_bytes() const { return shmem_bytes_; }
+
+  absl::string_view kernel_name() const { return kernel_name_; }
+
+  bool collective_kernel_enabled() const { return collective_kernel_enabled_; }
+  bool is_async() const { return is_async_; }
+
   // Returns true if the collective kernel is supported for the given clique.
   absl::StatusOr<bool> IsSupported(
-      const GpuCliqueKey& clique_key,
-      const CollectiveCliques* collective_cliques) const;
+      const GpuCliqueKey& clique_key, se::StreamExecutor& executor,
+      const CollectiveParams& collective_params) const;
 
   // The single host collective thunk actually requires a clique key.
   absl::Status Prepare(const PrepareParams& params) final;
@@ -100,10 +107,9 @@ class CollectiveKernelThunk : public Thunk {
   // We use a double buffering strategy for the buffers.
   // See docs on struct StreamState for more details.
   static constexpr int64_t kNumBuffers = 2;
-  // Per-executor state that needs to be synchronized for access.
-  struct StreamState {
-    int device_ordinal;
-    RankId rank;
+
+  // Per-executor scratch memory.
+  struct StreamMemory {
     // Buffers allocated for the collective.
     // Buffers are double buffered to allow for consecutive invocation
     // of the kernel on different GPUs.
@@ -118,6 +124,17 @@ class CollectiveKernelThunk : public Thunk {
     // Also double buffered for the same reason as local buffers.
     se::DeviceAddressHandle signal_buffers_handle;
 
+    se::gpu::AllReduceStrategy strategy;
+
+    const int64_t local_buffer_size_bytes = 0;
+    const int64_t signal_buffer_size_bytes = 0;
+  };
+
+  // Per-executor state that needs to be synchronized for access.
+  struct StreamState {
+    int device_ordinal = 0;
+    RankId rank = RankId(0);
+
     // Pointer to the collective kernel metadata on device.
     se::DeviceAddressBase metadata;
 
@@ -136,25 +153,15 @@ class CollectiveKernelThunk : public Thunk {
     // Constructor to make OSS builds happy.
     StreamState() = default;
     StreamState(int device_ordinal_arg, RankId rank_arg,
-                se::DeviceAddressHandle local_buffers_handle_arg,
-                se::DeviceAddressHandle signal_buffers_handle_arg,
                 std::unique_ptr<se::Kernel> kernel_arg)
         : device_ordinal(device_ordinal_arg),
           rank(rank_arg),
-          local_buffers_handle(std::move(local_buffers_handle_arg)),
-          signal_buffers_handle(std::move(signal_buffers_handle_arg)),
           kernel(std::move(kernel_arg)) {}
   };
 
   // Returns the input size in bytes for the collective.
   int64_t GetInputSizeBytes() const;
 
-  // Internal method to sync thread after Initialize.
-  // Returns the collective kernel metadata for the given clique key.
-  absl::Status ExchangeStateMetadata(const GpuCliqueKey& clique_key,
-                                     const InitializeParams& params,
-                                     StreamState& state);
-
   // Whether the one-shot kernel is enabled.
   const bool collective_kernel_enabled_;
   // Whether the collective is run on an async stream.
@@ -177,6 +184,8 @@ class CollectiveKernelThunk : public Thunk {
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamState>>
       per_stream_state_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamMemory>>
+      per_stream_memory_ ABSL_GUARDED_BY(mutex_);
   const bool is_multimem_enabled_;
 };
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
index 993b4d0cc06b0d..15f1fdd20c33cd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
@@ -21,17 +21,20 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
@@ -252,7 +255,7 @@ absl::StatusOr<se::DeviceAddressBase> RunCollectiveKernelThunk(
       &gpu_options);
 
   TF_ASSIGN_OR_RETURN(
-      auto collective_params,
+      CollectiveParams collective_params,
       CollectiveParams::Create(run_options, /*async_streams=*/{},
                                LocalDeviceId(executor->device_ordinal())));
   std::vector<se::DeviceAddressBase> allocated_buffers = {
@@ -274,12 +277,23 @@ absl::StatusOr<se::DeviceAddressBase> RunCollectiveKernelThunk(
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
 
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
+  CollectiveCliqueRequests clique_requests;
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &buffer_allocations};
+  TF_RETURN_IF_ERROR(metadata.thunk->Prepare(prepare_params));
+
+  TF_RETURN_IF_ERROR(multimem_registry.Build());
+
   Thunk::InitializeParams initialize_params;
   initialize_params.executor = executor;
   initialize_params.stream = stream.get();
   initialize_params.buffer_allocations = &buffer_allocations;
   initialize_params.collective_params = &collective_params;
   initialize_params.src = {kKernelSource};
+  initialize_params.multicast_memory_registry = &multimem_registry;
 
   GpuExecutableRunOptions::DeviceIdMap global_device_id_map = {
       {LocalDeviceId(0), GlobalDeviceId(0)}};
@@ -394,7 +408,7 @@ TEST(CollectiveKernelThunkTest, MultiprocessTest) {
   for (absl::StatusOr<se::DeviceAddressBase> result :
        RunCollectiveKernelThunkOnDevices(metadata,
                                          /*emulate_multiprocess=*/true)) {
-    EXPECT_THAT(result, StatusIs(absl::StatusCode::kUnimplemented));
+    EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
index ae4757dec337fa..e845aed3cc5b42 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
@@ -15,31 +15,29 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 
+#include <any>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
 #include "google/protobuf/repeated_ptr_field.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_clique_rendezvous.h"
 #include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/layout.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/rendezvous.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/gpu/collective_kernel_metadata.h"
@@ -74,104 +72,49 @@ CollectiveConfig CollectiveMetadataThunk::GetCollectiveConfig(
     }
   }
 
-  config.group_mode =
-      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA;
+  config.group_mode = CollectiveOpGroupMode::
+      COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION;
 
   return config;
 }
 
-struct DeviceParameters {
-  RankId rank;
-  std::vector<se::DeviceAddressBase> parameters;
-
-  bool operator<(const DeviceParameters& other) const {
-    return rank < other.rank;
-  }
-};
-
-absl::StatusOr<std::vector<DeviceParameters>> SyncLocalDeviceParameters(
-    const GpuCliqueKey& clique_key, RankId rank,
-    std::vector<se::DeviceAddressBase> parameters) {
-  std::vector<DeviceParameters> device_parameters;
-  auto rendezvous_fn = [](absl::Span<const DeviceParameters* const> values) {
-    std::vector<DeviceParameters> values_copy;
-    for (const auto& value : values) {
-      values_copy.push_back(*value);
-    }
-    // Sort to make sure that values are in the same order as the
-    // devices are ordered in the communicator.
-    absl::c_sort(values_copy);
-    return values_copy;
-  };
-
-  std::string start_rendezvous_key = absl::StrFormat(
-      "[rank=%d] Initializing collective metadata for clique %s", rank.value(),
-      clique_key.ToString());
-
-  DeviceParameters params;
-  params.rank = rank;
-  params.parameters = std::move(parameters);
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<DeviceParameters>> local_ranks_parameters,
-      Rendezvous<std::vector<DeviceParameters>>(
-          /*name=*/start_rendezvous_key, /*key=*/clique_key,
-          /*value=*/params,
-          /*num_threads=*/clique_key.num_local_participants(), rendezvous_fn));
-  return std::vector<DeviceParameters>(local_ranks_parameters->begin(),
-                                       local_ranks_parameters->end());
-}
-
-absl::StatusOr<std::vector<DeviceParameters>> SyncGlobalDeviceParameters(
-    const GpuCliqueKey& clique_key, RankId rank,
-    std::vector<se::DeviceAddressBase> parameters) {
-  if (!clique_key.is_local()) {
-    return Unimplemented(
-        "[rank=%d] Multiprocess collective metadata is not supported yet in "
-        "clique %s",
-        rank.value(), clique_key.ToString());
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceParameters> local_ranks_parameters,
-      SyncLocalDeviceParameters(clique_key, rank, std::move(parameters)));
-
-  return local_ranks_parameters;
-}
-
 absl::Status CollectiveMetadataThunk::ConstructCollectiveMetadata(
     const GpuCliqueKey& clique_key, RankId rank, se::Stream* stream,
     std::vector<se::DeviceAddressBase> parameters,
     std::shared_ptr<CollectiveMultimem> multimem,
     se::DeviceAddressBase destination) {
-  CollectiveKernelMetadata metadata;
-  metadata.rank = rank.value();
-  metadata.multicast_buffer_ptr =
-      multimem ? multimem->mapped_ptr(rank) : nullptr;
+  size_t num_parameters = parameters.size();
+
+  using DeviceParameters = std::vector<se::DeviceAddressBase>;
+
+  // Exchange device parameters with all ranks in the clique.
   TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceParameters> device_parameters,
-      SyncGlobalDeviceParameters(clique_key, rank, std::move(parameters)));
-  TF_RET_CHECK(!device_parameters.empty())
-      << "Not enough devices in the clique.";
-  const size_t num_parameters = device_parameters[0].parameters.size();
-  for (const auto& value : device_parameters) {
-    TF_RET_CHECK(value.parameters.size() == num_parameters);
-  }
+      auto device_parameters,
+      GpuCliqueRendezvous::Join(clique_key, rank, std::move(parameters)));
 
+  // Collect pointers to device buffers from all participating ranks.
   std::vector<void*> param_to_peers_ptrs;
-  param_to_peers_ptrs.reserve(device_parameters.size() * num_parameters);
-  for (int peer = 0; peer < device_parameters.size(); ++peer) {
-    for (int param = 0; param < num_parameters; ++param) {
-      param_to_peers_ptrs.push_back(
-          device_parameters[peer].parameters[param].opaque());
+  for (auto peer = RankId(0); peer < RankId(clique_key.num_devices()); ++peer) {
+    TF_ASSIGN_OR_RETURN(const DeviceParameters& peer_parameters,
+                        device_parameters->at<DeviceParameters>(peer));
+    for (se::DeviceAddressBase peer_parameter : peer_parameters) {
+      param_to_peers_ptrs.push_back(peer_parameter.opaque());
     }
   }
 
+  // Check that all participants have the same number of parameters.
+  TF_RET_CHECK(param_to_peers_ptrs.size() ==
+               num_parameters * clique_key.num_local_participants());
+
   const int64_t param_to_peers_ptrs_size =
       param_to_peers_ptrs.size() * sizeof(void*);
   se::DeviceAddressBase param_to_peers_ptrs_buffer = destination.GetByteSlice(
       sizeof(CollectiveKernelMetadata), param_to_peers_ptrs_size);
 
+  CollectiveKernelMetadata metadata;
+  metadata.rank = rank.value();
+  metadata.multicast_buffer_ptr =
+      multimem ? multimem->mapped_ptr(rank) : nullptr;
   metadata.param_to_peers =
       reinterpret_cast<void**>(param_to_peers_ptrs_buffer.opaque());
 
@@ -202,6 +145,36 @@ CollectiveMetadataThunk::GetParameterDeviceMemoryBase(
       /*size_bytes=*/num_devices * sizeof(void*));
 }
 
+absl::Status CollectiveMetadataThunk::Prepare(const PrepareParams& params) {
+  // We currently support only a single memory space for multimem parameters.
+  // So we just pick the first one here.
+  auto fast_memory_parameter =
+      absl::c_find_if(parameters_, [](const Buffer& parameter) {
+        return parameter.memory_space == xla::Layout::kGenericFastMemorySpace;
+      });
+  if (fast_memory_parameter == parameters_.end()) {
+    return absl::OkStatus();
+  }
+
+  se::DeviceAddressBase memory_range;
+  TF_ASSIGN_OR_RETURN(memory_range,
+                      params.executor->GetMemoryRange(
+                          params.buffer_allocations->GetDeviceAddress(
+                              fast_memory_parameter->slice)));
+
+  // Since there is no parameter in the collective memory space, we don't need
+  // to set up the collective multimem.
+  if (memory_range.is_null()) {
+    return absl::OkStatus();
+  }
+  TF_ASSIGN_OR_RETURN(
+      const GpuCliqueKey clique_key,
+      GetCollectiveGpuCliqueKey(*params.collective_params, collective_config_,
+                                /*include_participant_groups=*/false));
+  params.multimem_registry->Register({clique_key, /*map_to=*/memory_range});
+  return absl::OkStatus();
+}
+
 absl::Status CollectiveMetadataThunk::Initialize(
     const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
@@ -223,11 +196,11 @@ absl::Status CollectiveMetadataThunk::Initialize(
       params.buffer_allocations->GetDeviceAddress(result_);
 
   GlobalDeviceId global_device_id = params.collective_params->global_device_id;
-  std::optional<RankId> rank = clique_key.rank(global_device_id);
 
-  TF_ASSIGN_OR_RETURN(auto multimem,
-                      AllocateMultimem(clique_key, *rank, params));
+  TF_ASSIGN_OR_RETURN(auto multimem, GetCollectiveMultimem(clique_key, params));
 
+  std::optional<RankId> rank = clique_key.rank(global_device_id);
+  TF_RET_CHECK(rank.has_value());
   return ConstructCollectiveMetadata(clique_key, *rank, params.stream,
                                      std::move(parameters), std::move(multimem),
                                      result_ptr);
@@ -239,9 +212,8 @@ absl::Status CollectiveMetadataThunk::ExecuteOnStream(
 }
 
 absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
-CollectiveMetadataThunk::AllocateMultimem(const GpuCliqueKey& clique_key,
-                                          RankId rank,
-                                          const InitializeParams& params) {
+CollectiveMetadataThunk::GetCollectiveMultimem(const GpuCliqueKey& clique_key,
+                                               const InitializeParams& params) {
   se::DeviceAddressBase memory_range;
   for (const Buffer& parameter : parameters_) {
     if (parameter.memory_space == xla::Layout::kGenericFastMemorySpace) {
@@ -259,10 +231,9 @@ CollectiveMetadataThunk::AllocateMultimem(const GpuCliqueKey& clique_key,
     return nullptr;
   }
 
+  const MultimemRequest request{clique_key, memory_range};
   TF_ASSIGN_OR_RETURN(std::shared_ptr<CollectiveMultimem> collective_multimem,
-                      CollectiveMultimem::Allocate(params.executor, clique_key,
-                                                   rank, memory_range));
-
+                      params.multicast_memory_registry->Get(request));
   absl::MutexLock lock(mutex_);
   return (collective_multimem_[params.executor] =
               std::move(collective_multimem));
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
index 73a36ca88102fe..01677b001866c7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
@@ -45,15 +45,16 @@ class CollectiveMetadataThunk : public Thunk {
     int64_t memory_space;
   };
 
-  explicit CollectiveMetadataThunk(ThunkInfo thunk_info,
-                                   CollectiveConfig collective_config,
-                                   std::vector<Buffer> parameters,
-                                   BufferAllocation::Slice result)
+  CollectiveMetadataThunk(ThunkInfo thunk_info,
+                          CollectiveConfig collective_config,
+                          std::vector<Buffer> parameters,
+                          BufferAllocation::Slice result)
       : Thunk(Thunk::Kind::kCollectiveMetadata, thunk_info),
         collective_config_(std::move(collective_config)),
         parameters_(std::move(parameters)),
         result_(result) {}
 
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -75,9 +76,8 @@ class CollectiveMetadataThunk : public Thunk {
       int64_t num_devices, int64_t parameter_index);
 
  private:
-  absl::StatusOr<std::shared_ptr<CollectiveMultimem>> AllocateMultimem(
-      const GpuCliqueKey& clique_key, RankId rank,
-      const InitializeParams& params);
+  absl::StatusOr<std::shared_ptr<CollectiveMultimem>> GetCollectiveMultimem(
+      const GpuCliqueKey& clique_key, const InitializeParams& params);
 
   const CollectiveConfig collective_config_;
   std::vector<Buffer> parameters_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc
index bbf2535d5f1216..ad17e9b3b8a45b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/collective_multimem.h"
 
-#include <any>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -74,7 +73,6 @@ struct AllocateParams {
   se::StreamExecutor* executor;
   RankId rank;
   se::DeviceAddressBase map_to;
-  std::any payload;
 };
 
 struct RankCmp {
@@ -100,9 +98,9 @@ struct MappedPtrFormatter {
 }  // namespace
 
 absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
-CollectiveMultimem::Allocate(se::StreamExecutor* executor,
+CollectiveMultimem::Allocate(se::StreamExecutor& executor,
                              const GpuCliqueKey& clique_key, RankId rank,
-                             se::DeviceAddressBase map_to, std::any payload) {
+                             se::DeviceAddressBase map_to) {
   VLOG(3) << absl::StrFormat(
       "rank=[%d] Allocate collective multimem for clique: %s", rank.value(),
       clique_key.ToString());
@@ -112,13 +110,13 @@ CollectiveMultimem::Allocate(se::StreamExecutor* executor,
   if (!clique_key.is_local()) {
     return Unimplemented(
         "%sMultimem is not supported in multi-process mode in clique %s",
-        XlaFormatDevice(executor->device_ordinal()), clique_key.ToString());
+        XlaFormatDevice(executor.device_ordinal()), clique_key.ToString());
   }
 
   std::string rendezvous_name = absl::StrFormat(
       "CollectiveMultimem::Allocate for clique %s", clique_key.ToString());
   AllocateRendezvousKey rendezvous_key = {clique_key};
-  AllocateParams params = {executor, rank, map_to, std::move(payload)};
+  AllocateParams params = {&executor, rank, map_to};
 
   // A callback for rendezvous to allocate and map the multicast memory.
   auto allocate = [&](absl::Span<const AllocateParams*> params)
@@ -159,12 +157,6 @@ CollectiveMultimem::Allocate(se::StreamExecutor* executor,
               dynamic_cast<se::gpu::GpuExecutor*>(param->executor)));
     }
 
-    // For all participating devices move payloads to the collective multimem.
-    absl::btree_map<RankId, std::any> payloads;
-    for (const auto* param : params) {
-      payloads[param->rank] = std::move(param->payload);
-    }
-
     VLOG(3) << absl::StrFormat(
         "Allocated collective multimem for clique: %s; mapped_ptrs: [%s]",
         clique_key.ToString(),
@@ -182,14 +174,13 @@ CollectiveMultimem::Allocate(se::StreamExecutor* executor,
 }
 
 absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
-CollectiveMultimem::Allocate(se::StreamExecutor* executor,
+CollectiveMultimem::Allocate(se::StreamExecutor& executor,
                              const GpuCliqueKey& clique_key,
                              GlobalDeviceId global_device_id,
-                             se::DeviceAddressBase map_to, std::any payload) {
+                             se::DeviceAddressBase map_to) {
   if (std::optional<RankId> rank = clique_key.rank(global_device_id)) {
-    return Allocate(executor, clique_key, *rank, map_to, std::move(payload));
+    return Allocate(executor, clique_key, *rank, map_to);
   }
   return InvalidArgument("Rank not found for device %v", global_device_id);
 }
-
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h
index 20ce764570c53f..a2ee82ea77e102 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_H_
 
 #include <any>
-#include <functional>
 #include <memory>
 
 #include "absl/container/btree_map.h"
@@ -52,25 +51,19 @@ class CollectiveMultimem {
   // The optional `payload` argument is captured by the returned shared pointer
   // to allow callers to associate arbitrary data with the collective multimem.
   static absl::StatusOr<std::shared_ptr<CollectiveMultimem>> Allocate(
-      se::StreamExecutor* executor, const GpuCliqueKey& clique_key, RankId rank,
-      se::DeviceAddressBase map_to, std::any payload = {});
+      se::StreamExecutor& executor, const GpuCliqueKey& clique_key, RankId rank,
+      se::DeviceAddressBase map_to);
 
   // Allocates a CollectiveMultimem for the given global device id.
   static absl::StatusOr<std::shared_ptr<CollectiveMultimem>> Allocate(
-      se::StreamExecutor* executor, const GpuCliqueKey& clique_key,
-      GlobalDeviceId global_device_id, se::DeviceAddressBase map_to,
-      std::any payload = {});
+      se::StreamExecutor& executor, const GpuCliqueKey& clique_key,
+      GlobalDeviceId global_device_id, se::DeviceAddressBase map_to);
 
   const GpuCliqueKey& clique_key() const { return clique_key_; }
 
   // Returns the device pointer to the multicast memory for the given rank.
   void* mapped_ptr(RankId rank) const { return mapped_ptrs_.at(rank); }
 
-  // Returns the payload associated with the given rank. If payload type is not
-  // the same as `T`, returns an error.
-  template <typename T>
-  absl::StatusOr<std::reference_wrapper<T>> payload(RankId rank) const;
-
  private:
   CollectiveMultimem(
       GpuCliqueKey clique_key, absl::btree_map<RankId, void*> mapped_ptrs,
@@ -89,21 +82,6 @@ class CollectiveMultimem {
   std::unique_ptr<se::gpu::MulticastMemory> multicast_memory_;
 };
 
-template <typename T>
-absl::StatusOr<std::reference_wrapper<T>> CollectiveMultimem::payload(
-    RankId rank) const {
-  auto it = payload_.find(rank);
-  if (it == payload_.end()) {
-    return NotFound("Payload not found for rank %d", rank.value());
-  }
-
-  if (std::any_cast<T>(&it->second) == nullptr) {
-    return InvalidArgument("Payload type mismatch for rank %d", rank.value());
-  }
-
-  return std::ref(std::any_cast<T&>(&it->second));
-}
-
 }  // namespace xla::gpu
 
 #endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc
new file mode 100644
index 00000000000000..dee42a12108375
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+void CollectiveMultimemRegistry::Register(const MultimemRequest& request) {
+  requests_.push_back(request);
+}
+
+absl::Status CollectiveMultimemRegistry::Build() {
+  for (const MultimemRequest& request : requests_) {
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<CollectiveMultimem> multimem,
+        CollectiveMultimem::Allocate(executor_, request.key, global_device_id_,
+                                     request.map_to));
+    multimems_[request] = multimem;
+  }
+
+  requests_.clear();
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
+CollectiveMultimemRegistry::Get(const MultimemRequest& request) const {
+  auto it = multimems_.find(request);
+  if (it == multimems_.end()) {
+    return absl::NotFoundError(absl::StrFormat(
+        "Multimem not found for request: %s", request.key.ToString()));
+  }
+  return it->second;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h
new file mode 100644
index 00000000000000..46e7069cf18e44
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h
@@ -0,0 +1,83 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_REGISTRY_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_REGISTRY_H_
+
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/runtime/device_id.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+// A request for a multimem for a given clique on a given address space.
+struct MultimemRequest {
+  static std::tuple<GpuCliqueKey, void*, uint64_t> CmpKey(
+      const MultimemRequest& key) {
+    return {key.key, key.map_to.opaque(), key.map_to.size()};
+  }
+
+  friend bool operator==(const MultimemRequest& a, const MultimemRequest& b) {
+    return a.key == b.key && a.map_to == b.map_to;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const MultimemRequest& key) {
+    return H::combine(std::move(h), key.key, key.map_to.opaque(),
+                      key.map_to.size());
+  }
+
+  GpuCliqueKey key;
+  se::DeviceAddressBase map_to;
+};
+
+// Allocates and provides thunks requested multimem objects.
+class CollectiveMultimemRegistry {
+ public:
+  // Does not take ownership of `executor`, which must outlive this object.
+  CollectiveMultimemRegistry(se::StreamExecutor* absl_nonnull executor,
+                             GlobalDeviceId global_device_id)
+      : executor_(*executor), global_device_id_(global_device_id) {}
+
+  void Register(const MultimemRequest& request);
+
+  absl::Status Build();
+
+  absl::StatusOr<std::shared_ptr<CollectiveMultimem>> Get(
+      const MultimemRequest& request) const;
+
+ private:
+  std::vector<MultimemRequest> requests_;
+  absl::flat_hash_map<MultimemRequest, std::shared_ptr<CollectiveMultimem>>
+      multimems_;
+  se::StreamExecutor& executor_;
+  GlobalDeviceId global_device_id_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_REGISTRY_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_params.cc b/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
index ec60f01138d2db..7a1f1791d3d911 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_params.h"
 
 #include <cstdint>
+#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
@@ -60,9 +61,11 @@ absl::StatusOr<CollectiveParams> CollectiveParams::Create(
   const GpuExecutableRunOptions* gpu_options =
       run_options.run_options().gpu_executable_run_options();
 
+  const std::string& platform_name =
+      run_options.run_options().stream()->parent()->GetPlatform()->Name();
   auto* collectives = gpu_options && gpu_options->collectives()
                           ? gpu_options->collectives()
-                          : GpuCollectives::Default();
+                          : GpuCollectives::Default(platform_name);
 
   auto* device_id_map = gpu_options && gpu_options->gpu_global_device_ids()
                             ? &*gpu_options->gpu_global_device_ids()
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
index f50e40992f8292..469086398b05a6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
@@ -56,6 +58,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -90,9 +93,22 @@ CollectivePermuteStartThunk::CollectivePermuteStartThunk(
     int64_t replica_count, int64_t partition_count,
     const std::vector<Buffer>& buffers, bool p2p_memcpy_enabled,
     AsyncStreamKind stream_kind)
-    : CollectiveThunk(Thunk::kCollectivePermuteStart, thunk_info,
-                      IsGPUSyncCollective(*instr), stream_kind),
-      config_(GetP2PConfig(instr, replica_count, partition_count)),
+    : CollectivePermuteStartThunk(
+          std::move(thunk_info),
+          GetP2PConfig(instr, replica_count, partition_count),
+          IsGPUSyncCollective(*instr)
+              ? nullptr
+              : std::make_shared<CollectiveThunk::AsyncEvents>(),
+          buffers, p2p_memcpy_enabled, stream_kind) {}
+
+CollectivePermuteStartThunk::CollectivePermuteStartThunk(
+    ThunkInfo thunk_info, const P2PConfig& config,
+    std::shared_ptr<AsyncEvents> async_events,
+    const std::vector<Buffer>& buffers, bool p2p_memcpy_enabled,
+    AsyncStreamKind stream_kind)
+    : CollectiveThunk(Thunk::kCollectivePermuteStart, thunk_info, async_events,
+                      stream_kind),
+      config_(config),
       buffers_(buffers),
       p2p_memcpy_enabled_(p2p_memcpy_enabled) {}
 
@@ -230,6 +246,88 @@ bool operator==(const CallRendezvousKey& a, const CallRendezvousKey& b) {
   return a.run_id == b.run_id;
 }
 
+absl::StatusOr<std::unique_ptr<CollectivePermuteStartThunk>>
+CollectivePermuteStartThunk::FromProto(
+    ThunkInfo thunk_info, const CollectivePermuteStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  P2PConfig::IdToSourceTargetMap id_to_source_target;
+  for (const SourceTarget& source_target : thunk_proto.source_target_pairs()) {
+    id_to_source_target.insert({source_target.target(), {}})
+        .first->second.source = source_target.source();
+    id_to_source_target.insert({source_target.source(), {}})
+        .first->second.target = source_target.target();
+  }
+
+  return std::make_unique<CollectivePermuteStartThunk>(
+      std::move(thunk_info), P2PConfig{config, std::move(id_to_source_target)},
+      async_events, std::move(buffers), thunk_proto.p2p_memcpy_enabled(),
+      thunk_proto.async_stream_kind());
+}
+
+absl::StatusOr<ThunkProto> CollectivePermuteStartThunk::ToProto() const {
+  CHECK_EQ(config_.validation_kind, P2PConfig::ValidationKind::kValid);
+  CHECK(config_.source_target_to_bounds.empty());
+
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  CollectivePermuteStartThunkProto* thunk_proto =
+      proto.mutable_collective_permute_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  thunk_proto->set_p2p_memcpy_enabled(p2p_memcpy_enabled_);
+
+  std::vector<SourceTarget> source_target_pairs;
+  source_target_pairs.reserve(config_.id_to_source_target.size() / 2);
+  for (const auto& [key_id, map_entry] : config_.id_to_source_target) {
+    SourceTarget pair;
+    if (!map_entry.source.has_value()) {
+      // Same pair is in the map with target/source switched.
+      continue;
+    }
+    pair.set_source(*map_entry.source);
+    pair.set_target(key_id);
+    source_target_pairs.push_back(pair);
+  }
+  thunk_proto->mutable_source_target_pairs()->Assign(
+      source_target_pairs.begin(), source_target_pairs.end());
+
+  return proto;
+}
+
 absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
index 964fcd7ea7526b..c24369f2d85dbc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
@@ -1,4 +1,5 @@
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/service/buffer_assignment.h"
 /* Copyright 2021 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -92,6 +93,18 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
         ABSL_GUARDED_BY(mutex_);
   };
 
+  CollectivePermuteStartThunk(ThunkInfo thunk_info,
+                              const HloCollectivePermuteInstruction* instr,
+                              int64_t replica_count, int64_t partition_count,
+                              const std::vector<Buffer>& buffers,
+                              bool p2p_memcpy_enabled,
+                              AsyncStreamKind stream_kind);
+  CollectivePermuteStartThunk(ThunkInfo thunk_info, const P2PConfig& config,
+                              std::shared_ptr<AsyncEvents> async_events,
+                              const std::vector<Buffer>& buffers,
+                              bool p2p_memcpy_enabled,
+                              AsyncStreamKind stream_kind);
+
   static P2PConfig GetP2PConfig(const HloCollectivePermuteInstruction* instr,
                                 int64_t replica_count, int64_t partition_count);
 
@@ -101,16 +114,9 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
   static CollectiveOpGroupMode GetGroupMode(
       const HloCollectivePermuteInstruction* instr);
 
-  CollectivePermuteStartThunk(ThunkInfo thunk_info,
-                              const HloCollectivePermuteInstruction* instr,
-                              int64_t replica_count, int64_t partition_count,
-                              const std::vector<Buffer>& buffers,
-                              bool p2p_memcpy_enabled,
-                              AsyncStreamKind stream_kind);
-
   absl::Status Initialize(const InitializeParams& params) override;
 
-  static const char* GetHloOpName() { return "collective-permute-start"; }
+  static absl::string_view GetHloOpName() { return "collective-permute-start"; }
 
   const CollectiveConfig& config() const override { return config_.config; }
 
@@ -118,6 +124,13 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
 
   const P2PConfig& p2p_config() const { return config_; }
 
+  static absl::StatusOr<std::unique_ptr<CollectivePermuteStartThunk>> FromProto(
+      ThunkInfo thunk_info, const CollectivePermuteStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
index f9e51f21be014b..e6796e68525e88 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
@@ -46,6 +46,8 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -55,6 +57,7 @@ namespace {
 
 using ::testing::ElementsAre;
 using Kind = Thunk::Kind;
+using ::tsl::proto_testing::EqualsProto;
 
 class GpuCollectivePermuteTest : public HloTestBase {};
 
@@ -212,5 +215,79 @@ ENTRY test_computation {
   EXPECT_THAT(kinds, ElementsAre(Kind::kReplicaId, Kind::kKernel,
                                  Kind::kCollectivePermuteStart));
 }
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        collective_permute_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          p2p_memcpy_enabled: true
+          async_stream_kind: ASYNC_STREAM_KIND_COLLECTIVE
+          source_target_pairs: { source: 1 target: 2 }
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<CollectivePermuteStartThunk> thunk,
+                       CollectivePermuteStartThunk::FromProto(
+                           thunk_info, proto.collective_permute_start_thunk(),
+                           buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_collective_permute_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.collective_permute_start_thunk()
+          .async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(CollectiveThunkTest, SyncCollective) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        collective_permute_start_thunk {
+          collective_config {}
+          p2p_memcpy_enabled: true
+          async_stream_kind: ASYNC_STREAM_KIND_COLLECTIVE
+          source_target_pairs: { source: 1 target: 2 }
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<CollectivePermuteStartThunk> thunk,
+                       CollectivePermuteStartThunk::FromProto(
+                           thunk_info, proto.collective_permute_start_thunk(),
+                           buffer_allocations, async_events_map));
+  ASSERT_EQ(thunk->async_events(), nullptr);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index f67cca8c784ad3..3d339a136aa0f1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstdlib>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
@@ -45,6 +46,7 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/primitive_util.h"
 #include "xla/runtime/device_id.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/rendezvous.h"
@@ -59,6 +61,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -148,6 +151,38 @@ bool CollectiveConfig::IsDegenerate(int64_t replica_count,
   }
 }
 
+CollectiveConfigProto CollectiveConfig::ToProto() const {
+  CollectiveConfigProto proto;
+
+  proto.mutable_operand_element_type()->Assign(operand_element_type.begin(),
+                                               operand_element_type.end());
+  proto.mutable_replica_groups()->Assign(replica_groups.begin(),
+                                         replica_groups.end());
+
+  proto.set_group_mode(group_mode);
+  proto.set_use_symmetric_buffer(use_symmetric_buffer);
+
+  return proto;
+}
+
+CollectiveConfig CollectiveConfig::FromProto(
+    const CollectiveConfigProto& proto) {
+  CollectiveConfig config;
+
+  config.operand_element_type.reserve(proto.operand_element_type_size());
+  for (int element_type : proto.operand_element_type()) {
+    config.operand_element_type.push_back(
+        static_cast<PrimitiveType>(element_type));
+  }
+
+  config.replica_groups.assign(proto.replica_groups().begin(),
+                               proto.replica_groups().end());
+
+  config.group_mode = proto.group_mode();
+  config.use_symmetric_buffer = proto.use_symmetric_buffer();
+  return config;
+}
+
 CollectiveConfig GetCollectiveConfig(
     const HloInstruction* hlo, std::optional<bool> use_global_device_ids) {
   CollectiveConfig config;
@@ -176,6 +211,13 @@ CollectiveThunk::CollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync,
       stream_kind_(stream_kind),
       async_events_(is_sync ? nullptr : std::make_shared<AsyncEvents>()) {}
 
+CollectiveThunk::CollectiveThunk(Kind kind, ThunkInfo thunk_info,
+                                 std::shared_ptr<AsyncEvents> async_events,
+                                 AsyncStreamKind stream_kind)
+    : Thunk(kind, thunk_info),
+      stream_kind_(stream_kind),
+      async_events_(async_events) {}
+
 absl::StatusOr<GpuCliqueKey> GetCollectiveGpuCliqueKey(
     const CollectiveParams& params, const CollectiveConfig& collective_config,
     bool include_participant_groups) {
@@ -217,7 +259,7 @@ absl::Status MaybeRegisterBuffer(se::StreamExecutor* executor,
                                  const se::DeviceAddressBase& buffer,
                                  Communicator* comm,
                                  bool use_symmetric_buffer) {
-  TF_ASSIGN_OR_RETURN(auto range, executor->GetMemoryRange(buffer));
+  ASSIGN_OR_RETURN(auto range, executor->GetMemoryRange(buffer));
   XLA_VLOG_DEVICE(1, executor->device_ordinal())
       << "Registering range: " << range.opaque()
       << " with size: " << range.size() << " for buffer: " << buffer.opaque()
@@ -235,17 +277,44 @@ absl::Status MaybeRegisterBuffers(se::StreamExecutor* executor,
                                   bool use_symmetric_buffer) {
   for (int i = 0; i < buffers.size(); ++i) {
     if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
-      TF_RETURN_IF_ERROR(MaybeRegisterBuffer(executor, buffers[i].source_buffer,
-                                             comm, use_symmetric_buffer));
+      RETURN_IF_ERROR(MaybeRegisterBuffer(executor, buffers[i].source_buffer,
+                                          comm, use_symmetric_buffer));
     }
     if (buffers[i].destination_memory_space == kCollectiveMemorySpaceColor) {
-      TF_RETURN_IF_ERROR(MaybeRegisterBuffer(
+      RETURN_IF_ERROR(MaybeRegisterBuffer(
           executor, buffers[i].destination_buffer, comm, use_symmetric_buffer));
     }
   }
   return absl::OkStatus();
 }
 
+absl::StatusOr<CollectiveBufferProto> CollectiveThunk::Buffer::ToProto() const {
+  CollectiveBufferProto proto;
+  proto.set_element_count(element_count);
+  ASSIGN_OR_RETURN(*proto.mutable_source_buffer(), source_buffer.ToProto());
+  ASSIGN_OR_RETURN(*proto.mutable_destination_buffer(),
+                   destination_buffer.ToProto());
+  proto.set_source_memory_space(source_memory_space);
+  proto.set_destination_memory_space(destination_memory_space);
+  return proto;
+}
+
+absl::StatusOr<CollectiveThunk::Buffer> CollectiveThunk::Buffer::FromProto(
+    const CollectiveBufferProto& buffer_proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  CollectiveThunk::Buffer res;
+  res.element_count = buffer_proto.element_count();
+  ASSIGN_OR_RETURN(res.source_buffer,
+                   BufferAllocation::Slice::FromProto(
+                       buffer_proto.source_buffer(), buffer_allocations));
+  ASSIGN_OR_RETURN(res.destination_buffer,
+                   BufferAllocation::Slice::FromProto(
+                       buffer_proto.destination_buffer(), buffer_allocations));
+  res.source_memory_space = buffer_proto.source_memory_space();
+  res.destination_memory_space = buffer_proto.destination_memory_space();
+  return res;
+}
+
 absl::Status CollectiveThunk::AsyncEvents::Initialize(
     se::StreamExecutor* executor) {
   absl::MutexLock lock(mu_);
@@ -253,7 +322,7 @@ absl::Status CollectiveThunk::AsyncEvents::Initialize(
     return absl::OkStatus();
   }
 
-  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
+  ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
 
   events_.try_emplace(executor, std::move(event));
   return absl::OkStatus();
@@ -274,7 +343,7 @@ absl::StatusOr<se::Event*> CollectiveThunk::AsyncEvents::GetEvent(
 
 absl::Status CollectiveThunk::Prepare(const PrepareParams& params) {
   TF_RET_CHECK(params.collective_params != nullptr);
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(*params.collective_params, config().replica_groups,
                       config().group_mode, GetAsyncStreamKind()));
@@ -283,7 +352,7 @@ absl::Status CollectiveThunk::Prepare(const PrepareParams& params) {
 
 absl::Status CollectiveThunk::Initialize(const InitializeParams& params) {
   if (async_events_) {
-    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
+    RETURN_IF_ERROR(async_events_->Initialize(params.executor));
   }
   return absl::OkStatus();
 }
@@ -294,15 +363,14 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
       IsAsync() ? "async" : "sync", Thunk::KindToString(kind()));
   AsyncStreamKind stream_kind = GetAsyncStreamKind();
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(*params.collective_params, config().replica_groups,
                       config().group_mode, stream_kind));
 
-  TF_ASSIGN_OR_RETURN(
-      Communicator * comm,
-      params.collective_cliques->GetComm(
-          clique_key, params.collective_params->global_device_id));
+  ASSIGN_OR_RETURN(Communicator * comm,
+                   params.collective_cliques->GetComm(
+                       clique_key, params.collective_params->global_device_id));
   DCHECK(comm) << "Failed to get communicator for collective operation";
 
   se::StreamExecutor* executor = params.stream->parent();
@@ -315,20 +383,18 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
         *params.collective_params->async_streams.at(async_stream_idx);
 
     // Wait for main compute stream to make sure all buffers are ready.
-    TF_RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
+    RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
 
-    TF_ASSIGN_OR_RETURN(is_first_rendezvous_needed,
-                        RunCollective(params, clique_key, async_stream, *comm));
+    ASSIGN_OR_RETURN(is_first_rendezvous_needed,
+                     RunCollective(params, clique_key, async_stream, *comm));
 
     // Record collective operation completion.
-    TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
-    TF_RETURN_IF_ERROR(async_stream.RecordEvent(event));
-
+    ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+    RETURN_IF_ERROR(async_stream.RecordEvent(event));
   } else {
     // Launch collective operation on a main stream.
-    TF_ASSIGN_OR_RETURN(
-        is_first_rendezvous_needed,
-        RunCollective(params, clique_key, *params.stream, *comm));
+    ASSIGN_OR_RETURN(is_first_rendezvous_needed,
+                     RunCollective(params, clique_key, *params.stream, *comm));
   }
 
   // After a first execution of this instance of collective operation do a
@@ -356,7 +422,7 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
 
     const xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
 
-    TF_RETURN_IF_ERROR(Rendezvous(
+    RETURN_IF_ERROR(Rendezvous(
         first_call_rendezvous_flag_, rendezvous_name, rendezvous_key,
         num_local_participants,
         /*warn_stuck_timeout=*/
@@ -378,14 +444,13 @@ absl::StatusOr<std::vector<Communicator*>> CollectiveThunk::GetCommunicators(
     const ExecuteParams& params) const {
   AsyncStreamKind stream_kind = GetAsyncStreamKind();
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(*params.collective_params, config().replica_groups,
                       config().group_mode, stream_kind));
-  TF_ASSIGN_OR_RETURN(
-      Communicator * comm,
-      params.collective_cliques->GetComm(
-          clique_key, params.collective_params->global_device_id));
+  ASSIGN_OR_RETURN(Communicator * comm,
+                   params.collective_cliques->GetComm(
+                       clique_key, params.collective_params->global_device_id));
   return std::vector<Communicator*>{comm};
 }
 
@@ -410,6 +475,22 @@ std::optional<AsyncEventsUniqueId> CollectiveThunk::GetAsyncEventsUniqueId()
   return absl::bit_cast<AsyncEventsUniqueId>(async_events_.get());
 }
 
+absl::StatusOr<CollectiveThunkProto> CollectiveThunk::ToCollectiveThunkProto()
+    const {
+  CollectiveThunkProto proto;
+
+  proto.set_async_stream_kind(stream_kind_);
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (!async_events_id.has_value()) {
+    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  }
+  proto.set_async_events_unique_id(async_events_id->value());
+  proto.set_thunk_kind(Thunk::KindToProto(kind()));
+
+  return proto;
+}
+
 CollectiveDoneThunk::CollectiveDoneThunk(
     Thunk::Kind kind, ThunkInfo thunk_info,
     std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
@@ -420,7 +501,7 @@ CollectiveDoneThunk::CollectiveDoneThunk(
 
 absl::Status CollectiveDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
-  TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+  ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
   return params.stream->WaitFor(event);
 }
 
@@ -446,4 +527,43 @@ std::optional<AsyncEventsUniqueId> CollectiveDoneThunk::GetAsyncEventsUniqueId()
   // We rely on the fact that the pointer to async_events_ is unique.
   return absl::bit_cast<AsyncEventsUniqueId>(async_events_.get());
 }
+
+absl::StatusOr<ThunkProto> CollectiveDoneThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  CollectiveDoneThunkProto* thunk_proto = proto.mutable_collective_done_thunk();
+  thunk_proto->set_async_stream_kind(stream_kind_);
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+  thunk_proto->set_thunk_kind(Thunk::KindToProto(kind()));
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<CollectiveDoneThunk>>
+CollectiveDoneThunk::FromProto(
+    ThunkInfo thunk_info, const CollectiveDoneThunkProto& thunk_proto,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  ASSIGN_OR_RETURN(Thunk::Kind kind,
+                   Thunk::KindFromProto(thunk_proto.thunk_kind()));
+  return std::make_unique<CollectiveDoneThunk>(kind, std::move(thunk_info),
+                                               async_events,
+                                               thunk_proto.async_stream_kind());
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
index ba936ac9d9d472..3f5013bad2a47f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -57,6 +58,9 @@ struct CollectiveConfig {
   std::vector<ReplicaGroup> replica_groups;
   CollectiveOpGroupMode group_mode;
   bool use_symmetric_buffer;
+
+  CollectiveConfigProto ToProto() const;
+  static CollectiveConfig FromProto(const CollectiveConfigProto& proto);
 };
 
 CollectiveConfig GetCollectiveConfig(const HloInstruction* hlo,
@@ -96,6 +100,11 @@ class CollectiveThunk : public Thunk {
     BufferAllocation::Slice destination_buffer;
     int64_t source_memory_space;
     int64_t destination_memory_space;
+
+    absl::StatusOr<CollectiveBufferProto> ToProto() const;
+    static absl::StatusOr<Buffer> FromProto(
+        const CollectiveBufferProto& buffer_proto,
+        absl::Span<const BufferAllocation> buffer_allocations);
   };
 
   // Completion events for asynchronous collective operations (operations
@@ -117,6 +126,12 @@ class CollectiveThunk : public Thunk {
     absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>> events_
         ABSL_GUARDED_BY(mu_);
   };
+  using AsyncEventsMap =
+      absl::flat_hash_map<AsyncEventsUniqueId, std::shared_ptr<AsyncEvents>>;
+
+  CollectiveThunk(Kind kind, ThunkInfo thunk_info,
+                  std::shared_ptr<AsyncEvents> async_events,
+                  AsyncStreamKind stream_kind);
 
   // Logging support.
   static std::string GetDeviceString(const CollectiveParams& params);
@@ -149,6 +164,8 @@ class CollectiveThunk : public Thunk {
                              nccl_stream_id().value());
   }
 
+  absl::StatusOr<CollectiveThunkProto> ToCollectiveThunkProto() const;
+
  protected:
   // Run collective operation on a given stream and return if the first call
   // rendezvous with other participants is needed.
@@ -220,6 +237,11 @@ class CollectiveDoneThunk : public Thunk {
     return async_events_;
   }
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+  static absl::StatusOr<std::unique_ptr<CollectiveDoneThunk>> FromProto(
+      ThunkInfo thunk_info, const CollectiveDoneThunkProto& thunk_proto,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
  private:
   std::shared_ptr<CollectiveThunk::AsyncEvents> async_events_;
   AsyncStreamKind stream_kind_ = AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
new file mode 100644
index 00000000000000..dd71d9eeb969de
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(CollectiveDoneThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        collective_done_thunk {
+          thunk_kind: 1
+          async_stream_kind: 2
+          async_events_unique_id: 3
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CollectiveDoneThunk> thunk,
+      CollectiveDoneThunk::FromProto(thunk_info, proto.collective_done_thunk(),
+                                     async_events_map));
+  CHECK_NE(thunk->async_events(), nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_collective_done_thunk()->set_async_events_unique_id(
+      round_trip_proto.collective_done_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(CollectiveConfigTest, ToProto) {
+  CollectiveConfig config{
+      /*operand_element_type=*/{PrimitiveType::F32, PrimitiveType::BF16},
+      /*replica_groups=*/
+      {ParseTextProtoOrDie<ReplicaGroup>(
+           R"pb(replica_ids: 0 replica_ids: 1)pb"),
+       ParseTextProtoOrDie<ReplicaGroup>(
+           R"pb(replica_ids: 2 replica_ids: 3)pb")},
+      /*group_mode=*/
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION,
+      /*use_symmetric_buffer=*/true,
+  };
+
+  EXPECT_THAT(config.ToProto(), EqualsProto(R"pb(
+                operand_element_type: F32
+                operand_element_type: BF16
+                replica_groups { replica_ids: 0 replica_ids: 1 }
+                replica_groups { replica_ids: 2 replica_ids: 3 }
+                group_mode: COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION
+                use_symmetric_buffer: true
+              )pb"));
+}
+
+TEST(CollectiveConfigTest, FromProto) {
+  CollectiveConfigProto proto = ParseTextProtoOrDie<CollectiveConfigProto>(
+      R"pb(
+        operand_element_type: F32
+        operand_element_type: BF16
+        replica_groups { replica_ids: 0 replica_ids: 1 }
+        replica_groups { replica_ids: 2 replica_ids: 3 }
+        group_mode: COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION
+        use_symmetric_buffer: true
+      )pb");
+
+  CollectiveConfig config = CollectiveConfig::FromProto(proto);
+
+  EXPECT_THAT(config.operand_element_type,
+              ElementsAre(PrimitiveType::F32, PrimitiveType::BF16));
+  EXPECT_THAT(config.replica_groups,
+              ElementsAre(EqualsProto(R"pb(replica_ids: 0 replica_ids: 1)pb"),
+                          EqualsProto(R"pb(replica_ids: 2 replica_ids: 3)pb")));
+  EXPECT_EQ(config.group_mode,
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION);
+  EXPECT_TRUE(config.use_symmetric_buffer);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index 399bcc842ddaa0..c765d2562807f3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/call_frame.h"
@@ -74,7 +75,6 @@ limitations under the License.
 #include "xla/runtime/execution_graph.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
@@ -95,6 +95,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tensor_map.h"
 #include "xla/stream_executor/trace_command_buffer_factory.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -1323,13 +1324,19 @@ CommandBufferCmd::BufferUseVector CustomKernelLaunchCmd::buffers() const {
 // MemcpyDeviceToDeviceCmd
 //===----------------------------------------------------------------------===//
 
-MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
-                                                 BufferAllocation::Slice src,
+MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(ShapedSlice dst,
+                                                 ShapedSlice src,
                                                  int64_t num_bytes)
     : CommandBufferCmd(CommandBufferCmdType::kMemcpyDeviceToDeviceCmd),
       dst_(dst),
       src_(src),
-      num_bytes_(num_bytes) {}
+      num_bytes_(num_bytes) {
+  CHECK_EQ(ShapeUtil::ByteSizeOfElements(src_.shape),
+           ShapeUtil::ByteSizeOfElements(dst_.shape));
+  CHECK_LE(num_bytes, dst_.slice.size());
+  CHECK_LE(num_bytes, src_.slice.size());
+  CHECK_GE(src_.slice.size(), ShapeUtil::ByteSizeOf(src_.shape));
+}
 
 absl::StatusOr<const se::CommandBuffer::Command*>
 MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
@@ -1337,9 +1344,9 @@ MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
                                 RecordAction record_action,
                                 se::CommandBuffer* command_buffer) {
   se::DeviceAddressBase dst =
-      execute_params.buffer_allocations->GetDeviceAddress(dst_);
+      execute_params.buffer_allocations->GetDeviceAddress(dst_.slice);
   se::DeviceAddressBase src =
-      execute_params.buffer_allocations->GetDeviceAddress(src_);
+      execute_params.buffer_allocations->GetDeviceAddress(src_.slice);
 
   VLOG(5) << "MemcpyDeviceToDeviceCmd: num_bytes = " << num_bytes_;
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
@@ -1362,14 +1369,15 @@ MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
 }
 
 CommandBufferCmd::BufferUseVector MemcpyDeviceToDeviceCmd::buffers() const {
-  return {BufferUse::Write(dst_), BufferUse::Read(src_)};
+  return {BufferUse::Write(dst_.slice, dst_.shape),
+          BufferUse::Read(src_.slice, src_.shape)};
 }
 
 //===----------------------------------------------------------------------===//
 // MemzeroCmd
 //===----------------------------------------------------------------------===//
 
-MemzeroCmd::MemzeroCmd(BufferAllocation::Slice dst)
+MemzeroCmd::MemzeroCmd(ShapedSlice dst)
     : CommandBufferCmd(CommandBufferCmdType::kMemzeroCmd), dst_(dst) {}
 
 absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
@@ -1377,12 +1385,12 @@ absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
     const RecordParams& record_params, RecordAction record_action,
     se::CommandBuffer* command_buffer) {
   se::DeviceAddressBase dst =
-      execute_params.buffer_allocations->GetDeviceAddress(dst_);
+      execute_params.buffer_allocations->GetDeviceAddress(dst_.slice);
 
   VLOG(5) << "MemzeroCmd:";
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
 
-  if (dst_.size() == 0) {
+  if (dst_.slice.size() == 0) {
     VLOG(5) << "Skip recording MemzeroCmd command of 0 bytes";
     return nullptr;
   }
@@ -1391,17 +1399,17 @@ absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
       std::move(record_action),
       [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
         return command_buffer->CreateMemset(&dst, uint8_t{0},
-                                            /*num_elements=*/dst_.size(),
+                                            /*num_elements=*/dst_.slice.size(),
                                             dependencies);
       },
       [&](const se::CommandBuffer::Command* command) {
         return command_buffer->UpdateMemset(command, &dst, uint8_t{0},
-                                            /*num_elements=*/dst_.size());
+                                            /*num_elements=*/dst_.slice.size());
       });
 }
 
 CommandBufferCmd::BufferUseVector MemzeroCmd::buffers() const {
-  return {BufferUse::Write(dst_)};
+  return {BufferUse::Write(dst_.slice, dst_.shape)};
 }
 
 //===----------------------------------------------------------------------===//
@@ -1497,11 +1505,11 @@ absl::StatusOr<const se::CommandBuffer::Command*> ChildCmd::Record(
 // CaseCmd
 //===----------------------------------------------------------------------===//
 
-CaseCmd::CaseCmd(BufferAllocation::Slice index, bool index_is_bool,
+CaseCmd::CaseCmd(ShapedSlice index,
                  std::vector<CommandBufferCmdExecutor> branches)
     : CommandBufferCmd(CommandBufferCmdType::kCaseCmd),
       index_(index),
-      index_is_bool_(index_is_bool),
+      index_is_bool_(index.shape.element_type() == PRED),
       branches_(std::move(branches)) {}
 
 absl::Status CaseCmd::Initialize(const Thunk::InitializeParams& params,
@@ -1517,7 +1525,7 @@ absl::StatusOr<const se::CommandBuffer::Command*> CaseCmd::Record(
     const RecordParams& record_params, RecordAction record_action,
     se::CommandBuffer* command_buffer) {
   se::DeviceAddressBase index =
-      execute_params.buffer_allocations->GetDeviceAddress(index_);
+      execute_params.buffer_allocations->GetDeviceAddress(index_.slice);
 
   VLOG(5) << "CaseCmd:";
   VLOG(5) << "  index: " << index_ << " (" << index.opaque() << ")";
@@ -1560,7 +1568,7 @@ bool CaseCmd::force_update() {
 
 CommandBufferCmd::BufferUseVector CaseCmd::buffers() const {
   absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(BufferUse::Read(index_));
+  buffers.emplace(BufferUse::Read(index_.slice, index_.shape));
   for (auto& branch : branches_) {
     buffers.insert(branch.buffers().begin(), branch.buffers().end());
   }
@@ -1693,7 +1701,7 @@ bool WhileCmd::force_update() {
 
 CommandBufferCmd::BufferUseVector WhileCmd::buffers() const {
   absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(BufferUse::Write(pred_));
+  buffers.emplace(BufferUse::Read(pred_, ShapeUtil::MakeShape(PRED, {})));
   buffers.insert(cond_commands_.buffers().begin(),
                  cond_commands_.buffers().end());
   buffers.insert(body_commands_.buffers().begin(),
@@ -2570,7 +2578,7 @@ DynamicSliceFusionCmd::DynamicSliceFusionCmd(
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
-    std::vector<std::optional<uint64_t>> offset_byte_sizes,
+    std::vector<std::optional<PrimitiveType>> offset_primitive_types,
     std::optional<
         const DynamicSliceThunk::OffsetAsFunctionOfIndvarModulesMetadata*>
         offset_as_function_of_indvar_metadata)
@@ -2580,15 +2588,15 @@ DynamicSliceFusionCmd::DynamicSliceFusionCmd(
       offset_as_function_of_indvar_metadata_(
           std::move(offset_as_function_of_indvar_metadata)) {
   // Zip all arguments together to create a list of SliceDef.
-  for (auto [arg, offset, orig_shape, sliced_shape, offset_byte_size] :
+  for (auto [arg, offset, orig_shape, sliced_shape, offset_primitive_type] :
        llvm::zip_equal(arguments, offsets, orig_shapes, sliced_shapes,
-                       offset_byte_sizes)) {
+                       offset_primitive_types)) {
     slices_.push_back(DynamicSliceThunk::SliceDef{
         std::move(arg),
         std::move(offset),
         std::move(orig_shape),
         std::move(sliced_shape),
-        std::move(offset_byte_size),
+        std::move(offset_primitive_type),
     });
   }
 
@@ -2649,7 +2657,7 @@ absl::Status DynamicSliceFusionCmd::Prepare(
       TF_RET_CHECK(slice.embedded_thunk_argument.has_value());
       TF_RET_CHECK(slice.orig_shape.has_value());
       TF_RET_CHECK(slice.sliced_shape.has_value());
-      TF_RET_CHECK(slice.offset_byte_size.has_value());
+      TF_RET_CHECK(slice.offset_primitive_type.has_value());
       TF_RET_CHECK(slice.orig_shape->IsArray());
       TF_RET_CHECK(slice.sliced_shape->IsArray());
       TF_RET_CHECK(slice.offsets->size() ==
@@ -2763,8 +2771,9 @@ absl::StatusOr<const se::CommandBuffer::Command*> DynamicSliceFusionCmd::Record(
 
         // Copy the `offset_idx`-th component of the offset for the
         // `argument_idx`-th argument from device to host.
-        TF_RETURN_IF_ERROR(
-            stream.Memcpy(offset_dst, offset_src, *slice.offset_byte_size));
+        TF_RETURN_IF_ERROR(stream.Memcpy(
+            offset_dst, offset_src,
+            ShapeUtil::ByteSizeOfPrimitiveType(*slice.offset_primitive_type)));
         ++num_transfers;
       }
     }
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index 51b493507caaa1..543a1b295fac18 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/runtime/object_pool.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -118,6 +118,8 @@ std::string CommandBufferCmdString(CommandBufferCmdType type);
 // CommandBufferCmd
 //===----------------------------------------------------------------------===//
 
+using ResourceUseVector = absl::InlinedVector<ResourceUse, 1>;
+
 // Command is a Thunk counterpart that instead of launching operations directly
 // on the underlying device records them into command buffers.
 //
@@ -127,9 +129,41 @@ std::string CommandBufferCmdString(CommandBufferCmdType type);
 //
 // Commands must be thread safe as they can be recorded into multiple command
 // buffers concurrently on different stream executors.
-
-using ResourceUseVector = absl::InlinedVector<ResourceUse, 1>;
-
+//
+// IMPORTANT: In contrast to GPU thunks, commands MUST be stateless. Thunk state
+// typically belongs to the Thunk instance itself, and tends to be kept in
+// synchronized hash maps keyed by `se::StreamExecutor*` pointer. Commands on
+// the other hand should attach state to the underlying command buffer, and
+// because the number of command buffers that can be instantiated from a command
+// sequence is unbounded (as we have an eviction policy for command buffers),
+// keeping a state in a map inside the command will lead to memory leaks.
+//
+// Commands have an external state manager, which is responsible for managing
+// the lifetime of command state. See `State` and `StateManager` classes below.
+//
+// To make command stateful, it needs a `params.state` indirection:
+//
+//   class MyCommand : public CommandBufferCmd {
+//     public:
+//
+//     // Container for mutable state required for command execution.
+//     struct MyState : CommandBufferCmd::State {
+//       ...
+//     };
+//
+//     absl::StatusOr<Command*> Record(...) override {
+//       // Attach a new instance of `MyState` to the `command_buffer`. When
+//       // command buffer will be destroyed, the state will be destroyed as
+//       // well automatically by XLA runtime. If this command will be recorded
+//       // into another command buffer, the state will be re-created
+//       // automatically using the provided callback.
+//       MyState* my_state = record_params.state.GetOrCreate<MyState>(this,
+//         command_buffer, [&] { // create MyState for a `command_buffer` });
+//       ...
+//     }
+//
+//   };
+//
 class CommandBufferCmd {
  public:
   explicit CommandBufferCmd(
@@ -156,6 +190,8 @@ class CommandBufferCmd {
   // Externally managed state (owned and synchronized by CommandBufferThunk)
   // allows commands to attach a piece of information to command buffer in a
   // safe and performant way.
+  //
+  // See example above next to `CommandBufferCmd` definition.
   class State {
    public:
     virtual ~State() = default;
@@ -753,8 +789,7 @@ class CustomKernelLaunchCmd : public CommandBufferCmd {
 
 class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
  public:
-  MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
-                          BufferAllocation::Slice src, int64_t num_bytes);
+  MemcpyDeviceToDeviceCmd(ShapedSlice dst, ShapedSlice src, int64_t num_bytes);
 
   absl::StatusOr<const se::CommandBuffer::Command*> Record(
       const Thunk::ExecuteParams& execute_params,
@@ -764,9 +799,9 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
   BufferUseVector buffers() const override;
 
  private:
-  BufferAllocation::Slice dst_;
-  BufferAllocation::Slice src_;
-  int64_t num_bytes_;
+  ShapedSlice dst_;
+  ShapedSlice src_;
+  uint64_t num_bytes_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -775,7 +810,7 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
 
 class MemzeroCmd : public CommandBufferCmd {
  public:
-  explicit MemzeroCmd(BufferAllocation::Slice dst);
+  explicit MemzeroCmd(ShapedSlice dst);
 
   absl::StatusOr<const se::CommandBuffer::Command*> Record(
       const Thunk::ExecuteParams& execute_params,
@@ -785,7 +820,7 @@ class MemzeroCmd : public CommandBufferCmd {
   BufferUseVector buffers() const override;
 
  private:
-  BufferAllocation::Slice dst_;
+  ShapedSlice dst_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -842,8 +877,7 @@ class ChildCmd : public CommandBufferCmd {
 
 class CaseCmd : public CommandBufferCmd {
  public:
-  CaseCmd(BufferAllocation::Slice index, bool index_is_bool,
-          std::vector<CommandBufferCmdExecutor> branches);
+  CaseCmd(ShapedSlice index, std::vector<CommandBufferCmdExecutor> branches);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
@@ -862,7 +896,7 @@ class CaseCmd : public CommandBufferCmd {
   BufferUseVector buffers() const override;
 
  private:
-  BufferAllocation::Slice index_;
+  ShapedSlice index_;
   bool index_is_bool_;
   std::vector<CommandBufferCmdExecutor> branches_;
 };
@@ -1258,7 +1292,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
           offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
-      std::vector<std::optional<uint64_t>> offset_byte_sizes,
+      std::vector<std::optional<PrimitiveType>> offset_primitive_types,
       std::optional<
           const DynamicSliceThunk::OffsetAsFunctionOfIndvarModulesMetadata*>
           offset_as_function_of_indvar_metadata = std::nullopt);
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
index 1af03c86094247..1f2d499e5d6906 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -160,7 +160,6 @@ static absl::StatusOr<Command> Convert(
     }
   }
   return std::make_unique<CaseCmd>(thunk.branch_index_buffer(),
-                                   thunk.branch_index_is_bool(),
                                    std::move(branch_cmds));
 }
 
@@ -214,7 +213,7 @@ static absl::StatusOr<Command> Convert(
   return std::make_unique<DynamicSliceFusionCmd>(
       std::move(embedded_cmds), thunk.get_arguments(),
       std::move(fake_allocations), thunk.get_offsets(), thunk.get_orig_shapes(),
-      thunk.get_sliced_shapes(), thunk.get_offset_byte_sizes(),
+      thunk.get_sliced_shapes(), thunk.offset_primitive_types(),
       thunk.get_offset_function());
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
index 5dffa09d49e184..c995a45d181cac 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -36,6 +37,8 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
@@ -249,6 +252,7 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   auto stream = stream_executor->CreateStream().value();
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
 
   // Prepare arguments: a=42, b=0
   se::DeviceAddress<int32_t> a =
@@ -268,7 +272,8 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_b, shape}, ShapedSlice{slice_a, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
@@ -606,6 +611,7 @@ TEST(CommandBufferCmdTest, RecordExecutorsWithDependencies) {
   auto stream = stream_executor->CreateStream().value();
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
 
   // Device buffers: a, b, c
   se::DeviceAddress<int32_t> a =
@@ -651,7 +657,8 @@ TEST(CommandBufferCmdTest, RecordExecutorsWithDependencies) {
 
   // Executor C: c = b (memcpy)
   CommandBufferCmdSequence seq_c;
-  seq_c.Emplace<MemcpyDeviceToDeviceCmd>(slice_c, slice_b, byte_length);
+  seq_c.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_c, shape}, ShapedSlice{slice_b, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor exec_c,
       CommandBufferCmdExecutor::Create(std::move(seq_c), serialize));
@@ -721,6 +728,8 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
   // Prepare device memory for three buffers.
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
+
   se::DeviceAddress<int32_t> a =
       stream_executor->AllocateArray<int32_t>(length);
   se::DeviceAddress<int32_t> b =
@@ -744,7 +753,8 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
 
   // Inner child: c = a (device-to-device memcpy)
   CommandBufferCmdSequence inner_seq;
-  inner_seq.Emplace<MemcpyDeviceToDeviceCmd>(slice_c, slice_a, byte_length);
+  inner_seq.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_c, shape}, ShapedSlice{slice_a, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor inner_executor,
       CommandBufferCmdExecutor::Create(std::move(inner_seq), serialize));
@@ -754,7 +764,8 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
   middle_seq.Emplace<ChildCmd>(std::move(inner_executor));
   // Add a couple of extra commands that don't affect `c`.
   middle_seq.Emplace<Memset32Cmd>(slice_b, /*bit_pattern=*/3);
-  middle_seq.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_b, byte_length);
+  middle_seq.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_b, shape}, ShapedSlice{slice_b, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor middle_executor,
       CommandBufferCmdExecutor::Create(std::move(middle_seq), serialize));
@@ -763,7 +774,7 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
   CommandBufferCmdSequence outer_seq;
   outer_seq.Emplace<ChildCmd>(std::move(middle_executor));
   // Add a couple more commands at the outer level that still don't affect `c`.
-  outer_seq.Emplace<MemzeroCmd>(slice_b);
+  outer_seq.Emplace<MemzeroCmd>(ShapedSlice{slice_b, shape});
   outer_seq.Emplace<EmptyCmd>();
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor outer_executor,
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
index f6ef881b0966e3..e25b81c61bfbfe 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
@@ -144,7 +144,6 @@ std::optional<DebugOptions::CommandBufferCmdType> GetCommandBufferCmdType(
     case Thunk::kAllToAllStart:
     case Thunk::kCollectiveBroadcastStart:
     case Thunk::kCollectivePermuteStart:
-    case Thunk::kRaggedAllToAllStart:
     case Thunk::kRecv:
     case Thunk::kSend:
       return DebugOptions::COLLECTIVES;
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
index d8854aa70db011..f03e3c3b3e944e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
@@ -51,6 +52,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
@@ -134,8 +136,10 @@ std::unique_ptr<AllGatherStartThunk> CreateAllGatherStartThunk(
 std::unique_ptr<DeviceToDeviceCopyThunk> CreateCopyThunk(
     const BufferAllocation& alloc0) {
   BufferAllocation::Slice slice0(&alloc0, 0, 1024);
-  return std::make_unique<DeviceToDeviceCopyThunk>(Thunk::ThunkInfo(), slice0,
-                                                   slice0, 1024);
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+  return std::make_unique<DeviceToDeviceCopyThunk>(
+      Thunk::ThunkInfo(), ShapedSlice{slice0, shape},
+      ShapedSlice{slice0, shape}, 1024);
 }
 
 std::unique_ptr<GemmThunk> CreateGemmThunk(const BufferAllocation& alloc1) {
@@ -179,6 +183,7 @@ std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
     std::vector<std::vector<std::unique_ptr<Thunk>>> branch_thunks) {
   BufferAllocation alloc(0, 1024, 0);
   BufferAllocation::Slice slice(&alloc, 0, 1024);
+  Shape shape = ShapeUtil::MakeShape(S32, {});
 
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunk_sequences;
   for (auto& thunks : branch_thunks) {
@@ -186,9 +191,9 @@ std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
         Thunk::ThunkInfo(), std::move(thunks)));
   }
 
-  return std::make_unique<ConditionalThunk>(Thunk::ThunkInfo(), slice,
-                                            std::move(branch_thunk_sequences),
-                                            /*branch_index_is_bool=*/false);
+  return std::make_unique<ConditionalThunk>(Thunk::ThunkInfo(),
+                                            ShapedSlice{slice, shape},
+                                            std::move(branch_thunk_sequences));
 }
 
 std::unique_ptr<CustomCallThunk> CreateCustomCallThunk(
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index d4a472d5c542e5..9a6928a711ce55 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -157,7 +159,7 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
-
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
   // Prepare arguments: a=42, b=0
   se::DeviceAddress<int32_t> a =
       stream_executor->AllocateArray<int32_t>(length, 0);
@@ -176,7 +178,8 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_b, shape}, ShapedSlice{slice_a, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
@@ -222,6 +225,7 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
 
   // Prepare arguments: a=42
   se::DeviceAddress<int32_t> a =
@@ -234,7 +238,7 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
-  commands.Emplace<MemzeroCmd>(slice_a);
+  commands.Emplace<MemzeroCmd>(ShapedSlice{slice_a, shape});
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
@@ -1009,13 +1013,13 @@ TEST(CommandBufferThunkTest, DISABLED_DynamicSliceFusionCmd) {
   std::vector<std::optional<Shape>> sliced_shapes = {
       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
       std::nullopt, std::nullopt};
-  std::vector<std::optional<uint64_t>> offset_byte_sizes = {
-      sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt};
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types = {
+      S64, std::nullopt, std::nullopt, std::nullopt};
 
   CommandBufferCmdSequence commands;
   commands.Emplace<DynamicSliceFusionCmd>(
       std::move(embed_executor), arguments, std::move(fake_allocations),
-      offsets, orig_shapes, sliced_shapes, offset_byte_sizes);
+      offsets, orig_shapes, sliced_shapes, offset_primitive_types);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
@@ -1128,8 +1132,9 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
   CommandBufferCmdSequence commands;
   commands.Emplace<CublasLtCmd>(CublasLtMatmulThunk(
       Thunk::ThunkInfo(), /*canonical_hlo=*/"", config.value(),
-      se::gpu::BlasLt::Epilogue::kDefault, 0, slice_a, slice_b, slice_c,
-      slice_d, BufferAllocation::Slice(), BufferAllocation::Slice(),
+      se::gpu::BlasLt::Epilogue::kDefault, /*algorithm_idx=*/0,
+      /*autotune_workspace_size=*/0, slice_a, slice_b, slice_c, slice_d,
+      BufferAllocation::Slice(), BufferAllocation::Slice(),
       BufferAllocation::Slice(), BufferAllocation::Slice(),
       BufferAllocation::Slice(), BufferAllocation::Slice(),
       BufferAllocation::Slice(), slice_workspace));
@@ -1380,6 +1385,8 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
 
   BufferAllocation::Slice slice_i(&alloc_i, 0, sizeof(int32_t));
+  Shape i_shape = ShapeUtil::MakeShape(S32, {});
+
   BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
   BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
 
@@ -1413,7 +1420,7 @@ TEST(CommandBufferThunkTest, CaseCmd) {
 
   // Prepare commands sequence for thunk.
   CommandBufferCmdSequence commands;
-  commands.Emplace<CaseCmd>(slice_i, false, std::move(branches));
+  commands.Emplace<CaseCmd>(ShapedSlice{slice_i, i_shape}, std::move(branches));
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
index f1077fcf368d3f..49602cbf3fda36 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
@@ -48,14 +49,18 @@ namespace xla {
 namespace gpu {
 
 ConditionalThunk::ConditionalThunk(
-    ThunkInfo thunk_info,
-    const BufferAllocation::Slice& branch_index_buffer_index,
-    std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks,
-    bool branch_index_is_bool)
+    ThunkInfo thunk_info, const ShapedSlice& branch_index_buffer_index,
+    std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks)
     : Thunk(Kind::kConditional, thunk_info),
       branch_index_buffer_index_(branch_index_buffer_index),
       branch_thunks_(std::move(branch_thunks)),
-      branch_index_is_bool_(branch_index_is_bool) {}
+      branch_index_is_bool_(branch_index_buffer_index.shape.element_type() ==
+                            PRED) {
+  PrimitiveType element_type = branch_index_buffer_index.shape.element_type();
+  CHECK(element_type == PRED || element_type == S32);
+  CHECK_EQ(branch_index_buffer_index.shape.dimensions(),
+           std::vector<int64_t>{});
+}
 
 absl::Status ConditionalThunk::Prepare(const PrepareParams& params) {
   if (branch_index_is_bool_) {
@@ -111,7 +116,8 @@ absl::Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
   }();
 
   se::DeviceAddressBase branch_index_address =
-      params.buffer_allocations->GetDeviceAddress(branch_index_buffer_index_);
+      params.buffer_allocations->GetDeviceAddress(
+          branch_index_buffer_index_.slice);
   if (branch_index_is_bool_) {
     TF_RETURN_IF_ERROR(stream.Memcpy(std::get<bool*>(branch_index_or_pred),
                                      branch_index_address, sizeof(bool)));
@@ -191,8 +197,6 @@ absl::StatusOr<ThunkProto> ConditionalThunk::ToProto() const {
     *conditional_thunk_proto->add_branch_thunks() =
         std::move(seq_thunk_proto).sequential_thunk();
   }
-
-  conditional_thunk_proto->set_branch_index_is_bool(branch_index_is_bool_);
   return proto;
 }
 
@@ -200,10 +204,9 @@ absl::StatusOr<std::unique_ptr<ConditionalThunk>> ConditionalThunk::FromProto(
     ThunkInfo thunk_info, const ConditionalThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations,
     const Deserializer& deserializer) {
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice branch_index_buffer_index,
-      BufferAllocation::Slice::FromProto(thunk_proto.branch_index_buffer(),
-                                         buffer_allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice branch_index_buffer_index,
+                      ShapedSlice::FromProto(thunk_proto.branch_index_buffer(),
+                                             buffer_allocations));
 
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.reserve(thunk_proto.branch_thunks_size());
@@ -213,9 +216,9 @@ absl::StatusOr<std::unique_ptr<ConditionalThunk>> ConditionalThunk::FromProto(
         SequentialThunk::FromProto(thunk_info, seq_thunk_proto, deserializer));
     branch_thunks.push_back(std::move(seq_thunk));
   }
-  return std::make_unique<ConditionalThunk>(
-      std::move(thunk_info), branch_index_buffer_index,
-      std::move(branch_thunks), thunk_proto.branch_index_is_bool());
+  return std::make_unique<ConditionalThunk>(std::move(thunk_info),
+                                            branch_index_buffer_index,
+                                            std::move(branch_thunks));
 }
 
 std::string ConditionalThunk::ToString(int indent) const {
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
index 312574a4a91037..cc03a6cdc4c0fa 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
@@ -51,10 +52,8 @@ namespace gpu {
 class ConditionalThunk : public Thunk {
  public:
   ConditionalThunk(
-      ThunkInfo thunk_info,
-      const BufferAllocation::Slice& branch_index_buffer_index,
-      std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks,
-      bool branch_index_is_bool);
+      ThunkInfo thunk_info, const ShapedSlice& branch_index_buffer_index,
+      std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks);
 
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
@@ -67,7 +66,7 @@ class ConditionalThunk : public Thunk {
     return branch_thunks_;
   }
 
-  const BufferAllocation::Slice& branch_index_buffer() const {
+  const ShapedSlice& branch_index_buffer() const {
     return branch_index_buffer_index_;
   }
 
@@ -82,7 +81,8 @@ class ConditionalThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(branch_index_buffer_index_),
+        BufferUse::Read(branch_index_buffer_index_.slice,
+                        branch_index_buffer_index_.shape),
     };
   }
 
@@ -105,7 +105,7 @@ class ConditionalThunk : public Thunk {
   std::string ToString(int indent) const override;
 
  private:
-  const BufferAllocation::Slice branch_index_buffer_index_;
+  const ShapedSlice branch_index_buffer_index_;
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks_;
   bool branch_index_is_bool_;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
index 247ab4eca9835f..4207aca6afbbb7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
@@ -27,9 +27,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -67,9 +70,8 @@ struct DummyThunk : public Thunk {
 
 std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
     const Thunk::ThunkInfo& thunk_info,
-    const BufferAllocation::Slice& branch_index_buffer_index,
-    std::vector<ThunkSequence> branch_thunk_sequences,
-    bool kBranchIndexIsBool) {
+    const ShapedSlice& branch_index_buffer_index,
+    std::vector<ThunkSequence> branch_thunk_sequences) {
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   for (auto& thunk_sequence : branch_thunk_sequences) {
     branch_thunks.push_back(std::make_unique<SequentialThunk>(
@@ -77,8 +79,7 @@ std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
   }
 
   return std::make_unique<ConditionalThunk>(
-      thunk_info, branch_index_buffer_index, std::move(branch_thunks),
-      kBranchIndexIsBool);
+      thunk_info, branch_index_buffer_index, std::move(branch_thunks));
 }
 
 TEST(ConditionalThunkTest, BufferUses) {
@@ -87,7 +88,10 @@ TEST(ConditionalThunkTest, BufferUses) {
   thunk_info.execution_stream_id = 123;
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+
+  constexpr bool kBranchIndexIsBool = true;
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(PRED, {});
 
   ThunkSequence false_seq;
   false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
@@ -101,12 +105,11 @@ TEST(ConditionalThunkTest, BufferUses) {
   branch_thunk_sequences.push_back(std::move(false_seq));
   branch_thunk_sequences.push_back(std::move(true_seq));
 
-  constexpr bool kBranchIndexIsBool = true;
   std::unique_ptr<ConditionalThunk> thunk = CreateConditionalThunk(
-      thunk_info, slice, std::move(branch_thunk_sequences), kBranchIndexIsBool);
+      thunk_info, {slice, shape}, std::move(branch_thunk_sequences));
 
   EXPECT_EQ(thunk->branch_index_is_bool(), kBranchIndexIsBool);
-  EXPECT_EQ(thunk->branch_index_buffer(), slice);
+  EXPECT_EQ(thunk->branch_index_buffer().slice, slice);
 
   auto thunk_matcher = Pointee(Property(&Thunk::kind, Thunk::Kind::kGemm));
   auto branch_matcher = Pointee(Property(
@@ -122,6 +125,7 @@ TEST(ConditionalThunkTest, ToProto) {
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(PRED, {});
 
   ThunkSequence false_seq;
   false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
@@ -135,9 +139,8 @@ TEST(ConditionalThunkTest, ToProto) {
   branch_thunk_seq.push_back(std::move(false_seq));
   branch_thunk_seq.push_back(std::move(true_seq));
 
-  constexpr bool kBranchIndexIsBool = true;
   std::unique_ptr<ConditionalThunk> thunk = CreateConditionalThunk(
-      thunk_info, slice, std::move(branch_thunk_seq), kBranchIndexIsBool);
+      thunk_info, {slice, shape}, std::move(branch_thunk_seq));
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk->ToProto());
 
   std::string expected = R"pb(
@@ -146,7 +149,13 @@ TEST(ConditionalThunkTest, ToProto) {
       execution_stream_id: 123
     }
     conditional_thunk {
-      branch_index_buffer { size: 256 }
+      branch_index_buffer {
+        slice { size: 256 }
+        shape {
+          element_type: PRED
+          layout { tail_padding_alignment_in_elements: 1 }
+        }
+      }
       branch_thunks {
         thunks {
           thunk_info {
@@ -175,7 +184,6 @@ TEST(ConditionalThunkTest, ToProto) {
           }
         }
       }
-      branch_index_is_bool: true
     }
   )pb";
   EXPECT_THAT(proto, EqualsProto(expected));
@@ -190,7 +198,13 @@ TEST(ConditionalThunkTest, FromProto) {
           execution_stream_id: 123
         }
         conditional_thunk {
-          branch_index_buffer { offset: 8 size: 256 buffer_allocation_index: 0 }
+          branch_index_buffer {
+            slice { offset: 8 size: 256 buffer_allocation_index: 0 }
+            shape {
+              element_type: PRED
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+          }
           branch_thunks {
             thunks {
               thunk_info {
@@ -219,7 +233,6 @@ TEST(ConditionalThunkTest, FromProto) {
               }
             }
           }
-          branch_index_is_bool: true
         }
       )pb",
       &proto));
@@ -248,6 +261,8 @@ TEST(ConditionalThunkTest, ToString) {
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+  Shape bool_shape = ShapeUtil::MakeShape(PRED, {});
+  Shape int_shape = ShapeUtil::MakeShape(S32, {});
 
   auto create_branch_thunk_sequences = [&]() -> std::vector<ThunkSequence> {
     ThunkSequence false_seq;
@@ -264,9 +279,8 @@ TEST(ConditionalThunkTest, ToString) {
   };
 
   ThunkSequence thunk_sequence;
-  thunk_sequence.push_back(
-      CreateConditionalThunk(thunk_info, slice, create_branch_thunk_sequences(),
-                             /*kBranchIndexIsBool=*/true));
+  thunk_sequence.push_back(CreateConditionalThunk(
+      thunk_info, {slice, bool_shape}, create_branch_thunk_sequences()));
   auto sequential_thunk =
       std::make_unique<SequentialThunk>(thunk_info, std::move(thunk_sequence));
   EXPECT_EQ(sequential_thunk->ToString(/*indent=*/0),
@@ -277,9 +291,8 @@ TEST(ConditionalThunkTest, ToString) {
             "    000: kGemm\t\n"
             "    000: kGemm\t\n\n");
 
-  std::unique_ptr<ConditionalThunk> thunk =
-      CreateConditionalThunk(thunk_info, slice, create_branch_thunk_sequences(),
-                             /*kBranchIndexIsBool=*/false);
+  std::unique_ptr<ConditionalThunk> thunk = CreateConditionalThunk(
+      thunk_info, {slice, int_shape}, create_branch_thunk_sequences());
 
   EXPECT_EQ(thunk->ToString(/*indent=*/0),
             "\n"
@@ -292,14 +305,15 @@ TEST(ConditionalThunkTest, ToString) {
 
 TEST(ConditionalThunkTest, TransformAllNestedThunks) {
   BufferAllocation::Slice slice;
+  Shape shape = ShapeUtil::MakeShape(S32, {});
+
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.push_back(
       std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
   branch_thunks.push_back(
       std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
   auto conditional_thunk = std::make_unique<ConditionalThunk>(
-      Thunk::ThunkInfo(), slice, std::move(branch_thunks),
-      /*branch_index_is_bool=*/false);
+      Thunk::ThunkInfo(), ShapedSlice{slice, shape}, std::move(branch_thunks));
 
   TF_EXPECT_OK(conditional_thunk->TransformAllNestedThunks([](auto) {
     return std::make_unique<DummyThunk>(Kind::kCustomCall, Thunk::ThunkInfo());
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto b/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
index f691051f611d5f..94cb9967b42bdb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package xla.gpu;
 
+import "xla/backends/gpu/runtime/shaped_slice.proto";
 import "xla/service/buffer_assignment.proto";
 
 // Dimensions of the convolution filter.
@@ -20,6 +21,6 @@ message ConvolutionFilterDimensions {
 // Buffers for the bias input and output of the convolution reorder thunk.
 // Serialized version of xla::gpu::ConvolutionReorderThunk::BiasBuffers.
 message ConvolutionReorderBiasBuffers {
-  xla.buffer_assignment.BufferAllocationSliceProto bias_input = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto bias_output = 2;
+  ShapedSliceProto bias_input = 1;
+  ShapedSliceProto bias_output = 2;
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
index 51cb69840c1197..872cce67cfda74 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
@@ -24,13 +24,16 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_assignment.pb.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace gpu {
@@ -50,7 +53,7 @@ static se::dnn::FilterDescriptor CreateFilterDescriptor(
 
 ConvolutionReorderThunk::ConvolutionReorderThunk(
     ThunkInfo thunk_info, ConvolutionFilterDimensions filter_dimensions,
-    BufferAllocation::Slice filter_input, BufferAllocation::Slice filter_output,
+    ShapedSlice filter_input, ShapedSlice filter_output,
     std::optional<BiasBuffers> biases)
     : Thunk(Kind::kConvolutionReorder, thunk_info),
       filter_dimensions_(std::move(filter_dimensions)),
@@ -59,22 +62,41 @@ ConvolutionReorderThunk::ConvolutionReorderThunk(
       filter_output_(filter_output),
       biases_(biases) {}
 
+absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>>
+ConvolutionReorderThunk::Create(ThunkInfo thunk_info, ShapedSlice filter_input,
+                                ShapedSlice filter_output,
+                                std::optional<BiasBuffers> biases) {
+  Shape shape = filter_output.shape;
+  if (shape.dimensions().size() != 5 || shape.dimensions(4) != 32) {
+    return Internal("Unexpected shape for convolution reorder: %s",
+                    shape.ToString());
+  }
+  ConvolutionFilterDimensions filter_dimensions;
+  filter_dimensions.set_output_feature_map_count(shape.dimensions(0));
+  filter_dimensions.set_input_feature_map_count(shape.dimensions(1) * 32);
+  filter_dimensions.set_input_filter_height(shape.dimensions(2));
+  filter_dimensions.set_input_filter_width(shape.dimensions(3));
+  return std::make_unique<ConvolutionReorderThunk>(
+      std::move(thunk_info), filter_dimensions, filter_input, filter_output,
+      biases);
+}
+
 absl::Status ConvolutionReorderThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
 
   auto filter_input = se::DeviceAddress<int8_t>(
-      buffer_allocations.GetDeviceAddress(filter_input_));
+      buffer_allocations.GetDeviceAddress(filter_input_.slice));
   auto filter_output = se::DeviceAddress<int8_t>(
-      buffer_allocations.GetDeviceAddress(filter_output_));
+      buffer_allocations.GetDeviceAddress(filter_output_.slice));
 
   std::optional<se::DeviceAddress<float>> bias_input;
   std::optional<se::DeviceAddress<float>> bias_output;
   if (biases_.has_value()) {
     bias_input = se::DeviceAddress<float>(
-        buffer_allocations.GetDeviceAddress(biases_->bias_input));
+        buffer_allocations.GetDeviceAddress(biases_->bias_input.slice));
     bias_output = se::DeviceAddress<float>(
-        buffer_allocations.GetDeviceAddress(biases_->bias_output));
+        buffer_allocations.GetDeviceAddress(biases_->bias_output.slice));
   }
 
   auto dnn = params.stream->parent()->AsDnn();
@@ -90,27 +112,26 @@ absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>>
 ConvolutionReorderThunk::FromProto(
     ThunkInfo thunk_info, const ConvolutionReorderThunkProto& proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
-                      BufferAllocation::Slice::FromProto(proto.filter_input(),
-                                                         buffer_allocations));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
-                      BufferAllocation::Slice::FromProto(proto.filter_output(),
-                                                         buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      ShapedSlice filter_input,
+      ShapedSlice::FromProto(proto.filter_input(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      ShapedSlice filter_output,
+      ShapedSlice::FromProto(proto.filter_output(), buffer_allocations));
 
   std::optional<BiasBuffers> biases;
   if (proto.has_biases()) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
-                        BufferAllocation::Slice::FromProto(
-                            proto.biases().bias_input(), buffer_allocations));
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
-                        BufferAllocation::Slice::FromProto(
-                            proto.biases().bias_output(), buffer_allocations));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_input,
+                        ShapedSlice::FromProto(proto.biases().bias_input(),
+                                               buffer_allocations));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_output,
+                        ShapedSlice::FromProto(proto.biases().bias_output(),
+                                               buffer_allocations));
     biases = {{bias_input, bias_output}};
   }
 
-  return std::make_unique<ConvolutionReorderThunk>(
-      std::move(thunk_info), proto.filter_dimensions(), filter_input,
-      filter_output, biases);
+  return ConvolutionReorderThunk::Create(std::move(thunk_info), filter_input,
+                                         filter_output, biases);
 }
 
 absl::StatusOr<ThunkProto> ConvolutionReorderThunk::ToProto() const {
@@ -119,7 +140,6 @@ absl::StatusOr<ThunkProto> ConvolutionReorderThunk::ToProto() const {
 
   ConvolutionReorderThunkProto* reorder_proto =
       thunk_proto.mutable_convolution_reorder_thunk();
-  *reorder_proto->mutable_filter_dimensions() = filter_dimensions_;
 
   TF_ASSIGN_OR_RETURN(*reorder_proto->mutable_filter_input(),
                       filter_input_.ToProto());
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
index 2bd0501b08e365..d409c48819fc22 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
@@ -36,16 +37,19 @@ namespace gpu {
 class ConvolutionReorderThunk : public Thunk {
  public:
   struct BiasBuffers {
-    BufferAllocation::Slice bias_input;
-    BufferAllocation::Slice bias_output;
+    ShapedSlice bias_input;
+    ShapedSlice bias_output;
   };
 
   ConvolutionReorderThunk(ThunkInfo thunk_info,
                           ConvolutionFilterDimensions filter_dimensions,
-                          BufferAllocation::Slice filter_input,
-                          BufferAllocation::Slice filter_output,
+                          ShapedSlice filter_input, ShapedSlice filter_output,
                           std::optional<BiasBuffers> biases);
 
+  static absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>> Create(
+      ThunkInfo thunk_info, ShapedSlice filter_input, ShapedSlice filter_output,
+      std::optional<BiasBuffers> biases);
+
   ConvolutionReorderThunk(const ConvolutionReorderThunk&) = delete;
   ConvolutionReorderThunk& operator=(const ConvolutionReorderThunk&) = delete;
 
@@ -53,12 +57,14 @@ class ConvolutionReorderThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     BufferUses res{
-        BufferUse::Read(filter_input_),
-        BufferUse::Write(filter_output_),
+        BufferUse::Read(filter_input_.slice, filter_input_.shape),
+        BufferUse::Write(filter_output_.slice, filter_output_.shape),
     };
     if (biases_.has_value()) {
-      res.push_back(BufferUse::Read(biases_->bias_input));
-      res.push_back(BufferUse::Write(biases_->bias_output));
+      res.push_back(BufferUse::Read(biases_->bias_input.slice,
+                                    biases_->bias_input.shape));
+      res.push_back(BufferUse::Write(biases_->bias_output.slice,
+                                     biases_->bias_output.shape));
     }
     return res;
   }
@@ -72,9 +78,9 @@ class ConvolutionReorderThunk : public Thunk {
  private:
   const ConvolutionFilterDimensions filter_dimensions_;
   const se::dnn::FilterDescriptor filter_descriptor_;
-  BufferAllocation::Slice filter_input_;
-  BufferAllocation::Slice filter_output_;
-  std::optional<BiasBuffers> biases_;
+  const ShapedSlice filter_input_;
+  const ShapedSlice filter_output_;
+  const std::optional<BiasBuffers> biases_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
index 69aaf54193541f..f85b49bd35509a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
@@ -39,17 +39,35 @@ TEST(ConvolutionReorderThunkTest, ProtoRoundTrip) {
   auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
     thunk_info { profile_annotation: "test" execution_stream_id: 0 }
     convolution_reorder_thunk {
-      filter_dimensions {
-        output_feature_map_count: 1
-        input_feature_map_count: 2
-        input_filter_height: 3
-        input_filter_width: 4
+      filter_input {
+        slice { buffer_allocation_index: 0 offset: 0 size: 1024 }
+        shape {}
+      }
+      filter_output {
+        slice { buffer_allocation_index: 1 offset: 0 size: 512 }
+        shape {
+          element_type: F32
+          dimensions: 1
+          dimensions: 2
+          dimensions: 3
+          dimensions: 4
+          dimensions: 32
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
       }
-      filter_input { buffer_allocation_index: 0 offset: 0 size: 1024 }
-      filter_output { buffer_allocation_index: 1 offset: 0 size: 512 }
       biases {
-        bias_input { buffer_allocation_index: 2 offset: 0 size: 256 }
-        bias_output { buffer_allocation_index: 3 offset: 0 size: 128 }
+        bias_input {
+          slice { buffer_allocation_index: 2 offset: 0 size: 256 }
+          shape {}
+        }
+        bias_output {
+          slice { buffer_allocation_index: 3 offset: 0 size: 128 }
+          shape {}
+        }
       }
     }
   )pb");
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
index 3e020680be7d4c..d6d8da0379def0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
@@ -45,8 +46,8 @@ using buffer_assignment::BufferAllocationSliceProto;
 
 absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::Create(
     ThunkInfo thunk_info, GpuConvDescriptor descriptor,
-    std::vector<BufferAllocation::Slice> operand_slices,
-    std::vector<BufferAllocation::Slice> result_slices,
+    std::vector<ShapedSlice> operand_slices,
+    std::vector<ShapedSlice> result_slices,
     BufferAllocation::Slice scratch_slice) {
   TF_ASSIGN_OR_RETURN(GpuConvConfig config,
                       GetGpuConvConfig(descriptor, /*inst_as_string=*/""));
@@ -57,11 +58,12 @@ absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::Create(
       std::move(operand_slices), std::move(result_slices), scratch_slice));
 }
 
-ConvolutionThunk::ConvolutionThunk(
-    ThunkInfo thunk_info, GpuConvDescriptor descriptor, GpuConvConfig config,
-    std::vector<BufferAllocation::Slice> operand_slices,
-    std::vector<BufferAllocation::Slice> result_slices,
-    BufferAllocation::Slice scratch_slice)
+ConvolutionThunk::ConvolutionThunk(ThunkInfo thunk_info,
+                                   GpuConvDescriptor descriptor,
+                                   GpuConvConfig config,
+                                   std::vector<ShapedSlice> operand_slices,
+                                   std::vector<ShapedSlice> result_slices,
+                                   BufferAllocation::Slice scratch_slice)
     : Thunk(Kind::kConvolution, thunk_info),
       operand_buffers_(std::move(operand_slices)),
       result_buffers_(std::move(result_slices)),
@@ -87,13 +89,15 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   std::vector<se::DeviceAddressBase> operand_se_buffers, result_se_buffers;
   operand_se_buffers.reserve(operand_buffers_.size());
-  for (BufferAllocation::Slice buffer : operand_buffers_) {
-    operand_se_buffers.push_back(buffer_allocations.GetDeviceAddress(buffer));
+  for (const ShapedSlice& buffer : operand_buffers_) {
+    operand_se_buffers.push_back(
+        buffer_allocations.GetDeviceAddress(buffer.slice));
   }
 
   result_se_buffers.reserve(result_buffers_.size());
-  for (BufferAllocation::Slice buffer : result_buffers_) {
-    result_se_buffers.push_back(buffer_allocations.GetDeviceAddress(buffer));
+  for (const ShapedSlice& buffer : result_buffers_) {
+    result_se_buffers.push_back(
+        buffer_allocations.GetDeviceAddress(buffer.slice));
   }
 
   se::DeviceAddressBase scratch =
@@ -150,21 +154,20 @@ absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::FromProto(
   TF_ASSIGN_OR_RETURN(GpuConvDescriptor descriptor,
                       GpuConvDescriptor::FromProto(proto.conv_descriptor()));
 
-  std::vector<BufferAllocation::Slice> operand_slices;
+  std::vector<ShapedSlice> operand_slices;
   operand_slices.reserve(proto.operand_buffers_size());
-  for (const BufferAllocationSliceProto& slice_proto :
-       proto.operand_buffers()) {
+  for (const ShapedSliceProto& slice_proto : proto.operand_buffers()) {
     TF_ASSIGN_OR_RETURN(
         operand_slices.emplace_back(),
-        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+        ShapedSlice::FromProto(slice_proto, buffer_allocations));
   }
 
-  std::vector<BufferAllocation::Slice> result_slices;
+  std::vector<ShapedSlice> result_slices;
   result_slices.reserve(proto.result_buffers_size());
-  for (const BufferAllocationSliceProto& slice_proto : proto.result_buffers()) {
+  for (const ShapedSliceProto& slice_proto : proto.result_buffers()) {
     TF_ASSIGN_OR_RETURN(
         result_slices.emplace_back(),
-        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+        ShapedSlice::FromProto(slice_proto, buffer_allocations));
   }
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
@@ -183,10 +186,10 @@ absl::StatusOr<ThunkProto> ConvolutionThunk::ToProto() const {
   ConvolutionThunkProto* conv_proto = proto.mutable_convolution_thunk();
   *conv_proto->mutable_conv_descriptor() = descriptor_.ToProto();
 
-  for (const BufferAllocation::Slice& slice : operand_buffers_) {
+  for (const ShapedSlice& slice : operand_buffers_) {
     TF_ASSIGN_OR_RETURN(*conv_proto->add_operand_buffers(), slice.ToProto());
   }
-  for (const BufferAllocation::Slice& slice : result_buffers_) {
+  for (const ShapedSlice& slice : result_buffers_) {
     TF_ASSIGN_OR_RETURN(*conv_proto->add_result_buffers(), slice.ToProto());
   }
   TF_ASSIGN_OR_RETURN(*conv_proto->mutable_scratch_buffer(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
index c13653ba69c8f2..72845ff25d4536 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -45,8 +46,8 @@ class ConvolutionThunk : public Thunk {
   // operand_slices should be in the same order as cudnn_call->operands().
   static absl::StatusOr<std::unique_ptr<ConvolutionThunk>> Create(
       ThunkInfo thunk_info, GpuConvDescriptor descriptor,
-      std::vector<BufferAllocation::Slice> operand_slices,
-      std::vector<BufferAllocation::Slice> result_slices,
+      std::vector<ShapedSlice> operand_slices,
+      std::vector<ShapedSlice> result_slices,
       BufferAllocation::Slice scratch_slice);
 
   ConvolutionThunk(const ConvolutionThunk&) = delete;
@@ -58,11 +59,11 @@ class ConvolutionThunk : public Thunk {
     BufferUses res;
     res.reserve(operand_buffers_.size() + result_buffers_.size() + 1);
 
-    for (const BufferAllocation::Slice& slice : operand_buffers_) {
-      res.push_back(BufferUse::Read(slice));
+    for (const ShapedSlice& slice : operand_buffers_) {
+      res.push_back(BufferUse::Read(slice.slice, slice.shape));
     }
-    for (const BufferAllocation::Slice& slice : result_buffers_) {
-      res.push_back(BufferUse::Write(slice));
+    for (const ShapedSlice& slice : result_buffers_) {
+      res.push_back(BufferUse::Write(slice.slice, slice.shape));
     }
     res.emplace_back(scratch_buffer_, BufferUse::MemoryAccess::kWrite,
                      BufferUse::ContentValidity::kUndefined);
@@ -78,12 +79,12 @@ class ConvolutionThunk : public Thunk {
  private:
   ConvolutionThunk(ThunkInfo thunk_info, GpuConvDescriptor descriptor,
                    GpuConvConfig config,
-                   std::vector<BufferAllocation::Slice> operand_slices,
-                   std::vector<BufferAllocation::Slice> result_slices,
+                   std::vector<ShapedSlice> operand_slices,
+                   std::vector<ShapedSlice> result_slices,
                    BufferAllocation::Slice scratch_slice);
 
-  std::vector<BufferAllocation::Slice> operand_buffers_;
-  std::vector<BufferAllocation::Slice> result_buffers_;
+  std::vector<ShapedSlice> operand_buffers_;
+  std::vector<ShapedSlice> result_buffers_;
   BufferAllocation::Slice scratch_buffer_;
   GenericConvRunner& GetOrCreateRunner(const stream_executor::Stream* stream,
                                        bool* runner_created);
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
index 6ebd309f06e7af..acfdf2cd39209c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
@@ -108,9 +108,42 @@ TEST(ConvolutionThunkTest, ProtoRoundTrip) {
           output_spatial_dimensions: [ 2, 3 ]
         }
       }
-      operand_buffers { offset: 0 size: 4 buffer_allocation_index: 0 }
-      operand_buffers { offset: 0 size: 4 buffer_allocation_index: 1 }
-      result_buffers { offset: 0 size: 4 buffer_allocation_index: 2 }
+      operand_buffers {
+        slice { offset: 0 size: 4 buffer_allocation_index: 0 }
+        shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+      }
+      operand_buffers {
+        slice { offset: 0 size: 4 buffer_allocation_index: 1 }
+        shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+      }
+      result_buffers {
+        slice { offset: 0 size: 4 buffer_allocation_index: 2 }
+        shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+      }
       scratch_buffer { offset: 0 size: 1024 buffer_allocation_index: 3 }
     }
   )pb");
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
index a98f4a05f38dbc..a5aba50e345156 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
@@ -22,15 +22,18 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
@@ -42,22 +45,31 @@ namespace xla {
 namespace gpu {
 
 DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size)
+    ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+    const ShapedSlice& destination_buffer, int64_t mem_size)
     : Thunk(Kind::kCopy, std::move(thunk_info)),
       source_buffer_(source_buffer),
       destination_buffer_(destination_buffer),
-      mem_size_(mem_size) {}
+      mem_size_(mem_size) {
+  // TODO(b/460846009): Determine size based on shape.
+  // Bounded dynamic shape contains extra header after data.
+  // Header size needs to be accounted for.
+  CHECK_EQ(ShapeUtil::ByteSizeOf(source_buffer_.shape),
+           ShapeUtil::ByteSizeOf(destination_buffer_.shape));
+
+  CHECK_GE(source_buffer_.slice.size(), mem_size);
+  CHECK_GE(destination_buffer_.slice.size(), mem_size);
+}
 
 absl::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::DeviceAddressBase destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination_buffer_);
+      params.buffer_allocations->GetDeviceAddress(destination_buffer_.slice);
   se::DeviceAddressBase source_data =
-      params.buffer_allocations->GetDeviceAddress(source_buffer_);
-  VLOG(3) << "Memcpy D2D of size " << mem_size_ << " from "
+      params.buffer_allocations->GetDeviceAddress(source_buffer_.slice);
+  VLOG(3) << "Memcpy D2D of size " << size_bytes() << " from "
           << source_data.opaque() << " to " << destination_data.opaque();
-  return params.stream->Memcpy(&destination_data, source_data, mem_size_);
+  return params.stream->Memcpy(&destination_data, source_data, size_bytes());
 }
 
 absl::StatusOr<ThunkProto> DeviceToDeviceCopyThunk::ToProto() const {
@@ -67,9 +79,9 @@ absl::StatusOr<ThunkProto> DeviceToDeviceCopyThunk::ToProto() const {
       proto.mutable_device_to_device_copy_thunk();
   CopyThunkProto* copy_thunk_proto = d2d_copy_thunk_proto->mutable_copy_thunk();
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
-                      source().ToProto());
+                      source_buffer_.ToProto());
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
-                      destination().ToProto());
+                      destination_buffer_.ToProto());
   copy_thunk_proto->set_mem_size(size_bytes());
   return proto;
 }
@@ -79,13 +91,18 @@ DeviceToDeviceCopyThunk::FromProto(
     ThunkInfo thunk_info, const DeviceToDeviceCopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().source_buffer(), buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().source_buffer(),
+                             buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().destination_buffer(), buffer_allocations));
+      ShapedSlice dst_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().destination_buffer(),
+                             buffer_allocations));
+  if (ShapeUtil::ByteSizeOfElements(src_slice.shape) !=
+      ShapeUtil::ByteSizeOfElements(dst_slice.shape)) {
+    return absl::FailedPreconditionError(
+        "DeviceToDeviceCopyThunkProto with incompatible shapes.");
+  }
   return std::make_unique<DeviceToDeviceCopyThunk>(
       std::move(thunk_info), src_slice, dst_slice,
       thunk_proto.copy_thunk().mem_size());
@@ -95,14 +112,18 @@ DeviceToDeviceCopyThunk::FromProto(
 // CopyThunk
 //===----------------------------------------------------------------------===//
 
-CopyThunk::CopyThunk(ThunkInfo thunk_info,
-                     const BufferAllocation::Slice& source_buffer,
-                     const BufferAllocation::Slice& destination_buffer,
-                     uint64_t mem_size)
+CopyThunk::CopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+                     const ShapedSlice& destination_buffer, int64_t mem_size)
     : Thunk(Kind::kCopy, std::move(thunk_info)),
       source_buffer_(source_buffer),
       destination_buffer_(destination_buffer),
-      mem_size_(mem_size) {}
+      mem_size_(mem_size) {
+  CHECK_EQ(ShapeUtil::ByteSizeOfElements(source_buffer_.shape),
+           ShapeUtil::ByteSizeOfElements(destination_buffer_.shape));
+
+  CHECK_GE(source_buffer_.slice.size(), mem_size);
+  CHECK_GE(destination_buffer_.slice.size(), mem_size);
+}
 
 absl::Status CopyThunk::ExecuteOnStream(const ExecuteParams& params) {
   return absl::OkStatus();
@@ -146,9 +167,9 @@ absl::StatusOr<ThunkProto> CopyThunk::ToProto() const {
 
   CopyThunkProto* copy_thunk_proto = proto.mutable_copy_thunk();
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
-                      source().ToProto());
+                      source_buffer_.ToProto());
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
-                      destination().ToProto());
+                      destination_buffer_.ToProto());
   copy_thunk_proto->set_mem_size(size_bytes());
   return proto;
 }
@@ -156,13 +177,18 @@ absl::StatusOr<ThunkProto> CopyThunk::ToProto() const {
 absl::StatusOr<std::unique_ptr<CopyThunk>> CopyThunk::FromProto(
     ThunkInfo thunk_info, const CopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_slice,
-                      BufferAllocation::Slice::FromProto(
-                          thunk_proto.source_buffer(), buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(thunk_proto.destination_buffer(),
-                                         buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.source_buffer(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice dst_slice,
+                      ShapedSlice::FromProto(thunk_proto.destination_buffer(),
+                                             buffer_allocations));
+  if (ShapeUtil::ByteSizeOfElements(src_slice.shape) !=
+      ShapeUtil::ByteSizeOfElements(dst_slice.shape)) {
+    return absl::FailedPreconditionError(
+        "DeviceToDeviceCopyThunkProto with incompatible shapes.");
+  }
+
   return std::make_unique<CopyThunk>(std::move(thunk_info), src_slice,
                                      dst_slice, thunk_proto.mem_size());
 }
@@ -171,8 +197,8 @@ absl::StatusOr<std::unique_ptr<CopyThunk>> CopyThunk::FromProto(
 // DeviceToHostCopyThunk
 //===----------------------------------------------------------------------===//
 DeviceToHostCopyThunk::DeviceToHostCopyThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+    const ShapedSlice& destination_buffer, int64_t mem_size,
     std::shared_ptr<CopyThunk::AsyncEvents> async_events,
     const HloInstruction* instr)
     : CopyThunk(std::move(thunk_info), source_buffer, destination_buffer,
@@ -183,9 +209,9 @@ DeviceToHostCopyThunk::DeviceToHostCopyThunk(
 absl::Status DeviceToHostCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::DeviceAddressBase destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination());
+      params.buffer_allocations->GetDeviceAddress(destination().slice);
   se::DeviceAddressBase source_data =
-      params.buffer_allocations->GetDeviceAddress(source());
+      params.buffer_allocations->GetDeviceAddress(source().slice);
   void* cpu_dst = destination_data.opaque();
   TF_ASSIGN_OR_RETURN(
       se::Stream * stream,
@@ -225,13 +251,13 @@ DeviceToHostCopyThunk::FromProto(
     ThunkInfo thunk_info, const DeviceToHostCopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().source_buffer(), buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().source_buffer(),
+                             buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().destination_buffer(), buffer_allocations));
+      ShapedSlice dst_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().destination_buffer(),
+                             buffer_allocations));
   return std::make_unique<DeviceToHostCopyThunk>(
       std::move(thunk_info), src_slice, dst_slice,
       thunk_proto.copy_thunk().mem_size(),
@@ -252,8 +278,8 @@ DeviceToHostCopyThunk::GetAsyncEventsUniqueId() const {
 // HostToDeviceCopyThunk
 //===----------------------------------------------------------------------===//
 HostToDeviceCopyThunk::HostToDeviceCopyThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+    const ShapedSlice& destination_buffer, int64_t mem_size,
     std::shared_ptr<CopyThunk::AsyncEvents> async_events,
     const HloInstruction* instr)
     : CopyThunk(std::move(thunk_info), source_buffer, destination_buffer,
@@ -264,9 +290,9 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
 absl::Status HostToDeviceCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::DeviceAddressBase destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination());
+      params.buffer_allocations->GetDeviceAddress(destination().slice);
   se::DeviceAddressBase source_data =
-      params.buffer_allocations->GetDeviceAddress(source());
+      params.buffer_allocations->GetDeviceAddress(source().slice);
   void* cpu_src = source_data.opaque();
   TF_ASSIGN_OR_RETURN(
       se::Stream * stream,
@@ -306,13 +332,13 @@ HostToDeviceCopyThunk::FromProto(
     ThunkInfo thunk_info, const HostToDeviceCopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().source_buffer(), buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().source_buffer(),
+                             buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().destination_buffer(), buffer_allocations));
+      ShapedSlice dst_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().destination_buffer(),
+                             buffer_allocations));
   return std::make_unique<HostToDeviceCopyThunk>(
       std::move(thunk_info), src_slice, dst_slice,
       thunk_proto.copy_thunk().mem_size(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
index 381c426a66f35f..a6afb8f0e3c7e4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
@@ -30,11 +30,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -45,28 +47,25 @@ namespace gpu {
 class DeviceToDeviceCopyThunk : public Thunk {
  public:
   // Constructs a CopyThunk that copies host data from `source_buffer` to the
-  // device buffer `destination_buffer`. `mem_size` is the size of the data in
-  // bytes.
+  // device buffer `destination_buffer`.
   DeviceToDeviceCopyThunk(ThunkInfo thunk_info,
-                          const BufferAllocation::Slice& source_buffer,
-                          const BufferAllocation::Slice& destination_buffer,
-                          uint64_t mem_size);
+                          const ShapedSlice& source_buffer,
+                          const ShapedSlice& destination_buffer,
+                          int64_t mem_size);
 
   DeviceToDeviceCopyThunk(const DeviceToDeviceCopyThunk&) = delete;
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  const BufferAllocation::Slice& source() const { return source_buffer_; }
-  const BufferAllocation::Slice& destination() const {
-    return destination_buffer_;
-  }
-  uint64_t size_bytes() const { return mem_size_; }
+  const ShapedSlice& source() const { return source_buffer_; }
+  const ShapedSlice& destination() const { return destination_buffer_; }
+  int64_t size_bytes() const { return mem_size_; }
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(source_buffer_),
-        BufferUse::Write(destination_buffer_),
+        BufferUse::Read(source_buffer_.slice, source_buffer_.shape),
+        BufferUse::Write(destination_buffer_.slice, destination_buffer_.shape),
     };
   }
 
@@ -78,9 +77,11 @@ class DeviceToDeviceCopyThunk : public Thunk {
 
   friend bool operator==(const DeviceToDeviceCopyThunk& lhs,
                          const DeviceToDeviceCopyThunk& rhs) {
-    return std::tie(lhs.source_buffer_, lhs.destination_buffer_,
-                    lhs.mem_size_) ==
-           std::tie(rhs.source_buffer_, rhs.destination_buffer_, rhs.mem_size_);
+    if (lhs.size_bytes() != rhs.size_bytes()) {
+      return false;
+    }
+    return std::tie(lhs.source_buffer_, lhs.destination_buffer_) ==
+           std::tie(rhs.source_buffer_, rhs.destination_buffer_);
   }
 
   friend bool operator!=(const DeviceToDeviceCopyThunk& lhs,
@@ -89,9 +90,9 @@ class DeviceToDeviceCopyThunk : public Thunk {
   }
 
  private:
-  const BufferAllocation::Slice source_buffer_;
-  const BufferAllocation::Slice destination_buffer_;
-  const uint64_t mem_size_;
+  const ShapedSlice source_buffer_;
+  const ShapedSlice destination_buffer_;
+  const int64_t mem_size_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -117,20 +118,17 @@ class CopyThunk : public Thunk {
     absl::flat_hash_map<Key, std::unique_ptr<se::Event>> events_
         ABSL_GUARDED_BY(mutex_);
   };
-  CopyThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-            const BufferAllocation::Slice& destination_buffer,
-            uint64_t mem_size);
+  CopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+            const ShapedSlice& destination_buffer, int64_t mem_size);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-  const BufferAllocation::Slice& source() const { return source_buffer_; }
-  const BufferAllocation::Slice& destination() const {
-    return destination_buffer_;
-  }
+  const ShapedSlice& source() const { return source_buffer_; }
+  const ShapedSlice& destination() const { return destination_buffer_; }
   uint64_t size_bytes() const { return mem_size_; }
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(source_buffer_),
-        BufferUse::Write(destination_buffer_),
+        BufferUse::Read(source_buffer_.slice, source_buffer_.shape),
+        BufferUse::Write(destination_buffer_.slice, destination_buffer_.shape),
     };
   }
 
@@ -146,9 +144,9 @@ class CopyThunk : public Thunk {
       absl::Span<const BufferAllocation> buffer_allocations);
 
  private:
-  const BufferAllocation::Slice source_buffer_;
-  const BufferAllocation::Slice destination_buffer_;
-  const uint64_t mem_size_;
+  const ShapedSlice source_buffer_;
+  const ShapedSlice destination_buffer_;
+  const int64_t mem_size_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -163,10 +161,8 @@ class DeviceToHostCopyThunk : public CopyThunk {
   // the device buffer `destination_buffer`. `mem_size` is the size of the data
   // in bytes. `events` are the cuda record/wait events.
   // `instr` is the copy-start instruction.
-  DeviceToHostCopyThunk(ThunkInfo thunk_info,
-                        const BufferAllocation::Slice& source_buffer,
-                        const BufferAllocation::Slice& destination_buffer,
-                        uint64_t mem_size,
+  DeviceToHostCopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+                        const ShapedSlice& destination_buffer, int64_t mem_size,
                         std::shared_ptr<CopyThunk::AsyncEvents> events,
                         const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
@@ -198,10 +194,8 @@ class HostToDeviceCopyThunk : public CopyThunk {
   // the host buffer `destination_buffer`. `mem_size` is the size of the data
   // in bytes. `events` are the cuda record/wait events.
   // `instr` is the copy-start instruction.
-  HostToDeviceCopyThunk(ThunkInfo thunk_info,
-                        const BufferAllocation::Slice& source_buffer,
-                        const BufferAllocation::Slice& destination_buffer,
-                        uint64_t mem_size,
+  HostToDeviceCopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+                        const ShapedSlice& destination_buffer, int64_t mem_size,
                         std::shared_ptr<CopyThunk::AsyncEvents> events,
                         const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
index 0be4c722ce265e..3b556de2685d4d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -41,22 +43,40 @@ TEST(CopyThunkTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
-  CopyThunk thunk(thunk_info, src_slice, dst_slice, /*mem_size=*/256);
+  CopyThunk thunk(thunk_info, {src_slice, shape}, {dst_slice, shape}, 256);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
-  EXPECT_THAT(proto, EqualsProto(R"pb(
-                thunk_info {
-                  profile_annotation: "profile_annotation"
-                  execution_stream_id: 123
-                }
-                copy_thunk {
-                  source_buffer { offset: 128 size: 384 }
-                  destination_buffer { size: 256 buffer_allocation_index: 1 }
-                  mem_size: 256
-                }
-              )pb"));
+  EXPECT_THAT(
+      proto, EqualsProto(R"pb(
+        thunk_info {
+          profile_annotation: "profile_annotation"
+          execution_stream_id: 123
+        }
+        copy_thunk {
+          source_buffer {
+            slice { offset: 128 size: 256 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          destination_buffer {
+            slice { size: 256 buffer_allocation_index: 1 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          mem_size: 256
+        }
+      )pb"));
 }
 
 TEST(CopyThunkTest, FromProto) {
@@ -67,8 +87,24 @@ TEST(CopyThunkTest, FromProto) {
           execution_stream_id: 123
         }
         copy_thunk {
-          source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
-          destination_buffer { offset: 0 size: 256 buffer_allocation_index: 1 }
+          source_buffer {
+            slice { offset: 128 size: 256 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          destination_buffer {
+            slice { size: 256 buffer_allocation_index: 1 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
           mem_size: 256
         }
       )pb");
@@ -83,15 +119,18 @@ TEST(CopyThunkTest, FromProto) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<CopyThunk> thunk,
       CopyThunk::FromProto(thunk_info, proto.copy_thunk(), buffer_allocations));
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
   EXPECT_EQ(
       *thunk.get(),
       CopyThunk(thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
-                /*mem_size=*/256));
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
+                256));
 }
 
 TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
@@ -102,11 +141,12 @@ TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
 
-  DeviceToHostCopyThunk thunk(thunk_info, src_slice, dst_slice,
-                              /*mem_size=*/256,
+  DeviceToHostCopyThunk thunk(thunk_info, {src_slice, shape},
+                              {dst_slice, shape}, 256,
                               /*events=*/nullptr,
                               /*instr=*/nullptr);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
@@ -117,8 +157,30 @@ TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
                 }
                 device_to_host_copy_thunk {
                   copy_thunk {
-                    source_buffer { offset: 128 size: 384 }
-                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    source_buffer {
+                      slice { offset: 128 size: 256 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
+                    destination_buffer {
+                      slice { size: 256 buffer_allocation_index: 1 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
                     mem_size: 256
                   }
                 }
@@ -134,11 +196,29 @@ TEST(DeviceToHostCopyThunkProtoTest, FromProto) {
         }
         device_to_host_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 256 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -156,14 +236,17 @@ TEST(DeviceToHostCopyThunkProtoTest, FromProto) {
       std::unique_ptr<DeviceToHostCopyThunk> thunk,
       DeviceToHostCopyThunk::FromProto(
           thunk_info, proto.device_to_host_copy_thunk(), buffer_allocations));
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
   EXPECT_EQ(*thunk.get(),
             DeviceToHostCopyThunk(
                 thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
                 /*mem_size=*/256,
                 /*events=*/nullptr,
                 /*instr=*/nullptr));
@@ -177,10 +260,12 @@ TEST(HostToDeviceCopyThunkProtoTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
-  HostToDeviceCopyThunk thunk(thunk_info, src_slice, dst_slice,
+  HostToDeviceCopyThunk thunk(thunk_info, {src_slice, shape},
+                              {dst_slice, shape},
                               /*mem_size=*/256,
                               /*events=*/nullptr,
                               /*instr=*/nullptr);
@@ -192,8 +277,30 @@ TEST(HostToDeviceCopyThunkProtoTest, ToProto) {
                 }
                 host_to_device_copy_thunk {
                   copy_thunk {
-                    source_buffer { offset: 128 size: 384 }
-                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    source_buffer {
+                      slice { offset: 128 size: 256 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
+                    destination_buffer {
+                      slice { size: 256 buffer_allocation_index: 1 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
                     mem_size: 256
                   }
                 }
@@ -209,11 +316,29 @@ TEST(HostToDeviceCopyThunkProtoTest, FromProto) {
         }
         host_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 256 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -226,6 +351,7 @@ TEST(HostToDeviceCopyThunkProtoTest, FromProto) {
   std::vector<BufferAllocation> buffer_allocations = {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HostToDeviceCopyThunk> thunk,
@@ -235,10 +361,12 @@ TEST(HostToDeviceCopyThunkProtoTest, FromProto) {
   EXPECT_EQ(*thunk.get(),
             HostToDeviceCopyThunk(
                 thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
                 /*mem_size=*/256,
                 /*events=*/nullptr,
                 /*instr=*/nullptr));
@@ -252,11 +380,12 @@ TEST(DeviceToDeviceCopyThunkProtoTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
-  DeviceToDeviceCopyThunk thunk(thunk_info, src_slice, dst_slice,
-                                /*mem_size=*/256);
+  DeviceToDeviceCopyThunk thunk(thunk_info, {src_slice, shape},
+                                {dst_slice, shape}, 256);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
   EXPECT_THAT(proto, EqualsProto(R"pb(
                 thunk_info {
@@ -265,8 +394,30 @@ TEST(DeviceToDeviceCopyThunkProtoTest, ToProto) {
                 }
                 device_to_device_copy_thunk {
                   copy_thunk {
-                    source_buffer { offset: 128 size: 384 }
-                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    source_buffer {
+                      slice { offset: 128 size: 256 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
+                    destination_buffer {
+                      slice { size: 256 buffer_allocation_index: 1 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
                     mem_size: 256
                   }
                 }
@@ -282,11 +433,29 @@ TEST(DeviceToDeviceCopyThunkProtoTest, FromProto) {
         }
         device_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 256 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -305,14 +474,17 @@ TEST(DeviceToDeviceCopyThunkProtoTest, FromProto) {
       DeviceToDeviceCopyThunk::FromProto(
           thunk_info, proto.device_to_device_copy_thunk(), buffer_allocations));
 
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
   EXPECT_EQ(*thunk.get(),
             DeviceToDeviceCopyThunk(
                 thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
-                /*mem_size=*/256));
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
+                256));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
index a8bdbd6719435a..10425250431f34 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
@@ -215,38 +215,42 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
     std::vector<NullableShapedSlice> operands,
     std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
-    const HloComputation* called_computation, absl::string_view platform_name) {
+    const HloComputation* called_computation, absl::string_view platform_name,
+    std::unique_ptr<ffi::ExecutionState> execution_state) {
   TF_ASSIGN_OR_RETURN(ffi::HandlerRegistration registration,
                       ffi::FindHandler(target_name, platform_name));
 
   return Create(thunk_info, std::move(target_name),
                 std::move(registration.bundle), std::move(operands),
-                std::move(results), std::move(attributes), called_computation);
+                std::move(results), std::move(attributes), called_computation,
+                std::move(execution_state));
 }
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
     XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
     std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
-    const HloComputation* called_computation) {
-  auto execution_state = std::make_unique<ffi::ExecutionState>();
-
+    const HloComputation* called_computation,
+    std::unique_ptr<ffi::ExecutionState> execution_state) {
   // Initialize FFI handler state if it has an instantiate callback.
-  if (bundle.instantiate) {
-    // At FFI handler instantiation time, we don't have any arguments or
-    // results or access to the underlying device (stream, etc.)
-    CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
-
-    CallFrameBuilder::AttributesBuilder attrs;
-    attrs.Append(attributes);
-
-    builder.AddAttributes(attrs.Build());
-    CallFrame call_frame = builder.Build();
-
-    CallOptions options;
-    options.execution_state = execution_state.get();
-    TF_RETURN_IF_ERROR(Call(bundle.instantiate, call_frame, options,
-                            XLA_FFI_ExecutionStage_INSTANTIATE));
+  if (execution_state == nullptr) {
+    execution_state = std::make_unique<ffi::ExecutionState>();
+    if (bundle.instantiate) {
+      // At FFI handler instantiation time, we don't have any arguments or
+      // results or access to the underlying device (stream, etc.)
+      CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+
+      CallFrameBuilder::AttributesBuilder attrs;
+      attrs.Append(attributes);
+
+      builder.AddAttributes(attrs.Build());
+      CallFrame call_frame = builder.Build();
+
+      CallOptions options;
+      options.execution_state = execution_state.get();
+      TF_RETURN_IF_ERROR(Call(bundle.instantiate, call_frame, options,
+                              XLA_FFI_ExecutionStage_INSTANTIATE));
+    }
   }
 
   TF_ASSIGN_OR_RETURN(CallFrame call_frame,
@@ -602,6 +606,12 @@ absl::StatusOr<ThunkProto> CustomCallThunk::ToProto() const {
     *proto.mutable_custom_call_thunk()->mutable_attributes() =
         attributes_->ToProto();
   }
+
+  if (execution_state_ && execution_state_->IsSerializable()) {
+    TF_ASSIGN_OR_RETURN(
+        *proto.mutable_custom_call_thunk()->mutable_execution_state(),
+        execution_state_->ToProto());
+  }
   return proto;
 }
 
@@ -629,6 +639,14 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::FromProto(
         NullableShapedSlice::FromProto(result_proto, buffer_allocations));
     results.push_back(std::move(result));
   }
+
+  if (proto.api_version() != CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    // Create a thunk that uses the legacy custom call registry.
+    return CustomCallThunk::Create(
+        std::move(thunk_info), proto.target_name(), std::move(operands),
+        std::move(results), proto.opaque(), proto.api_version(), platform_name);
+  }
+
   TF_ASSIGN_OR_RETURN(ffi::AttributesMap attributes,
                       ffi::AttributesMap::FromProto(proto.attributes()));
 
@@ -643,11 +661,17 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::FromProto(
           "' not found in the HloModule with name '", hlo_module->name(), "'"));
     }
   }
+  std::unique_ptr<ffi::ExecutionState> execution_state;
+  if (proto.has_execution_state()) {
+    TF_ASSIGN_OR_RETURN(
+        auto state, ffi::ExecutionState::FromProto(proto.execution_state()));
+    execution_state = std::make_unique<ffi::ExecutionState>(std::move(state));
+  }
 
   return CustomCallThunk::Create(std::move(thunk_info), proto.target_name(),
                                  std::move(operands), std::move(results),
                                  std::move(attributes), called_computation,
-                                 platform_name);
+                                 platform_name, std::move(execution_state));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
index 0ac5393d5238c0..f032c954acc1b5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
@@ -103,8 +103,8 @@ class CustomCallThunk : public Thunk {
       std::vector<NullableShapedSlice> operands,
       std::vector<NullableShapedSlice> results,
       xla::ffi::AttributesMap attributes,
-      const HloComputation* called_computation,
-      absl::string_view platform_name);
+      const HloComputation* called_computation, absl::string_view platform_name,
+      std::unique_ptr<xla::ffi::ExecutionState> execution_state = nullptr);
 
   // Creates a serializable custom call thunk from the given XLA FFI handler
   // bundle. Note that `target_name` needs to refer to a registered XLA FFI
@@ -114,7 +114,8 @@ class CustomCallThunk : public Thunk {
       XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
       std::vector<NullableShapedSlice> results,
       xla::ffi::AttributesMap attributes,
-      const HloComputation* called_computation);
+      const HloComputation* called_computation,
+      std::unique_ptr<xla::ffi::ExecutionState> execution_state = nullptr);
 
   // Creates a custom call thunk from a bundle of handlers created with
   // xla::ffi::Bind(). Any pointer or reference lambda captures must be valid
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
index 8aa5c9853bfcc3..82440516d63381 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 
 #include <gmock/gmock.h>
@@ -31,15 +32,19 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/attribute_map.h"
+#include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
@@ -56,6 +61,49 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 
+namespace xla::gpu {
+struct TestState {
+  std::string value;
+};
+
+struct NonSerializableTestState {
+  int value;
+};
+
+struct FailingSerializableTestState {
+  int value;
+};
+}  // namespace xla::gpu
+
+namespace xla::ffi {
+template <>
+struct TypeRegistry::SerDes<xla::gpu::TestState> : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(
+      const xla::gpu::TestState& value) {
+    return value.value;
+  }
+  static absl::StatusOr<std::unique_ptr<xla::gpu::TestState>> Deserialize(
+      absl::string_view data) {
+    return std::make_unique<xla::gpu::TestState>(
+        xla::gpu::TestState{std::string(data)});
+  }
+};
+
+template <>
+struct TypeRegistry::SerDes<xla::gpu::FailingSerializableTestState>
+    : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(
+      const xla::gpu::FailingSerializableTestState& value) {
+    return absl::InternalError("Serialization failed");
+  }
+  static absl::StatusOr<std::unique_ptr<xla::gpu::FailingSerializableTestState>>
+  Deserialize(absl::string_view data) {
+    return std::make_unique<xla::gpu::FailingSerializableTestState>(
+        xla::gpu::FailingSerializableTestState{0});
+  }
+};
+}  // namespace xla::ffi
+
 namespace xla::gpu {
 namespace {
 using absl_testing::IsOk;
@@ -240,9 +288,22 @@ TEST(CustomCallThunkTest, CustomCallWithOwnedHandlers) {
     ++execute_calls;
     return absl::OkStatus();
   });
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
   se::StreamExecutorMemoryAllocator allocator(executor);
-  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
   BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &buffer_allocations};
+
   Thunk::InitializeParams initialize_params;
   initialize_params.stream = stream.get();
   initialize_params.buffer_allocations = &buffer_allocations;
@@ -291,10 +352,23 @@ TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutOptionalOnes) {
     ++execute_calls;
     return absl::OkStatus();
   });
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
   se::StreamExecutorMemoryAllocator allocator(executor);
-  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
-  Thunk::InitializeParams initialize_params = Thunk::InitializeParams{};
   BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &buffer_allocations};
+
+  Thunk::InitializeParams initialize_params = Thunk::InitializeParams{};
   Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       ServiceExecutableRunOptions(), buffer_allocations, stream.get(),
       stream.get(), nullptr, nullptr);
@@ -335,7 +409,8 @@ TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutExecute) {
 absl::Status VerifyCallbackArguments(int my_attribute,
                                      ffi::AnyBuffer my_operand,
                                      ffi::Result<ffi::AnyBuffer> my_result,
-                                     const HloComputation* called_computation) {
+                                     const HloComputation* called_computation,
+                                     xla::gpu::TestState* state) {
   EXPECT_EQ(my_attribute, 42);
   EXPECT_EQ(my_operand.element_type(), xla::PrimitiveType::U8);
   EXPECT_EQ(my_operand.device_memory().opaque(),
@@ -344,6 +419,7 @@ absl::Status VerifyCallbackArguments(int my_attribute,
   EXPECT_EQ(my_result->device_memory().opaque(),
             absl::bit_cast<void*>(static_cast<intptr_t>(0xABCDEF)));
   EXPECT_EQ(called_computation->name(), "test_computation");
+  EXPECT_EQ(state->value, "some state");
   return absl::OkStatus();
 }
 
@@ -352,7 +428,8 @@ XLA_FFI_DEFINE_HANDLER(kVerifyCallbackArguments, VerifyCallbackArguments,
                            .Attr<int>("my_attribute")
                            .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
-                           .Ctx<ffi::CalledComputation>(),
+                           .Ctx<ffi::CalledComputation>()
+                           .Ctx<ffi::State<xla::gpu::TestState>>(),
                        {ffi::Traits::kCmdBufferCompatible});
 
 constexpr absl::string_view kVerifyCallbackArgumentsCustomCallName =
@@ -383,6 +460,11 @@ TEST(CustomCallThunkTest, ProtoConversion) {
   ShapedSlice result_slice{BufferAllocation::Slice{&alloc1, 0, 1024},
                            ShapeUtil::MakeShape(U16, {512})};
 
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  ASSERT_THAT(execution_state->Set(
+                  std::make_unique<TestState>(TestState{"some state"})),
+              IsOk());
+
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<CustomCallThunk> original_thunk,
       CustomCallThunk::Create(
@@ -391,9 +473,10 @@ TEST(CustomCallThunkTest, ProtoConversion) {
           /*operands=*/{operand_slice},
           /*results=*/{result_slice}, /*attributes=*/{{"my_attribute", 42}},
           hlo_module.entry_computation(),
-          /*platform_name=*/kTestPlatformName));
+          /*platform_name=*/kTestPlatformName, std::move(execution_state)));
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
   ASSERT_TRUE(proto.has_custom_call_thunk());
+  ASSERT_TRUE(proto.custom_call_thunk().has_execution_state());
   original_thunk.reset();
 
   std::array allocations = {alloc0, alloc1};
@@ -442,5 +525,115 @@ TEST(CustomCallThunkTest, DeserializationFailsWithMissingHloModule) {
               StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST(CustomCallThunkTest, RoundtripWithNonSerializableExecutionState) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("test_computation");
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  ASSERT_THAT(execution_state->Set(std::make_unique<NonSerializableTestState>(
+                  NonSerializableTestState{42})),
+              IsOk());
+  EXPECT_FALSE(execution_state->IsSerializable());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> original_thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kVerifyCallbackArgumentsCustomCallName),
+          /*operands=*/{},
+          /*results=*/{}, /*attributes=*/{}, hlo_module.entry_computation(),
+          /*platform_name=*/kTestPlatformName, std::move(execution_state)));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
+  ASSERT_TRUE(proto.has_custom_call_thunk());
+  EXPECT_FALSE(proto.custom_call_thunk().has_execution_state());
+
+  original_thunk.reset();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> new_thunk,
+      CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto.custom_call_thunk(),
+                                 /*buffer_allocations=*/{}, &hlo_module,
+                                 kTestPlatformName));
+
+  EXPECT_NE(new_thunk->execution_state(), nullptr);
+  EXPECT_FALSE(new_thunk->execution_state()->IsSet());
+}
+
+TEST(CustomCallThunkTest, SerializationFails) {
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("test_computation");
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  ASSERT_OK(execution_state->Set(std::make_unique<FailingSerializableTestState>(
+      FailingSerializableTestState{42})));
+  EXPECT_TRUE(execution_state->IsSerializable());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kVerifyCallbackArgumentsCustomCallName),
+          /*operands=*/{},
+          /*results=*/{}, /*attributes=*/{}, hlo_module.entry_computation(),
+          /*platform_name=*/kTestPlatformName, std::move(execution_state)));
+
+  EXPECT_THAT(thunk->ToProto(), StatusIs(absl::StatusCode::kInternal,
+                                         HasSubstr("Serialization failed")));
+}
+
+TEST(CustomCallThunkTest, LegacyCustomCallRoundTrip) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> original_thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/"Callback_WithStatusFailed",
+          /*operands=*/{},
+          /*results=*/{}, /*opaque=*/"opaque",
+          CustomCallApiVersion::API_VERSION_STATUS_RETURNING,
+          /*platform_name=*/executor->GetPlatform()->Name()));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
+  original_thunk.reset();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> new_thunk,
+      CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto.custom_call_thunk(),
+                                 /*buffer_allocations=*/{},
+                                 /*hlo_module=*/nullptr,
+                                 executor->GetPlatform()->Name()));
+
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations empty_unused_allocations({}, 0, &allocator);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), empty_unused_allocations,
+      /*stream=*/stream.get(),
+      /*command_buffer_trace_stream=*/stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr);
+
+  // We check that the new thunk behaves like the original one (returning
+  // internal error with specific message).
+  EXPECT_THAT(new_thunk->ExecuteOnStream(params),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("Legacy Custom call was executed!")));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
index 5628f682b31981..1feb1410564764 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/primitive_util.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
@@ -56,6 +57,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -118,15 +120,17 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
 
   // embedded_thunk_argument
   if (embedded_thunk_argument.has_value()) {
-    result += "embedded_thunk_argument:" + embedded_thunk_argument->ToString();
+    absl::StrAppend(&result, "embedded_thunk_argument:",
+                    embedded_thunk_argument->ToString());
   } else {
-    result += "embedded_thunk_argument:null";
+    absl::StrAppend(&result, "embedded_thunk_argument:null");
   }
 
   // offsets
   if (offsets.has_value()) {
-    result += ", offsets:[";
-    result +=
+    absl::StrAppend(&result, ", offsets:[");
+    absl::StrAppend(
+        &result,
         absl::StrJoin(*offsets, ", ", [](std::string* out, const auto& offset) {
           std::visit(
               [out](const auto& value) {
@@ -141,34 +145,33 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
                 }
               },
               offset);
-        });
-    result += "]";
+        }));
+    absl::StrAppend(&result, "]");
   } else {
-    result += ", offsets:null";
+    absl::StrAppend(&result, ", offsets:null");
   }
 
-  // orig_shape
   if (orig_shape.has_value()) {
-    result += ", orig_shape:" + orig_shape->ToString();
+    absl::StrAppend(&result, ", orig_shape:", orig_shape->ToString());
   } else {
-    result += ", orig_shape:null";
+    absl::StrAppend(&result, ", orig_shape:null");
   }
 
-  // sliced_shape
   if (sliced_shape.has_value()) {
-    result += ", sliced_shape:" + sliced_shape->ToString();
+    absl::StrAppend(&result, ", sliced_shape:", sliced_shape->ToString());
   } else {
-    result += ", sliced_shape:null";
+    absl::StrAppend(&result, ", sliced_shape:null");
   }
 
-  // offset_byte_size
-  if (offset_byte_size.has_value()) {
-    result += ", offset_byte_size:" + absl::StrCat(*offset_byte_size);
+  if (offset_primitive_type.has_value()) {
+    absl::StrAppend(
+        &result, ", offset_primitive_type:",
+        primitive_util::LowercasePrimitiveTypeName(*offset_primitive_type));
   } else {
-    result += ", offset_byte_size:null";
+    absl::StrAppend(&result, ", offset_primitive_type:null");
   }
 
-  result += "}";
+  absl::StrAppend(&result, "}");
   return result;
 }
 
@@ -179,7 +182,7 @@ DynamicSliceThunk::DynamicSliceThunk(
     std::vector<std::optional<std::vector<Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
-    std::vector<std::optional<uint64_t>> offset_byte_sizes,
+    std::vector<std::optional<PrimitiveType>> offset_primitive_types,
     std::optional<OffsetAsFunctionOfIndvarModulesMetadata>
         offset_as_function_of_indvar_metadata)
     : Thunk(Kind::kDynamicSlice, thunk_info),
@@ -190,19 +193,19 @@ DynamicSliceThunk::DynamicSliceThunk(
       offsets_(offsets),
       orig_shapes_(orig_shapes),
       sliced_shapes_(sliced_shapes),
-      offset_byte_sizes_(offset_byte_sizes),
+      offset_primitive_types_(offset_primitive_types),
       offset_as_function_of_indvar_metadata_(
           std::move(offset_as_function_of_indvar_metadata)) {
   // Zip all arguments together to create a list of SliceDef.
-  for (auto [arg, offsets, orig_shape, sliced_shape, offset_byte_size] :
+  for (auto [arg, offsets, orig_shape, sliced_shape, offset_primitive_type] :
        llvm::zip_equal(arguments, offsets, orig_shapes, sliced_shapes,
-                       offset_byte_sizes)) {
+                       offset_primitive_types)) {
     slices_.push_back(SliceDef{
         std::move(arg),
         std::move(offsets),
         std::move(orig_shape),
         std::move(sliced_shape),
-        std::move(offset_byte_size),
+        std::move(offset_primitive_type),
     });
   }
 
@@ -224,7 +227,7 @@ absl::Status DynamicSliceThunk::Prepare(const PrepareParams& params) {
       TF_RET_CHECK(slice.embedded_thunk_argument.has_value());
       TF_RET_CHECK(slice.orig_shape.has_value());
       TF_RET_CHECK(slice.sliced_shape.has_value());
-      TF_RET_CHECK(slice.offset_byte_size.has_value());
+      TF_RET_CHECK(slice.offset_primitive_type.has_value());
 
       TF_RET_CHECK(slice.orig_shape->IsArray());
       TF_RET_CHECK(slice.sliced_shape->IsArray());
@@ -243,7 +246,7 @@ absl::Status DynamicSliceThunk::Prepare(const PrepareParams& params) {
         HloEvaluator()
             .Evaluate(
                 /*module=*/*offset_as_function_of_indvar_metadata_->indvar_init,
-                /*arg_literals=*/{})
+                /*args=*/{})
             .value();
     VLOG(2) << "Indvar init module: "
             << offset_as_function_of_indvar_metadata_->indvar_init->ToString();
@@ -356,8 +359,9 @@ absl::Status DynamicSliceThunk::ExecuteOnStream(const ExecuteParams& params) {
 
         // Copy the `offset_idx`-th component of the offset for the
         // `argument_idx`-th argument from device to host.
-        TF_RETURN_IF_ERROR(
-            stream.Memcpy(offset_dst, offset_src, *slice.offset_byte_size));
+        TF_RETURN_IF_ERROR(stream.Memcpy(
+            offset_dst, offset_src,
+            ShapeUtil::ByteSizeOfPrimitiveType(*slice.offset_primitive_type)));
         ++num_transfers;
       }
     }
@@ -471,7 +475,9 @@ Thunk::BufferUses DynamicSliceThunk::buffer_uses() const {
       if (!alloc_slice) {
         continue;
       }
-      res.push_back(BufferUse::Read(*alloc_slice));
+      res.push_back(BufferUse::Read(
+          *alloc_slice,
+          ShapeUtil::MakeShape(*slice.offset_primitive_type, {})));
     }
   }
   return res;
@@ -628,10 +634,11 @@ absl::StatusOr<ThunkProto> DynamicSliceThunk::ToProto() const {
   }
 
   // offset_byte_sizes
-  for (const auto& size : offset_byte_sizes_) {
-    auto& proto_size = *dynamic_slice_proto->add_offset_byte_sizes();
-    if (size.has_value()) {
-      proto_size.set_value(size.value());
+  for (const std::optional<PrimitiveType>& primtive_type :
+       offset_primitive_types_) {
+    auto& proto_size = *dynamic_slice_proto->add_offset_primitive_types();
+    if (primtive_type.has_value()) {
+      proto_size.set_value(primtive_type.value());
     }
   }
 
@@ -665,7 +672,6 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
             proto.offset_as_function_of_indvar_modules_metadata()));
   }
 
-  // arguments
   std::vector<std::optional<BufferAllocation::Slice>> arguments;
   for (auto& arg_proto : proto.arguments()) {
     arguments.push_back(std::nullopt);
@@ -676,13 +682,11 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
-  // offsets
   TF_ASSIGN_OR_RETURN(
       std::vector<std::optional<std::vector<Offset>>> offsets,
       DeserializeOffsetsFromProto(proto, buffer_allocations,
                                   offset_as_function_of_indvar_metadata));
 
-  // orig_shapes
   std::vector<std::optional<Shape>> orig_shapes;
   for (auto& shape_proto : proto.orig_shapes()) {
     orig_shapes.push_back(std::nullopt);
@@ -692,7 +696,6 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
-  // sliced_shapes
   std::vector<std::optional<Shape>> sliced_shapes;
   for (auto& shape_proto : proto.sliced_shapes()) {
     sliced_shapes.push_back(std::nullopt);
@@ -702,23 +705,22 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
-  // offset_byte_sizes
-  std::vector<std::optional<uint64_t>> offset_byte_sizes;
-  for (auto& size_proto : proto.offset_byte_sizes()) {
-    offset_byte_sizes.push_back(std::nullopt);
-    if (size_proto.has_value()) {
-      offset_byte_sizes.back() = size_proto.value();
+  std::vector<std::optional<PrimitiveType>> offset_primtitive_types;
+  offset_primtitive_types.reserve(proto.offset_primitive_types_size());
+  for (const OptionalPrimitiveType& type_proto :
+       proto.offset_primitive_types()) {
+    offset_primtitive_types.push_back(std::nullopt);
+    if (type_proto.has_value()) {
+      offset_primtitive_types.back() = type_proto.value();
     }
   }
 
-  // fake_allocations
   std::vector<BufferAllocation> fake_allocations;
   for (const auto& fake_allocation_proto : proto.fake_allocations()) {
     fake_allocations.push_back(
         BufferAllocation::FromProto(fake_allocation_proto));
   }
 
-  // embedded_thunk
   std::vector<std::unique_ptr<Thunk>> embedded_thunks;
   for (const auto& thunk_proto : proto.embedded_thunk().thunks()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> embedded_thunk,
@@ -730,7 +732,7 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
       thunk_info, std::make_unique<ThunkSequence>(std::move(embedded_thunks)),
       std::move(arguments), std::move(fake_allocations), std::move(offsets),
       std::move(orig_shapes), std::move(sliced_shapes),
-      std::move(offset_byte_sizes),
+      std::move(offset_primtitive_types),
       std::move(offset_as_function_of_indvar_metadata));
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
index 68d724566b4d85..10a620158e9654 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
@@ -118,7 +118,7 @@ class DynamicSliceThunk : public Thunk {
       std::vector<std::optional<std::vector<Offset>>> offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
-      std::vector<std::optional<uint64_t>> offset_byte_sizes,
+      std::vector<std::optional<PrimitiveType>> offset_primitive_types,
       std::optional<OffsetAsFunctionOfIndvarModulesMetadata>
           offset_as_function_of_indvar_metadata = std::nullopt);
   DynamicSliceThunk(const DynamicSliceThunk&) = delete;
@@ -137,7 +137,7 @@ class DynamicSliceThunk : public Thunk {
     std::optional<std::vector<Offset>> offsets;
     std::optional<Shape> orig_shape;
     std::optional<Shape> sliced_shape;
-    std::optional<uint64_t> offset_byte_size;
+    std::optional<PrimitiveType> offset_primitive_type;
     std::string ToString() const;
   };
 
@@ -165,8 +165,8 @@ class DynamicSliceThunk : public Thunk {
     return sliced_shapes_;
   }
 
-  std::vector<std::optional<uint64_t>> get_offset_byte_sizes() const {
-    return offset_byte_sizes_;
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types() const {
+    return offset_primitive_types_;
   }
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
@@ -204,7 +204,7 @@ class DynamicSliceThunk : public Thunk {
   std::vector<std::optional<std::vector<Offset>>> offsets_;
   std::vector<std::optional<Shape>> orig_shapes_;
   std::vector<std::optional<Shape>> sliced_shapes_;
-  std::vector<std::optional<uint64_t>> offset_byte_sizes_;
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types_;
 
   std::vector<SliceDef> slices_;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto
index 6fc504f10ca83e..98616dd2ca48ac 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto
@@ -33,8 +33,8 @@ message OptionalShapeProto {
   optional xla.ShapeProto shape = 1;
 }
 
-message OptionalInt64Proto {
-  optional int64 value = 1;
+message OptionalPrimitiveType {
+  optional PrimitiveType value = 1;
 }
 
 // Reflects std::optional<std::vector<Offset>>
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
index 9668eda443f0b3..7c8913b737f2ed 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/ffi.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.pb.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
@@ -44,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -238,8 +240,8 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> CreateSlicedGemmThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedGemmProtoRoundTrip) {
@@ -411,8 +413,8 @@ CreateMultipleSlicedOperandsGemmThunk(
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
           ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), std::nullopt,
           std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), sizeof(int64_t),
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, S64, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, MultipleSlicedOperandsGemmProtoRoundTrip) {
@@ -601,7 +603,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8}), std::nullopt},
-      {sizeof(int64_t), std::nullopt});
+      {S64, std::nullopt});
 
   // Step 2:
   // Execute dynamic slice thunk.
@@ -767,7 +769,7 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
-      {sizeof(int64_t), sizeof(int64_t)});
+      {S64, S64});
 
   // Step 2:
   // Execute dynamic slice thunk.
@@ -945,8 +947,8 @@ CreateSlicedGemmArbitraryArgumentOrderThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedGemmArbitraryArgumentOrderProtoRoundTrip) {
@@ -1118,8 +1120,8 @@ CreateSlicedGemmArbitraryNumberOfArgumentsThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest,
@@ -1282,8 +1284,8 @@ CreateSlicedTupledOperandGemmThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedTupledOperandGemmProtoRoundTrip) {
@@ -1475,7 +1477,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
-      {sizeof(int64_t), sizeof(int64_t)});
+      {S64, S64});
 
   // Step 2:
   // Execute dynamic slice thunk.
@@ -1658,8 +1660,8 @@ CreateSlicedOperandsSameBufferGemmThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedOperandsSameBufferGemmProtoRoundTrip) {
@@ -1876,8 +1878,8 @@ CreateHostInductionVariableAndOffsetEvaluationThunk(
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 4}), std::nullopt,
           std::nullopt, std::nullopt},
       /*offset_byte_sizes=*/
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt},
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt},
       /*offset_as_function_of_indvar_metadata=*/
       std::move(offset_as_function_of_indvar_modules_metadata));
 }
@@ -1940,12 +1942,21 @@ TEST_F(DynamicSliceThunkTest,
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
   se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(/*buffers=*/{lhs, rhs, out, workspace},
-                                /*device_ordinal=*/0,
+                                /*device_ordinal=*/executor->device_ordinal(),
                                 /*memory_allocator=*/&allocator);
-
-  Thunk::PrepareParams prepare_params{};
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &allocations};
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, /*buffer_allocations=*/allocations, stream.get(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc b/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
index 7f5349975ee739..bec1a8fedb07a1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 
 namespace xla::gpu {
 namespace {
@@ -106,11 +108,14 @@ TEST(ForAllThunksTest, ConditionalThunk) {
       Thunk::ThunkInfo(), std::move(thunk_sequence));
   SequentialThunk* sequential_thunk_ptr = sequential_thunk.get();
 
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice(&alloc, 0, 4);
+  Shape shape = ShapeUtil::MakeShape(S32, {});
+
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.push_back(std::move(sequential_thunk));
-  ConditionalThunk conditional_thunk(
-      Thunk::ThunkInfo(), BufferAllocation::Slice(), std::move(branch_thunks),
-      /*branch_index_is_bool=*/false);
+  ConditionalThunk conditional_thunk(Thunk::ThunkInfo(), {slice, shape},
+                                     std::move(branch_thunks));
 
   EXPECT_THAT(GetAllThunks(&conditional_thunk),
               UnorderedElementsAre(thunk_ptr, sequential_thunk_ptr,
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
index dbf93d9bed5990..74bb3bcbb382e1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -80,6 +80,7 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(const CublasLtMatmulThunk& rhs)
       gemm_config_(rhs.gemm_config_),
       epilogue_(rhs.epilogue_),
       algorithm_idx_(rhs.algorithm_idx_),
+      autotune_workspace_size_(rhs.autotune_workspace_size_),
       canonical_hlo_(rhs.canonical_hlo_),
       a_(rhs.a_),
       b_(rhs.b_),
@@ -97,7 +98,8 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(const CublasLtMatmulThunk& rhs)
 CublasLtMatmulThunk::CublasLtMatmulThunk(
     Thunk::ThunkInfo thunk_info, std::string canonical_hlo,
     GemmConfig gemm_config, se::gpu::BlasLt::Epilogue epilogue,
-    int64_t algorithm_idx, BufferAllocation::Slice a, BufferAllocation::Slice b,
+    int64_t algorithm_idx, int64_t autotune_workspace_size,
+    BufferAllocation::Slice a, BufferAllocation::Slice b,
     BufferAllocation::Slice c, BufferAllocation::Slice d,
     BufferAllocation::Slice bias, BufferAllocation::Slice aux,
     BufferAllocation::Slice a_scale, BufferAllocation::Slice b_scale,
@@ -108,6 +110,7 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
       gemm_config_(std::move(gemm_config)),
       epilogue_(epilogue),
       algorithm_idx_(algorithm_idx),
+      autotune_workspace_size_(autotune_workspace_size),
       canonical_hlo_(std::move(canonical_hlo)),
       a_(a),
       b_(b),
@@ -172,10 +175,11 @@ CublasLtMatmulThunk::GetCachedMatmulPlan(const ExecuteParams& params) {
 
     TF_ASSIGN_OR_RETURN(auto plan,
                         blas_lt->GetMatmulPlan(gemm_config_, epilogue_));
-    // if workspace buffer is not provided, consider only the algorithms which
-    // do not require a scratch space
-    int64_t max_workspace =
-        workspace_.has_value() ? workspace_.value().size() : 0;
+
+    // Set the workspace size to the size that was used for autotuning, so
+    // algorithm index will be the same as returned by GetAlgorithms called
+    // during autotuning.
+    int64_t max_workspace = autotune_workspace_size_;
 
     // If autotuning is disabled, there is no point on retrieving all
     // algorithms, it's enough to get the default one only.
@@ -219,6 +223,7 @@ absl::StatusOr<ThunkProto> CublasLtMatmulThunk::ToProto() const {
   cublas_lt_matmul_thunk->set_epilogue(
       stream_executor::gpu::BlasLt::EpilogueToProto(epilogue_));
   cublas_lt_matmul_thunk->set_algorithm_idx(algorithm_idx_);
+  cublas_lt_matmul_thunk->set_autotune_workspace_size(autotune_workspace_size_);
   cublas_lt_matmul_thunk->set_canonical_hlo(canonical_hlo_);
   TF_ASSIGN_OR_RETURN(*cublas_lt_matmul_thunk->mutable_a(), a_.ToProto());
   TF_ASSIGN_OR_RETURN(*cublas_lt_matmul_thunk->mutable_b(), b_.ToProto());
@@ -323,10 +328,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> CublasLtMatmulThunk::FromProto(
   return std::make_unique<CublasLtMatmulThunk>(
       std::move(thunk_info), std::move(proto.canonical_hlo()),
       xla::gpu::GemmConfig(std::move(gemm_config)), std::move(epilogue),
-      proto.algorithm_idx(), std::move(a), std::move(b), std::move(c),
-      std::move(d), std::move(bias), std::move(aux), std::move(a_scale),
-      std::move(b_scale), std::move(c_scale), std::move(d_scale),
-      std::move(d_amax), std::move(workspace));
+      proto.algorithm_idx(), proto.autotune_workspace_size(), std::move(a),
+      std::move(b), std::move(c), std::move(d), std::move(bias), std::move(aux),
+      std::move(a_scale), std::move(b_scale), std::move(c_scale),
+      std::move(d_scale), std::move(d_amax), std::move(workspace));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
index 26c35ccca59902..efd1276d2c9b1d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -38,6 +38,7 @@ class CublasLtMatmulThunk : public Thunk {
   CublasLtMatmulThunk(Thunk::ThunkInfo thunk_info, std::string canonical_hlo,
                       GemmConfig gemm_config,
                       se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+                      int64_t autotune_workspace_size,
                       BufferAllocation::Slice a, BufferAllocation::Slice b,
                       BufferAllocation::Slice c, BufferAllocation::Slice d,
                       BufferAllocation::Slice bias /* may be null */,
@@ -75,6 +76,7 @@ class CublasLtMatmulThunk : public Thunk {
   GemmConfig gemm_config_;
   se::gpu::BlasLt::Epilogue epilogue_;
   int64_t algorithm_idx_;
+  int64_t autotune_workspace_size_;
   std::string canonical_hlo_;
   BufferAllocation::Slice a_;
   BufferAllocation::Slice b_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index ccdf653ca1862e..8fa69ad0f10b36 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <memory>
 #include <optional>
@@ -158,9 +159,9 @@ class GpuBlasLtThunkBuilder {
     return std::make_unique<CublasLtMatmulThunk>(
         std::move(thunk_info), std::move(canonical_hlo), std::move(gemm_config),
         epilogue,
-        /*algorithm_idx*/ 0, slices[0], slices[1],
-        has_matrix_bias ? slices[2] : slices.back(), slices.back(), bias,
-        BufferAllocation::Slice{} /* aux */,
+        /*algorithm_idx*/ 0, backend_config.autotune_workspace_size(),
+        slices[0], slices[1], has_matrix_bias ? slices[2] : slices.back(),
+        slices.back(), bias, BufferAllocation::Slice{} /* aux */,
         BufferAllocation::Slice{} /* a_scale */,
         BufferAllocation::Slice{} /* b_scale */,
         BufferAllocation::Slice{} /* c_scale */,
@@ -182,7 +183,7 @@ class GpuBlasLtThunkBuilder {
   se::StreamExecutorMemoryAllocator allocator_;
   se::GpuComputeCapability gpu_comp_;
   std::deque<BufferAllocation> allocs_;
-  std::vector<se::OwningDeviceMemory> mem_buffers_;
+  std::vector<se::ScopedDeviceAddress<uint8_t>> mem_buffers_;
 };
 
 void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
@@ -190,12 +191,14 @@ void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           this->ParseAndReturnVerifiedModule(hlo_string));
 
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
-      RunHloPass(
-          GemmRewriter(gpu_comp(executor),
-                       /*toolkit_version=*/se::SemanticVersion{12, 4, 0}),
-          module.get()));
+      RunHloPass(GemmRewriter(gpu_comp(executor),
+                              /*toolkit_version=*/se::SemanticVersion{12, 4, 0},
+                              options),
+                 module.get()));
   ASSERT_TRUE(changed);
 
   GpuBlasLtThunkBuilder builder(executor, gpu_comp(executor));
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
index 732462ab004c75..f77d0e8aabcf95 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
@@ -83,9 +83,8 @@ CreateHostExecuteStartThunk(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::cpu::NanoRtExecutable> executable,
                       client.Compile(host_computation));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<AotCompilationResult> aot_compilation_result,
-      client.Export(executable.get()));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<CompiledModule> aot_compilation_result,
+                      client.Export(executable.get()));
 
   xla::cpu::CpuAotCompilationResult* cpu_aot_compilation_result =
       tsl::down_cast<xla::cpu::CpuAotCompilationResult*>(
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
index a370a04753620b..75508e2365b0b8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/statusor.h"
@@ -30,16 +31,16 @@ namespace gpu {
 
 absl::Status MemzeroThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::DeviceAddressBase dest_data =
-      params.buffer_allocations->GetDeviceAddress(dest_);
+      params.buffer_allocations->GetDeviceAddress(dest_.slice);
   return params.stream->MemZero(&dest_data, dest_data.size());
 }
 
 absl::StatusOr<std::unique_ptr<MemzeroThunk>> MemzeroThunk::FromProto(
     ThunkInfo thunk_info, const MemzeroThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dest,
-                      BufferAllocation::Slice::FromProto(
-                          thunk_proto.dest_buffer(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      ShapedSlice dest,
+      ShapedSlice::FromProto(thunk_proto.dest_buffer(), buffer_allocations));
   return std::make_unique<MemzeroThunk>(std::move(thunk_info), dest);
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
index 6b627180c9f5af..aec432fa785d82 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -35,17 +36,16 @@ namespace gpu {
 // Thunk that zeroes out a given chunk of memory.
 class MemzeroThunk : public Thunk {
  public:
-  explicit MemzeroThunk(ThunkInfo thunk_info,
-                        const BufferAllocation::Slice& dest)
+  explicit MemzeroThunk(ThunkInfo thunk_info, const ShapedSlice& dest)
       : Thunk(Kind::kMemzero, thunk_info), dest_(dest) {}
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  const BufferAllocation::Slice& destination() const { return dest_; }
+  const ShapedSlice& destination() const { return dest_; }
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Write(dest_),
+        BufferUse::Write(dest_.slice, dest_.shape),
     };
   }
 
@@ -56,7 +56,7 @@ class MemzeroThunk : public Thunk {
   absl::StatusOr<ThunkProto> ToProto() const override;
 
  private:
-  const BufferAllocation::Slice dest_;
+  const ShapedSlice dest_;
 };
 
 // Thunk that sets a given chunk of memory to a particular 32-bit value.  The
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
index 0eb1bc60ff2cb3..67a6d8044ab5a4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
@@ -41,7 +41,14 @@ TEST(MemzeroThunkTest, ProtoRoundTrip) {
           execution_stream_id: 2
         }
         memzero_thunk {
-          dest_buffer { offset: 0 size: 4 buffer_allocation_index: 0 }
+          dest_buffer {
+            slice { offset: 0 size: 4 buffer_allocation_index: 0 }
+            shape {
+              dimensions: 1
+              element_type: F32
+              is_dynamic_dimension: false
+            }
+          }
         }
       )pb");
   std::vector<BufferAllocation> buffer_allocations = {
diff --git a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc
index 153d4f6e9e7048..0387a6e36b0f1e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h
index a0b392033f7fae..d92f23098bdb8b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
index d8ad220cbe0cec..a6c74ab97a05f4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
index 17c945f85e3149..2167e9c5bd1b5e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
index e310a931c8790b..dba1927c63560e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
index 7c30a373e730c1..ad76d81acace6d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc
index bb9259d19a7cf6..e03a176051e11f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_transfer_manager.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
index de0904648b7a7b..2a07b027f4207b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
index e0bfbc72f844de..035a9bb85932f1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
@@ -127,30 +127,7 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
   }
 
   std::vector<ReplicaGroup> replica_groups = statusor.value();
-  auto validation_it =
-      instr->frontend_attributes().map().find(kSendRecvValidationAttr);
-  P2PConfig::ValidationKind validation_kind = P2PConfig::ValidationKind::kValid;
-  std::vector<ReplicaGroup> bounds;
-  if (validation_it != instr->frontend_attributes().map().end()) {
-    if (validation_it->second == "invalid") {
-      validation_kind = P2PConfig::ValidationKind::kInvalid;
-    } else {
-      auto statusor_bounds = ParseReplicaGroupsOnly(validation_it->second);
-      if (!statusor_bounds.ok() ||
-          statusor_bounds.value().size() != replica_groups.size()) {
-        // Ignore problems related to the source-target-pair string to avoid
-        // using absl::StatusOr for the return type.
-        return p2p_config;
-      }
-      validation_kind = P2PConfig::ValidationKind::kConditional;
-      bounds = statusor_bounds.value();
-    }
-  }
-
-  int i = 0;
-  p2p_config.validation_kind = validation_kind;
-  P2PConfig::SourceTargetToBounds& source_target_to_bounds =
-      p2p_config.source_target_to_bounds;
+  p2p_config.validation_kind = P2PConfig::ValidationKind::kValid;
   for (const ReplicaGroup& replica_group : replica_groups) {
     int64_t source = replica_group.replica_ids(0);
     int64_t target = replica_group.replica_ids(1);
@@ -159,15 +136,6 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
         source;
     p2p_config.id_to_source_target.insert({source, {}}).first->second.target =
         target;
-
-    if (validation_kind == P2PConfig::ValidationKind::kConditional) {
-      const ReplicaGroup& bound = bounds[i];
-      int64_t lower = bound.replica_ids(0);
-      int64_t upper = bound.replica_ids(1);
-      source_target_to_bounds[std::make_pair(source, target)] =
-          std::make_pair(lower, upper);
-      i++;
-    }
   }
 
   return p2p_config;
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
index a3aadbacfc7bd0..f8875d3037fc5b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index f6d65a9a110d13..377d64404c8f34 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -59,6 +60,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -91,9 +93,9 @@ absl::Status LoadRaggedTensorMetadata(
     se::Stream& stream, absl::Span<DeviceBufferPair const> buffers,
     absl::Span<int64_t* const> ragged_metadata_allocs) {
   for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-    TF_RETURN_IF_ERROR(stream.Memcpy(ragged_metadata_allocs[i],
-                                     buffers[i + 2].source_buffer,
-                                     buffers[i + 2].source_buffer.size()));
+    RETURN_IF_ERROR(stream.Memcpy(ragged_metadata_allocs[i],
+                                  buffers[i + 2].source_buffer,
+                                  buffers[i + 2].source_buffer.size()));
   }
 
   // Wait for the copies to complete.
@@ -111,7 +113,7 @@ absl::Status RunAllToAllOnIndexBuffer(
     const se::DeviceAddressBase& source_buffer, int64_t num_updates_per_replica,
     const se::DeviceAddressBase& destination_buffer, PrimitiveType element_type,
     se::Stream& stream, Communicator& comm) {
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
+  ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
 
   auto* gpu_comm = tsl::down_cast<GpuCommunicator*>(&comm);
   Future<> future = gpu_comm->GroupExecute(
@@ -125,18 +127,18 @@ absl::Status RunAllToAllOnIndexBuffer(
           se::DeviceAddressBase recv_slice =
               GpuCollectives::Slice(destination_buffer, element_type, offset,
                                     /*count=*/num_updates_per_replica);
-          TF_RETURN_IF_ERROR(comm->LaunchSend(send_slice, element_type,
-                                              /*count=*/num_updates_per_replica,
-                                              RankId(peer),
-                                              GpuCollectives::On(stream)));
-          TF_RETURN_IF_ERROR(comm->LaunchRecv(recv_slice, element_type,
-                                              /*count=*/num_updates_per_replica,
-                                              RankId(peer),
-                                              GpuCollectives::On(stream)));
+          RETURN_IF_ERROR(comm->LaunchSend(send_slice, element_type,
+                                           /*count=*/num_updates_per_replica,
+                                           RankId(peer),
+                                           GpuCollectives::On(stream)));
+          RETURN_IF_ERROR(comm->LaunchRecv(recv_slice, element_type,
+                                           /*count=*/num_updates_per_replica,
+                                           RankId(peer),
+                                           GpuCollectives::On(stream)));
         }
         return absl::OkStatus();
       });
-  TF_RETURN_IF_ERROR(future.Await());
+  RETURN_IF_ERROR(future.Await());
   return stream.BlockHostUntilDone();
 }
 
@@ -149,7 +151,7 @@ absl::Status RunRaggedAllToAll(
   int device_ordinal = stream.parent()->device_ordinal();
   XLA_VLOG_DEVICE(3, device_ordinal)
       << "Performing ragged-all-to-all from device ordinal: " << device_ordinal;
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
+  ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
 
   std::vector<DeviceBufferPair> buffers = original_buffers;
 
@@ -161,13 +163,13 @@ absl::Status RunRaggedAllToAll(
   // local output buffer. To get the correct offsets we perform an AllToAll on
   // the output_offsets buffer.
   DeviceBufferPair& output_offsets_buffer_pair = buffers[4];
-  TF_RETURN_IF_ERROR(RunAllToAllOnIndexBuffer(
+  RETURN_IF_ERROR(RunAllToAllOnIndexBuffer(
       output_offsets_buffer_pair.source_buffer, num_updates_per_replica,
       output_offsets_device_buffer, output_offsets_buffer_pair.element_type,
       stream, comm));
   output_offsets_buffer_pair.source_buffer = output_offsets_device_buffer;
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       LoadRaggedTensorMetadata(stream, buffers, ragged_metadata_allocs));
 
   const int64_t* input_offsets = ragged_metadata_allocs[0];
@@ -198,12 +200,12 @@ absl::Status RunRaggedAllToAll(
                 output_offsets[idx] * ragged_row_element_size,
                 recv_sizes[idx] * ragged_row_element_size);
 
-            TF_RETURN_IF_ERROR(
+            RETURN_IF_ERROR(
                 comm->LaunchSend(send_slice, element_type,
                                  send_sizes[idx] * ragged_row_element_size,
                                  RankId(peer), GpuCollectives::On(stream)));
 
-            TF_RETURN_IF_ERROR(
+            RETURN_IF_ERROR(
                 comm->LaunchRecv(recv_slice, element_type,
                                  recv_sizes[idx] * ragged_row_element_size,
                                  RankId(peer), GpuCollectives::On(stream)));
@@ -215,38 +217,55 @@ absl::Status RunRaggedAllToAll(
   return future.Await();
 }
 
-// Contains the values that are passed between host threads with rendezvous.
-struct RendezvousValue {
-  RankId rank;
-  se::DeviceAddressBase output_buffer;
-  se::Event* start_event;
-  se::Event* end_event;
+}  // namespace
 
-  bool operator<(const RendezvousValue& other) const {
-    return rank < other.rank;
-  }
-};
+RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
+    ThunkInfo thunk_info, const HloRaggedAllToAllInstruction* instr,
+    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
+    : RaggedAllToAllStartThunk(
+          std::move(thunk_info), GetRaggedAllToAllConfig(instr),
+          IsGPUSyncCollective(*instr)
+              ? nullptr
+              : std::make_shared<CollectiveThunk::AsyncEvents>(),
+          std::move(buffers),
+          instr->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel()) {}
+
+RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
+    ThunkInfo thunk_info, const RaggedAllToAllConfig& config,
+    std::shared_ptr<AsyncEvents> async_events,
+    std::vector<CollectiveThunk::Buffer> buffers, bool one_shot_kernel_enabled)
+    : CollectiveThunk(Thunk::kRaggedAllToAllStart, thunk_info, async_events,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      config_(config),
+      buffers_(std::move(buffers)),
+      one_shot_kernel_enabled_(one_shot_kernel_enabled) {
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
+}
 
 // Executes the rendezvous before the kernel start.
 // Inserts CUDA events into the stream to ensure that all devices have reached
 // the start event before the kernel starts.
-absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
-RendezvousBeforeKernelStart(absl::string_view name,
-                            const GpuCliqueKey& clique_key, RankId rank,
-                            int64_t num_ranks,
-                            const se::DeviceAddressBase& output_buffer,
-                            se::Stream& stream, se::Event* start_event,
-                            se::Event* end_event) {
+absl::StatusOr<
+    std::shared_ptr<std::vector<RaggedAllToAllStartThunk::RendezvousValue>>>
+RaggedAllToAllStartThunk::RendezvousBeforeKernelStart(
+    const GpuCliqueKey& clique_key, se::Stream& stream,
+    const StreamState& state, const se::DeviceAddressBase& output_buffer) {
+  int64_t num_ranks = clique_key.num_local_participants();
+  const RankId& rank = state.rank;
+
   RendezvousValue rendezvous_value;
   rendezvous_value.rank = rank;
   rendezvous_value.output_buffer = output_buffer;
-  rendezvous_value.start_event = start_event;
-  rendezvous_value.end_event = end_event;
+  rendezvous_value.start_event = state.start_event.get();
+  rendezvous_value.end_event = state.end_event.get();
 
   // Record that this device has started the memcpy ragged-all-to-all. We do
   // this before the rendezvous to make sure that RecordEvent is called before
   // WaitFor on another stream.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
+  RETURN_IF_ERROR(stream.RecordEvent(state.start_event.get()));
 
   auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
     std::vector<RendezvousValue> values_copy;
@@ -259,21 +278,18 @@ RendezvousBeforeKernelStart(absl::string_view name,
     return values_copy;
   };
 
-  std::string start_rendezvous_key =
-      absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
+  std::string name =
+      absl::StrFormat("start one-shot ragged-all-to-all for rank %d, clique %s",
                       rank.value(), clique_key.ToString());
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
       Rendezvous<std::vector<RendezvousValue>>(
-          /*name=*/
-          start_rendezvous_key, /*key=*/clique_key,
-          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
-          rendezvous_fn));
+          name, clique_key, rendezvous_value, num_ranks, rendezvous_fn));
 
   // Wait for all devices to reach the start event. This indicates that all
   // output buffers are ready for transfer.
   for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
+    RETURN_IF_ERROR(stream.WaitFor(value.start_event));
   }
 
   return rendezvous_values;
@@ -281,33 +297,32 @@ RendezvousBeforeKernelStart(absl::string_view name,
 
 // Executes the rendezvous after the kernel finish. Waits for all devices to
 // reach the end event.
-absl::Status RendezvousAfterKernelFinish(
-    absl::string_view name, const GpuCliqueKey& clique_key, RankId rank,
-    int64_t num_ranks, se::Stream& stream, se::Event* end_event,
-    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
+absl::Status RaggedAllToAllStartThunk::RendezvousAfterKernelFinish(
+    const GpuCliqueKey& clique_key, se::Stream& stream,
+    const StreamState& state,
+    const std::vector<RendezvousValue>& rendezvous_values) {
+  int64_t num_ranks = clique_key.num_local_participants();
+  const RankId& rank = state.rank;
+
   // Record that this device has finished the memcpy ragged-all-to-all.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
+  RETURN_IF_ERROR(stream.RecordEvent(state.end_event.get()));
 
   // Do another rendezvous to make sure that we call RecordEvent for end_event
   // before WaitFor on another stream.
-  std::string finish_rendezvous_key =
-      absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
-                      name, rank.value(), clique_key.ToString());
-  TF_RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
-                                /*key=*/clique_key,
-                                /*num_threads=*/num_ranks));
+  std::string name = absl::StrFormat(
+      "finish one-shot ragged-all-to-all for rank %d, clique %s", rank.value(),
+      clique_key.ToString());
+  RETURN_IF_ERROR(Rendezvous(name, clique_key, num_ranks));
 
   // Wait for all devices to reach the end event. This indicates that all
   // updates from other devices have arrived.
-  for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
+  for (auto& value : rendezvous_values) {
+    RETURN_IF_ERROR(stream.WaitFor(value.end_event));
   }
 
   return absl::OkStatus();
 }
 
-}  // namespace
-
 absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
     const GpuCliqueKey& clique_key, se::Stream& stream,
     const StreamState& state, absl::Span<DeviceBufferPair const> buffers) {
@@ -324,11 +339,9 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
   se::DeviceAddressBase input_buffer = buffers[0].source_buffer;
   se::DeviceAddressBase output_buffer = buffers[1].destination_buffer;
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
-      RendezvousBeforeKernelStart(
-          /*name=*/"one-shot", clique_key, rank, num_ranks, output_buffer,
-          stream, state.start_event.get(), state.end_event.get()));
+      RendezvousBeforeKernelStart(clique_key, stream, state, output_buffer));
 
   const int64_t num_updates_per_replica = config_.num_total_updates / num_ranks;
 
@@ -337,31 +350,14 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
     output_ptrs.push_back(value.output_buffer);
   }
 
-  TF_RETURN_IF_ERROR(RunRaggedAllToAllKernel(
+  RETURN_IF_ERROR(RunRaggedAllToAllKernel(
       &stream, element_type, input_buffer, output_ptrs,
       buffers[2].source_buffer, buffers[3].source_buffer,
       buffers[4].source_buffer, num_ranks, num_updates_per_replica,
       config_.num_input_rows, config_.num_row_elements));
 
-  return RendezvousAfterKernelFinish(
-      /*name=*/"one-shot", clique_key, rank, num_ranks, stream,
-      state.end_event.get(), rendezvous_values);
-}
-
-RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
-    ThunkInfo thunk_info, const HloRaggedAllToAllInstruction* instr,
-    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
-    : CollectiveThunk(Thunk::kRaggedAllToAllStart, thunk_info,
-                      IsGPUSyncCollective(*instr),
-                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
-      config_(GetRaggedAllToAllConfig(instr)),
-      buffers_(std::move(buffers)),
-      one_shot_kernel_enabled_(
-          instr->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel()) {
-  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
+  return RendezvousAfterKernelFinish(clique_key, stream, state,
+                                     *rendezvous_values);
 }
 
 /*static*/ absl::Status RaggedAllToAllStartThunk::CheckImplementable(
@@ -370,7 +366,7 @@ RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
   auto status = [&instr]() -> absl::Status {
     for (HloInstruction* operand : instr->operands()) {
       Shape shape = operand->shape();
-      TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kRaggedAllToAll));
+      RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kRaggedAllToAll));
     }
 
     if (!ShapeUtil::IsEffectivelyMostMajorDimension(instr->shape(), 0)) {
@@ -399,7 +395,7 @@ RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
 
 absl::Status RaggedAllToAllStartThunk::Initialize(
     const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
   device_count_ = params.local_device_count;
 
   se::StreamExecutor* executor = params.executor;
@@ -414,7 +410,7 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       const GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, config_.config));
   const std::optional<RankId> rank =
@@ -426,9 +422,9 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
   // Allocate temp buffers in the host memory to load the sizes and offsets of
   // ragged tensors from device memory.
   for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> alloc,
-                        executor->HostMemoryAllocate(config_.num_total_updates *
-                                                     sizeof(int64_t)));
+    ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> alloc,
+                     executor->HostMemoryAllocate(config_.num_total_updates *
+                                                  sizeof(int64_t)));
     state->host_buffer_allocs.push_back(std::move(alloc));
   }
 
@@ -441,8 +437,8 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
   }
 
   if (is_local()) {
-    TF_ASSIGN_OR_RETURN(state->start_event, executor->CreateEvent());
-    TF_ASSIGN_OR_RETURN(state->end_event, executor->CreateEvent());
+    ASSIGN_OR_RETURN(state->start_event, executor->CreateEvent());
+    ASSIGN_OR_RETURN(state->end_event, executor->CreateEvent());
   }
 
   {
@@ -467,19 +463,79 @@ bool RaggedAllToAllStartThunk::is_local() const {
   return true;
 }
 
+absl::StatusOr<std::unique_ptr<RaggedAllToAllStartThunk>>
+RaggedAllToAllStartThunk::FromProto(
+    ThunkInfo thunk_info, const RaggedAllToAllStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  return std::make_unique<RaggedAllToAllStartThunk>(
+      std::move(thunk_info),
+      RaggedAllToAllConfig{config, thunk_proto.num_total_updates(),
+                           thunk_proto.num_input_rows(),
+                           thunk_proto.num_row_elements()},
+      async_events, std::move(buffers), thunk_proto.one_shot_kernel_enabled());
+}
+
+absl::StatusOr<ThunkProto> RaggedAllToAllStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  RaggedAllToAllStartThunkProto* thunk_proto =
+      proto.mutable_ragged_all_to_all_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+
+  thunk_proto->set_num_total_updates(config_.num_total_updates);
+  thunk_proto->set_num_input_rows(config_.num_input_rows);
+  thunk_proto->set_num_row_elements(config_.num_row_elements);
+  thunk_proto->set_one_shot_kernel_enabled(one_shot_kernel_enabled_);
+
+  return proto;
+}
+
 absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
+  ASSIGN_OR_RETURN(std::vector<DeviceBufferPair> device_buffers,
+                   ConvertToDeviceBuffers(params, buffers_,
+                                          config_.config.operand_element_type));
 
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
+  ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
 
-  TF_ASSIGN_OR_RETURN(
-      bool peer_access_enabled,
-      params.collective_cliques->peer_access_enabled(clique_key));
+  ASSIGN_OR_RETURN(bool peer_access_enabled,
+                   params.collective_cliques->peer_access_enabled(clique_key));
 
   StreamState* state = nullptr;
   {
@@ -493,7 +549,7 @@ absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
                                       device_buffers[0].element_type);
 
   if (should_use_one_shot_kernel) {
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         RunOneShotRaggedAllToAll(clique_key, stream, *state, device_buffers));
     return false;
   }
@@ -507,7 +563,7 @@ absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
         reinterpret_cast<int64_t*>(state->host_buffer_allocs[i]->opaque()));
   }
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunRaggedAllToAll(config_.num_row_elements, config_.num_total_updates,
                         device_buffers, stream, comm, ragged_metadata_allocs,
                         state->output_offsets_device_buffer.memory(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index 6a48a5fac956b0..fc56dfdae00c5a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
@@ -32,6 +33,8 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -56,6 +59,11 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
                            const HloRaggedAllToAllInstruction* instr,
                            std::vector<Buffer> buffers,
                            bool p2p_memcpy_enabled);
+  RaggedAllToAllStartThunk(ThunkInfo thunk_info,
+                           const RaggedAllToAllConfig& config,
+                           std::shared_ptr<AsyncEvents> async_events,
+                           std::vector<CollectiveThunk::Buffer> buffers,
+                           bool one_shot_kernel_enabled);
 
   // Returns whether the given instruction can be lowered to a nccl
   // ragged-all-to-all call.
@@ -65,7 +73,7 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
 
   absl::Status Initialize(const InitializeParams& params) override;
 
-  static const char* GetHloOpName() { return "ragged-all-to-all-start"; }
+  static absl::string_view GetHloOpName() { return "ragged-all-to-all-start"; }
 
   static CollectiveOpGroupMode GetGroupMode(
       const HloRaggedAllToAllInstruction* instr);
@@ -73,6 +81,13 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
   const CollectiveConfig& config() const override { return config_.config; }
   absl::Span<const Buffer> buffers() const { return buffers_; }
 
+  static absl::StatusOr<std::unique_ptr<RaggedAllToAllStartThunk>> FromProto(
+      ThunkInfo thunk_info, const RaggedAllToAllStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
@@ -80,6 +95,18 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
                                      Communicator& comm) override;
 
  private:
+  // Contains the values that are passed between host threads with rendezvous.
+  struct RendezvousValue {
+    RankId rank;
+    se::DeviceAddressBase output_buffer;
+    se::Event* start_event = nullptr;
+    se::Event* end_event = nullptr;
+
+    bool operator<(const RendezvousValue& other) const {
+      return rank < other.rank;
+    }
+  };
+
   struct StreamState {
     int device_ordinal;
     RankId rank;
@@ -103,6 +130,21 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
         : device_ordinal(device_ordinal), rank(rank) {}
   };
 
+  // Executes the rendezvous before the kernel start.
+  // Inserts CUDA events into the stream to ensure that all devices have reached
+  // the start event before the kernel starts.
+  absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
+  RendezvousBeforeKernelStart(const GpuCliqueKey& clique_key,
+                              se::Stream& stream, const StreamState& state,
+                              const se::DeviceAddressBase& output_buffer);
+
+  // Executes the rendezvous after the kernel finish. Waits for all devices to
+  // reach the end event.
+  absl::Status RendezvousAfterKernelFinish(
+      const GpuCliqueKey& clique_key, se::Stream& stream,
+      const StreamState& state,
+      const std::vector<RendezvousValue>& rendezvous_values);
+
   absl::Status RunOneShotRaggedAllToAll(
       const GpuCliqueKey& clique_key, se::Stream& stream,
       const StreamState& state, absl::Span<DeviceBufferPair const> buffers);
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc
new file mode 100644
index 00000000000000..3b476b5777834f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        ragged_all_to_all_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          num_total_updates: 10
+          num_input_rows: 2
+          num_row_elements: 5
+          one_shot_kernel_enabled: true
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<RaggedAllToAllStartThunk> thunk,
+                       RaggedAllToAllStartThunk::FromProto(
+                           thunk_info, proto.ragged_all_to_all_start_thunk(),
+                           buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_ragged_all_to_all_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.ragged_all_to_all_start_thunk()
+          .async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
index 884157fd5f85f1..4f3814c0f61509 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
@@ -172,7 +172,7 @@ absl::Status AppendToFileCustomCall(se::Stream* stream, ffi::AnyBuffer buffer,
   std::string filename(path);
 
   {
-    absl::MutexLock lock(&host_mutex);
+    absl::MutexLock lock(host_mutex);
 
     TF_RETURN_IF_ERROR(env->NewAppendableFile(filename, &file));
     tsl::io::RecordWriter writer(file.get());
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc
index 623a0dd23c61f9..5cb759a96f3f01 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc
@@ -106,7 +106,7 @@ void RunSelectKTest() {
   std::vector<T> h_data_in(batch * n);
   for (int j = 0; j < batch; ++j) {
     std::shuffle(topk.begin(), topk.end(), gen);
-    std::copy(topk.begin(), topk.end(), h_data_in.begin() + j * n);
+    absl::c_copy(topk, h_data_in.begin() + j * n);
   }
 
   // Compute golden Top-K values for verification
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
index 53d39d7d5e3093..a2144f91b4c924 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/send_thunk.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -35,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/device_id.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_placer.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_address.h"
@@ -42,6 +46,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -49,16 +54,24 @@ namespace gpu {
 SendThunk::SendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
                      int64_t replica_count, int64_t partition_count,
                      const Buffer& buffer)
-    : CollectiveThunk(Thunk::kSend, thunk_info,
-                      /*is_sync=*/false, GetStreamKindForP2P(instr)),
-      config_(GetP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
-                                      replica_count, partition_count)),
+    : SendThunk(std::move(thunk_info),
+                GetP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
+                                        replica_count, partition_count),
+                std::make_shared<CollectiveThunk::AsyncEvents>(),
+                GetStreamKindForP2P(instr), buffer, instr->name()) {}
+
+SendThunk::SendThunk(ThunkInfo thunk_info, const P2PConfig& config,
+                     std::shared_ptr<AsyncEvents> async_events,
+                     AsyncStreamKind stream_kind, const Buffer& buffer,
+                     absl::string_view instr_name)
+    : CollectiveThunk(Thunk::kSend, thunk_info, async_events, stream_kind),
+      config_(config),
       buffer_(buffer),
       execution_counters_(config_.validation_kind ==
                                   P2PConfig::ValidationKind::kConditional
                               ? new ExecutionCounters()
                               : nullptr),
-      hlo_name_(instr->name()) {}
+      hlo_name_(instr_name) {}
 
 absl::Status SendThunk::Initialize(const InitializeParams& params) {
   TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
@@ -69,6 +82,72 @@ absl::Status SendThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
+absl::StatusOr<std::unique_ptr<SendThunk>> SendThunk::FromProto(
+    ThunkInfo thunk_info, const SendThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
+      async_events_map[AsyncEventsUniqueId{
+          thunk_proto.async_events_unique_id()}];
+  if (!async_events) {
+    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  }
+
+  ASSIGN_OR_RETURN(CollectiveThunk::Buffer buffer,
+                   CollectiveThunk::Buffer::FromProto(thunk_proto.buffer(),
+                                                      buffer_allocations));
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  P2PConfig::IdToSourceTargetMap id_to_source_target;
+  for (const SourceTarget& source_target : thunk_proto.source_target_pairs()) {
+    id_to_source_target.insert({source_target.target(), {}})
+        .first->second.source = source_target.source();
+    id_to_source_target.insert({source_target.source(), {}})
+        .first->second.target = source_target.target();
+  }
+
+  return std::make_unique<SendThunk>(
+      std::move(thunk_info), P2PConfig{config, std::move(id_to_source_target)},
+      async_events, thunk_proto.async_stream_kind(), buffer,
+      thunk_proto.instruction_name());
+}
+
+absl::StatusOr<ThunkProto> SendThunk::ToProto() const {
+  CHECK_EQ(config_.validation_kind, P2PConfig::ValidationKind::kValid);
+  CHECK(config_.source_target_to_bounds.empty());
+
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  SendThunkProto* thunk_proto = proto.mutable_send_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  CHECK(async_events_id.has_value());
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  std::vector<SourceTarget> source_target_pairs;
+  source_target_pairs.reserve(config_.id_to_source_target.size() / 2);
+  for (const auto& [key_id, map_entry] : config_.id_to_source_target) {
+    if (!map_entry.source.has_value()) {
+      // Same pair is in the map with target/source switched.
+      continue;
+    }
+    SourceTarget pair;
+    pair.set_source(*map_entry.source);
+    pair.set_target(key_id);
+    source_target_pairs.push_back(pair);
+  }
+  thunk_proto->mutable_source_target_pairs()->Assign(
+      source_target_pairs.begin(), source_target_pairs.end());
+
+  thunk_proto->set_async_stream_kind(GetAsyncStreamKind());
+  thunk_proto->set_instruction_name(hlo_name_);
+  return proto;
+}
+
 absl::StatusOr<bool> SendThunk::RunCollective(const ExecuteParams& params,
                                               const GpuCliqueKey&,
                                               se::Stream& stream,
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
index e71b7b948f15b0..3651dba6c580ba 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
@@ -23,12 +23,14 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
@@ -40,8 +42,20 @@ class SendThunk : public CollectiveThunk {
   SendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
             int64_t replica_count, int64_t partition_count,
             const Buffer& buffer);
+  SendThunk(ThunkInfo thunk_info, const P2PConfig& config,
+            std::shared_ptr<AsyncEvents> async_events,
+            AsyncStreamKind stream_kind, const Buffer& buffer,
+            absl::string_view instr_name);
+
   absl::Status Initialize(const InitializeParams& params) override;
 
+  static absl::StatusOr<std::unique_ptr<SendThunk>> FromProto(
+      ThunkInfo thunk_info, const SendThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   const CollectiveConfig& config() const override { return config_.config; }
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc
new file mode 100644
index 00000000000000..79516ce955ec6b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/send_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        send_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          async_stream_kind: ASYNC_STREAM_KIND_COLLECTIVE
+          source_target_pairs: { source: 1 target: 2 }
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SendThunk> thunk,
+      SendThunk::FromProto(thunk_info, proto.send_thunk(), buffer_allocations,
+                           async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_send_thunk()->set_async_events_unique_id(
+      round_trip_proto.send_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
index 5bd1e169aea492..623f4fd376c49d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
@@ -109,6 +109,310 @@ Thunk::ExecuteParams::ExecuteParams(
 
 //===----------------------------------------------------------------------===//
 
+ThunkKindProto Thunk::KindToProto(Kind kind) {
+  switch (kind) {
+    case kAllGather:
+      return THUNK_KIND_ALL_GATHER;
+    case kAllGatherDone:
+      return THUNK_KIND_ALL_GATHER_DONE;
+    case kAllGatherStart:
+      return THUNK_KIND_ALL_GATHER_START;
+    case kAllReduce:
+      return THUNK_KIND_ALL_REDUCE;
+    case kAllReduceDone:
+      return THUNK_KIND_ALL_REDUCE_DONE;
+    case kAllReduceStart:
+      return THUNK_KIND_ALL_REDUCE_START;
+    case kAllToAll:
+      return THUNK_KIND_ALL_TO_ALL;
+    case kAllToAllDone:
+      return THUNK_KIND_ALL_TO_ALL_DONE;
+    case kAllToAllStart:
+      return THUNK_KIND_ALL_TO_ALL_START;
+    case kBuffersDebugChecksum:
+      return THUNK_KIND_BUFFERS_DEBUG_CHECKSUM;
+    case kBuffersDebugFloatCheck:
+      return THUNK_KIND_BUFFERS_DEBUG_FLOAT_CHECK;
+    case kCollectiveBroadcast:
+      return THUNK_KIND_COLLECTIVE_BROADCAST;
+    case kCollectiveBroadcastDone:
+      return THUNK_KIND_COLLECTIVE_BROADCAST_DONE;
+    case kCollectiveBroadcastStart:
+      return THUNK_KIND_COLLECTIVE_BROADCAST_START;
+    case kCollectiveKernel:
+      return THUNK_KIND_COLLECTIVE_KERNEL;
+    case kCollectiveMetadata:
+      return THUNK_KIND_COLLECTIVE_METADATA;
+    case kCollectivePermute:
+      return THUNK_KIND_COLLECTIVE_PERMUTE;
+    case kCollectivePermuteDone:
+      return THUNK_KIND_COLLECTIVE_PERMUTE_DONE;
+    case kCollectivePermuteStart:
+      return THUNK_KIND_COLLECTIVE_PERMUTE_START;
+    case kCommandBuffer:
+      return THUNK_KIND_COMMAND_BUFFER;
+    case kConditional:
+      return THUNK_KIND_CONDITIONAL;
+    case kConvolution:
+      return THUNK_KIND_CONVOLUTION;
+    case kConvolutionReorder:
+      return THUNK_KIND_CONVOLUTION_REORDER;
+    case kCopy:
+      return THUNK_KIND_COPY;
+    case kCopyDone:
+      return THUNK_KIND_COPY_DONE;
+    case kCuDnn:
+      return THUNK_KIND_CU_DNN;
+    case kCubSort:
+      return THUNK_KIND_CUB_SORT;
+    case kCublasLtMatmul:
+      return THUNK_KIND_CUBLAS_LT_MATMUL;
+    case kCustomCall:
+      return THUNK_KIND_CUSTOM_CALL;
+    case kCustomKernel:
+      return THUNK_KIND_CUSTOM_KERNEL;
+    case kDynamicSlice:
+      return THUNK_KIND_DYNAMIC_SLICE;
+    case kFft:
+      return THUNK_KIND_FFT;
+    case kGemm:
+      return THUNK_KIND_GEMM;
+    case kGroupDone:
+      return THUNK_KIND_GROUP_DONE;
+    case kGroupStart:
+      return THUNK_KIND_GROUP_START;
+    case kHostExecuteDone:
+      return THUNK_KIND_HOST_EXECUTE_DONE;
+    case kHostExecuteStart:
+      return THUNK_KIND_HOST_EXECUTE_START;
+    case kHostRecv:
+      return THUNK_KIND_HOST_RECV;
+    case kHostRecvDone:
+      return THUNK_KIND_HOST_RECV_DONE;
+    case kHostSend:
+      return THUNK_KIND_HOST_SEND;
+    case kHostSendDone:
+      return THUNK_KIND_HOST_SEND_DONE;
+    case kInfeed:
+      return THUNK_KIND_INFEED;
+    case kKernel:
+      return THUNK_KIND_KERNEL;
+    case kMemset32BitValue:
+      return THUNK_KIND_MEMSET32_BIT_VALUE;
+    case kMemzero:
+      return THUNK_KIND_MEMZERO;
+    case kNorm:
+      return THUNK_KIND_NORM;
+    case kNvshmemAllReduceDone:
+      return THUNK_KIND_NVSHMEM_ALL_REDUCE_DONE;
+    case kNvshmemAllReduceStart:
+      return THUNK_KIND_NVSHMEM_ALL_REDUCE_START;
+    case kNvshmemCollectivePermute:
+      return THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE;
+    case kNvshmemCollectivePermuteDone:
+      return THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_DONE;
+    case kNvshmemCollectivePermuteStart:
+      return THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_START;
+    case kNvshmemRecv:
+      return THUNK_KIND_NVSHMEM_RECV;
+    case kNvshmemRecvDone:
+      return THUNK_KIND_NVSHMEM_RECV_DONE;
+    case kNvshmemSend:
+      return THUNK_KIND_NVSHMEM_SEND;
+    case kNvshmemSendDone:
+      return THUNK_KIND_NVSHMEM_SEND_DONE;
+    case kOutfeed:
+      return THUNK_KIND_OUTFEED;
+    case kPartitionId:
+      return THUNK_KIND_PARTITION_ID;
+    case kRaggedAllToAll:
+      return THUNK_KIND_RAGGED_ALL_TO_ALL;
+    case kRaggedAllToAllDone:
+      return THUNK_KIND_RAGGED_ALL_TO_ALL_DONE;
+    case kRaggedAllToAllStart:
+      return THUNK_KIND_RAGGED_ALL_TO_ALL_START;
+    case kRecv:
+      return THUNK_KIND_RECV;
+    case kRecvDone:
+      return THUNK_KIND_RECV_DONE;
+    case kReduceScatter:
+      return THUNK_KIND_REDUCE_SCATTER;
+    case kReduceScatterDone:
+      return THUNK_KIND_REDUCE_SCATTER_DONE;
+    case kReduceScatterStart:
+      return THUNK_KIND_REDUCE_SCATTER_START;
+    case kReplicaId:
+      return THUNK_KIND_REPLICA_ID;
+    case kSelectK:
+      return THUNK_KIND_SELECT_K;
+    case kSend:
+      return THUNK_KIND_SEND;
+    case kSendDone:
+      return THUNK_KIND_SEND_DONE;
+    case kSequential:
+      return THUNK_KIND_SEQUENTIAL;
+    case kTriangularSolve:
+      return THUNK_KIND_TRIANGULAR_SOLVE;
+    case kWaitForStreams:
+      return THUNK_KIND_WAIT_FOR_STREAMS;
+    case kWhile:
+      return THUNK_KIND_WHILE;
+  };
+}
+
+absl::StatusOr<Thunk::Kind> Thunk::KindFromProto(ThunkKindProto kind) {
+  switch (kind) {
+    case THUNK_KIND_ALL_GATHER:
+      return kAllGather;
+    case THUNK_KIND_ALL_GATHER_DONE:
+      return kAllGatherDone;
+    case THUNK_KIND_ALL_GATHER_START:
+      return kAllGatherStart;
+    case THUNK_KIND_ALL_REDUCE:
+      return kAllReduce;
+    case THUNK_KIND_ALL_REDUCE_DONE:
+      return kAllReduceDone;
+    case THUNK_KIND_ALL_REDUCE_START:
+      return kAllReduceStart;
+    case THUNK_KIND_ALL_TO_ALL:
+      return kAllToAll;
+    case THUNK_KIND_ALL_TO_ALL_DONE:
+      return kAllToAllDone;
+    case THUNK_KIND_ALL_TO_ALL_START:
+      return kAllToAllStart;
+    case THUNK_KIND_BUFFERS_DEBUG_CHECKSUM:
+      return kBuffersDebugChecksum;
+    case THUNK_KIND_BUFFERS_DEBUG_FLOAT_CHECK:
+      return kBuffersDebugFloatCheck;
+    case THUNK_KIND_COLLECTIVE_BROADCAST:
+      return kCollectiveBroadcast;
+    case THUNK_KIND_COLLECTIVE_BROADCAST_DONE:
+      return kCollectiveBroadcastDone;
+    case THUNK_KIND_COLLECTIVE_BROADCAST_START:
+      return kCollectiveBroadcastStart;
+    case THUNK_KIND_COLLECTIVE_KERNEL:
+      return kCollectiveKernel;
+    case THUNK_KIND_COLLECTIVE_METADATA:
+      return kCollectiveMetadata;
+    case THUNK_KIND_COLLECTIVE_PERMUTE:
+      return kCollectivePermute;
+    case THUNK_KIND_COLLECTIVE_PERMUTE_DONE:
+      return kCollectivePermuteDone;
+    case THUNK_KIND_COLLECTIVE_PERMUTE_START:
+      return kCollectivePermuteStart;
+    case THUNK_KIND_COMMAND_BUFFER:
+      return kCommandBuffer;
+    case THUNK_KIND_CONDITIONAL:
+      return kConditional;
+    case THUNK_KIND_CONVOLUTION:
+      return kConvolution;
+    case THUNK_KIND_CONVOLUTION_REORDER:
+      return kConvolutionReorder;
+    case THUNK_KIND_COPY:
+      return kCopy;
+    case THUNK_KIND_COPY_DONE:
+      return kCopyDone;
+    case THUNK_KIND_CU_DNN:
+      return kCuDnn;
+    case THUNK_KIND_CUB_SORT:
+      return kCubSort;
+    case THUNK_KIND_CUBLAS_LT_MATMUL:
+      return kCublasLtMatmul;
+    case THUNK_KIND_CUSTOM_CALL:
+      return kCustomCall;
+    case THUNK_KIND_CUSTOM_KERNEL:
+      return kCustomKernel;
+    case THUNK_KIND_DYNAMIC_SLICE:
+      return kDynamicSlice;
+    case THUNK_KIND_FFT:
+      return kFft;
+    case THUNK_KIND_GEMM:
+      return kGemm;
+    case THUNK_KIND_GROUP_DONE:
+      return kGroupDone;
+    case THUNK_KIND_GROUP_START:
+      return kGroupStart;
+    case THUNK_KIND_HOST_EXECUTE_DONE:
+      return kHostExecuteDone;
+    case THUNK_KIND_HOST_EXECUTE_START:
+      return kHostExecuteStart;
+    case THUNK_KIND_HOST_RECV:
+      return kHostRecv;
+    case THUNK_KIND_HOST_RECV_DONE:
+      return kHostRecvDone;
+    case THUNK_KIND_HOST_SEND:
+      return kHostSend;
+    case THUNK_KIND_HOST_SEND_DONE:
+      return kHostSendDone;
+    case THUNK_KIND_INFEED:
+      return kInfeed;
+    case THUNK_KIND_KERNEL:
+      return kKernel;
+    case THUNK_KIND_MEMSET32_BIT_VALUE:
+      return kMemset32BitValue;
+    case THUNK_KIND_MEMZERO:
+      return kMemzero;
+    case THUNK_KIND_NORM:
+      return kNorm;
+    case THUNK_KIND_NVSHMEM_ALL_REDUCE_DONE:
+      return kNvshmemAllReduceDone;
+    case THUNK_KIND_NVSHMEM_ALL_REDUCE_START:
+      return kNvshmemAllReduceStart;
+    case THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE:
+      return kNvshmemCollectivePermute;
+    case THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_DONE:
+      return kNvshmemCollectivePermuteDone;
+    case THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_START:
+      return kNvshmemCollectivePermuteStart;
+    case THUNK_KIND_NVSHMEM_RECV:
+      return kNvshmemRecv;
+    case THUNK_KIND_NVSHMEM_RECV_DONE:
+      return kNvshmemRecvDone;
+    case THUNK_KIND_NVSHMEM_SEND:
+      return kNvshmemSend;
+    case THUNK_KIND_NVSHMEM_SEND_DONE:
+      return kNvshmemSendDone;
+    case THUNK_KIND_OUTFEED:
+      return kOutfeed;
+    case THUNK_KIND_PARTITION_ID:
+      return kPartitionId;
+    case THUNK_KIND_RAGGED_ALL_TO_ALL:
+      return kRaggedAllToAll;
+    case THUNK_KIND_RAGGED_ALL_TO_ALL_DONE:
+      return kRaggedAllToAllDone;
+    case THUNK_KIND_RAGGED_ALL_TO_ALL_START:
+      return kRaggedAllToAllStart;
+    case THUNK_KIND_RECV:
+      return kRecv;
+    case THUNK_KIND_RECV_DONE:
+      return kRecvDone;
+    case THUNK_KIND_REDUCE_SCATTER:
+      return kReduceScatter;
+    case THUNK_KIND_REDUCE_SCATTER_DONE:
+      return kReduceScatterDone;
+    case THUNK_KIND_REDUCE_SCATTER_START:
+      return kReduceScatterStart;
+    case THUNK_KIND_REPLICA_ID:
+      return kReplicaId;
+    case THUNK_KIND_SELECT_K:
+      return kSelectK;
+    case THUNK_KIND_SEND:
+      return kSend;
+    case THUNK_KIND_SEND_DONE:
+      return kSendDone;
+    case THUNK_KIND_SEQUENTIAL:
+      return kSequential;
+    case THUNK_KIND_TRIANGULAR_SOLVE:
+      return kTriangularSolve;
+    case THUNK_KIND_WAIT_FOR_STREAMS:
+      return kWaitForStreams;
+    case THUNK_KIND_WHILE:
+      return kWhile;
+    default:
+      return absl::InternalError(absl::StrCat("Unknown ThunkKindProto:", kind));
+  };
+}
+
 /*static*/ absl::string_view Thunk::KindToString(Thunk::Kind kind) {
 #define CASE(x)  \
   case Thunk::x: \
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h
index dbbfa3309a27ad..e8b8471e7403e9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_clique_requests.h"
 #include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
@@ -202,6 +203,9 @@ class Thunk {
     // go/keep-sorted end
   };
 
+  static ThunkKindProto KindToProto(Kind kind);
+  static absl::StatusOr<Thunk::Kind> KindFromProto(ThunkKindProto kind);
+
   // TODO(ezhulenev): This should become a part of StreamExecutor library, but
   // for now we keep it here as a Thunk implementation detail. It's not yet
   // clear what else should become a part of "executable source", we likely
@@ -247,6 +251,12 @@ class Thunk {
     const CollectiveParams* collective_params = nullptr;
     // Clique requests for preparing collective communicators.
     CollectiveCliqueRequests* clique_requests = nullptr;
+    // Multimem registry for preparing multimem objects.
+    CollectiveMultimemRegistry* absl_nonnull multimem_registry = nullptr;
+    // Stream executor for the thunk.
+    se::StreamExecutor* absl_nonnull executor = nullptr;
+    // Buffer allocations for the thunk.
+    const BufferAllocations* absl_nonnull buffer_allocations = nullptr;
   };
 
   //===--------------------------------------------------------------------===//
@@ -279,6 +289,9 @@ class Thunk {
     // Collective cliques acquired based on resource requests.
     CollectiveCliques* collective_cliques = nullptr;
 
+    // Multimem registry for preparing collective communicators.
+    CollectiveMultimemRegistry* multicast_memory_registry = nullptr;
+
     // XLA FFI execution context.
     const ffi::ExecutionContext* ffi_execution_context = nullptr;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index c34eabae9e45f4..f9b79285bad415 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -22,6 +22,7 @@ import "xla/backends/gpu/runtime/dynamic_slice_thunk.proto";
 import "xla/backends/gpu/runtime/shaped_slice.proto";
 import "xla/core/host_offloading/host_offloading_executable.proto";
 import "xla/ffi/attribute_map.proto";
+import "xla/ffi/execution_state.proto";
 import "xla/service/buffer_assignment.proto";
 import "xla/service/gpu/gpu_conv_runner.proto";
 import "xla/service/gpu/gpu_norm_runner.proto";
@@ -52,9 +53,86 @@ message ThunkMetadataListProto {
   repeated ThunkMetadataProto thunk_metadata = 1;
 }
 
+enum ThunkKindProto {
+  THUNK_KIND_UNSPECIFIED = 0;
+  THUNK_KIND_ALL_GATHER = 1;
+  THUNK_KIND_ALL_GATHER_DONE = 2;
+  THUNK_KIND_ALL_GATHER_START = 3;
+  THUNK_KIND_ALL_REDUCE = 4;
+  THUNK_KIND_ALL_REDUCE_DONE = 5;
+  THUNK_KIND_ALL_REDUCE_START = 6;
+  THUNK_KIND_ALL_TO_ALL = 7;
+  THUNK_KIND_ALL_TO_ALL_DONE = 8;
+  THUNK_KIND_ALL_TO_ALL_START = 9;
+  THUNK_KIND_BUFFERS_DEBUG_CHECKSUM = 10;
+  THUNK_KIND_BUFFERS_DEBUG_FLOAT_CHECK = 11;
+  THUNK_KIND_COLLECTIVE_BROADCAST = 12;
+  THUNK_KIND_COLLECTIVE_BROADCAST_DONE = 13;
+  THUNK_KIND_COLLECTIVE_BROADCAST_START = 14;
+  THUNK_KIND_COLLECTIVE_KERNEL = 15;
+  THUNK_KIND_COLLECTIVE_METADATA = 16;
+  THUNK_KIND_COLLECTIVE_PERMUTE = 17;
+  THUNK_KIND_COLLECTIVE_PERMUTE_DONE = 18;
+  THUNK_KIND_COLLECTIVE_PERMUTE_START = 19;
+  THUNK_KIND_COMMAND_BUFFER = 20;
+  THUNK_KIND_CONDITIONAL = 21;
+  THUNK_KIND_CONVOLUTION = 22;
+  THUNK_KIND_CONVOLUTION_REORDER = 23;
+  THUNK_KIND_COPY = 24;
+  THUNK_KIND_COPY_DONE = 25;
+  THUNK_KIND_CU_DNN = 26;
+  THUNK_KIND_CUB_SORT = 27;
+  THUNK_KIND_CUBLAS_LT_MATMUL = 28;
+  THUNK_KIND_CUSTOM_CALL = 29;
+  THUNK_KIND_CUSTOM_KERNEL = 30;
+  THUNK_KIND_DYNAMIC_SLICE = 31;
+  THUNK_KIND_FFT = 32;
+  THUNK_KIND_GEMM = 33;
+  THUNK_KIND_GROUP_DONE = 34;
+  THUNK_KIND_GROUP_START = 35;
+  THUNK_KIND_HOST_EXECUTE_DONE = 36;
+  THUNK_KIND_HOST_EXECUTE_START = 37;
+  THUNK_KIND_HOST_RECV = 38;
+  THUNK_KIND_HOST_RECV_DONE = 39;
+  THUNK_KIND_HOST_SEND = 40;
+  THUNK_KIND_HOST_SEND_DONE = 41;
+  THUNK_KIND_INFEED = 42;
+  THUNK_KIND_KERNEL = 43;
+  THUNK_KIND_MEMSET32_BIT_VALUE = 44;
+  THUNK_KIND_MEMZERO = 45;
+  THUNK_KIND_NORM = 46;
+  THUNK_KIND_NVSHMEM_ALL_REDUCE_DONE = 47;
+  THUNK_KIND_NVSHMEM_ALL_REDUCE_START = 48;
+  THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE = 49;
+  THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_DONE = 50;
+  THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_START = 51;
+  THUNK_KIND_NVSHMEM_RECV = 52;
+  THUNK_KIND_NVSHMEM_RECV_DONE = 53;
+  THUNK_KIND_NVSHMEM_SEND = 54;
+  THUNK_KIND_NVSHMEM_SEND_DONE = 55;
+  THUNK_KIND_OUTFEED = 56;
+  THUNK_KIND_PARTITION_ID = 57;
+  THUNK_KIND_RAGGED_ALL_TO_ALL = 58;
+  THUNK_KIND_RAGGED_ALL_TO_ALL_DONE = 59;
+  THUNK_KIND_RAGGED_ALL_TO_ALL_START = 60;
+  THUNK_KIND_RECV = 61;
+  THUNK_KIND_RECV_DONE = 62;
+  THUNK_KIND_REDUCE_SCATTER = 63;
+  THUNK_KIND_REDUCE_SCATTER_DONE = 64;
+  THUNK_KIND_REDUCE_SCATTER_START = 65;
+  THUNK_KIND_REPLICA_ID = 66;
+  THUNK_KIND_SELECT_K = 67;
+  THUNK_KIND_SEND = 68;
+  THUNK_KIND_SEND_DONE = 69;
+  THUNK_KIND_SEQUENTIAL = 70;
+  THUNK_KIND_TRIANGULAR_SOLVE = 71;
+  THUNK_KIND_WAIT_FOR_STREAMS = 72;
+  THUNK_KIND_WHILE = 73;
+}
+
 message CopyThunkProto {
-  xla.buffer_assignment.BufferAllocationSliceProto source_buffer = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto destination_buffer = 2;
+  ShapedSliceProto source_buffer = 1;
+  ShapedSliceProto destination_buffer = 2;
   int64 mem_size = 3;
 }
 
@@ -71,9 +149,8 @@ message DeviceToDeviceCopyThunkProto {
 }
 
 message ConditionalThunkProto {
-  xla.buffer_assignment.BufferAllocationSliceProto branch_index_buffer = 1;
+  ShapedSliceProto branch_index_buffer = 1;
   repeated SequentialThunkProto branch_thunks = 2;
-  bool branch_index_is_bool = 3;
 }
 
 message WhileThunkProto {
@@ -111,15 +188,9 @@ message WaitForStreamsThunkProto {
 
 message TriangularSolveThunkProto {
   xla.TriangularSolveOptions options = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto a_buffer = 2;
-  xla.buffer_assignment.BufferAllocationSliceProto b_buffer = 3;
-  xla.buffer_assignment.BufferAllocationSliceProto temp_buffer = 4;
-  xla.PrimitiveType type = 5;
-  int64 batch_size = 6;
-  int64 m = 7;
-  int64 n = 8;
-  int64 a_batch_stride = 9;
-  int64 b_batch_stride = 10;
+  ShapedSliceProto a_buffer = 2;
+  ShapedSliceProto b_buffer = 3;
+  ShapedSliceProto temp_buffer = 4;
 }
 
 message ReplicaIdThunkProto {
@@ -184,14 +255,14 @@ message DynamicSliceThunkProto {
   repeated OptionalDynamicSliceOffsetsProto offsets = 3;
   repeated OptionalShapeProto orig_shapes = 4;
   repeated OptionalShapeProto sliced_shapes = 5;
-  repeated OptionalInt64Proto offset_byte_sizes = 6;
+  repeated OptionalPrimitiveType offset_primitive_types = 6;
   optional OffsetAsFunctionOfIndvarModulesMetadataProto
       offset_as_function_of_indvar_modules_metadata = 7;
   repeated BufferAllocationProto fake_allocations = 8;
 }
 
 message MemzeroThunkProto {
-  xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;
+  ShapedSliceProto dest_buffer = 1;
 }
 
 message Memset32BitValueThunkProto {
@@ -232,6 +303,7 @@ message CublasLtMatmulThunkProto {
   optional xla.buffer_assignment.BufferAllocationSliceProto d_scale = 14;
   optional xla.buffer_assignment.BufferAllocationSliceProto d_amax = 15;
   optional xla.buffer_assignment.BufferAllocationSliceProto workspace = 16;
+  int64 autotune_workspace_size = 17;
 }
 
 message CubSortThunkProto {
@@ -260,16 +332,15 @@ message NormThunkProto {
 
 message ConvolutionThunkProto {
   GpuConvDescriptorProto conv_descriptor = 1;
-  repeated xla.buffer_assignment.BufferAllocationSliceProto operand_buffers = 2;
-  repeated xla.buffer_assignment.BufferAllocationSliceProto result_buffers = 3;
+  repeated ShapedSliceProto operand_buffers = 2;
+  repeated ShapedSliceProto result_buffers = 3;
   xla.buffer_assignment.BufferAllocationSliceProto scratch_buffer = 4;
 }
 
 message ConvolutionReorderThunkProto {
-  ConvolutionFilterDimensions filter_dimensions = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto filter_input = 2;
-  xla.buffer_assignment.BufferAllocationSliceProto filter_output = 3;
-  optional ConvolutionReorderBiasBuffers biases = 4;
+  ShapedSliceProto filter_input = 1;
+  ShapedSliceProto filter_output = 2;
+  optional ConvolutionReorderBiasBuffers biases = 3;
 }
 
 message FftThunkProto {
@@ -291,6 +362,7 @@ message CustomCallThunkProto {
   // The name of the called computation. It needs to match the HloCompuation in
   // the HloModule that is used to deserialize the thunk.
   optional string called_computation = 7;
+  optional xla.ffi.ExecutionStateProto execution_state = 8;
 }
 
 message CustomKernelThunkProto {
@@ -299,6 +371,102 @@ message CustomKernelThunkProto {
   CustomKernelProto custom_kernel = 3;
 }
 
+message CollectiveBufferProto {
+  int64 element_count = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto source_buffer = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto destination_buffer = 3;
+  int64 source_memory_space = 4;
+  int64 destination_memory_space = 5;
+}
+
+message CollectiveConfigProto {
+  repeated PrimitiveType operand_element_type = 1;
+  repeated ReplicaGroup replica_groups = 2;
+  CollectiveOpGroupMode group_mode = 3;
+  bool use_symmetric_buffer = 4;
+}
+
+message CollectiveThunkProto {
+  ThunkKindProto thunk_kind = 1;
+  AsyncStreamKind async_stream_kind = 2;
+  uint64 async_events_unique_id = 3;
+}
+
+message AllGatherStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+}
+
+enum ReductionKindProto {
+  REDUCTION_KIND_UNSPECIFIED = 0;
+  REDUCTION_KIND_SUM = 1;
+  REDUCTION_KIND_PRODUCT = 2;
+  REDUCTION_KIND_MIN = 3;
+  REDUCTION_KIND_MAX = 4;
+}
+
+message AllReduceStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+
+  ReductionKindProto reduction_kind = 4;
+  bool is_multimem_enabled = 5;
+  int32 shmem_bytes = 6;
+  string kernel_name = 7;
+  bool collective_kernel_enabled = 8;
+  bool is_async = 9;
+}
+
+message AllToAllStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+
+  bool has_split_dimension = 4;
+  bool p2p_memcpy_enabled = 5;
+}
+
+message RaggedAllToAllStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+
+  int64 num_total_updates = 4;
+  int64 num_input_rows = 5;
+  int64 num_row_elements = 6;
+  bool one_shot_kernel_enabled = 7;
+}
+
+message CollectivePermuteStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  repeated CollectiveBufferProto buffers = 2;
+
+  CollectiveConfigProto collective_config = 3;
+  repeated SourceTarget source_target_pairs = 4;
+
+  AsyncStreamKind async_stream_kind = 5;
+  bool p2p_memcpy_enabled = 6;
+}
+
+message SendThunkProto {
+  uint64 async_events_unique_id = 1;
+  CollectiveBufferProto buffer = 2;
+
+  CollectiveConfigProto collective_config = 3;
+  repeated SourceTarget source_target_pairs = 4;
+
+  AsyncStreamKind async_stream_kind = 5;
+  string instruction_name = 6;
+}
+
+message CollectiveDoneThunkProto {
+  ThunkKindProto thunk_kind = 1;
+  AsyncStreamKind async_stream_kind = 2;
+  optional uint64 async_events_unique_id = 3;
+}
+
 message ThunkProto {
   ThunkInfoProto thunk_info = 1;
 
@@ -337,6 +505,13 @@ message ThunkProto {
     HostRecvThunkProto host_recv_thunk = 34;
     HostRecvDoneThunkProto host_recv_done_thunk = 35;
     CustomKernelThunkProto custom_kernel_thunk = 36;
+    CollectiveDoneThunkProto collective_done_thunk = 37;
+    AllGatherStartThunkProto all_gather_start_thunk = 38;
+    AllReduceStartThunkProto all_reduce_start_thunk = 39;
+    AllToAllStartThunkProto all_to_all_start_thunk = 40;
+    RaggedAllToAllStartThunkProto ragged_all_to_all_start_thunk = 41;
+    CollectivePermuteStartThunkProto collective_permute_start_thunk = 42;
+    SendThunkProto send_thunk = 43;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
index 8241084f52831c..808142a0e2a45c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h"
 
+#include <algorithm>
+#include <cmath>
 #include <cstddef>
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <optional>
@@ -32,6 +35,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "Eigen/Core"
 #include "xla/backends/gpu/ffi.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
@@ -59,6 +63,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -72,10 +77,41 @@ constexpr size_t kLogSizeBytes = 64 * 1024;
 
 namespace {
 
-std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
+size_t CalculateTempBufferSize(const Thunk& thunk) {
+  size_t max_buffer_size_bytes = 0;
+  for (const BufferUse& use : thunk.buffer_uses()) {
+    if (use.HasDefinedContentsOnInput() || use.HasDefinedContentsOnOutput()) {
+      max_buffer_size_bytes =
+          std::max<size_t>(max_buffer_size_bytes, use.slice().size());
+    }
+  }
+
+  // We're doing the float checks in 2 steps:
+  // - parallel aggregation: one thread block writes partial result into the
+  //   temp buffer. The number of thread blocks used will be limtied by the size
+  //   calculated here.
+  // - reduction of the temp buffer on a single thread block
+  // To optimize for time, we want to do as much computation in parallel as we
+  // can, but also consider the overhead of single-block reduction step.
+
+  // Avoid making the reduction step use less than a block's worth of data. We
+  // can't go any faster than that anyway.
+  static constexpr size_t kMinElements = 1024;
+  // Arbitrary limit of 1Mi elements. This should be enough to accomodate the
+  // max number of thread blocks available on any supported GPU.
+  static constexpr size_t kMaxElements = 1024 * 1024;
+  const size_t size_elems =
+      xla::CeilOfRatio(max_buffer_size_bytes, sizeof(uint32_t));
+  const size_t sqrt_size_elems = std::sqrt(size_elems);
+  return std::clamp(xla::CeilOfRatio(size_elems, sqrt_size_elems), kMinElements,
+                    kMaxElements);
+}
+
+absl::StatusOr<std::unique_ptr<Thunk>> WrapWithFloatCheckThunk(
     std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
     const Thunk& predecessor_thunk, Thunk& successor_thunk,
-    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store) {
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    ThunkPassBufferAllocator& allocator) {
   const auto& thunk_buffers = thunk->buffer_uses();
   if (thunk_buffers.empty()) {
     VLOG(1) << "No buffers in thunk " << thunk->thunk_info().thunk_id
@@ -120,6 +156,12 @@ std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
     return thunk;
   }
 
+  const size_t temp_buffer_size_bytes =
+      CalculateTempBufferSize(*thunk) * sizeof(xla::gpu::FloatCheckResult);
+  TF_ASSIGN_OR_RETURN(BufferAllocation * tmp_alloc,
+                      allocator.NewEmptyAllocation(temp_buffer_size_bytes));
+  BufferAllocation::Slice tmp_slice(tmp_alloc, 0, tmp_alloc->size());
+
   VLOG(1) << "Wrapping thunk " << thunk->thunk_info().thunk_id
           << " with float check thunk due to presence of buffers: "
           << buffers_to_check.size();
@@ -128,7 +170,7 @@ std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
   thunk_and_checks.push_back(std::move(thunk));
   auto buffer_debug_float_check_thunk =
       std::make_unique<BuffersDebugFloatCheckThunk>(
-          Thunk::ThunkInfo(), thunk_ptr->thunk_info(), log_slice,
+          Thunk::ThunkInfo(), thunk_ptr->thunk_info(), log_slice, tmp_slice,
           std::move(buffers_to_check), std::move(metadata_store));
   buffer_debug_float_check_thunk->add_control_predecessor(thunk_ptr);
   thunk_and_checks.push_back(std::move(buffer_debug_float_check_thunk));
@@ -329,8 +371,9 @@ absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
       CreateBufferDebugFloatCheckThunk(metadata_store, log_slice, hlo_module));
 
   ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
-  TF_RETURN_IF_ERROR(
-      root_thunk->TransformAllNestedThunks([&](std::unique_ptr<Thunk> thunk) {
+  TF_RETURN_IF_ERROR(root_thunk->TransformAllNestedThunks(
+      [&](std::unique_ptr<Thunk> thunk)
+          -> absl::StatusOr<std::unique_ptr<Thunk>> {
         if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
           return thunk;
         }
@@ -338,7 +381,8 @@ absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
         return WrapWithFloatCheckThunk(
             std::move(thunk), log_slice,
             /*predecessor_thunk=*/*buffer_debug_init_thunk,
-            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store);
+            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store,
+            allocator);
       }));
 
   ThunkSequence& thunks = root_thunk->thunks();
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
index cb4e449e4a8bb1..135f3e42694bd2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/runtime_intrinsics.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
@@ -58,6 +59,7 @@ namespace {
 
 using testing::ElementsAre;
 using testing::Eq;
+using testing::IsEmpty;
 using testing::Pair;
 using testing::Pointer;
 using testing::SizeIs;
@@ -102,17 +104,16 @@ using SliceList =
 class FakeThunkPassBufferAllocator : public ThunkPassBufferAllocator {
  public:
   absl::StatusOr<BufferAllocation*> NewEmptyAllocation(int64_t size) override {
-    if (CreatedAlloc()) {
-      return absl::InvalidArgumentError("Expected only one allocation");
-    }
-    alloc_ = std::make_unique<BufferAllocation>(0, size, 0);
-    return alloc_.get();
+    allocs_.push_back(std::make_unique<BufferAllocation>(0, size, 0));
+    return allocs_.back().get();
   }
 
-  bool CreatedAlloc() { return alloc_ != nullptr; }
+  const std::vector<std::unique_ptr<BufferAllocation>>& allocs() const {
+    return allocs_;
+  }
 
  private:
-  std::unique_ptr<BufferAllocation> alloc_;
+  std::vector<std::unique_ptr<BufferAllocation>> allocs_;
 };
 
 class FakeThunk : public Thunk {
@@ -188,6 +189,7 @@ TEST_F(ThunkBufferDebugPassTest, IsNoOpWhenHloModuleIsNull) {
                              /*hlo_module=*/nullptr, device_info, allocator));
   EXPECT_FALSE(changed);
   EXPECT_THAT(root_thunk->thunks(), ElementsAre(Pointer(fake_thunk_ptr)));
+  EXPECT_THAT(allocator.allocs(), IsEmpty());
 }
 
 TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugChecksumThunks) {
@@ -256,6 +258,8 @@ TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugChecksumThunks) {
                                                 {2, slice_io},
                                             }))),
           IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+
+  EXPECT_THAT(allocator.allocs(), SizeIs(1));
 }
 
 TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
@@ -297,11 +301,13 @@ TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
       SequentialThunk::FromThunk(std::move(conditional_branch0_thunk)));
   branch_thunks.push_back(
       SequentialThunk::FromThunk(std::move(conditional_branch1_thunk)));
+
+  Shape condition_shape = ShapeUtil::MakeShape(PRED, {});
+  BufferAllocation::Slice condition_slice = CreateSlice();
+
   auto conditional_thunk = std::make_unique<ConditionalThunk>(
-      Thunk::ThunkInfo(),
-      /*branch_index_buffer_index=*/BufferAllocation::Slice(),
-      std::move(branch_thunks),
-      /*branch_index_is_bool=*/true);
+      Thunk::ThunkInfo(), ShapedSlice{condition_slice, condition_shape},
+      std::move(branch_thunks));
   const Thunk* const conditional_thunk_ptr = conditional_thunk.get();
   std::vector<std::unique_ptr<Thunk>> while_body_thunks;
   while_body_thunks.push_back(std::move(while_body_fake_thunk));
@@ -461,6 +467,8 @@ TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
                     Pointer(branch1_thunk_ptr),
                     IsChecksumThunkChecking(SliceList{{0, slice_branch1}})));
   }
+
+  EXPECT_THAT(allocator.allocs(), SizeIs(1));
 }
 
 TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugFloatCheckThunks) {
@@ -544,6 +552,9 @@ TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugFloatCheckThunks) {
       static_cast<const BuffersDebugFloatCheckThunk&>(*sub_thunks[1]);
   EXPECT_THAT(buffer_debug_after_fake_thunk.buffer_slices(),
               UnorderedElementsAre(Pair(1, slice_o), Pair(2, slice_io)));
+
+  // 1 for the log buffer, 1 per wrapped thunk for the temp buffer
+  EXPECT_THAT(allocator.allocs(), SizeIs(2));
 }
 
 TEST_F(ThunkBufferDebugPassTest, BufferSaverInserter) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index e323e9f722bee8..7ba384ee90c936 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -28,6 +28,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/message.h"
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
@@ -47,7 +52,9 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/norm_thunk.h"
 #include "xla/backends/gpu/runtime/outfeed_thunk.h"
+#include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
+#include "xla/backends/gpu/runtime/send_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
@@ -87,6 +94,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
     const HloModule* absl_nullable hlo_module, absl::string_view platform_name,
     HostExecuteAsyncEventsMap& host_executable_async_events_map,
     HostSendRecvAsyncEventsMap& host_send_recv_async_events_map,
+    CollectiveThunk::AsyncEventsMap& collective_async_events_map,
     const std::optional<stream_executor::KernelLoaderSpec::SymbolResolver>&
         symbol_resolver) {
   TF_ASSIGN_OR_RETURN(Thunk::ThunkInfo thunk_info,
@@ -95,7 +103,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
     return DeserializeThunkProtoImpl(
         thunk_proto, buffer_allocations, hlo_module, platform_name,
         host_executable_async_events_map, host_send_recv_async_events_map,
-        symbol_resolver);
+        collective_async_events_map, symbol_resolver);
   };
 
   switch (thunk_proto.impl_case()) {
@@ -189,7 +197,8 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
             return DeserializeThunkProtoImpl(
                 thunk_proto, custom_allocations, hlo_module, platform_name,
                 host_executable_async_events_map,
-                host_send_recv_async_events_map, symbol_resolver);
+                host_send_recv_async_events_map, collective_async_events_map,
+                symbol_resolver);
           };
       return DynamicSliceThunk::FromProto(std::move(thunk_info),
                                           thunk_proto.dynamic_slice_thunk(),
@@ -235,7 +244,34 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return CustomKernelThunk::FromProto(std::move(thunk_info),
                                           thunk_proto.custom_kernel_thunk(),
                                           buffer_allocations, symbol_resolver);
-
+    case ThunkProto::kCollectiveDoneThunk:
+      return CollectiveDoneThunk::FromProto(std::move(thunk_info),
+                                            thunk_proto.collective_done_thunk(),
+                                            collective_async_events_map);
+    case ThunkProto::kAllGatherStartThunk:
+      return AllGatherStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.all_gather_start_thunk(),
+          buffer_allocations, collective_async_events_map);
+    case ThunkProto::kAllReduceStartThunk:
+      return AllReduceStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.all_reduce_start_thunk(),
+          buffer_allocations, collective_async_events_map);
+    case ThunkProto::kAllToAllStartThunk:
+      return AllToAllStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.all_to_all_start_thunk(),
+          buffer_allocations, collective_async_events_map);
+    case ThunkProto::kRaggedAllToAllStartThunk:
+      return RaggedAllToAllStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.ragged_all_to_all_start_thunk(),
+          buffer_allocations, collective_async_events_map);
+    case ThunkProto::kCollectivePermuteStartThunk:
+      return CollectivePermuteStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.collective_permute_start_thunk(),
+          buffer_allocations, collective_async_events_map);
+    case ThunkProto::kSendThunk:
+      return SendThunk::FromProto(std::move(thunk_info),
+                                  thunk_proto.send_thunk(), buffer_allocations,
+                                  collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);
@@ -263,10 +299,11 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
         symbol_resolver) {
   HostExecuteAsyncEventsMap host_executable_async_events_map;
   HostSendRecvAsyncEventsMap host_send_recv_async_events_map;
+  CollectiveThunk::AsyncEventsMap collective_async_events_map;
   return DeserializeThunkProtoImpl(
       thunk_proto, buffer_allocations, hlo_module, platform_name,
       host_executable_async_events_map, host_send_recv_async_events_map,
-      symbol_resolver);
+      collective_async_events_map, symbol_resolver);
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
index 83a1db3b20b828..8613d387871abf 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -101,8 +101,24 @@ TEST(ThunkProtoDeserializationTest, CopyThunk) {
           execution_stream_id: 123
         }
         copy_thunk {
-          source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
-          destination_buffer { offset: 0 size: 256 buffer_allocation_index: 1 }
+          source_buffer {
+            slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          destination_buffer {
+            slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
           mem_size: 256
         }
       )pb");
@@ -130,11 +146,29 @@ TEST(ThunkProtoDeserializationTest, DeviceToHostCopyThunk) {
         }
         device_to_host_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -164,11 +198,29 @@ TEST(ThunkProtoDeserializationTest, HostToDeviceCopyThunk) {
         }
         host_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -198,11 +250,29 @@ TEST(ThunkProtoDeserializationTest, DeviceToDeviceCopyThunk) {
         }
         device_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -243,18 +313,30 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { buffer_allocation_index: 0 }
-                destination_buffer { buffer_allocation_index: 1 }
-              }
-            }
-            thunks {
-              thunk_info {
-                profile_annotation: "profile_annotation"
-                execution_stream_id: 123
-              }
-              copy_thunk {
-                source_buffer { buffer_allocation_index: 1 }
-                destination_buffer { buffer_allocation_index: 2 }
+                source_buffer {
+                  slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
+                destination_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
               }
             }
           }
@@ -265,8 +347,30 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { buffer_allocation_index: 2 }
-                destination_buffer { buffer_allocation_index: 3 }
+                source_buffer {
+                  slice { offset: 128 size: 384 buffer_allocation_index: 2 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
+                destination_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
               }
             }
             thunks {
@@ -275,8 +379,30 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { buffer_allocation_index: 3 }
-                destination_buffer { buffer_allocation_index: 4 }
+                source_buffer {
+                  slice { offset: 128 size: 384 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
+                destination_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 4 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
               }
             }
           }
@@ -310,7 +436,10 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
           execution_stream_id: 123
         }
         conditional_thunk {
-          branch_index_buffer { offset: 8 size: 256 buffer_allocation_index: 5 }
+          branch_index_buffer {
+            slice { offset: 8 size: 1 buffer_allocation_index: 5 }
+            shape { element_type: PRED }
+          }
           branch_thunks {
             thunks {
               thunk_info {
@@ -318,11 +447,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 0 size: 256 buffer_allocation_index: 0 }
+                source_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 0 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 1
-                  size: 257
-                  buffer_allocation_index: 1
+                  slice { offset: 1 size: 257 buffer_allocation_index: 1 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
@@ -332,11 +479,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 2 size: 258 buffer_allocation_index: 1 }
+                source_buffer {
+                  slice { offset: 2 size: 258 buffer_allocation_index: 1 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 3
-                  size: 259
-                  buffer_allocation_index: 2
+                  slice { offset: 3 size: 259 buffer_allocation_index: 2 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
@@ -348,11 +513,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 4 size: 260 buffer_allocation_index: 2 }
+                source_buffer {
+                  slice { offset: 4 size: 260 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 5
-                  size: 261
-                  buffer_allocation_index: 3
+                  slice { offset: 5 size: 261 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
@@ -362,16 +545,33 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 6 size: 262 buffer_allocation_index: 3 }
+                source_buffer {
+                  slice { offset: 6 size: 262 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 7
-                  size: 263
-                  buffer_allocation_index: 4
+                  slice { offset: 7 size: 263 buffer_allocation_index: 4 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
           }
-          branch_index_is_bool: true
         }
       )pb");
 
@@ -564,6 +764,7 @@ TEST(ThunkProtoDeserializationTest, CustomCallThunk) {
             }
           }
           called_computation: "called_computation"
+          execution_state {}
         }
       )pb");
   std::vector<BufferAllocation> buffer_allocations = {
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
index c6d36f675715eb..16881b25aa35ee 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/make_batch_pointers.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
@@ -42,11 +43,8 @@ namespace gpu {
 
 TriangularSolveThunk::TriangularSolveThunk(
     ThunkInfo thunk_info, const TriangularSolveOptions& options,
-    const BufferAllocation::Slice& a_buffer,
-    const BufferAllocation::Slice& b_buffer,
-    const BufferAllocation::Slice& temp_buffer,  //
-    PrimitiveType type, int64_t batch_size, int64_t m, int64_t n,
-    int64_t a_batch_stride, int64_t b_batch_stride)
+    const ShapedSlice& a_buffer, const ShapedSlice& b_buffer,
+    const ShapedSlice& temp_buffer)
     : Thunk(Kind::kTriangularSolve, thunk_info),
       uplo_(options.lower() ? se::blas::UpperLower::kLower
                             : se::blas::UpperLower::kUpper),
@@ -57,12 +55,9 @@ TriangularSolveThunk::TriangularSolveThunk(
       a_buffer_(a_buffer),
       b_buffer_(b_buffer),
       temp_buffer_(temp_buffer),
-      type_(type),
-      batch_size_(batch_size),
-      m_(m),
-      n_(n),
-      a_batch_stride_(a_batch_stride),
-      b_batch_stride_(b_batch_stride) {
+      type_(b_buffer.shape.element_type()),
+      m_(b_buffer.shape.dimensions(b_buffer.shape.dimensions().size() - 2)),
+      n_(b_buffer.shape.dimensions(b_buffer.shape.dimensions().size() - 1)) {
   transpose_a_ = [&] {
     switch (options.transpose_a()) {
       case TriangularSolveOptions::NO_TRANSPOSE:
@@ -82,31 +77,30 @@ TriangularSolveThunk::TriangularSolveThunk(
 absl::Status TriangularSolveThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& buffer_allocations = *params.buffer_allocations;
-  return RunTriangularSolve(buffer_allocations.GetDeviceAddress(a_buffer_),
-                            buffer_allocations.GetDeviceAddress(b_buffer_),
-                            buffer_allocations.GetDeviceAddress(temp_buffer_),
-                            uplo_, side_, unit_diagonal_, transpose_a_, type_,
-                            batch_size_, m_, n_, a_batch_stride_,
-                            b_batch_stride_, params.stream);
+  return RunTriangularSolve(
+      buffer_allocations.GetDeviceAddress(a_buffer_.slice),
+      buffer_allocations.GetDeviceAddress(b_buffer_.slice),
+      buffer_allocations.GetDeviceAddress(temp_buffer_.slice), uplo_, side_,
+      unit_diagonal_, transpose_a_, type_, batch_size(), m_, n_,
+      a_batch_stride(), b_batch_stride(), params.stream);
 }
 
 absl::StatusOr<std::unique_ptr<TriangularSolveThunk>>
 TriangularSolveThunk::FromProto(
     ThunkInfo thunk_info, const TriangularSolveThunkProto& proto,
     absl::Span<const BufferAllocation> allocations) {
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice a_buffer,
-      BufferAllocation::Slice::FromProto(proto.a_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice b_buffer,
-      BufferAllocation::Slice::FromProto(proto.b_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice temp_buffer,
-      BufferAllocation::Slice::FromProto(proto.temp_buffer(), allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice a_buffer,
+                      ShapedSlice::FromProto(proto.a_buffer(), allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice b_buffer,
+                      ShapedSlice::FromProto(proto.b_buffer(), allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice temp_buffer,
+                      ShapedSlice::FromProto(proto.temp_buffer(), allocations));
+
+  if (b_buffer.shape.dimensions().size() < 2) {
+    return absl::InvalidArgumentError("Unsupported shape for b");
+  }
   return std::make_unique<TriangularSolveThunk>(
-      thunk_info, proto.options(), a_buffer, b_buffer, temp_buffer,
-      proto.type(), proto.batch_size(), proto.m(), proto.n(),
-      proto.a_batch_stride(), proto.b_batch_stride());
+      thunk_info, proto.options(), a_buffer, b_buffer, temp_buffer);
 }
 
 absl::StatusOr<ThunkProto> TriangularSolveThunk::ToProto() const {
@@ -143,12 +137,6 @@ absl::StatusOr<ThunkProto> TriangularSolveThunk::ToProto() const {
                       b_buffer_.ToProto());
   TF_ASSIGN_OR_RETURN(*triangular_solve_thunk_proto->mutable_temp_buffer(),
                       temp_buffer_.ToProto());
-  triangular_solve_thunk_proto->set_type(type_);
-  triangular_solve_thunk_proto->set_batch_size(batch_size_);
-  triangular_solve_thunk_proto->set_m(m_);
-  triangular_solve_thunk_proto->set_n(n_);
-  triangular_solve_thunk_proto->set_a_batch_stride(a_batch_stride_);
-  triangular_solve_thunk_proto->set_b_batch_stride(b_batch_stride_);
   return proto;
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h
index 88c05ca23a226d..29e701fb9a627e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h
@@ -17,15 +17,19 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
 
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <numeric>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
@@ -42,12 +46,8 @@ class TriangularSolveThunk : public Thunk {
  public:
   TriangularSolveThunk(ThunkInfo thunk_info,
                        const TriangularSolveOptions& options,
-                       const BufferAllocation::Slice& a_buffer,
-                       const BufferAllocation::Slice& b_buffer,
-                       const BufferAllocation::Slice& temp_buffer,
-                       PrimitiveType type, int64_t batch_size, int64_t m,
-                       int64_t n, int64_t a_batch_stride,
-                       int64_t b_batch_stride);
+                       const ShapedSlice& a_buffer, const ShapedSlice& b_buffer,
+                       const ShapedSlice& temp_buffer);
 
   TriangularSolveThunk(const TriangularSolveThunk&) = delete;
   TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
@@ -56,10 +56,9 @@ class TriangularSolveThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(a_buffer_),
-        BufferUse::Write(b_buffer_),
-        BufferUse(temp_buffer_, BufferUse::MemoryAccess::kWrite,
-                  BufferUse::ContentValidity::kUndefined),
+        BufferUse::Read(a_buffer_.slice, a_buffer_.shape),
+        BufferUse::Write(b_buffer_.slice, b_buffer_.shape),
+        BufferUse::Scratch(temp_buffer_.slice, temp_buffer_.shape),
     };
   };
 
@@ -70,21 +69,34 @@ class TriangularSolveThunk : public Thunk {
   absl::StatusOr<ThunkProto> ToProto() const override;
 
  private:
+  int64_t batch_size() const {
+    return std::accumulate(b_buffer_.shape.dimensions().begin(),
+                           b_buffer_.shape.dimensions().end() - 2, int64_t{1},
+                           std::multiplies<int64_t>());
+  }
+
+  int64_t a_batch_stride() const {
+    int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(type_);
+    return side_ == se::blas::Side::kLeft ? (m_ * m_ * elem_size)
+                                          : (n_ * n_ * elem_size);
+  }
+
+  int64_t b_batch_stride() const {
+    return m_ * n_ * ShapeUtil::ByteSizeOfPrimitiveType(type_);
+  }
+
   const se::blas::UpperLower uplo_;
   const se::blas::Side side_;
   const se::blas::Diagonal unit_diagonal_;
   se::blas::Transpose transpose_a_;
 
-  const BufferAllocation::Slice a_buffer_;
-  const BufferAllocation::Slice b_buffer_;
-  const BufferAllocation::Slice temp_buffer_;
+  const ShapedSlice a_buffer_;
+  const ShapedSlice b_buffer_;
+  const ShapedSlice temp_buffer_;
 
   const PrimitiveType type_;
-  const int64_t batch_size_;
   const int64_t m_;
   const int64_t n_;
-  const int64_t a_batch_stride_;
-  const int64_t b_batch_stride_;
 };
 
 absl::Status RunTriangularSolve(se::DeviceAddressBase a_data,
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
index 6414135ebb3f8b..641fbc89be34cb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
@@ -48,15 +48,24 @@ TEST(TriangularSolveThunkTest, ProtoRoundTrip) {
             unit_diagonal: false
             transpose_a: TRANSPOSE
           }
-          a_buffer { offset: 0 size: 256 buffer_allocation_index: 0 }
-          b_buffer { offset: 0 size: 256 buffer_allocation_index: 1 }
-          temp_buffer { offset: 0 size: 128 buffer_allocation_index: 2 }
-          type: F32
-          batch_size: 1
-          m: 32
-          n: 32
-          a_batch_stride: 0
-          b_batch_stride: 1
+          a_buffer {
+            slice { offset: 0 size: 256 buffer_allocation_index: 0 }
+            shape {}
+          }
+          b_buffer {
+            slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+            shape {
+              element_type: F32
+              dimensions: 32
+              dimensions: 32
+              is_dynamic_dimension: false
+              is_dynamic_dimension: false
+            }
+          }
+          temp_buffer {
+            slice { offset: 0 size: 128 buffer_allocation_index: 2 }
+            shape {}
+          }
         }
       )pb",
       &proto));
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
index 51d0cf5ad532f4..c8af5cf26bf378 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
@@ -103,7 +103,8 @@ class WhileThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(condition_result_buffer_index_),
+        BufferUse::Read(condition_result_buffer_index_,
+                        ShapeUtil::MakeShape(PRED, {})),
     };
   }
 
diff --git a/third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc b/third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc
new file mode 100644
index 00000000000000..056bbb9d6c1560
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/specs/gpu_target_config.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/backends/gpu/specs/all_gpu_specs.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+absl::StatusOr<absl::string_view> GetEmbeddedGpuTargetConfigData(
+    const std::string& gpu_model) {
+  if (gpu_model == "a100_pcie_80") {
+    return get_a100_pcie_80();
+  }
+  if (gpu_model == "a100_sxm_40") {
+    return get_a100_sxm_40();
+  }
+  if (gpu_model == "a100_sxm_80") {
+    return get_a100_sxm_80();
+  }
+  if (gpu_model == "a6000") {
+    return get_a6000();
+  }
+  if (gpu_model == "b200") {
+    return get_b200();
+  }
+  if (gpu_model == "b300") {
+    return get_b300();
+  }
+  if (gpu_model == "h100_pcie") {
+    return get_h100_pcie();
+  }
+  if (gpu_model == "h100_sxm") {
+    return get_h100_sxm();
+  }
+  if (gpu_model == "mi200") {
+    return get_mi200();
+  }
+  if (gpu_model == "p100") {
+    return get_p100();
+  }
+  if (gpu_model == "v100") {
+    return get_v100();
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Embedded file not found: ", gpu_model, ".txtpb"));
+}
+
+}  // namespace
+
+absl::StatusOr<stream_executor::GpuTargetConfigProto> GetGpuTargetConfig(
+    const std::string& gpu_model) {
+  TF_ASSIGN_OR_RETURN(absl::string_view gpu_spec,
+                      GetEmbeddedGpuTargetConfigData(gpu_model));
+
+  stream_executor::GpuTargetConfigProto config;
+  if (!google::protobuf::TextFormat::ParseFromString(std::string(gpu_spec), &config)) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to parse GpuTargetConfigProto from embedded data for: ",
+        gpu_model));
+  }
+  return config;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/target_config/BUILD b/third_party/xla/xla/backends/gpu/target_config/BUILD
new file mode 100644
index 00000000000000..251cf4717792c5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/BUILD
@@ -0,0 +1,65 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/backends/gpu/target_config:build_defs.bzl", "embed_files")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//xla:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "all_gpu_specs",
+    data = glob(["specs/*.txtpb"]),
+)
+
+exports_files(glob([
+    "specs/*.txtpb",
+]))
+
+embed_files(
+    name = "embed_gpu_specs",
+    srcs = glob(["specs/*.txtpb"]),
+    cpp_namespace = "xla::gpu",
+)
+
+cc_library(
+    name = "target_config",
+    srcs = ["target_config.cc"],
+    hdrs = ["target_config.h"],
+    deps = [
+        ":embed_gpu_specs",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf",
+    ],
+)
+
+xla_cc_test(
+    name = "target_config_test",
+    srcs = ["target_config_test.cc"],
+    deps = [
+        ":target_config",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla/tsl:package_groups_bzl",
+        "//xla/tsl:tsl_default_bzl",
+        "//xla/tsl/platform:rules_cc_bzl",
+    ],
+)
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md b/third_party/xla/xla/backends/gpu/target_config/README.md
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md
rename to third_party/xla/xla/backends/gpu/target_config/README.md
diff --git a/third_party/xla/xla/backends/gpu/target_config/build_defs.bzl b/third_party/xla/xla/backends/gpu/target_config/build_defs.bzl
new file mode 100644
index 00000000000000..1105d747dd961c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/build_defs.bzl
@@ -0,0 +1,112 @@
+"""Contains embed_files build rule."""
+
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+visibility(DEFAULT_LOAD_VISIBILITY)
+
+def embed_files(name, srcs, cpp_namespace = "", **kwargs):
+    """Compiles srcs into a cc_library with functions returning embedded file data.
+
+    Example:
+        embed_files(
+            name = "embed_some_file",
+            srcs = ["file1.txt", "file2.txt"],
+            cpp_namespace = "my_namespace",
+        )
+
+    will generate a cc_library with the following functions:
+
+        const std::string& get_file1();
+        const std::string& get_file2();
+
+    Args:
+        name: name for the generated cc_library target
+        srcs: files to embed
+        cpp_namespace: If set, the generated code will be wrapped in this namespace
+        **kwargs: keyword arguments passed onto the generated cc_library() rule.
+    """
+
+    namespace_open = ""
+    namespace_close = ""
+    if cpp_namespace:
+        namespace_open = "namespace " + cpp_namespace + " { "
+        namespace_close = "}  // namespace " + cpp_namespace + "\n"
+
+    native.genrule(
+        name = name + "_gen",
+        srcs = srcs,
+        outs = [
+            name + ".cc",
+            name + ".h",
+        ],
+        cmd = """
+            HDR_OUT=$(location {name}.h)
+            CC_OUT=$(location {name}.cc)
+            GUARD="{guard}"
+
+            # 1. Start Header File
+            echo "#ifndef $${{GUARD}}" > "$${{HDR_OUT}}"
+            echo "#define $${{GUARD}}" >> "$${{HDR_OUT}}"
+            echo "#include <string>" >> "$${{HDR_OUT}}"
+            echo "" >> "$${{HDR_OUT}}"
+            echo "{namespace_open}" >> "$${{HDR_OUT}}"
+
+            # 2. Start CC File
+            # Include standard headers FIRST to avoid namespace issues if header is malformed
+            echo "#include <cstddef>" > "$${{CC_OUT}}"
+            echo "#include <string>" >> "$${{CC_OUT}}"
+            echo '#include "{name}.h"' >> "$${{CC_OUT}}"
+            echo "" >> "$${{CC_OUT}}"
+            echo "{namespace_open}" >> "$${{CC_OUT}}"
+
+            # 3. Iterate over source files
+            for src in $(SRCS); do
+                # Extract filename without path
+                FILENAME=$$(basename "$${{src}}")
+                # Extract stem (filename without extension)
+                STEM=$$(echo "$${{FILENAME}}" | sed 's/\\.[^.]*$$//')
+                # Create C++ identifier safe names
+                SAFE_STEM=$$(echo "$${{STEM}}" | sed 's/[^a-zA-Z0-9_]/_/g')
+                FUNC_NAME="get_$${{SAFE_STEM}}"
+                VAR_NAME="$${{SAFE_STEM}}_data"
+
+                # Header: Add function declaration
+                echo "const std::string& $${{FUNC_NAME}}();" >> "$${{HDR_OUT}}"
+
+                # CC: Embed data using xxd
+                xxd -i "$${{src}}" | \
+                sed -e "s/^unsigned char [^[]*/static const unsigned char $${{VAR_NAME}}/" \
+                    -e "s/^unsigned int .*_len/static const size_t $${{VAR_NAME}}_size/" \
+                    >> "$${{CC_OUT}}"
+                echo "" >> "$${{CC_OUT}}"
+
+                # CC: Define the accessor function
+                echo "const std::string& $${{FUNC_NAME}}() {{" >> "$${{CC_OUT}}"
+                echo "  static const std::string* const kInstance = new std::string(" >> "$${{CC_OUT}}"
+                echo "      reinterpret_cast<const char*>($${{VAR_NAME}}), $${{VAR_NAME}}_size);" >> "$${{CC_OUT}}"
+                echo "  return *kInstance;" >> "$${{CC_OUT}}"
+                echo "}}" >> "$${{CC_OUT}}"
+                echo "" >> "$${{CC_OUT}}"
+            done
+
+            # 4. Finish Header File
+            echo "{namespace_close}" >> "$${{HDR_OUT}}"
+            echo "{namespace_close}" >> "$${{CC_OUT}}"
+            echo "#endif  // $${{GUARD}}" >> "$${{HDR_OUT}}"
+        """.format(
+            name = name,
+            guard = name.upper() + "_H_",
+            namespace_open = namespace_open,
+            namespace_close = namespace_close,
+        ),
+        compatible_with = get_compatible_with_portable(),
+    )
+
+    cc_library(
+        name = name,
+        srcs = [name + ".cc"],
+        hdrs = [name + ".h"],
+        **kwargs
+    )
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a100_pcie_80.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a100_pcie_80.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_40.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_40.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_40.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_40.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_80.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_80.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_80.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_80.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a6000.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a6000.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/b200.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/b200.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/b300.txtpb
similarity index 96%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/b300.txtpb
index a4d2f9de3aea3b..016292524680d4 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb
+++ b/third_party/xla/xla/backends/gpu/target_config/specs/b300.txtpb
@@ -1,43 +1,42 @@
-# Copyright 2025 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-gpu_device_info {
-  threads_per_block_limit: 1024
-  threads_per_warp: 32
-  shared_memory_per_block: 49152
-  shared_memory_per_core: 233472
-  threads_per_core_limit: 2048
-  core_count: 158
-  fpus_per_core: 128
-  block_dim_limit_x: 2147483647
-  block_dim_limit_y: 65535
-  block_dim_limit_z: 65535
-  memory_bandwidth: 7936000000000
-  l2_cache_size: 135528448
-  clock_rate_ghz: 1.965
-  device_memory_size: 297021865984
-  shared_memory_per_block_optin: 232448
-  cuda_compute_capability {
-    major: 10
-    minor: 3
-  }
-  registers_per_core_limit: 65536
-  registers_per_block_limit: 65536
-}
-platform_name: "CUDA"
-dnn_version_info {
-  major: 9
-  minor: 9
-}
-device_description_str: "NVIDIA B300"
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 233472
+  threads_per_core_limit: 2048
+  core_count: 158
+  fpus_per_core: 128
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 7936000000000
+  l2_cache_size: 135528448
+  clock_rate_ghz: 1.965
+  device_memory_size: 297021865984
+  shared_memory_per_block_optin: 232448
+  cuda_compute_capability {
+    major: 10
+    minor: 3
+  }
+  registers_per_core_limit: 65536
+  registers_per_block_limit: 65536
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 9
+  minor: 9
+}
+device_description_str: "NVIDIA B300"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_pcie.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/h100_pcie.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_pcie.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/h100_pcie.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_sxm.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/h100_sxm.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_sxm.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/h100_sxm.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/mi200.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/mi200.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/p100.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/p100.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/v100.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/v100.txtpb
diff --git a/third_party/xla/xla/backends/gpu/target_config/target_config.cc b/third_party/xla/xla/backends/gpu/target_config/target_config.cc
new file mode 100644
index 00000000000000..778308bcbce48d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config.cc
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/target_config/target_config.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/backends/gpu/target_config/embed_gpu_specs.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+absl::StatusOr<absl::string_view> GetEmbeddedGpuTargetConfigData(
+    const std::string& gpu_model) {
+  if (gpu_model == "a100_pcie_80") {
+    return get_a100_pcie_80();
+  }
+  if (gpu_model == "a100_sxm_40") {
+    return get_a100_sxm_40();
+  }
+  if (gpu_model == "a100_sxm_80") {
+    return get_a100_sxm_80();
+  }
+  if (gpu_model == "a6000") {
+    return get_a6000();
+  }
+  if (gpu_model == "b200") {
+    return get_b200();
+  }
+  if (gpu_model == "b300") {
+    return get_b300();
+  }
+  if (gpu_model == "h100_pcie") {
+    return get_h100_pcie();
+  }
+  if (gpu_model == "h100_sxm") {
+    return get_h100_sxm();
+  }
+  if (gpu_model == "mi200") {
+    return get_mi200();
+  }
+  if (gpu_model == "p100") {
+    return get_p100();
+  }
+  if (gpu_model == "v100") {
+    return get_v100();
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Embedded file not found: ", gpu_model, ".txtpb"));
+}
+
+}  // namespace
+
+absl::StatusOr<stream_executor::GpuTargetConfigProto> GetGpuTargetConfig(
+    const std::string& gpu_model) {
+  TF_ASSIGN_OR_RETURN(absl::string_view gpu_spec,
+                      GetEmbeddedGpuTargetConfigData(gpu_model));
+
+  stream_executor::GpuTargetConfigProto config;
+  if (!google::protobuf::TextFormat::ParseFromString(std::string(gpu_spec), &config)) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to parse GpuTargetConfigProto from embedded data for: ",
+        gpu_model));
+  }
+  return config;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/tsl/platform/default/recordphase.cc b/third_party/xla/xla/backends/gpu/target_config/target_config.h
similarity index 57%
rename from third_party/xla/xla/tsl/platform/default/recordphase.cc
rename to third_party/xla/xla/backends/gpu/target_config/target_config.h
index fb823690d0581f..eb4be618c2ce76 100644
--- a/third_party/xla/xla/tsl/platform/default/recordphase.cc
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config.h
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/tsl/platform/recordphase.h"
+#ifndef XLA_BACKENDS_GPU_TARGET_CONFIG_TARGET_CONFIG_H_
+#define XLA_BACKENDS_GPU_TARGET_CONFIG_TARGET_CONFIG_H_
 
 #include <string>
-#include <vector>
 
-#include "absl/strings/string_view.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.pb.h"
 
-namespace tsl::recordphase {
-void StartPhase(const absl::string_view phase_name,
-                const std::vector<absl::string_view>& dependencies) {}
+namespace xla::gpu {
 
-std::string StartPhaseUnique(
-    absl::string_view phase_name,
-    const std::vector<absl::string_view>& dependencies) {
-  return std::string(phase_name);
-}
+// Returns the GpuTargetConfigProto for the given GPU model.
+absl::StatusOr<stream_executor::GpuTargetConfigProto> GetGpuTargetConfig(
+    const std::string& gpu_model);
 
-void EndPhase(const absl::string_view phase_name) {}
-}  // namespace tsl::recordphase
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_TARGET_CONFIG_TARGET_CONFIG_H_
diff --git a/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc b/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
new file mode 100644
index 00000000000000..543a21730dae26
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/target_config/target_config.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/status_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::HasSubstr;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+struct GpuTargetConfigTestCase {
+  std::string test_name;
+  std::string gpu_model;
+  bool expect_ok;
+};
+
+using GetGpuTargetConfigTest =
+    ::testing::TestWithParam<GpuTargetConfigTestCase>;
+
+TEST_P(GetGpuTargetConfigTest, TestProtoRetrieval) {
+  const GpuTargetConfigTestCase& test_case = GetParam();
+  auto config = GetGpuTargetConfig(test_case.gpu_model);
+
+  if (test_case.expect_ok) {
+    ASSERT_THAT(config, absl_testing::IsOk());
+    EXPECT_TRUE(config->has_gpu_device_info());
+    EXPECT_GT(config->gpu_device_info().threads_per_block_limit(), 0);
+  } else {
+    EXPECT_THAT(config,
+                absl_testing::StatusIs(absl::StatusCode::kNotFound,
+                                       HasSubstr("Embedded file not found")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    GetGpuTargetConfigTests, GetGpuTargetConfigTest,
+    ::testing::ValuesIn<GpuTargetConfigTestCase>({
+        {"A100_PCIE_80", "a100_pcie_80", true},
+        {"A100_SXM_40", "a100_sxm_40", true},
+        {"A100_SXM_80", "a100_sxm_80", true},
+        {"A6000", "a6000", true},
+        {"B200", "b200", true},
+        {"B300", "b300", true},
+        {"H100_PCIE", "h100_pcie", true},
+        {"H100_SXM", "h100_sxm", true},
+        {"MI200", "mi200", true},
+        {"P100", "p100", true},
+        {"V100", "v100", true},
+        {"UnknownModel", "unknown_gpu", false},
+    }),
+    [](const ::testing::TestParamInfo<GetGpuTargetConfigTest::ParamType>&
+           info) { return info.param.test_name; });
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 6b9515523cf0c1..b8af7e32ada24e 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -84,7 +84,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/stream_executor:device_address",
@@ -168,7 +168,6 @@ cc_library(
         "//xla/stream_executor:stream_executor_common",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/host:host_stream",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index d8a9ac91c7d39f..8299384cf6838b 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
@@ -55,8 +55,7 @@ namespace interpreter {
 
 InterpreterExecutableBase::InterpreterExecutableBase(
     std::unique_ptr<HloModule> hlo_module)
-    : Executable(std::move(hlo_module), /*hlo_profile_printer_data=*/nullptr,
-                 /*hlo_profile_index_map=*/nullptr) {}
+    : Executable(std::move(hlo_module)) {}
 
 absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
@@ -74,13 +73,13 @@ absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     device_ordinal = 0;
   }
   for (auto& argument : arguments) {
-    const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
+    const ShapeTree<MaybeOwningDeviceAddress>& buffers = argument.Buffers();
     argument_buffers.push_back(ShapedBuffer(buffers.shape(),
                                             /*device_ordinal=*/device_ordinal));
     auto in_it = buffers.begin();
     auto out_it = argument_buffers.back().buffers().begin();
     for (; in_it != buffers.end(); ++in_it, ++out_it) {
-      out_it->second = in_it->second.AsDeviceMemoryBase();
+      out_it->second = in_it->second.AsDeviceAddress();
     }
   }
 
@@ -180,7 +179,7 @@ InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
           -> absl::Status {
         if (alias && alias->must_alias()) {
           VLOG(1) << alias->ToString();
-          const MaybeOwningDeviceMemory& original_input =
+          const MaybeOwningDeviceAddress& original_input =
               (*arguments)[alias->parameter_number].Buffers().element(
                   alias->parameter_index);
           if (!original_input.HasOwnership()) {
@@ -215,7 +214,7 @@ InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
     if (alias) {
       TF_RET_CHECK(alias->parameter_number < arguments->size());
       ExecutionInput& input = (*arguments)[alias->parameter_number];
-      MaybeOwningDeviceMemory* device_memory =
+      MaybeOwningDeviceAddress* device_memory =
           input.MutableBuffer(alias->parameter_index);
       if (auto owning = device_memory->Release()) {
         se::DeviceAddressBase device_memory_base = owning->Release();
diff --git a/third_party/xla/xla/backends/interpreter/executor.cc b/third_party/xla/xla/backends/interpreter/executor.cc
index 93ac837ab069fd..59857bc1190f47 100644
--- a/third_party/xla/xla/backends/interpreter/executor.cc
+++ b/third_party/xla/xla/backends/interpreter/executor.cc
@@ -37,23 +37,23 @@ host::HostStream *AsExecutorStream(Stream *stream) {
   return dynamic_cast<host::HostStream *>(stream);
 }
 
-DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64_t size,
-                                                  int64_t memory_space) {
-  return DeviceMemoryBase(new char[size], size);
+DeviceAddressBase XlaInterpreterExecutor::Allocate(uint64_t size,
+                                                   int64_t memory_space) {
+  return DeviceAddressBase(new char[size], size);
 }
 
-void XlaInterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
+void XlaInterpreterExecutor::Deallocate(DeviceAddressBase* mem) {
   delete[] static_cast<char *>(mem->opaque());
 }
 
 absl::Status XlaInterpreterExecutor::SynchronousMemcpy(
-    DeviceMemoryBase *dev_dst, const void *host_src, uint64_t size) {
+    DeviceAddressBase* dev_dst, const void* host_src, uint64_t size) {
   memcpy(dev_dst->opaque(), host_src, size);
   return absl::OkStatus();
 }
 
 absl::Status XlaInterpreterExecutor::SynchronousMemcpy(
-    void *host_dst, const DeviceMemoryBase &dev_src, uint64_t size) {
+    void* host_dst, const DeviceAddressBase& dev_src, uint64_t size) {
   memcpy(host_dst, dev_src.opaque(), size);
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 0171dc692be4c8..00952eb5003bc0 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -60,14 +60,14 @@ class InterpreterStream : public host::HostStream {
     return absl::UnimplementedError("Not implemented.");
   }
 
-  absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+  absl::Status Memcpy(void* host_dst, const DeviceAddressBase& gpu_src,
                       uint64_t size) override {
     void *src_mem = gpu_src.opaque();
     memcpy(host_dst, src_mem, size);
     return absl::OkStatus();
   }
 
-  absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+  absl::Status Memcpy(DeviceAddressBase* gpu_dst, const void* host_src,
                       uint64_t size) override {
     void *dst_mem = gpu_dst->opaque();
     memcpy(dst_mem, host_src, size);
@@ -84,8 +84,8 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
 
   int device_ordinal() const override { return device_ordinal_; };
 
-  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
-  void Deallocate(DeviceMemoryBase *mem) override;
+  DeviceAddressBase Allocate(uint64_t size, int64_t memory_space) override;
+  void Deallocate(DeviceAddressBase* mem) override;
 
   absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
       uint64_t size) override {
@@ -97,15 +97,15 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
 
   // No "synchronize all activity" implemented for this platform at the moment.
   bool SynchronizeAllActivity() override { return true; }
-  absl::Status SynchronousMemZero(DeviceMemoryBase *location,
+  absl::Status SynchronousMemZero(DeviceAddressBase* location,
                                   uint64_t size) override {
     return absl::InternalError("Interpreter can not memzero");
   }
 
-  absl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                 const void *host_src, uint64_t size) override;
-  absl::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &dev_src,
+  absl::Status SynchronousMemcpy(DeviceAddressBase* dev_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceAddressBase& dev_src,
                                  uint64_t size) override;
 
   void DeallocateStream(Stream *stream) override {}
@@ -136,8 +136,8 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
     return std::make_unique<InterpreterStream>(this);
   }
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override {
-    if (type == MemoryType::kHost) {
+      MemorySpace type) override {
+    if (type == MemorySpace::kHost) {
       return std::make_unique<GenericMemoryAllocator>(
           [](uint64_t size)
               -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 30e6a1cfa72cb4..50eb8fc6fd1980 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -68,7 +68,6 @@ cc_library(
         ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "//xla/tsl/platform:env_time",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/profiler/backends/cpu:annotation_stack",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -201,9 +200,7 @@ xla_test(
         ":cupti_wrapper",
         ":mock_cupti",
         "//xla/tsl/profiler/utils:time_utils",
-        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -362,7 +359,6 @@ cc_library(
     ],
     deps = [
         ":cupti_collector",
-        ":cupti_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ],
@@ -537,6 +533,8 @@ xla_cc_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_config_rocm//rocm:hip",  # buildcleaner: keep
+        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -756,11 +754,7 @@ xla_cc_test(
         ":ondevice_event_exporter",
         "//xla/tsl/profiler/backends/gpu:ondevice_event_receiver",
         "//xla/tsl/profiler/backends/gpu:ondevice_trace_event",
-        "//xla/tsl/profiler/utils:xplane_builder",
-        "//xla/tsl/profiler/utils:xplane_schema",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index 8b8fb84fd092fb..da44744967eff8 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -285,11 +285,10 @@ class PerDeviceCollector {
                           occ_stats.occupancy_pct);
       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                               GetStatTypeStr(StatType::kOccupancyMinGridSize)),
-                          static_cast<tsl::int32>(occ_stats.min_grid_size));
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-          static_cast<tsl::int32>(occ_stats.suggested_block_size));
+                          static_cast<int32_t>(occ_stats.min_grid_size));
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                              StatType::kOccupancySuggestedBlockSize)),
+                          static_cast<int32_t>(occ_stats.suggested_block_size));
       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                               GetStatTypeStr(StatType::kKernelDetails)),
                           *plane->GetOrCreateStatMetadata(ToXStat(
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index cb42984230bba6..b7f90db95f1b9c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -1077,6 +1077,13 @@ const char* GetCuptiErrorString(CuptiInterface* cupti_interface,
   return err_str;
 }
 
+bool& IsCuptiHardwareEventSystemEnabled() {
+  // This flag can not flip to true once per process. Once enabled, it will stay
+  // enabled until the process is terminated.
+  static bool is_enabled = false;
+  return is_enabled;
+}
+
 }  // namespace
 
 CuptiTracer::CuptiTracer(CuptiInterface* cupti_interface)
@@ -1447,19 +1454,31 @@ absl::Status CuptiTracer::EnableActivityTracing() {
                    << err;
     }
     if (option_->enable_activity_hardware_tracing) {
-      auto err = cupti_interface_->ActivityEnableHWTrace(true);
-      if (err == CUPTI_ERROR_NOT_SUPPORTED) {
-        LOG(INFO)
-            << "CUPTI activity HW trace not enabled due to not supported on "
-               "this platform!";
-      } else if (err != CUPTI_SUCCESS) {
-        LOG(WARNING)
-            << "Fail to enable CUPTI activity HW trace, CUPTI ERROR CODE:"
-            << err << " (" << GetCuptiErrorString(cupti_interface_, err) << ")";
+      if (IsCuptiHardwareEventSystemEnabled()) {
+        LOG(INFO) << "CUPTI activity HW trace already enabled.";
       } else {
-        LOG(INFO) << "CUPTI activity HW trace successfully enabled.";
+        auto err = cupti_interface_->ActivityEnableHWTrace(true);
+        if (err == CUPTI_ERROR_NOT_SUPPORTED) {
+          LOG(INFO)
+              << "CUPTI activity HW trace not enabled due to not supported on "
+                 "this platform!";
+        } else if (err != CUPTI_SUCCESS) {
+          LOG(WARNING)
+              << "Fail to enable CUPTI activity HW trace, CUPTI ERROR CODE:"
+              << err << " (" << GetCuptiErrorString(cupti_interface_, err)
+              << ")";
+        } else {
+          LOG(INFO) << "CUPTI activity HW trace successfully enabled.";
+          IsCuptiHardwareEventSystemEnabled() = true;
+        }
+      }
+    } else {
+      if (IsCuptiHardwareEventSystemEnabled()) {
+        LOG(INFO)
+            << "CUPTI activity HW trace already enabled, continue with it.";
       }
     }
+
     RETURN_IF_CUPTI_ERROR(ActivityRegisterCallbacks(
         RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
     VLOG(1) << "Enabling activity tracing for "
@@ -1497,22 +1516,6 @@ absl::Status CuptiTracer::DisableActivityTracing() {
     }
     option_->activities_selected.clear();
 
-    if (option_->enable_activity_hardware_tracing) {
-      auto err = cupti_interface_->ActivityEnableHWTrace(false);
-      // CUPTI_ERROR_NOT_SUPPORTED here is ok as it already handled/logged
-      // in EnableActivityTracing.
-      if (err == CUPTI_ERROR_NOT_SUPPORTED) {
-        LOG(INFO) << "CUPTI activity HW trace not disabled due to not "
-                     "supported on this platform!";
-      } else if (err != CUPTI_SUCCESS) {
-        LOG(WARNING)
-            << "Fail to disable CUPTI activity HW trace, CUPTI ERROR CODE:"
-            << err << " (" << GetCuptiErrorString(cupti_interface_, err) << ")";
-      } else {
-        LOG(INFO) << "CUPTI activity HW trace successfully disabled.";
-      }
-    }
-
     VLOG(1) << "Flushing CUPTI activity buffer";
     RETURN_IF_CUPTI_ERROR(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
     LOG(INFO) << "CUPTI activity buffer flushed";
diff --git a/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc b/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc
index 3ca82f9d92161f..29b635c1b23887 100644
--- a/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc
@@ -187,6 +187,10 @@ void SimpleAddSubWithProfilerTest(bool enable_activity_hardware_tracing,
   EXPECT_EQ(vec.size(), kNumElements);
   EXPECT_THAT(vec, Each(DistanceFrom(0, Lt(0.001))));
 
+  auto space = std::make_unique<tensorflow::profiler::XSpace>();
+  collector->Export(space.get(), CuptiTracer::GetTimestamp());
+  EXPECT_GE(space->planes_size(), 1);
+
   if (enable_pm_sampling) {
     // Expect 4 * elems / (32 elemn / warp) +- 5% double instructions
     // (if they were sampled)
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index 601261073d71fe..2e02eb7a2270c8 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -499,6 +499,10 @@ void RocmTracer::toolFinalize(void* tool_data) {
 }
 
 void RocmTracer::Disable() {
+  rocprofiler_status_t status = rocprofiler_flush_buffer(buffer_);
+  if (status != ROCPROFILER_STATUS_SUCCESS) {
+    LOG(WARNING) << "rocprofiler_flush_buffer failed with error " << status;
+  }
   absl::MutexLock lock(collector_mutex_);
   collector_->Flush();
   collector_ = nullptr;
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
index d8ad1392738d20..1d19b33269fc03 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
+#include "rocm/include/hip/hip_runtime.h"
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -124,6 +126,73 @@ TEST(RocmTracerTest, AnnotationMapWorks) {
   EXPECT_EQ(result, annotation);
 }
 
+// Simple collector that tracks received events for verification.
+class EventCapturingCollector : public RocmTraceCollector {
+ public:
+  EventCapturingCollector() : RocmTraceCollector(MakeCollectorOptions()) {}
+
+  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override {
+    event_count_++;
+  }
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t num_events) override {}
+  void Flush() override {}
+  void Export(tsl::profiler::XSpace* space) override {}
+
+  int event_count() const { return event_count_; }
+
+ private:
+  static RocmTraceCollectorOptions MakeCollectorOptions() {
+    RocmTraceCollectorOptions options;
+    options.max_callback_api_events = 2 * 1024 * 1024;
+    options.max_activity_api_events = 2 * 1024 * 1024;
+    options.max_annotation_strings = 1024 * 1024;
+    options.num_gpus = RocmTracer::GetRocmTracerSingleton().NumGpus();
+    return options;
+  }
+  int event_count_ = 0;
+};
+
+std::unique_ptr<EventCapturingCollector> CreateEventCapturingCollector() {
+  return std::make_unique<EventCapturingCollector>();
+}
+
+TEST(RocmTracerTest, CapturesHipEvents) {
+#define HIP_ASSERT_OK(expr) ASSERT_EQ((expr), hipSuccess) << #expr " failed"
+
+  int device_count = 0;
+  HIP_ASSERT_OK(hipGetDeviceCount(&device_count));
+  ASSERT_GT(device_count, 0) << "No HIP devices available";
+
+  auto collector = CreateEventCapturingCollector();
+  EventCapturingCollector* collector_ptr = collector.get();
+
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  RocmTracerOptions tracer_options{/*max_annotation_strings=*/1024 * 1024};
+  tracer.Enable(tracer_options, collector.get());
+
+  constexpr size_t kNumFloats = 1024;
+  constexpr size_t kSize = kNumFloats * sizeof(float);
+  std::vector<float> host_data(kNumFloats, 1.0f);
+  void* device_data = nullptr;
+
+  HIP_ASSERT_OK(hipMalloc(&device_data, kSize));
+  HIP_ASSERT_OK(
+      hipMemcpy(device_data, host_data.data(), kSize, hipMemcpyHostToDevice));
+  HIP_ASSERT_OK(
+      hipMemcpy(host_data.data(), device_data, kSize, hipMemcpyDeviceToHost));
+  HIP_ASSERT_OK(hipDeviceSynchronize());
+
+  tracer.Disable();
+  HIP_ASSERT_OK(hipFree(device_data));
+
+#undef HIP_ASSERT_OK
+
+  EXPECT_GT(collector_ptr->event_count(), 0)
+      << "Expected to capture at least one trace event";
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/subprocess/BUILD b/third_party/xla/xla/backends/profiler/subprocess/BUILD
index 1a62071beafea5..4ca882ae468219 100644
--- a/third_party/xla/xla/backends/profiler/subprocess/BUILD
+++ b/third_party/xla/xla/backends/profiler/subprocess/BUILD
@@ -59,7 +59,6 @@ cc_library(
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -87,7 +86,6 @@ xla_cc_test(
         ":subprocess_registry",
         "//xla/backends/profiler:profiler_backends",  # buildcleaner: keep
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:resource_loader",
         "//xla/tsl/platform:subprocess",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
@@ -100,8 +98,6 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/lib:traceme",
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 8fa2963bc27550..fac2d9343ff1d0 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -124,11 +124,11 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:local_service",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:source_map_util",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -181,7 +181,7 @@ cc_library(
         "//xla/service:compile_only_service",
         "//xla/service:local_service",
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/client/client_library.h b/third_party/xla/xla/client/client_library.h
index 0e4f3a9a24dd22..42d0f34202e092 100644
--- a/third_party/xla/xla/client/client_library.h
+++ b/third_party/xla/xla/client/client_library.h
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/service/compile_only_service.h"
 #include "xla/service/local_service.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/client/compile_only_client.cc b/third_party/xla/xla/client/compile_only_client.cc
index 3a9fe305a04948..665607f524b2f1 100644
--- a/third_party/xla/xla/client/compile_only_client.cc
+++ b/third_party/xla/xla/client/compile_only_client.cc
@@ -44,7 +44,7 @@ CompileOnlyClient::CreateModuleConfig(
                                                execution_options);
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
 CompileOnlyClient::CompileAheadOfTime(
     const AotXlaComputationInstance& computation,
     const AotCompilationOptions& options,
diff --git a/third_party/xla/xla/client/compile_only_client.h b/third_party/xla/xla/client/compile_only_client.h
index 53d17b87795a4f..87baff417d7a04 100644
--- a/third_party/xla/xla/client/compile_only_client.h
+++ b/third_party/xla/xla/client/compile_only_client.h
@@ -59,7 +59,7 @@ class CompileOnlyClient : public Client {
   // This is intended for use in static compilation. The |options|
   // parameter describes the target for which the compiler should emit
   // code. |metadata|, if provided, is populated during compilation.
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(
       const AotXlaComputationInstance& computation,
       const AotCompilationOptions& options,
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index 3c865b508f5700..aa0a0e4c289055 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -37,7 +37,7 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/source_map_util.h"
@@ -45,7 +45,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
@@ -319,7 +319,7 @@ absl::StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
 }
 
 static ShapedBuffer MaybeOwningShapeTreeToShapedBuffer(
-    const ShapeTree<MaybeOwningDeviceMemory>& tree, int device_ordinal) {
+    const ShapeTree<MaybeOwningDeviceAddress>& tree, int device_ordinal) {
   ShapedBuffer result(tree.shape(), device_ordinal);
   auto it = tree.begin();
   auto out_it = result.buffers().begin();
@@ -482,19 +482,17 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<xla::AotCompilationResult> aot_result,
       compiler->LoadAotCompilationResult(serialized_aot_result));
-  return LoadInternal(std::move(aot_result), compiler.get(), options);
+  return LoadInternal(std::move(aot_result), options);
 }
 
 absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
     std::unique_ptr<xla::AotCompilationResult> aot_result,
     const ExecutableBuildOptions& options) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
-                      Compiler::GetForPlatform(platform()));
-  return LoadInternal(std::move(aot_result), compiler.get(), options);
+  return LoadInternal(std::move(aot_result), options);
 }
 
 absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
-    std::unique_ptr<xla::AotCompilationResult> aot_result, Compiler* compiler,
+    std::unique_ptr<xla::AotCompilationResult> aot_result,
     const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
                       UpdateBuildOptions(options, default_device_ordinal()));
@@ -502,9 +500,8 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
       se::StreamExecutor * executor,
       backend().stream_executor(updated_options.device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, executor));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      std::move(*aot_result).LoadExecutable(executor));
   return std::make_unique<LocalExecutable>(std::move(executable),
                                            local_service_->mutable_backend(),
                                            updated_options);
@@ -512,7 +509,7 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
 
 absl::StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
-    se::DeviceMemoryAllocator* allocator) {
+    se::DeviceAddressAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3f06595a88500d..364852b956af56 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -34,12 +34,12 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/service/local_service.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -161,7 +161,7 @@ class LocalClient : public Client {
   // Same as Compile() above, but return AotCompilationResult objects (instead
   // of LocalExecutable objects), which can be persisted to later load
   // LocalExecutable(s) using the Load() method below.
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(const XlaComputation& computation,
                      absl::Span<const Shape* const> argument_layouts,
                      const ExecutableBuildOptions& options);
@@ -174,7 +174,7 @@ class LocalClient : public Client {
 
   // Variant of `Load()` that accepts an AotCompilationResult.
   absl::StatusOr<std::unique_ptr<LocalExecutable>> Load(
-      std::unique_ptr<xla::AotCompilationResult> aot_result,
+      std::unique_ptr<CompiledModule> aot_result,
       const ExecutableBuildOptions& options);
 
   // Copy the literal data to the device with the given ordinal and return as a
@@ -183,7 +183,7 @@ class LocalClient : public Client {
   // device is used.
   absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
-      se::DeviceMemoryAllocator* allocator = nullptr);
+      se::DeviceAddressAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
   absl::StatusOr<GlobalDataHandle> TransferToLocalServer(
@@ -249,7 +249,7 @@ class LocalClient : public Client {
   LocalService* local_service_;
 
   absl::StatusOr<std::unique_ptr<LocalExecutable>> LoadInternal(
-      std::unique_ptr<xla::AotCompilationResult> aot_result, Compiler* compiler,
+      std::unique_ptr<CompiledModule> aot_result,
       const ExecutableBuildOptions& options);
 };
 
diff --git a/third_party/xla/xla/codegen/BUILD b/third_party/xla/xla/codegen/BUILD
index f58bf888d460bb..fce3eb83bd9efb 100644
--- a/third_party/xla/xla/codegen/BUILD
+++ b/third_party/xla/xla/codegen/BUILD
@@ -57,8 +57,6 @@ cc_library(
     srcs = ["llvm_kernel_source.cc"],
     hdrs = ["llvm_kernel_source.h"],
     deps = [
-        ":kernel_definition",
-        ":kernel_emitter",
         ":kernel_source",
         "//xla/service/llvm_ir:llvm_util",
         "@llvm-project//llvm:Core",
@@ -77,7 +75,6 @@ cc_library(
     deps = [
         ":kernel_source",
         ":kernel_spec",
-        "//xla/tsl/platform:logging",
     ],
 )
 
@@ -86,11 +83,8 @@ cc_library(
     srcs = ["mlir_kernel_source.cc"],
     hdrs = ["mlir_kernel_source.h"],
     deps = [
-        ":kernel_definition",
-        ":kernel_emitter",
         ":kernel_source",
         "//xla:util",
-        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -117,7 +111,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
@@ -136,8 +129,6 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
diff --git a/third_party/xla/xla/codegen/emitters/BUILD b/third_party/xla/xla/codegen/emitters/BUILD
index 3bf04998441f0b..63ea5a16a9bc61 100644
--- a/third_party/xla/xla/codegen/emitters/BUILD
+++ b/third_party/xla/xla/codegen/emitters/BUILD
@@ -23,7 +23,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/algorithm:container",
@@ -48,7 +47,6 @@ xla_cc_test(
     deps = [
         ":computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -91,7 +89,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:gather_simplifier",
         "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
@@ -137,7 +134,6 @@ xla_cc_test(
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
@@ -257,7 +253,6 @@ cc_library(
         "//xla/codegen:kernel_spec",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
@@ -287,7 +282,6 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/runtime:work_cluster",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
@@ -311,13 +305,11 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -327,7 +319,6 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -365,13 +356,11 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -425,13 +414,11 @@ cc_library(
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:ir_emission_utils",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
index a610c928102591..dd452ddf321f93 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.h b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
index 8fd113f43602da..8c5b6053298dcb 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.h
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
index 39ef286b2d089c..3fb8c566fb157c 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
index 090a1dea456744..69d9df07487f12 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
index b7d5d7d9b9b970..2ffc8e9fc8a36e 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
index fb8d3e6445e32d..af61c02c7e73d0 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
index e727d8296176a5..7346e4b8890c16 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/codegen/kernel_spec.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/runtime/work_dimensions.h"
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
index 14d5adaadb48c6..0f800f3789a95b 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "xla/comparison_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
index fd85579d9ccb3f..a90c1a51d74ea6 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
diff --git a/third_party/xla/xla/codegen/emitters/ir/BUILD b/third_party/xla/xla/codegen/emitters/ir/BUILD
index bb534898b5fa96..d529bfef000b37 100644
--- a/third_party/xla/xla/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/codegen/emitters/ir/BUILD
@@ -93,7 +93,6 @@ cc_library(
         ":xla_ops_inc_gen",
         "//xla/codegen/emitters:type_util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -118,7 +117,6 @@ xla_test(
     deps = [
         ":xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/testlib:filecheck",
         "//xla/mlir/utils:error_util",
         "//xla/tests:hlo_pjrt_test_base",
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
index 1239f8afe06f5e..09f351a845baf5 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
@@ -57,7 +57,6 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_dialect.cc.inc"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
index cc4002b42c5ea5..d6c565694107de 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/mlir/utils/error_util.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
diff --git a/third_party/xla/xla/codegen/emitters/kernel_arguments.cc b/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
index 0507890ee2e28b..d2e983ea4b7bf1 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <optional>
 #include <utility>
 #include <vector>
 
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
index 467e80fd26888e..acbd64326bdc69 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
index 46546815d2f8cb..b5d0761060f7a3 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index 429feecc9eb982..e39d3565a5edbf 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -29,6 +29,113 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "lower_to_llvm_cpu_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"lower_to_llvm_cpu.h.inc": [
+        "-gen-pass-decls",
+        "-name=TransformsLLVMCPU",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lower_to_llvm_cpu.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "lower_to_llvm_common",
+    srcs = ["lower_to_llvm_common.cc"],
+    hdrs = ["lower_to_llvm_common.h"],
+    deps = [
+        "@com_google_absl//absl/functional:function_ref",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithToLLVM",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:ComplexToLLVM",
+        "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MemRefToLLVM",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:UBToLLVM",
+        "@llvm-project//mlir:VectorToLLVM",
+    ],
+)
+
+cc_library(
+    name = "lower_to_llvm_cpu_pass",
+    srcs = ["lower_to_llvm_cpu.cc"],
+    hdrs = ["lower_to_llvm_cpu.h"],
+    deps = [
+        ":lower_to_llvm_common",
+        ":lower_to_llvm_cpu_inc_gen",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "lower_to_llvm_gpu_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"lower_to_llvm_gpu.h.inc": [
+        "-gen-pass-decls",
+        "-name=TransformsLLVMGPU",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lower_to_llvm_gpu.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "lower_to_llvm_gpu_pass",
+    srcs = ["lower_to_llvm_gpu.cc"],
+    hdrs = ["lower_to_llvm_gpu.h"],
+    deps = [
+        ":lower_to_llvm_common",
+        ":lower_to_llvm_gpu_inc_gen",
+        "//xla/codegen:device_spec",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:logging",
+        "@com_google_protobuf//:protobuf",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AMDGPUUtils",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithToLLVM",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:ComplexToLLVM",
+        "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:GPUToLLVMSPVTransforms",
+        "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUToROCDLTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MemRefToLLVM",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:UBToLLVM",
+        "@llvm-project//mlir:VectorToLLVM",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -49,7 +156,6 @@ cc_library(
         "expand_float_ops.cc",
         "flatten_tensors.cc",
         "lower_tensors.cc",
-        "lower_to_llvm.cc",
         "lower_xla_intrinsic_lib.cc",
         "lower_xla_to_scf.cc",
         "merge_pointers_to_same_slice.cc",
@@ -82,7 +188,6 @@ cc_library(
         "//xla/codegen/intrinsic:tanh",
         "//xla/codegen/intrinsic:type",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:ir_emission_utils",
@@ -90,8 +195,6 @@ cc_library(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/rocm:rocm_compute_capability",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
@@ -100,41 +203,25 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AMDGPUUtils",
-        "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithToLLVM",
-        "@llvm-project//mlir:ArithTransforms",
         "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:ComplexToLLVM",
-        "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:GPUToLLVMSPVTransforms",
-        "@llvm-project//mlir:GPUToNVVMTransforms",
-        "@llvm-project//mlir:GPUToROCDLTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MathTransforms",
-        "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLDialect",
         "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:SCFUtils",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:UBToLLVM",
         "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorTransforms",
         "@llvm-project//mlir:VectorUtils",
         "@local_tsl//tsl/platform:protobuf",
diff --git a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
index d2283480a1b372..22d70efcf3f33b 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
@@ -88,25 +88,25 @@ struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
 
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     auto c = [&](float v) -> Value {
-      return b.create<ma::ConstantFloatOp>(rewriter.getF32Type(),
-                                           llvm::APFloat(v));
+      return ma::ConstantFloatOp::create(b, rewriter.getF32Type(),
+                                         llvm::APFloat(v));
     };
 
     auto poly = [&](auto x, auto coefficients) -> Value {
       auto r = c(coefficients[0]);
       for (int i = 1; i < coefficients.size(); ++i) {
-        r = b.create<mlir::math::FmaOp>(r, x, c(coefficients[i]));
+        r = mlir::math::FmaOp::create(b, r, x, c(coefficients[i]));
       }
       return r;
     };
 
     Value x = op.getOperand();
-    x = b.create<ma::MaximumFOp>(x, c(-kErfInvOneMinusHalfULP));
-    x = b.create<ma::MinimumFOp>(x, c(kErfInvOneMinusHalfULP));
-    Value x2 = b.create<ma::MulFOp>(x, x);
+    x = ma::MaximumFOp::create(b, x, c(-kErfInvOneMinusHalfULP));
+    x = ma::MinimumFOp::create(b, x, c(kErfInvOneMinusHalfULP));
+    Value x2 = ma::MulFOp::create(b, x, x);
 
     rewriter.replaceOpWithNewOp<ma::DivFOp>(
-        op, b.create<ma::MulFOp>(x, poly(x2, kAlpha)), poly(x2, kBeta));
+        op, ma::MulFOp::create(b, x, poly(x2, kAlpha)), poly(x2, kBeta));
 
     return mlir::success();
   }
@@ -129,39 +129,39 @@ bool IsFNUZ(mlir::FloatType ty) {
 Value IsInf(Value value, mlir::ImplicitLocOpBuilder& b) {
   auto ty = mlir::cast<mlir::FloatType>(value.getType());
   if (mlir::LLVM::isCompatibleOuterType(ty)) {
-    value = b.create<mlir::math::AbsFOp>(value);
-    Value inf = b.create<ma::ConstantFloatOp>(
-        ty, llvm::APFloat::getInf(ty.getFloatSemantics()));
-    return b.create<ma::CmpFOp>(ma::CmpFPredicate::OEQ, value, inf);
+    value = mlir::math::AbsFOp::create(b, value);
+    Value inf = ma::ConstantFloatOp::create(
+        b, ty, llvm::APFloat::getInf(ty.getFloatSemantics()));
+    return ma::CmpFOp::create(b, ma::CmpFPredicate::OEQ, value, inf);
   }
 
   assert(ty.getIntOrFloatBitWidth() <= 8);
   // F8E5M2, F8E4M3, F8E3M4 are the only 8 bit float with infinities.
   if (llvm::isa<mlir::Float8E5M2Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+    Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
     return (bits & 0x7F) == 0x7C;
   } else if (llvm::isa<mlir::Float8E4M3Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+    Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
     return (bits & 0x7F) == 0x78;
   } else if (llvm::isa<mlir::Float8E3M4Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+    Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
     return (bits & 0x7F) == 0x70;
   } else {
-    return b.create<ma::ConstantIntOp>(b.getI1Type(), false);
+    return ma::ConstantIntOp::create(b, b.getI1Type(), false);
   }
 }
 
 Value IsNaN(Value value, mlir::ImplicitLocOpBuilder& b) {
   auto ty = value.getType();
   if (mlir::LLVM::isCompatibleOuterType(ty)) {
-    return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value);
+    return ma::CmpFOp::create(b, ma::CmpFPredicate::UNO, value, value);
   }
   if (llvm::isa<mlir::Float4E2M1FNType>(ty)) {
-    return b.create<ma::ConstantIntOp>(b.getI1Type(), false);
+    return ma::ConstantIntOp::create(b, b.getI1Type(), false);
   }
 
   assert(ty.getIntOrFloatBitWidth() == 8);
-  Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+  Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
   if (llvm::isa<mlir::Float8E5M2Type>(ty)) {
     return (bits & 0b0111'1111).cmp(ma::CmpIPredicate::ugt, 0b0111'1100);
   } else if (llvm::isa<mlir::Float8E4M3Type>(ty)) {
@@ -189,21 +189,21 @@ Value EmitReducePrecision(Value value, int exponent_bits, int mantissa_bits,
 }
 
 Value EmitF16ToF8e5m2(Value in, mlir::ImplicitLocOpBuilder& b) {
-  Val in_bits{b.create<ma::BitcastOp>(b.getI16Type(), in), &b};
+  Val in_bits{ma::BitcastOp::create(b, b.getI16Type(), in), &b};
   // Use this method of checking for NaN because it's the same as what's used
   // in the reduce precision lowering.
   Value is_nan = (in_bits & 32767).cmp(ma::CmpIPredicate::ugt, 31744);
 
   Value value = EmitReducePrecision(in, 5, 2, b);
-  value = b.create<ma::BitcastOp>(b.getI16Type(), value);
-  value = b.create<ma::ShRUIOp>(value,
-                                b.create<ma::ConstantIntOp>(b.getI16Type(), 8));
-  value = b.create<ma::TruncIOp>(b.getI8Type(), value);
+  value = ma::BitcastOp::create(b, b.getI16Type(), value);
+  value = ma::ShRUIOp::create(b, value,
+                              ma::ConstantIntOp::create(b, b.getI16Type(), 8));
+  value = ma::TruncIOp::create(b, b.getI8Type(), value);
   // When the input is NaN, just truncating can turn a NaN into an inf if the
   // mantissa becomes 0.
-  value = b.create<ma::SelectOp>(
-      is_nan, b.create<ma::ConstantIntOp>(value.getType(), 0x7F), value);
-  return b.create<ma::BitcastOp>(b.getType<mlir::Float8E5M2Type>(), value);
+  value = ma::SelectOp::create(
+      b, is_nan, ma::ConstantIntOp::create(b, value.getType(), 0x7F), value);
+  return ma::BitcastOp::create(b, b.getType<mlir::Float8E5M2Type>(), value);
 }
 
 Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
@@ -220,8 +220,8 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
     // Going through f32 and f16 is significantly faster than the fallback code
     // below.
     return EmitF16ToF8e5m2(
-        b.create<ma::TruncFOp>(b.getF16Type(),
-                               b.create<ma::ExtFOp>(b.getF32Type(), value)),
+        ma::TruncFOp::create(b, b.getF16Type(),
+                             ma::ExtFOp::create(b, b.getF32Type(), value)),
         b);
   }
 
@@ -265,23 +265,23 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
       return {v, &b};
     }
     if (ty.getIntOrFloatBitWidth() < v.getType().getIntOrFloatBitWidth()) {
-      return {b.create<ma::TruncIOp>(ty, v), &b};
+      return {ma::TruncIOp::create(b, ty, v), &b};
     }
-    return {b.create<ma::ExtUIOp>(ty, v), &b};
+    return {ma::ExtUIOp::create(b, ty, v), &b};
   };
 
   int64_t exp_offset = to_bias - from_bias;
   int digit_shift = to_mantissa - from_mantissa;
 
   int from_width = value.getType().getIntOrFloatBitWidth();
-  Val from_bits{b.create<ma::BitcastOp>(b.getIntegerType(from_width), value),
+  Val from_bits{ma::BitcastOp::create(b, b.getIntegerType(from_width), value),
                 &b};
   if (from_width < 8) {
     from_bits = convert_int(b.getIntegerType(8), from_bits);
   }
 
   auto cst = [&](mlir::Type ty, int64_t n) -> Val {
-    return {b.create<ma::ConstantIntOp>(ty, n), &b};
+    return {ma::ConstantIntOp::create(b, ty, n), &b};
   };
 
   // Shift bits to destination type, without sign bit.
@@ -368,8 +368,8 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
     // `From` supports larger values than `To`, we may overflow.
     if (std::make_pair(to_max_exp, to_mantissa) <
         std::make_pair(from_max_exp, from_mantissa)) {
-      result = b.create<SelectOp>(
-          rounded_from_bits.cmp(CmpIPredicate::ugt, aligned_highest), to_inf,
+      result = SelectOp::create(
+          b, rounded_from_bits.cmp(CmpIPredicate::ugt, aligned_highest), to_inf,
           result);
     }
   }
@@ -386,7 +386,7 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
 
     // Determine exponent in target type.
     Value clz = convert_int(
-        i32_ty, b.create<mlir::math::CountLeadingZerosOp>(from_bits));
+        i32_ty, mlir::math::CountLeadingZerosOp::create(b, from_bits));
     Value msb = cst(i32_ty, std::max(from_width, 8) - 1) - clz;
     Value normalization_factor = cst(i32_ty, from_mantissa) - msb;
 
@@ -408,7 +408,7 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
 
     Value biased_exp_sle_zero = biased_exponent.cmp(CmpIPredicate::sle, 0);
     bits = Val(
-        b.create<SelectOp>(biased_exp_sle_zero, subnormal_bits, normal_bits),
+        SelectOp::create(b, biased_exp_sle_zero, subnormal_bits, normal_bits),
         &b);
     if (digit_shift >= 0) {
       bits = bits.shl(digit_shift);
@@ -420,7 +420,7 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
     }
     bits = convert_int(to_int_ty, bits);
 
-    result = b.create<SelectOp>(biased_from_exp == 0, bits, result);
+    result = SelectOp::create(b, biased_from_exp == 0, bits, result);
   } else if (to_min_exp > from_min_exp) {
     // `To` supports fewer exponents near zero which means that some values in
     // `From` may become subnormal.
@@ -451,19 +451,19 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
             .shrui(exponent_shift_from_ty));
     // To avoid UB, limit rounding and shifting to the full mantissa plus
     // leading 1.
-    positive_bits =
-        Val(b.create<SelectOp>(
-                exponent_shift_i32.cmp(CmpIPredicate::sle, from_mantissa + 1),
-                positive_bits, to_zero),
-            &b);
+    positive_bits = Val(
+        SelectOp::create(
+            b, exponent_shift_i32.cmp(CmpIPredicate::sle, from_mantissa + 1),
+            positive_bits, to_zero),
+        &b);
 
     Val negative_bits = convert_int(to_int_ty, rounded_from_bits)
                             .shl(to_zero - exponent_shift_to_ty);
     Value bits =
-        b.create<SelectOp>(exponent_shift_i32.cmp(CmpIPredicate::sgt, 0),
-                           positive_bits, negative_bits);
-    result = b.create<SelectOp>(biased_to_exp.cmp(CmpIPredicate::sle, 0), bits,
-                                result);
+        SelectOp::create(b, exponent_shift_i32.cmp(CmpIPredicate::sgt, 0),
+                         positive_bits, negative_bits);
+    result = SelectOp::create(b, biased_to_exp.cmp(CmpIPredicate::sle, 0), bits,
+                              result);
   }
 
   Value result_is_inf = IsInf(value, b);
@@ -485,17 +485,17 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
   }
 
   if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
-    result = b.create<SelectOp>(from_bits == 0, to_zero, result);
+    result = SelectOp::create(b, from_bits == 0, to_zero, result);
   }
-  result = b.create<SelectOp>(result_is_inf, to_inf, result);
-  result = b.create<SelectOp>(input_is_nan, to_nan, result);
+  result = SelectOp::create(b, result_is_inf, to_inf, result);
+  result = SelectOp::create(b, input_is_nan, to_nan, result);
 
   // Insert sign bit.
   if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
     Value neg_result = Val{result, &b} | (1ll << (to_int_ty.getWidth() - 1));
-    result = b.create<SelectOp>(from_sign_bit, neg_result, result);
+    result = SelectOp::create(b, from_sign_bit, neg_result, result);
   }
-  result = b.create<ma::BitcastOp>(to_ty, result);
+  result = ma::BitcastOp::create(b, to_ty, result);
   return result;
 }
 
@@ -569,7 +569,7 @@ struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
     if (op.getPredicate() == ma::CmpFPredicate::UNE &&
         mlir::matchPattern(rhs, mlir::m_ConstantFloat(&rhs_cst))) {
       mlir::Type int_ty = rewriter.getIntegerType(lhs.getType().getWidth());
-      Val int_value{b.create<ma::BitcastOp>(int_ty, lhs), &b};
+      Val int_value{ma::BitcastOp::create(b, int_ty, lhs), &b};
       int64_t constant = rhs_cst.bitcastToAPInt().getZExtValue();
       // If we're comparing to +-0, compare the absolute values.
       if (rhs_cst.isZero() && !IsFNUZ(lhs.getType())) {
@@ -577,14 +577,14 @@ struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
         int_value = int_value & mask;
         constant &= mask;
       }
-      auto cst = b.create<ma::ConstantIntOp>(int_ty, constant);
+      auto cst = ma::ConstantIntOp::create(b, int_ty, constant);
       rewriter.replaceOpWithNewOp<ma::CmpIOp>(op, ma::CmpIPredicate::ne,
                                               int_value, cst);
       return mlir::success();
     }
 
-    auto lhs_ext = b.create<ma::ExtFOp>(b.getF32Type(), lhs);
-    auto rhs_ext = b.create<ma::ExtFOp>(b.getF32Type(), rhs);
+    auto lhs_ext = ma::ExtFOp::create(b, b.getF32Type(), lhs);
+    auto rhs_ext = ma::ExtFOp::create(b, b.getF32Type(), rhs);
     rewriter.replaceOpWithNewOp<ma::CmpFOp>(op, op->getResultTypes(),
                                             mlir::ValueRange{lhs_ext, rhs_ext},
                                             op->getAttrs());
@@ -618,7 +618,7 @@ struct RewriteAbsFPattern : public mlir::OpRewritePattern<mlir::math::AbsFOp> {
 
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     mlir::Type i_ty = rewriter.getIntegerType(src.getType().getWidth());
-    Val value{b.create<ma::BitcastOp>(i_ty, src), &b};
+    Val value{ma::BitcastOp::create(b, i_ty, src), &b};
     int64_t mask = (1ull << (src.getType().getWidth() - 1)) - 1;
     value = value & mask;
     rewriter.replaceOpWithNewOp<ma::BitcastOp>(op, src.getType(), value);
@@ -636,7 +636,7 @@ struct RewriteIToFpPattern : public mlir::OpRewritePattern<Op> {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) itofp");
     }
     Value to_float =
-        rewriter.create<Op>(op.getLoc(), rewriter.getF32Type(), op.getIn());
+        Op::create(rewriter, op.getLoc(), rewriter.getF32Type(), op.getIn());
     rewriter.replaceOpWithNewOp<ma::TruncFOp>(op, op.getType(), to_float);
     return mlir::success();
   }
@@ -652,8 +652,8 @@ struct RewriteFpToIPattern : public mlir::OpRewritePattern<Op> {
         op.getIn().getType().getIntOrFloatBitWidth() > 8) {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) fptoi");
     }
-    Value to_f32 = rewriter.create<ma::ExtFOp>(
-        op.getLoc(), rewriter.getF32Type(), op.getIn());
+    Value to_f32 = ma::ExtFOp::create(rewriter, op.getLoc(),
+                                      rewriter.getF32Type(), op.getIn());
     rewriter.replaceOpWithNewOp<Op>(op, op.getType(), to_f32);
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
index f603a2d89a7b25..7f937e13a9fa82 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
@@ -385,13 +385,13 @@ struct RewriteTensorInsert : OpRewritePattern<InsertOp> {
     auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto tensor_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(tensor_type), tensor)
+    auto tensor_1D = UnrealizedConversionCastOp::create(
+                         b, GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
     auto new_insert =
-        b.create<InsertOp>(op.getScalar(), tensor_1D, linear_index);
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        tensor_type, new_insert.getResult());
+        InsertOp::create(b, op.getScalar(), tensor_1D, linear_index);
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, tensor_type, new_insert.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
@@ -411,13 +411,13 @@ struct RewriteVectorInsert : OpRewritePattern<mv::InsertOp> {
     auto indices = mv::getAsValues(rewriter, loc, op.getMixedPosition());
     auto linear_index = LinearizeIndex(loc, vector_type, indices, rewriter);
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto vector_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(vector_type), vector)
+    auto vector_1D = UnrealizedConversionCastOp::create(
+                         b, GetFlattenedType(vector_type), vector)
                          .getResult(0);
     auto new_insert =
-        b.create<mv::InsertOp>(op.getValueToStore(), vector_1D, linear_index);
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        vector_type, new_insert.getResult());
+        mv::InsertOp::create(b, op.getValueToStore(), vector_1D, linear_index);
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, vector_type, new_insert.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
@@ -435,10 +435,10 @@ struct RewriteVectorFromElements : OpRewritePattern<mv::FromElementsOp> {
     }
     auto loc = op.getLoc();
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto new_from_elements = b.create<mv::FromElementsOp>(
-        GetFlattenedType(vector_type), op.getElements());
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        vector_type, new_from_elements.getResult());
+    auto new_from_elements = mv::FromElementsOp::create(
+        b, GetFlattenedType(vector_type), op.getElements());
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, vector_type, new_from_elements.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
@@ -458,14 +458,14 @@ struct RewriteAtomicRMW : OpRewritePattern<AtomicRMWOp> {
     auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto tensor_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(tensor_type), tensor)
+    auto tensor_1D = UnrealizedConversionCastOp::create(
+                         b, GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
-    auto new_atomic_rmw = b.create<AtomicRMWOp>(tensor_1D, linear_index);
+    auto new_atomic_rmw = AtomicRMWOp::create(b, tensor_1D, linear_index);
     rewriter.inlineRegionBefore(op.getRegion(),
                                 &new_atomic_rmw.getRegion().front());
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        tensor_type, new_atomic_rmw.getResult());
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, tensor_type, new_atomic_rmw.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc
new file mode 100644
index 00000000000000..bf1174b2eb930a
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc
@@ -0,0 +1,95 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/codegen/emitters/transforms/lower_to_llvm_common.h"
+
+#include <utility>
+
+#include "absl/functional/function_ref.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace xla {
+namespace emitters {
+
+mlir::LogicalResult LowerToLLVM(
+    mlir::ModuleOp op,
+    absl::FunctionRef<mlir::LogicalResult(mlir::LLVMTypeConverter&,
+                                          mlir::RewritePatternSet&,
+                                          mlir::ConversionTarget&)>
+        populate_platform_patterns) {
+  // Populate type conversions.
+  mlir::LowerToLLVMOptions llvm_opts(op.getContext(), mlir::DataLayout(op));
+  mlir::LLVMTypeConverter type_converter(op.getContext(), llvm_opts);
+  mlir::LLVMConversionTarget target(*op.getContext());
+
+  // Populate patterns.
+  mlir::RewritePatternSet patterns(op.getContext());
+  mlir::arith::populateArithExpandOpsPatterns(patterns);
+  mlir::arith::populateArithToLLVMConversionPatterns(type_converter, patterns);
+  if (mlir::failed(
+          populate_platform_patterns(type_converter, patterns, target))) {
+    return mlir::failure();
+  }
+  mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
+  mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
+                                                       patterns);
+  mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
+  mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
+  mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
+                                                        patterns);
+  mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
+
+  //  Set up target.
+  target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                           mlir::complex::ComplexDialect>();
+  target.addLegalOp<mlir::ModuleOp>();
+
+  if (mlir::failed(applyPartialConversion(op, target, std::move(patterns)))) {
+    return mlir::failure();
+  }
+
+  // Clean up any leftover math ops.
+  mlir::RewritePatternSet mathPatterns(op.getContext());
+  mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
+                                             /*approximateLog1p=*/false);
+  target.addIllegalDialect<mlir::math::MathDialect>();
+
+  if (mlir::failed(applyFullConversion(op, target, std::move(mathPatterns)))) {
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
+}  // namespace emitters
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h
new file mode 100644
index 00000000000000..5aec57c5e65cc6
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_COMMON_H_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_COMMON_H_
+
+#include "absl/functional/function_ref.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace xla {
+namespace emitters {
+
+mlir::LogicalResult LowerToLLVM(
+    mlir::ModuleOp op,
+    absl::FunctionRef<mlir::LogicalResult(mlir::LLVMTypeConverter&,
+                                          mlir::RewritePatternSet&,
+                                          mlir::ConversionTarget&)>
+        populate_platform_patterns =
+            [](mlir::LLVMTypeConverter&, mlir::RewritePatternSet&,
+               mlir::ConversionTarget&) { return mlir::success(); });
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_COMMON_H_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
new file mode 100644
index 00000000000000..cbe400674ea2e5
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
@@ -0,0 +1,49 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h"
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep, needed by lower_to_llvm_cpu.h.inc.
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_common.h"
+
+namespace xla {
+namespace emitters {
+namespace {
+
+#define GEN_PASS_DEF_LOWERTOLLVMCPUPASS
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h.inc"
+
+class LowerToLLVMCPUPass
+    : public impl::LowerToLLVMCPUPassBase<LowerToLLVMCPUPass> {
+ public:
+  void runOnOperation() override {
+    if (mlir::failed(LowerToLLVM(getOperation()))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<::mlir::Pass> CreateLowerToLLVMCPUPass() {
+  return std::make_unique<LowerToLLVMCPUPass>();
+}
+
+}  // namespace emitters
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h
new file mode 100644
index 00000000000000..3fb1d73d3b2606
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h
@@ -0,0 +1,36 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_H_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace emitters {
+
+#define GEN_PASS_DECL
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMCPUPass();
+
+#define GEN_PASS_REGISTRATION
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h.inc"
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_emitter.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td
similarity index 51%
rename from third_party/xla/xla/backends/cpu/xnn_emitter.h
rename to third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td
index 439e7f25d84e0c..f937a75c75fd40 100644
--- a/third_party/xla/xla/backends/cpu/xnn_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_BACKENDS_CPU_XNN_EMITTER_H_
-#define XLA_BACKENDS_CPU_XNN_EMITTER_H_
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_TD_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_TD_
 
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/hlo/ir/hlo_computation.h"
+include "mlir/Pass/PassBase.td"
 
-namespace xla::cpu {
+def LowerToLLVMCPUPass :
+   Pass<"xla-lower-to-llvm-cpu", "mlir::ModuleOp"> {
+  let summary = "Lowers to LLVM (CPU Version).";
 
-absl::StatusOr<absl::AnyInvocable<absl::StatusOr<XnnSubgraph>()>>
-EmitXnnFusionBuilder(const HloComputation* computation);
+  let description = [{
+    Lowers the rest to LLVM (CPU Version)
+  }];
 
-}  // namespace xla::cpu
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+  ];
+  let constructor = "CreateLowerToLLVMCPUPass()";
+}
 
-#endif  // XLA_BACKENDS_CPU_XNN_EMITTER_H_
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_TD_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
similarity index 53%
rename from third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
rename to third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
index 4fe70076991a27..3a9091747ab33c 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h"
+
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 
 #include "llvm/Support/LogicalResult.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
-#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
@@ -35,23 +35,20 @@ limitations under the License.
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // IWYU pragma: keep
-#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
-#include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_common.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/tsl/platform/logging.h"
@@ -63,19 +60,20 @@ namespace {
 
 namespace se = ::stream_executor;
 
-#define GEN_PASS_DEF_LOWERTOLLVMPASS
-#include "xla/codegen/emitters/transforms/passes.h.inc"
+#define GEN_PASS_DEF_LOWERTOLLVMGPUPASS
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h.inc"
 
-class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
+class LowerToLLVMGPUPass
+    : public impl::LowerToLLVMGPUPassBase<LowerToLLVMGPUPass> {
  public:
-  explicit LowerToLLVMPass(const LowerToLLVMPassOptions& options)
-      : LowerToLLVMPassBase(options) {}
+  explicit LowerToLLVMGPUPass(const LowerToLLVMGPUPassOptions& options)
+      : LowerToLLVMGPUPassBase(options) {}
 
-  explicit LowerToLLVMPass(const se::DeviceDescription& device_description)
+  explicit LowerToLLVMGPUPass(const se::DeviceDescription& device_description)
       : device_spec_(device_description) {}
 
   void runOnOperation() override {
-    if (target_type_ == "gpu" && !gpu_device_info_.empty()) {
+    if (!gpu_device_info_.empty()) {
       se::GpuDeviceInfoProto device_info;
       CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
                                                        &device_info));
@@ -83,35 +81,24 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
           se::DeviceDescription::FromProto(device_info);
       CHECK_OK(device_description.status());
       *device_spec_.mutable_type() = *device_description;
-    } else if (target_type_ == "cpu") {
-      CHECK(gpu_device_info_.empty());
-      *device_spec_.mutable_type() = CpuDeviceSpec{};
     }
-    // Populate type conversions.
-    mlir::LowerToLLVMOptions llvm_opts(&getContext(),
-                                       mlir::DataLayout(getOperation()));
-    mlir::LLVMTypeConverter type_converter(getOperation().getContext(),
-                                           llvm_opts);
-    mlir::LLVMConversionTarget target(*getOperation().getContext());
-
-    // Populate patterns.
-    mlir::RewritePatternSet patterns(&getContext());
-    mlir::arith::populateArithExpandOpsPatterns(patterns);
-    mlir::arith::populateArithToLLVMConversionPatterns(type_converter,
-                                                       patterns);
-    if (device_spec_.IsGpu()) {
+
+    auto populate_patterns =
+        [&](mlir::LLVMTypeConverter& converter,
+            mlir::RewritePatternSet& patterns,
+            mlir::ConversionTarget& target) -> mlir::LogicalResult {
       if (device_spec_.IsAmdGpu()) {
         std::string chipset =
             device_spec_.gpu().rocm_compute_capability().gfx_version();
         llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
             mlir::amdgpu::Chipset::parse(chipset);
-        if (failed(maybeChipset)) {
+        if (mlir::failed(maybeChipset)) {
           mlir::emitError(mlir::UnknownLoc::get(&getContext()),
                           "Invalid chipset name: " + chipset);
-          return signalPassFailure();
+          return mlir::failure();
         }
         mlir::populateGpuToROCDLConversionPatterns(
-            type_converter, patterns, mlir::gpu::amd::Runtime::Unknown,
+            converter, patterns, mlir::gpu::amd::Runtime::Unknown,
             *maybeChipset);
         mlir::configureGpuToROCDLConversionLegality(target);
       } else if (device_spec_.IsIntelGpu()) {
@@ -126,41 +113,16 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
             }
           });
         }
-        populateGpuToLLVMSPVConversionPatterns(type_converter, patterns);
-        populateGpuMemorySpaceAttributeConversions(type_converter);
+        populateGpuToLLVMSPVConversionPatterns(converter, patterns);
+        populateGpuMemorySpaceAttributeConversions(converter);
       } else {
-        mlir::populateGpuToNVVMConversionPatterns(type_converter, patterns);
+        mlir::populateGpuToNVVMConversionPatterns(converter, patterns);
         mlir::configureGpuToNVVMConversionLegality(target);
       }
-    }
-    mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
-    mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
-                                                         patterns);
-    mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
-    mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
-    mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
-                                                          patterns);
-    mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
-
-    //  Setup target.
-    target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                             mlir::complex::ComplexDialect>();
-    target.addLegalOp<mlir::ModuleOp>();
-
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-
-    // Cleanup any leftover math ops not handled NVVM or ROCDL lowering
-    mlir::RewritePatternSet mathPatterns(&getContext());
-    mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
-                                               /* approximateLog1p */ false);
-    target.addIllegalDialect<mlir::math::MathDialect>();
+      return mlir::success();
+    };
 
-    if (failed(applyFullConversion(getOperation(), target,
-                                   std::move(mathPatterns)))) {
+    if (mlir::failed(LowerToLLVM(getOperation(), populate_patterns))) {
       signalPassFailure();
     }
   }
@@ -171,17 +133,16 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
 
 }  // namespace
 
-std::unique_ptr<::mlir::Pass> CreateLowerToLLVMPass(
-    const std::string& target_type, const std::string& gpu_device_info) {
-  LowerToLLVMPassOptions options;
+std::unique_ptr<::mlir::Pass> CreateLowerToLLVMGPUPass(
+    const std::string& gpu_device_info) {
+  LowerToLLVMGPUPassOptions options;
   options.gpu_device_info_ = gpu_device_info;
-  options.target_type_ = target_type;
-  return std::make_unique<LowerToLLVMPass>(options);
+  return std::make_unique<LowerToLLVMGPUPass>(options);
 }
 
-std::unique_ptr<::mlir::Pass> CreateLowerToLLVMPass(
+std::unique_ptr<::mlir::Pass> CreateLowerToLLVMGPUPass(
     const se::DeviceDescription& device_description) {
-  return std::make_unique<LowerToLLVMPass>(device_description);
+  return std::make_unique<LowerToLLVMGPUPass>(device_description);
 }
 
 }  // namespace emitters
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
new file mode 100644
index 00000000000000..0bed290d078d4f
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"
+
+namespace stream_executor {
+class DeviceDescription;
+}  // namespace stream_executor
+
+namespace xla {
+namespace emitters {
+
+#define GEN_PASS_DECL
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMGPUPass(
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMGPUPass(
+    const stream_executor::DeviceDescription& device_description);
+#define GEN_PASS_REGISTRATION
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h.inc"
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td
new file mode 100644
index 00000000000000..765dab04a6fb79
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td
@@ -0,0 +1,42 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_TD_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_TD_
+
+include "mlir/Pass/PassBase.td"
+
+def LowerToLLVMGPUPass :
+   Pass<"xla-lower-to-llvm-gpu", "mlir::ModuleOp"> {
+  let summary = "Lowers to LLVM (GPU Version).";
+
+  let description = [{
+    Lowers the rest to LLVM (GPU Version)
+  }];
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::LLVM::LLVMDialect",
+    "mlir::NVVM::NVVMDialect",
+  ];
+
+  let options = [
+    Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
+           "Serialized stream_executor::GPUDeviceInfo proto.">,
+  ];
+  let constructor = "CreateLowerToLLVMGPUPass()";
+}
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_TD_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
index 95b6fe5737f916..261ad5326095c0 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
@@ -74,7 +74,7 @@ mlir::func::FuncOp GetOrInsertDeclaration(mlir::PatternRewriter& rewriter,
   rewriter.setInsertionPointToStart(module_op.getBody());
 
   auto func_decl =
-      rewriter.create<mlir::func::FuncOp>(module_op.getLoc(), name, func_type);
+      mlir::func::FuncOp::create(rewriter, module_op.getLoc(), name, func_type);
   func_decl.setPrivate();
   return func_decl;
 }
@@ -110,14 +110,14 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
       mlir::Type f32_type = get_vector_type(b.getF32Type());
 
       mlir::Value input_value =
-          b.create<mlir::arith::ExtFOp>(f32_type, op.getOperand());
+          mlir::arith::ExtFOp::create(b, f32_type, op.getOperand());
 
       auto erf_decl = codegen::intrinsics::Erf::GetOrInsertDeclaration(
           rewriter, module_op_, Type::TypeFromIrType(f32_type));
-      auto call_op = b.create<mlir::func::CallOp>(erf_decl, input_value);
+      auto call_op = mlir::func::CallOp::create(b, erf_decl, input_value);
 
       mlir::Value f32_result = call_op.getResult(0);
-      mlir::Value result = b.create<mlir::arith::TruncFOp>(type, f32_result);
+      mlir::Value result = mlir::arith::TruncFOp::create(b, type, f32_result);
 
       rewriter.replaceOp(op, result);
       return mlir::success();
@@ -129,7 +129,7 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
       auto erf_decl = GetErf64Declaration(rewriter);
 
       if (!maybe_vector_type) {
-        auto call_op = b.create<mlir::func::CallOp>(erf_decl, op.getOperand());
+        auto call_op = mlir::func::CallOp::create(b, erf_decl, op.getOperand());
         rewriter.replaceOp(op, call_op->getResults());
         return mlir::success();
       }
@@ -139,7 +139,7 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
         mlir::Value extracted = mlir::vector::ExtractOp::create(
             rewriter, op.getLoc(), op.getOperand(), idx);
         mlir::Value scalar_erf =
-            b.create<mlir::func::CallOp>(erf_decl, extracted).getResult(0);
+            mlir::func::CallOp::create(b, erf_decl, extracted).getResult(0);
         scalar_erf_results.push_back(scalar_erf);
       }
       rewriter.replaceOpWithNewOp<mlir::vector::FromElementsOp>(
@@ -196,7 +196,7 @@ class LowerTruncF32BF16FPattern
         codegen::intrinsics::FpTrunc::GetOrInsertDeclaration(
             rewriter, module_op_, src_type, dst_type);
     auto call_op =
-        b.create<mlir::func::CallOp>(f32_to_bf16_decl, op.getOperand());
+        mlir::func::CallOp::create(b, f32_to_bf16_decl, op.getOperand());
     rewriter.replaceOp(op, call_op->getResults());
     return mlir::success();
   }
@@ -213,27 +213,52 @@ class LowerIntrinsicPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
-    if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(op.getType());
-        vec_type && vec_type.getRank() != 1) {
+    auto vec_type = mlir::dyn_cast<mlir::VectorType>(op.getType());
+    if (vec_type && vec_type.getRank() != 1) {
       // These will later be converted to loops of 1D vectors but will then miss
       // the XLA intrinsic lowering.
       op->emitWarning() << "Missed XLA intrinsic lowering as vector rank != 1.";
       return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
     }
     Type type = Type::TypeFromIrType(op.getType());
+    Type scalar_type =
+        Type::TypeFromIrType(mlir::getElementTypeOrSelf(op.getType()));
     mlir::StringAttr features =
         module_op_->getAttrOfType<mlir::StringAttr>("mhlo.cpu_features");
     const std::string features_str = !features ? "" : features.getValue().str();
-    if (!Intrinsic::IsSupported(features_str, type)) {
+    bool is_supported = Intrinsic::IsSupported(features_str, type);
+    bool scalar_supported = Intrinsic::IsSupported(features_str, scalar_type);
+    if (!is_supported && !scalar_supported) {
       return rewriter.notifyMatchFailure(op, "unsupported type");
     }
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    auto intrinsic_decl =
-        Intrinsic::GetOrInsertDeclaration(rewriter, module_op_, type);
-    auto call_op =
-        b.create<mlir::func::CallOp>(intrinsic_decl, op.getOperand());
-    rewriter.replaceOp(op, call_op->getResults());
+    if (is_supported) {
+      auto intrinsic_decl =
+          Intrinsic::GetOrInsertDeclaration(rewriter, module_op_, type);
+      rewriter.replaceOpWithNewOp<mlir::func::CallOp>(op, intrinsic_decl,
+                                                      op.getOperand());
+    } else {
+      // If the element type is supported but not the vector type, then we
+      // decompose the vector op into a sequence of scalar ops. This is not
+      // optimal in that we could split into the largest possible supported
+      // vectorized ops, but it works for now.
+      auto intrinsic_decl =
+          Intrinsic::GetOrInsertDeclaration(rewriter, module_op_, scalar_type);
+
+      llvm::SmallVector<mlir::Value> scalar_results;
+      scalar_results.reserve(vec_type.getNumElements());
+      for (int64_t idx = 0; idx != vec_type.getNumElements(); ++idx) {
+        mlir::Value scalar_value = mlir::vector::ExtractOp::create(
+            rewriter, op.getLoc(), op.getOperand(), idx);
+        mlir::Value scalar_result =
+            mlir::func::CallOp::create(rewriter, op.getLoc(), intrinsic_decl,
+                                       scalar_value)
+                .getResult(0);
+        scalar_results.push_back(scalar_result);
+      }
+      rewriter.replaceOpWithNewOp<mlir::vector::FromElementsOp>(op, vec_type,
+                                                                scalar_results);
+    }
     return mlir::success();
   }
 
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc
index 4c1fcd127de205..2d91bf339d1ec7 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc
@@ -77,13 +77,14 @@ struct RewritePredicatedInsert : mlir::OpRewritePattern<PredicatedInsertOp> {
     rewriter.replaceOpWithNewOp<mlir::scf::IfOp>(
         op, op.getCondition(),
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(
-              loc, b.create<mlir::tensor::InsertOp>(
-                        loc, op.getValue(), op.getDest(), op.getIndices())
-                       .getResult());
+          mlir::scf::YieldOp::create(
+              b, loc,
+              mlir::tensor::InsertOp::create(b, loc, op.getValue(),
+                                             op.getDest(), op.getIndices())
+                  .getResult());
         },
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(loc, op.getDest());
+          mlir::scf::YieldOp::create(b, loc, op.getDest());
         });
     return success();
   }
@@ -99,13 +100,13 @@ struct RewritePredicatedExtract : mlir::OpRewritePattern<PredicatedExtractOp> {
     rewriter.replaceOpWithNewOp<mlir::scf::IfOp>(
         op, op.getCondition(),
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(
-              loc, b.create<mlir::tensor::ExtractOp>(loc, op.getSrc(),
-                                                     op.getIndices())
-                       .getResult());
+          mlir::scf::YieldOp::create(b, loc,
+                                     mlir::tensor::ExtractOp::create(
+                                         b, loc, op.getSrc(), op.getIndices())
+                                         .getResult());
         },
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(loc, op.getFallback());
+          mlir::scf::YieldOp::create(b, loc, op.getFallback());
         });
     return success();
   }
@@ -222,8 +223,8 @@ struct RewriteXlaLoop : mlir::OpRewritePattern<LoopOp> {
           mlir::ImplicitLocOpBuilder nested_b(loc, nested_builder);
           auto is_in_bounds = emitters::CheckConstraints(
               indexing_map, op.getDims(), symbol_values, nested_b);
-          auto if_op = nested_b.create<mlir::scf::IfOp>(
-              is_in_bounds,
+          auto if_op = mlir::scf::IfOp::create(
+              nested_b, is_in_bounds,
               [&](OpBuilder& then_builder, Location then_loc) -> void {
                 ImplicitLocOpBuilder then_b(then_loc, then_builder);
                 mlir::IRMapping mapping;
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.h b/third_party/xla/xla/codegen/emitters/transforms/passes.h
index 74a5ab5fe2f897..d1bb595ab029a7 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.h
@@ -42,11 +42,6 @@ std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
     const std::string& gpu_device_info = "");
 std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
     const stream_executor::DeviceDescription& device_description);
-std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass(
-    const std::string& target_type = "gpu",
-    const std::string& gpu_device_info = "");
-std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass(
-    const stream_executor::DeviceDescription& device_description);
 std::unique_ptr<mlir::Pass> CreateLowerXlaToScfPass(int64_t warp_size = 32);
 std::unique_ptr<mlir::Pass> CreateLowerXlaLoopsToScfPass();
 std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.td b/third_party/xla/xla/codegen/emitters/transforms/passes.td
index 0b7afb432042f6..c82a0d26ea39a2 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.td
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.td
@@ -129,29 +129,6 @@ def LowerTensorsPass : Pass<"xla-lower-tensors", "mlir::ModuleOp"> {
   let constructor = "CreateLowerTensorsPass()";
 }
 
-def LowerToLLVMPass :
-   Pass<"xla-lower-to-llvm", "mlir::ModuleOp"> {
-  let summary = "Lowers to LLVM.";
-
-  let description = [{
-    Lowers the rest to LLVM
-  }];
-
-  let dependentDialects = [
-    "mlir::func::FuncDialect",
-    "mlir::LLVM::LLVMDialect",
-    "mlir::NVVM::NVVMDialect",
-  ];
-
-  let options = [
-    Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
-           "Serialized stream_executor::GPUDeviceInfo proto.">,
-    Option<"target_type_", "target_type", "std::string", /*default=*/"\"gpu\"",
-           "Whether the pass targets a 'cpu' or 'gpu'. If 'cpu', gpu_device_info_ must be empty.">,
-  ];
-  let constructor = "CreateLowerToLLVMPass()";
-}
-
 def LowerXlaToScfPass :
    Pass<"xla-lower-xla-to-scf", "mlir::func::FuncOp"> {
   let summary = "Lowers xla to SCF.";
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
index 24a772c5580f12..937ac8206ddde1 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
@@ -146,4 +146,20 @@ module {
 // CHECK-LABEL: @local_xla.rsqrt.f64
 // CHECK-NOT: math.rsqrt
 // CHECK: %[[RSQRT_CALL:.*]] = call @local_xla.rsqrt.f64
-// CHECK: return %[[RSQRT_CALL]]
\ No newline at end of file
+// CHECK: return %[[RSQRT_CALL]]
+
+// -----
+
+// Use a vector length of 3 as we know that will never be supported.
+func.func @rsqrt_unsupported_vector_size(%arg0: vector<3xf32>) -> vector<3xf32> {
+  // CHECK: %[[IN0:.*]] = vector.extract %arg0[0]
+  // CHECK: %[[RSQRT0:.*]] = call @local_xla.rsqrt.f32(%[[IN0]])
+  // CHECK: %[[IN1:.*]] = vector.extract %arg0[1]
+  // CHECK: %[[RSQRT1:.*]] = call @local_xla.rsqrt.f32(%[[IN1]])
+  // CHECK: %[[IN2:.*]] = vector.extract %arg0[2]
+  // CHECK: %[[RSQRT2:.*]] = call @local_xla.rsqrt.f32(%[[IN2]])
+  // CHECK: %[[RESULT:.*]] = vector.from_elements %[[RSQRT0]], %[[RSQRT1]], %[[RSQRT2]]
+  %ret = math.rsqrt %arg0 : vector<3xf32>
+  // CHECK: return %[[RESULT]]
+  return %ret : vector<3xf32>
+}
diff --git a/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc b/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc
index 08ab06e65a3c13..f151bce51ba1c8 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc
@@ -57,28 +57,31 @@ struct UnswitchLoop : mlir::OpRewritePattern<mlir::scf::ForOp> {
       return rewriter.notifyMatchFailure(op, "condition is a constant");
     }
 
-    auto true_cst = rewriter.create<mlir::arith::ConstantOp>(
-        op.getLoc(), rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
-    auto false_cst = rewriter.create<mlir::arith::ConstantOp>(
-        op.getLoc(), rewriter.getIntegerAttr(rewriter.getI1Type(), 0));
+    auto true_cst = mlir::arith::ConstantOp::create(
+        rewriter, op.getLoc(),
+        rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
+    auto false_cst = mlir::arith::ConstantOp::create(
+        rewriter, op.getLoc(),
+        rewriter.getIntegerAttr(rewriter.getI1Type(), 0));
     rewriter.setInsertionPoint(op);
     mlir::IRMapping mapping;
     mapping.map(if_op.getCondition(), false_cst);
     auto false_branch_loop = op->clone(mapping);
-    auto new_if = rewriter.create<mlir::scf::IfOp>(
-        op.getLoc(), op.getResultTypes(), if_op.getCondition(), true, true);
+    auto new_if =
+        mlir::scf::IfOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                if_op.getCondition(), true, true);
     rewriter.replaceAllUsesWith(op.getResults(), new_if.getResults());
 
     auto then_builder = new_if.getThenBodyBuilder(rewriter.getListener());
     auto then_yield =
-        then_builder.create<mlir::scf::YieldOp>(op.getLoc(), op.getResults());
+        mlir::scf::YieldOp::create(then_builder, op.getLoc(), op.getResults());
     rewriter.moveOpBefore(op, then_yield);
     rewriter.modifyOpInPlace(if_op, [&]() { if_op->setOperand(0, true_cst); });
 
     auto else_builder = new_if.getElseBodyBuilder(rewriter.getListener());
     else_builder.insert(false_branch_loop);
-    else_builder.create<mlir::scf::YieldOp>(op.getLoc(),
-                                            false_branch_loop->getResults());
+    mlir::scf::YieldOp::create(else_builder, op.getLoc(),
+                               false_branch_loop->getResults());
 
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
index dc8dd8eebcf41b..63bb6361e55939 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -42,12 +42,12 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/atomic_rmw_utils.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace emitters {
@@ -61,6 +61,7 @@ using mlir::Value;
 namespace arith = ::mlir::arith;
 namespace ml = ::mlir::LLVM;
 namespace scf = mlir::scf;
+namespace se = stream_executor;
 
 // Tries to find the stride of a symbol or dimension in an affine expression.
 // Returns std::nullopt if the stride could not be determined.
@@ -197,7 +198,7 @@ std::optional<Value> GetVectorBaseIndices(Value index, scf::ForOp loop,
                                           mlir::ImplicitLocOpBuilder& b) {
   Value induction_var = loop.getInductionVar();
   if (index == induction_var) {
-    return b.create<arith::ConstantIndexOp>(0);
+    return arith::ConstantIndexOp::create(b, 0);
   }
 
   auto apply_indexing =
@@ -247,9 +248,9 @@ std::optional<Value> GetVectorBaseIndices(Value index, scf::ForOp loop,
   }
 
   auto operands = llvm::to_vector(apply_indexing.getOperands());
-  operands[induction_var_operand_index] = b.create<arith::ConstantIndexOp>(0);
+  operands[induction_var_operand_index] = arith::ConstantIndexOp::create(b, 0);
 
-  return b.create<ApplyIndexingOp>(operands, apply_indexing.getIndexingMap())
+  return ApplyIndexingOp::create(b, operands, apply_indexing.getIndexingMap())
       ->getResult(0);
 }
 
@@ -286,8 +287,8 @@ struct VectorizeLoad : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
       return rewriter.notifyMatchFailure(
           op, "the instruction does not access contiguous elements");
     }
-    auto loaded_vector = b.create<mlir::vector::TransferReadOp>(
-        vector_type, op.getTensor(), *vector_index, /*padding=*/std::nullopt,
+    auto loaded_vector = mlir::vector::TransferReadOp::create(
+        b, vector_type, op.getTensor(), *vector_index, /*padding=*/std::nullopt,
         llvm::ArrayRef<bool>{true});
     rewriter.replaceOpWithNewOp<mlir::vector::ExtractOp>(
         op, loaded_vector, loop.getInductionVar());
@@ -355,15 +356,15 @@ class VectorizeAtomicRMW : public mlir::OpRewritePattern<AtomicRMWOp> {
     }
 
     auto init =
-        b.create<arith::ConstantOp>(b.getZeroAttr(vector_type)).getResult();
+        arith::ConstantOp::create(b, b.getZeroAttr(vector_type)).getResult();
 
     auto yield_fn = [&](mlir::OpBuilder& yield_b, mlir::Location yield_loc,
                         llvm::ArrayRef<mlir::BlockArgument> bbarg) {
       auto induction_var =
           mlir::cast<scf::ForOp>(bbarg.front().getOwner()->getParentOp())
               .getInductionVar();
-      auto insert_op = yield_b.create<mlir::vector::InsertOp>(
-          yield_loc, atomic_modifier_parameters->first, bbarg.front(),
+      auto insert_op = mlir::vector::InsertOp::create(
+          yield_b, yield_loc, atomic_modifier_parameters->first, bbarg.front(),
           induction_var);
       return llvm::SmallVector<Value>{insert_op.getResult()};
     };
@@ -376,14 +377,14 @@ class VectorizeAtomicRMW : public mlir::OpRewritePattern<AtomicRMWOp> {
     rewriter.replaceOp(op, op->getOpOperand(0).get());
 
     auto filled_vector = new_for->getResults().back();
-    auto new_atomic_rmw = b.create<AtomicRMWOp>(
-        new_for.getInits()[result_index], *vector_index, vector_type);
+    auto new_atomic_rmw = AtomicRMWOp::create(
+        b, new_for.getInits()[result_index], *vector_index, vector_type);
     mlir::ImplicitLocOpBuilder body_builder(new_atomic_rmw.getLoc(),
                                             new_atomic_rmw.getBodyBuilder());
-    auto addf_op = body_builder.create<arith::AddFOp>(
-        body_builder.getLoc(), vector_type, new_atomic_rmw.getCurrentValue(),
-        filled_vector);
-    body_builder.create<xla::YieldOp>(addf_op.getResult());
+    auto addf_op =
+        arith::AddFOp::create(body_builder, body_builder.getLoc(), vector_type,
+                              new_atomic_rmw.getCurrentValue(), filled_vector);
+    xla::YieldOp::create(body_builder, addf_op.getResult());
     new_for->getResult(result_index)
         .replaceAllUsesWith(new_atomic_rmw.getResult());
 
@@ -421,15 +422,15 @@ struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
     }
 
     auto init =
-        b.create<arith::ConstantOp>(b.getZeroAttr(vector_type)).getResult();
+        arith::ConstantOp::create(b, b.getZeroAttr(vector_type)).getResult();
 
     auto yield_fn = [&](mlir::OpBuilder& yield_b, mlir::Location yield_loc,
                         llvm::ArrayRef<mlir::BlockArgument> bbarg) {
       auto induction_var =
           mlir::cast<scf::ForOp>(bbarg.front().getOwner()->getParentOp())
               .getInductionVar();
-      auto insert_op = yield_b.create<mlir::vector::InsertOp>(
-          yield_loc, op.getScalar(), bbarg.front(), induction_var);
+      auto insert_op = mlir::vector::InsertOp::create(
+          yield_b, yield_loc, op.getScalar(), bbarg.front(), induction_var);
       return llvm::SmallVector<Value>{insert_op.getResult()};
     };
     int result_index = op->use_begin()->getOperandNumber();
@@ -441,8 +442,8 @@ struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
     rewriter.replaceOp(op, op.getDest());
 
     auto filled_vector = new_for->getResults().back();
-    auto written = b.create<mlir::vector::TransferWriteOp>(
-        filled_vector, new_for.getInits()[result_index], *vector_index,
+    auto written = mlir::vector::TransferWriteOp::create(
+        b, filled_vector, new_for.getInits()[result_index], *vector_index,
         llvm::ArrayRef<bool>{true});
     new_for->getResult(result_index).replaceAllUsesWith(written.getResult());
 
@@ -506,7 +507,7 @@ struct FoldVectorInsertExtractPairs
     auto bbarg = mlir::cast<mlir::BlockArgument>(insert.getDest());
     int64_t result_index = bbarg.getArgNumber() - 1;
     if (auto transfer_read =
-            extract.getVector().getDefiningOp<mlir::vector::TransferReadOp>()) {
+            extract.getSource().getDefiningOp<mlir::vector::TransferReadOp>()) {
       if (transfer_read.getBase().getType().getNumElements() ==
           vector_type.getNumElements()) {
         return rewriter.notifyMatchFailure(
@@ -538,7 +539,7 @@ struct FoldVectorInsertExtractPairs
       yield_op->setOperand(result_index, insert.getDest());
     });
     rewriter.replaceAllUsesWith(loop->getResult(result_index),
-                                extract.getVector());
+                                extract.getSource());
     return mlir::success();
   }
 };
diff --git a/third_party/xla/xla/codegen/emitters/type_util.cc b/third_party/xla/xla/codegen/emitters/type_util.cc
index 460b9079022257..c498ed84daf604 100644
--- a/third_party/xla/xla/codegen/emitters/type_util.cc
+++ b/third_party/xla/xla/codegen/emitters/type_util.cc
@@ -33,7 +33,11 @@ namespace emitters {
 
 mlir::Type PrimitiveTypeToMlirType(PrimitiveType type, mlir::OpBuilder& b) {
   if (primitive_util::IsIntegralType(type)) {
-    return b.getIntegerType(primitive_util::BitWidth(type));
+    auto bitwidth = primitive_util::BitWidth(type);
+    if (bitwidth == 1) {
+      return b.getI8Type();
+    }
+    return b.getIntegerType(bitwidth);
   }
   return PrimitiveTypeToMlirTypeWithSign(type, b);
 }
diff --git a/third_party/xla/xla/codegen/emitters/type_util_test.cc b/third_party/xla/xla/codegen/emitters/type_util_test.cc
index c11c4d5f768568..96df881acc0891 100644
--- a/third_party/xla/xla/codegen/emitters/type_util_test.cc
+++ b/third_party/xla/xla/codegen/emitters/type_util_test.cc
@@ -61,6 +61,22 @@ TEST(TensorShapeTest, ConvertsPred) {
             "tensor<4x5x6xi8>");
 }
 
+TEST(TensorShapeTest, ConvertsU1) {
+  mlir::MLIRContext ctx;
+  mlir::OpBuilder b(&ctx);
+  EXPECT_EQ(TypeToString(
+                TensorShapeToMlirType(ShapeUtil::MakeShape(U1, {4, 5, 6}), b)),
+            "tensor<4x5x6xi8>");
+}
+
+TEST(TensorShapeTest, ConvertsS1) {
+  mlir::MLIRContext ctx;
+  mlir::OpBuilder b(&ctx);
+  EXPECT_EQ(TypeToString(
+                TensorShapeToMlirType(ShapeUtil::MakeShape(S1, {4, 5, 6}), b)),
+            "tensor<4x5x6xi8>");
+}
+
 TEST(TensorShapeTest, ConvertsLayout) {
   mlir::MLIRContext ctx;
   mlir::OpBuilder b(&ctx);
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic.h b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
index 30d6e31088a581..27e673ac05c711 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic.h
+++ b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
@@ -137,7 +137,7 @@ class Intrinsic {
     mlir::OpBuilder::InsertionGuard guard(b);
     b.setInsertionPointToStart(module.getBody());
 
-    auto decl = b.create<mlir::func::FuncOp>(module.getLoc(), name, type);
+    auto decl = mlir::func::FuncOp::create(b, module.getLoc(), name, type);
     decl.setPrivate();
     return decl;
   }
diff --git a/third_party/xla/xla/codegen/intrinsic/tanh.h b/third_party/xla/xla/codegen/intrinsic/tanh.h
index 34d60229c29026..022a09951f3d3a 100644
--- a/third_party/xla/xla/codegen/intrinsic/tanh.h
+++ b/third_party/xla/xla/codegen/intrinsic/tanh.h
@@ -33,12 +33,13 @@ class Tanh : public Intrinsic<Tanh> {
   static std::vector<std::vector<Type>> SupportedVectorTypes() {
     // F16 via upcast to F32.
     return {
-        {Type::S(xla::F16)},    {Type::V(xla::F16, 8)}, {Type::V(xla::F16, 16)},
-        {Type::S(xla::F32)},
-
-        {Type::V(xla::F32, 4)}, {Type::V(xla::F32, 8)}, {Type::V(xla::F32, 16)},
-        {Type::S(xla::F64)},    {Type::V(xla::F64, 2)}, {Type::V(xla::F64, 4)},
-        {Type::V(xla::F64, 8)},
+        {Type::S(xla::F16)},     {Type::V(xla::F16, 2)},
+        {Type::V(xla::F16, 4)},  {Type::V(xla::F16, 8)},
+        {Type::V(xla::F16, 16)}, {Type::S(xla::F32)},
+        {Type::V(xla::F32, 2)},  {Type::V(xla::F32, 4)},
+        {Type::V(xla::F32, 8)},  {Type::V(xla::F32, 16)},
+        {Type::S(xla::F64)},     {Type::V(xla::F64, 2)},
+        {Type::V(xla::F64, 4)},  {Type::V(xla::F64, 8)},
     };
   }
   static absl::StatusOr<llvm::Function*> CreateDefinition(llvm::Module* module,
diff --git a/third_party/xla/xla/codegen/ir_emission_utils.cc b/third_party/xla/xla/codegen/ir_emission_utils.cc
index 5021929753c8c9..58ed76d4ee1a1d 100644
--- a/third_party/xla/xla/codegen/ir_emission_utils.cc
+++ b/third_party/xla/xla/codegen/ir_emission_utils.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -40,14 +39,6 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 
 namespace xla {
-
-int GetBitwidth(PrimitiveType type) {
-  if (type == PRED) {
-    return 8;
-  }
-  return primitive_util::BitWidth(type);
-}
-
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count) {
   // Number of operands should be in range [1, allowed_operand_count].
   if (instr->operand_count() == 0 ||
diff --git a/third_party/xla/xla/codegen/ir_emission_utils.h b/third_party/xla/xla/codegen/ir_emission_utils.h
index 1de4f455c119b4..e74a1ab81c0182 100644
--- a/third_party/xla/xla/codegen/ir_emission_utils.h
+++ b/third_party/xla/xla/codegen/ir_emission_utils.h
@@ -20,12 +20,10 @@ limitations under the License.
 #ifndef XLA_CODEGEN_IR_EMISSION_UTILS_H_
 #define XLA_CODEGEN_IR_EMISSION_UTILS_H_
 
-#include <cstdint>
 #include <functional>
 #include <optional>
 #include <vector>
 
-#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -33,51 +31,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
-// Returns the bitwidth of the given primitive type. Unfortunately,
-// primitive_util::BitWidth(PRED) return 1 instead of 8.
-int GetBitwidth(PrimitiveType type);
-
-/// Description of how to emit a given transposition.
-struct TransposeDescription {
-  // Transpose instruction.
-  const HloInstruction* instr;
-
-  // Normalized transpose dimensions.
-  absl::InlinedVector<int64_t, 3> dimensions;
-
-  // Permutations of normalized transpose dimensions.
-  absl::InlinedVector<int64_t, 3> permutation;
-
-  // Required amount of shared memory in bytes.
-  int64_t shmem_usage = 0;
-
-  TransposeDescription(const HloInstruction* instr,
-                       absl::InlinedVector<int64_t, 3> dimensions,
-                       absl::InlinedVector<int64_t, 3> permutation,
-                       int64_t shmem_usage)
-      : instr(instr),
-        dimensions(dimensions),
-        permutation(permutation),
-        shmem_usage(shmem_usage) {}
-
-  // Transpose instruction input shape.
-  const Shape& input_shape() const { return instr->operand(0)->shape(); }
-
-  // Returns true, if both descriptions have the same dimensions and
-  // permutation, even if they're produced by different instructions.
-  bool IsEquivalent(const TransposeDescription& other) const {
-    return dimensions == other.dimensions && permutation == other.permutation &&
-           GetBitwidth(instr->shape().element_type()) ==
-               GetBitwidth(other.instr->shape().element_type());
-  }
-};
-
 // Checks if the instruction is elementwise.
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1);
 
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.cc b/third_party/xla/xla/codegen/mlir_kernel_source.cc
index 8e7ea28a9b8198..80ebf56e4a468c 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.cc
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -54,8 +53,7 @@ absl::StatusOr<MlirKernelSource> MlirKernelSource::ParseFromString(
     return Internal("Failed to parse MLIR IR: %s", error_string);
   }
 
-  return MlirKernelSource(std::move(mlir_context),
-                          std::move(mlir_module));
+  return MlirKernelSource(std::move(mlir_context), std::move(mlir_module));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.h b/third_party/xla/xla/codegen/mlir_kernel_source.h
index 1beeb9b5a3a708..98342c01a80238 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.h
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/DebugStringHelper.h"
 #include "xla/codegen/kernel_source.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
@@ -45,8 +44,7 @@ class MlirKernelSource final : public KernelSource {
   // context.
   MlirKernelSource(std::unique_ptr<mlir::MLIRContext> mlir_context,
                    mlir::OwningOpRef<mlir::ModuleOp> module)
-      : mlir_context_(std::move(mlir_context)),
-        module_(std::move(module)) {}
+      : mlir_context_(std::move(mlir_context)), module_(std::move(module)) {}
 
   // Construct a MLIR kernel source from a module but don't take any ownership
   // of the MLIR context.
diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index a373ed7c1ced60..612dee3a700cb2 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -117,7 +117,6 @@ cc_library(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -142,7 +141,6 @@ xla_cc_test(
         ":tiling_specification",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -210,7 +208,6 @@ xla_cc_test(
     srcs = ["constraint_expression_test.cc"],
     deps = [
         ":constraint_expression",
-        "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/types:span",
@@ -228,6 +225,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:vlog_is_on",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -290,7 +288,6 @@ xla_cc_test(
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
@@ -306,12 +303,7 @@ cc_library(
     hdrs = ["tiling_specification.h"],
     deps = [
         ":constraint_expression",
-        ":symbolic_tiled_hlo_instruction",
-        ":tiled_hlo_computation",
-        "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_traversal",
-        "//xla/service:instruction_fusion",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -321,7 +313,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -331,7 +322,6 @@ xla_cc_test(
     deps = [
         ":symbolic_tile_analysis",
         ":tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -359,7 +349,6 @@ cc_library(
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
@@ -402,7 +391,6 @@ xla_cc_test(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_test_utils",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression.cc b/third_party/xla/xla/codegen/tiling/constraint_expression.cc
index 1c3a558970d2c9..8226932ba80bbb 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression.cc
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression.cc
@@ -184,6 +184,31 @@ bool ConstraintExpression::IsSatisfiedBy(
       });
 }
 
+void ConstraintExpression::PrintUnsatisfiedConstraints(
+    absl::Span<const int64_t> dim_values, std::ostream& out) const {
+  auto is_conjunction_satisfied = [&](const auto& conjunction) {
+    return absl::c_all_of(conjunction, [&](const Constraint& constraint) {
+      int64_t value = EvaluateAffineExpr(constraint.expr, dim_values);
+      return constraint.interval.Contains(value);
+    });
+  };
+
+  for (const auto& [i, conjunction] :
+       llvm::enumerate(disjoint_conjoint_constraints_)) {
+    if (is_conjunction_satisfied(conjunction)) {
+      continue;
+    }
+    out << "Unsatisfied conjunction: #" << i << "\n";
+    for (const Constraint& constraint : conjunction) {
+      int64_t value = EvaluateAffineExpr(constraint.expr, dim_values);
+      if (!constraint.interval.Contains(value)) {
+        out << " -- " << constraint.expr << " in "
+            << constraint.interval.ToString() << ". Value: " << value << "\n";
+      }
+    }
+  }
+}
+
 std::string ConstraintExpression::ToString() const {
   std::stringstream ss;
   Print(ss);
@@ -204,10 +229,10 @@ void ConstraintExpression::Print(std::ostream& out) const {
   // order and to get deterministic output.
   std::vector<std::string> conjunction_strings;
   conjunction_strings.reserve(disjoint_conjoint_constraints_.size());
-  for (const auto& disjunction : disjoint_conjoint_constraints_) {
+  for (const auto& conjunction : disjoint_conjoint_constraints_) {
     std::vector<std::string> constraint_strings;
-    constraint_strings.reserve(disjunction.size());
-    for (const auto& [expr, interval] : disjunction) {
+    constraint_strings.reserve(conjunction.size());
+    for (const auto& [expr, interval] : conjunction) {
       constraint_strings.push_back(
           absl::StrCat(xla::ToString(expr), " in ", interval.ToString()));
     }
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression.h b/third_party/xla/xla/codegen/tiling/constraint_expression.h
index 553c44ee9b1a7b..86c180695b21c1 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression.h
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression.h
@@ -99,6 +99,11 @@ class ConstraintExpression {
   // constraints.
   bool IsSatisfiedBy(absl::Span<const int64_t> dim_values) const;
 
+  // Prints unsatisfied constraints which are not satisfied by the provided
+  // `dim_values`.
+  void PrintUnsatisfiedConstraints(absl::Span<const int64_t> dim_values,
+                                   std::ostream& out) const;
+
   std::string ToString() const;
 
   void Print(std::ostream& out) const;
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc b/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
index 72152b9cee4444..d671d6aad7f516 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
@@ -16,19 +16,21 @@ limitations under the License.
 #include "xla/codegen/tiling/constraint_expression.h"
 
 #include <cstdint>
+#include <sstream>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/span.h"
-#include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 
 namespace xla {
 namespace {
 
 using ::testing::ExplainMatchResult;
+using ::testing::HasSubstr;
+using ::testing::Not;
 
 using Constraint = ConstraintExpression::Constraint;
 
@@ -96,6 +98,34 @@ TEST_F(ConstraintExpressionTest, PrettyPrintingTest) {
                                "d0 in [1, 2] && d1 in [3, 4] || d2 in [5, 6]"));
 }
 
+TEST_F(ConstraintExpressionTest, PrintUnsatisfiedConstraints) {
+  Constraint c0 = GetConstraint("d0 mod 6", 0, 0);
+  Constraint c1 = GetConstraint("d1 mod 8", 0, 0);
+  Constraint c2 = GetConstraint("d0 mod 13", 0, 0);
+  ConstraintExpression constraints = (c0 && c1) || (c1 && c2);
+
+  // (c0 && c1) is satisfied.
+  std::vector<int64_t> lhs_satisfied({6, 8});
+  ASSERT_TRUE(constraints.IsSatisfiedBy(lhs_satisfied));
+  std::stringstream ss;
+  constraints.PrintUnsatisfiedConstraints(lhs_satisfied, ss);
+
+  EXPECT_THAT(ss.str(), HasSubstr("Unsatisfied conjunction: #1"));
+  EXPECT_THAT(ss.str(), HasSubstr("d0 mod 13 in [0, 0]. Value: 6"));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d1 mod 8 in [0, 0]")));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d0 mod 6 in [0, 0]")));
+
+  // (c1 && c2) is satisfied.
+  std::vector<int64_t> rhs_satisfied({13, 8});
+  ASSERT_TRUE(constraints.IsSatisfiedBy(rhs_satisfied));
+  ss.str("");
+  constraints.PrintUnsatisfiedConstraints(rhs_satisfied, ss);
+  EXPECT_THAT(ss.str(), HasSubstr("Unsatisfied conjunction: #0"));
+  EXPECT_THAT(ss.str(), HasSubstr("d0 mod 6 in [0, 0]. Value: 1"));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d0 mod 13 in [0, 0]")));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d1 mod 8 in [0, 0]")));
+}
+
 TEST_F(ConstraintExpressionTest,
        ConjunctionOfConstraintsOnTheSameExpressionAreIntersected) {
   ConstraintExpression constraints{GetConstraint("d0", 0, 5)};
diff --git a/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc b/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc
index 4cb4100e9d55c9..cfc461c8b31a24 100644
--- a/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc
+++ b/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/log/vlog_is_on.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
index af166ae57df5cf..2684e6378f5545 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
@@ -459,8 +459,8 @@ FusionDecision ShouldProceedWithSymbolicTileDerivation(
         SymbolicTile::FromIndexingMap(reshape_indexing_map);
 
     if (!reshape_symbolic_tile.has_value()) {
-      return FusionDecision::Forbid("Bailing out on reshape ")
-             << hlo->ToString() << " with indexing map "
+      return FusionDecision::Forbid("Bailing out on reshape")
+             << " " << hlo->ToString() << " with indexing map "
              << ToString(reshape_indexing_map);
     }
   }
@@ -1147,7 +1147,7 @@ ComposeIndexingResult ComposeInstructionIndexing(
     IndexingMap rt_map =
         ComposeIndexingMaps(tiled_hlo_instruction->indexing_map(), rt_var.map);
     HloInstructionAdaptor hlo_adaptor =
-        instruction_adaptor.parent().GetInstruction(rt_var.hlo);
+        instruction_adaptor.parent().GetInstruction(rt_var.hlo());
     auto tiled_runtime_var = std::make_unique<SymbolicTiledHloInstruction>(
         &hlo_adaptor.instruction(), rt_map,
         tiled_hlo_instruction->runtime_variables());
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
index 6460e62c0b6882..bf1260c2b19820 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
index 23beabca918ab3..4f76367dad31d3 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
index b480395b10dcc6..136c83ce3215d7 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
index aa90ba63242859..71eee0d68f8e81 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
index c7868092a37278..0d616b78b73c64 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
index 77a1c5279051b9..9de0b05f029597 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
index b5d4c3cb160641..86c4aca14d8ff4 100644
--- a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/codegen/xtile/ir/BUILD b/third_party/xla/xla/codegen/xtile/ir/BUILD
index de986615da6c0e..58f8c97439da40 100644
--- a/third_party/xla/xla/codegen/xtile/ir/BUILD
+++ b/third_party/xla/xla/codegen/xtile/ir/BUILD
@@ -105,7 +105,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
diff --git a/third_party/xla/xla/core/collectives/BUILD b/third_party/xla/xla/core/collectives/BUILD
index 1b0398aaaf4801..06d3ef7f6c9aed 100644
--- a/third_party/xla/xla/core/collectives/BUILD
+++ b/third_party/xla/xla/core/collectives/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//xla:future",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/core/collectives/communicator.h b/third_party/xla/xla/core/collectives/communicator.h
index 0f60a859db854d..4be35fb52163f7 100644
--- a/third_party/xla/xla/core/collectives/communicator.h
+++ b/third_party/xla/xla/core/collectives/communicator.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -65,7 +65,7 @@ class Communicator {
   // Register `buffer_range` once for efficient collective operations (i.e. on
   // NCCL backend it registers the buffer for zero-copy collective operations).
   //
-  virtual absl::Status RegisterBufferOnce(se::DeviceMemoryBase buffer_range,
+  virtual absl::Status RegisterBufferOnce(se::DeviceAddressBase buffer_range,
                                           int device_ordinal,
                                           bool use_symmetric_buffer) {
     return Unimplemented("User-managed buffer registration is not supported");
@@ -91,40 +91,40 @@ class Communicator {
 
   // Reduce buffers of length `count` in `send_buff` using `reduction_kind`
   // reduction and leaves identical copies of the result on each `recv_buff`.
-  virtual Future<> AllReduce(stream_executor::DeviceMemoryBase send_buffer,
-                             stream_executor::DeviceMemoryBase recv_buffer,
+  virtual Future<> AllReduce(stream_executor::DeviceAddressBase send_buffer,
+                             stream_executor::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              ReductionKind reduction_kind,
                              const Executor& executor) = 0;
 
   // Copy data in `send_buff` from the root device to the `recv_buff` on
   // all other devices.
-  virtual Future<> Broadcast(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
+  virtual Future<> Broadcast(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count, RankId root,
                              const Executor& executor) = 0;
 
   // Reduce data in `send_buff` from all devices using the `reduction_kind`
   // operation and leave the reduced result scattered over the devices so that
   // the `recv_buff` on rank `i` will contain the i-th block of the result.
-  virtual Future<> ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
+  virtual Future<> ReduceScatter(se::DeviceAddressBase send_buffer,
+                                 se::DeviceAddressBase recv_buffer,
                                  PrimitiveType dtype, size_t count,
                                  ReductionKind reduction_kind,
                                  const Executor& executor) = 0;
 
   // Gather `count` values from all devices into `recv_buffer`, receiving data
   // from rank `i` at offset `i * sendcount`.
-  virtual Future<> AllGather(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
+  virtual Future<> AllGather(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              const Executor& executor) = 0;
 
   // Sends data from `send_buffer` to `target_ranks` and receives data from
   // `source_rank` into `recv_buffer`. If `source_rank` is not specified, the
   // output is filled with zeros.
-  virtual Future<> CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                     se::DeviceMemoryBase recv_buffer,
+  virtual Future<> CollectivePermute(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
                                      PrimitiveType dtype, size_t count,
                                      std::optional<RankId> source_rank,
                                      absl::Span<const RankId> target_ranks,
@@ -133,30 +133,30 @@ class Communicator {
   // Sends `count` values from `send_buffers` to other ranks and receives data
   // from other ranks into `recv_buffers`.
   virtual Future<> AllToAll(
-      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
-      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
       PrimitiveType dtype, size_t count, const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `peer`.
-  virtual Future<> Send(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Receive data from rank `peer` into `recv_buff`.
-  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `recv_buff` (one-way send).
-  virtual Future<> Send(se::DeviceMemoryBase recv_buffer,
-                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceAddressBase recv_buffer,
+                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way send is not implemented");
   }
 
   // Receive data from rank `peer` into `recv_buff` (one-way recv).
-  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer,
-                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceAddressBase recv_buffer,
+                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way recv is not implemented");
   }
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index bf1158a550a598..1100f0ca4b14b1 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -243,6 +243,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLASLT);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
+  opts.add_xla_gpu_enable_command_buffer(DebugOptions::DYNAMIC_SLICE_FUSION);
+  opts.add_xla_gpu_enable_command_buffer(
+      DebugOptions::DYNAMIC_SLICE_COPY_FUSION);
   opts.set_xla_gpu_graph_min_graph_size(5);
   opts.set_xla_gpu_command_buffer_scheduling_mode(DebugOptions::LHS);
   opts.set_xla_gpu_command_buffer_unroll_loops(false);
@@ -311,6 +314,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_pipelined_all_reduce(false);
   opts.set_xla_gpu_enable_pipelined_all_gather(false);
   opts.set_xla_gpu_enable_pipelined_reduce_scatter(true);
+  opts.set_xla_gpu_enable_pipelined_host_offloading(false);
   opts.set_xla_gpu_enable_pipelined_p2p(false);
 
   opts.set_xla_gpu_collective_permute_decomposer_threshold(
@@ -366,6 +370,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_enable_fusion_block_level_rewriter(false);
 
   opts.set_xla_gpu_enable_llvm_module_compilation_parallelism(false);
+  opts.set_xla_gpu_default_to_alg_dot_bf16_bf16_f32(false);
   opts.set_xla_gpu_enable_libnvptxcompiler(
       stream_executor::IsLibNvPtxCompilerSupported());
   opts.set_xla_gpu_libnvjitlink_mode(DebugOptions::LIB_NV_JIT_LINK_MODE_AUTO);
@@ -416,7 +421,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_autotune_gemm_rtol(0.1f);
 
-  opts.set_xla_enable_command_buffers_during_profiling(true);
+  // TODO(b/355487968): Remove this flag once all data will be presented in
+  // xprof with command buffers.
+  opts.set_xla_enable_command_buffers_during_profiling(false);
 
   opts.set_xla_gpu_cudnn_gemm_max_plans(5);
 
@@ -455,10 +462,11 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_enable_heuristic_collective_combining(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change(false);
   opts.set_xla_disable_automatic_host_compute_offload(false);
+  opts.set_xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled(
+      false);
   opts.set_xla_enable_scoped_logging_timers(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_noop_change(false);
   opts.set_xla_gpu_experimental_enable_split_k_rewrite(false);
-  opts.set_xla_gpu_experimental_enable_triton_tma(true);
   opts.set_xla_gpu_experimental_enable_triton_warp_specialization(false);
   opts.set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_NONE);
   opts.set_xla_detect_unstable_reductions_post_optimizations(
@@ -466,6 +474,15 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_scaled_dot_with_triton(false);
   opts.set_xla_gpu_experimental_use_raft_select_k(false);
 
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUDNN);
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_TRITON);
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUBLAS);
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUBLASLT);
+
   opts.set_xla_cpu_collective_call_warn_stuck_seconds(20);
   opts.set_xla_cpu_collective_call_terminate_timeout_seconds(40);
   opts.set_xla_cpu_collective_timeout_seconds(30 * 60);
@@ -705,6 +722,16 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
     return absl::StrJoin(command_types, ", ", Formatter());
   };
 
+  auto autotune_backends_to_string =
+      [](google::protobuf::RepeatedField<int> backends) -> std::string {
+    struct Formatter {
+      void operator()(std::string* out, int type) const {
+        absl::StrAppend(out, DebugOptions::AutotuneBackend_Name(type));
+      }
+    };
+    return absl::StrJoin(backends, ", ", Formatter());
+  };
+
   // Custom "sub-parser" for xla_fuel.  Note that ConsumeFuel does not do any
   // locking on the fuel global variables.  This means that it's
   // illegal/undefined behavior to modify this flag value while the compiler is
@@ -1542,6 +1569,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "--xla_gpu_force_compilation_parallelism flag and the thread pool "
       "supplied to GpuCompiler."));
 
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_default_to_alg_dot_bf16_bf16_f32",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_default_to_alg_dot_bf16_bf16_f32),
+      debug_options->xla_gpu_default_to_alg_dot_bf16_bf16_f32(),
+      "Use the dot precision algorithm `ALG_DOT_BF16_BF16_F32 by default for "
+      "f32 dots."));
   flag_list->push_back(
       tsl::Flag("xla_gpu_deterministic_ops",
                 bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_ops),
@@ -1944,6 +1978,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                     &DebugOptions::set_xla_gpu_enable_pipelined_reduce_scatter),
                 debug_options->xla_gpu_enable_pipelined_reduce_scatter(),
                 "[Stable] Enable pipelinling of reduce-scatter instructions."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_pipelined_host_offloading",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_pipelined_host_offloading),
+      debug_options->xla_gpu_enable_pipelined_host_offloading(),
+      "Enable pipelining of host offloading instructions."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_pipelined_p2p",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_pipelined_p2p),
@@ -2355,6 +2395,20 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_experimental_autotuner_cache_dir),
       debug_options->xla_gpu_experimental_autotuner_cache_dir(),
       "Experimental: Specify the directory to read/write autotuner cache to."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_autotune_backends",
+      SetterForRepeatedEnum<DebugOptions::AutotuneBackend>(
+          "xla_gpu_experimental_autotune_backends",
+          /*enum_prefix=*/"AUTOTUNE_BACKEND_",
+          &DebugOptions::AutotuneBackend_Parse,
+          debug_options->mutable_xla_gpu_experimental_autotune_backends()),
+      autotune_backends_to_string(
+          debug_options->xla_gpu_experimental_autotune_backends()),
+      "Backends to enable for autotuning. Comma-separated (no spaces). "
+      "Examples:\n"
+      "  'cudnn,triton' (overwrites defaults)\n"
+      "  '+cudnn,-cublas' (adds/removes from defaults)\n"
+      "Available: cudnn, triton, cublas, cublaslt."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_gemm_autotuner_override_file",
       string_setter_for(
@@ -2617,6 +2671,16 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_disable_automatic_host_compute_offload(),
       "Return an error if HostOffloader would have automatically offloaded some"
       " compute to the host."));
+  flag_list->push_back(tsl::Flag(
+      "xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled),  // NOLINT
+      debug_options
+          ->xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled(),
+      "Allow host-to-host copy when automatic host compute offload is "
+      "disabled, i.e. when xla_disable_automatic_host_compute_offload is "
+      "set."));
   flag_list->push_back(tsl::Flag(
       "xla_enable_scoped_logging_timers",
       bool_setter_for(&DebugOptions::set_xla_enable_scoped_logging_timers),
@@ -2637,12 +2701,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_experimental_enable_split_k_rewrite(),
       "Enable the pass that splits GEMMs that underutilize the GPU load by "
       "splitting the K dimension using a heuristic."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_experimental_enable_triton_tma",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_experimental_enable_triton_tma),
-      debug_options->xla_gpu_experimental_enable_triton_tma(),
-      "Enable Triton's TMA loads/stores for arguments where applicable."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_triton_warp_specialization",
       bool_setter_for(
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 1b0ba6505104e9..b721520caf6235 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -392,13 +392,15 @@ TEST(ParseRepeatedEnumModifiersTest, Invalid) {
 TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
   DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
 
-  // Check that the default setting has 5 types.
+  // Check that the default setting has 6 types.
   const auto& enabled_types = debug_options.xla_gpu_enable_command_buffer();
-  ASSERT_EQ(enabled_types.size(), 5);
-  ASSERT_THAT(enabled_types,
-              ElementsAre(DebugOptions::FUSION, DebugOptions::CUBLAS,
-                          DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN));
+  ASSERT_EQ(enabled_types.size(), 7);
+  ASSERT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::FUSION, DebugOptions::CUBLAS,
+                  DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION));
 
   // Initialize the flag objects.
   std::vector<tsl::Flag> flag_objects;
@@ -407,26 +409,33 @@ TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
   // Removing options from the existing setting.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=-fusion,-cublas");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 3);
-  EXPECT_THAT(enabled_types,
-              ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN));
+  EXPECT_EQ(enabled_types.size(), 5);
+  EXPECT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION));
 
   // Removing an option that isn't there and adding a duplicate.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=+cublaslt,-fusion");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 3);
-  EXPECT_THAT(enabled_types,
-              ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN));
+  EXPECT_EQ(enabled_types.size(), 5);
+  EXPECT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION));
 
   // Adding an option.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=+cublas");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 4);
-  EXPECT_THAT(enabled_types,
-              ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN, DebugOptions::CUBLAS));
+  EXPECT_EQ(enabled_types.size(), 6);
+  EXPECT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION,
+                  DebugOptions::CUBLAS));
 
   // Overwriting the default setting.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=custom_call,fusion");
@@ -499,6 +508,38 @@ TEST(ParseRepeatedEnumFlagsTest, XnnFusionType) {
   TestLibraryFusionType("xnn");
 }
 
+TEST(ParseRepeatedEnumFlagsTest, AutotuneBackend) {
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  std::vector<tsl::Flag> flag_objects;
+  MakeDebugOptionsFlags(&flag_objects, &debug_options);
+
+  const auto& enabled_backends =
+      debug_options.xla_gpu_experimental_autotune_backends();
+
+  // Check that the default setting is populated.
+  ASSERT_THAT(enabled_backends,
+              ElementsAre(DebugOptions::AUTOTUNE_BACKEND_CUDNN,
+                          DebugOptions::AUTOTUNE_BACKEND_TRITON,
+                          DebugOptions::AUTOTUNE_BACKEND_CUBLAS,
+                          DebugOptions::AUTOTUNE_BACKEND_CUBLASLT));
+
+  // Overwriting the default setting.
+  SetXlaFlagsEnvVar("--xla_gpu_experimental_autotune_backends=cudnn,triton");
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
+  EXPECT_EQ(enabled_backends.size(), 2);
+  EXPECT_THAT(enabled_backends,
+              ElementsAre(DebugOptions::AUTOTUNE_BACKEND_CUDNN,
+                          DebugOptions::AUTOTUNE_BACKEND_TRITON));
+
+  // Adding / removing options from the existing setting.
+  SetXlaFlagsEnvVar("--xla_gpu_experimental_autotune_backends=+cublas,-triton");
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
+  EXPECT_EQ(enabled_backends.size(), 2);
+  EXPECT_THAT(enabled_backends,
+              ElementsAre(DebugOptions::AUTOTUNE_BACKEND_CUDNN,
+                          DebugOptions::AUTOTUNE_BACKEND_CUBLAS));
+}
+
 TEST(ParseIntRangeInclusiveTest, SingleInteger) {
   IntRangeInclusive range;
   EXPECT_TRUE(ParseIntRangeInclusive("10", range));
diff --git a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
index fe4de7c6776cb9..55e87a248c48da 100644
--- a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
+++ b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
@@ -176,7 +176,7 @@ TEST_F(StableHloAxpyTest, CompileAndExecuteCPUTestProgram) {
 
   // Convert result buffer back to literal.
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> axpy_result_literal,
-                          axpy_result[0][0]->ToLiteralSync());
+                          axpy_result[0][0]->ToLiteral().Await());
 
   // Check to make sure that our results match what we expect.
   xla::LiteralTestUtil::ExpectR1Near<float>({13.64f, 26.78f, 39.92f, 53.06f},
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 41c825e3599ea2..eb9e58e2a050d0 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -37,7 +37,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -58,7 +58,7 @@ xla_cc_test(
         ":call_frame",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
@@ -123,11 +123,13 @@ xla_cc_test(
     srcs = ["execution_state_test.cc"],
     deps = [
         ":execution_state",
+        ":execution_state_proto_cc",
         ":type_registry",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
@@ -149,7 +151,7 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -182,8 +184,8 @@ cc_library(
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -299,7 +301,7 @@ xla_cc_test(
         "//xla/backends/cpu:ffi",
         "//xla/backends/gpu:ffi",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
@@ -325,6 +327,7 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/tsl/lib/gtl:int_type",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -348,6 +351,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 41889027b9ddd3..dc4551d8e2fecc 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -91,8 +91,8 @@ xla_cc_test(
         "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
         "//xla/ffi:type_registry",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index d0baf4fc3b7bb0..d9070080f3a4a6 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -93,7 +93,7 @@ typedef XLA_FFI_Error* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Stream_Get(
     XLA_FFI_ExecutionContext* ctx, void** stream);
 
-// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
+// Returns a pointer to device memory allocator (`se::DeviceAddressAllocator`
 // pointer) which allows to allocate memory inside a custom call from the same
 // allocator as XLA (i.e. it allows to construct scratch memory allocator).
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index e3345ebe915146..81578f564956fd 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -522,7 +522,7 @@ TEST(FfiTest, DeviceOrdinal) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -544,7 +544,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, BufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -562,7 +562,7 @@ TEST(FfiTest, BufferArgument) {
 
 TEST(FfiTest, AnyBufferResult) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -594,7 +594,7 @@ TEST(FfiTest, MissingBufferArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -611,7 +611,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -648,7 +648,7 @@ TEST(FfiTest, WrongNumberOfArguments) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -665,7 +665,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -694,7 +694,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -724,7 +724,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -785,7 +785,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -854,7 +854,7 @@ TEST(FfiTest, AutoBinding) {
   });
 
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder::AttributesBuilder attrs;
   attrs.Insert(kI32, 42);
@@ -873,7 +873,8 @@ TEST(FfiTest, AutoBindingResult) {
       Ffi::BindTo(+[](Result<AnyBuffer> buffer) { return Error::Success(); });
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
-  builder.AddBufferRet(se::DeviceMemoryBase(), PrimitiveType::F32, /*dims=*/{});
+  builder.AddBufferRet(se::DeviceAddressBase(), PrimitiveType::F32,
+                       /*dims=*/{});
   auto call_frame = builder.Build();
 
   auto status = Call(*handler, call_frame);
@@ -1409,19 +1410,22 @@ TEST(FfiTest, ScratchAllocator) {
   static void* kAddr = reinterpret_cast<void*>(0xDEADBEEF);
 
   // A test only memory allocator that returns a fixed memory address.
-  struct TestDeviceMemoryAllocator final : public se::DeviceMemoryAllocator {
+  struct TestDeviceMemoryAllocator final : public se::DeviceAddressAllocator {
     size_t count;
 
     TestDeviceMemoryAllocator()
-        : se::DeviceMemoryAllocator(nullptr), count(0) {}
+        : se::DeviceAddressAllocator(nullptr), count(0) {}
 
-    absl::StatusOr<se::OwningDeviceMemory> Allocate(int, uint64_t size, bool,
-                                                    int64_t) final {
+    absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(int,
+                                                              uint64_t size,
+                                                              bool,
+                                                              int64_t) final {
       count++;
-      return se::OwningDeviceMemory(se::DeviceMemoryBase(kAddr, size), 0, this);
+      return se::ScopedDeviceAddress<uint8_t>(
+          se::DeviceAddressBase(kAddr, size), 0, this);
     }
 
-    absl::Status Deallocate(int, se::DeviceMemoryBase mem) final {
+    absl::Status Deallocate(int, se::DeviceAddressBase mem) final {
       count--;
       EXPECT_EQ(mem.opaque(), kAddr);
       return absl::OkStatus();
@@ -1588,7 +1592,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(BufferR2F32Handler, BufferR2F32Function);
 
 TEST(FfiTest, DefineAutoSymbol) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -1604,7 +1608,7 @@ TEST(FfiTest, DefineAutoSymbol) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index ad7c71c98f8cd6..f0c17215c2dafd 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -47,7 +47,7 @@ namespace xla::ffi {
 //===----------------------------------------------------------------------===//
 
 struct CallFrameBuilder::Buffer {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   PrimitiveType type;
   absl::InlinedVector<int64_t, 4> dims;
 };
@@ -84,7 +84,7 @@ CallFrameBuilder::CallFrameBuilder(size_t num_args, size_t num_rets) {
 
 CallFrameBuilder::~CallFrameBuilder() = default;
 
-void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
+void CallFrameBuilder::AddBufferArg(se::DeviceAddressBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(args_.capacity() > args_.size())
@@ -95,10 +95,10 @@ void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
 void CallFrameBuilder::AddTokenArg() {
   DCHECK(args_.capacity() > args_.size())
       << "CallFrame builder `num_args` argument was too small";
-  args_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
+  args_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
 }
 
-void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
+void CallFrameBuilder::AddBufferRet(se::DeviceAddressBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(rets_.capacity() > rets_.size())
@@ -109,7 +109,7 @@ void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
 void CallFrameBuilder::AddTokenRet() {
   DCHECK(rets_.capacity() > rets_.size())
       << "CallFrame builder `num_rets` argument was too small";
-  rets_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
+  rets_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
 }
 
 void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
@@ -557,8 +557,8 @@ std::unique_ptr<CallFrame::Attributes> CallFrame::FixUpAttrs(
 //===----------------------------------------------------------------------===//
 
 absl::Status CallFrame::UpdateWithBuffers(
-    absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) {
+    absl::Span<const se::DeviceAddressBase> args,
+    absl::Span<const se::DeviceAddressBase> rets) {
   if (ABSL_PREDICT_FALSE(args.size() != arguments_->args.size())) {
     return InvalidArgument("Invalid number of updated arguments: %d vs %d",
                            args.size(), arguments_->args.size());
@@ -587,8 +587,8 @@ CallFrame CallFrame::Copy() const {
 }
 
 absl::StatusOr<CallFrame> CallFrame::CopyWithBuffers(
-    absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) const {
+    absl::Span<const se::DeviceAddressBase> args,
+    absl::Span<const se::DeviceAddressBase> rets) const {
   CallFrame clone(CopyArgs(*arguments_), CopyRets(*results_), attributes_);
   TF_RETURN_IF_ERROR(clone.UpdateWithBuffers(args, rets));
   return clone;
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 32dceead1d9b4b..5433d4be990d42 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
 
@@ -76,12 +76,12 @@ class CallFrameBuilder {
 
   CallFrame Build();
 
-  void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
+  void AddBufferArg(se::DeviceAddressBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenArg();
 
-  void AddBufferRet(se::DeviceMemoryBase memory, PrimitiveType type,
+  void AddBufferRet(se::DeviceAddressBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenRet();
@@ -117,16 +117,16 @@ class CallFrame {
   // array (buffer) arguments and results are known at compile time. Instead of
   // rebuilding the call frame from scratch on every execution, we can just
   // update the arguments and results with new pointers to device memory.
-  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceMemoryBase> args,
-                                 absl::Span<const se::DeviceMemoryBase> rets);
+  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceAddressBase> args,
+                                 absl::Span<const se::DeviceAddressBase> rets);
 
   // Creates a copy of the call frame.
   CallFrame Copy() const;
 
   // Creates a copy of the call frame with updated arguments and results.
   absl::StatusOr<CallFrame> CopyWithBuffers(
-      absl::Span<const se::DeviceMemoryBase> args,
-      absl::Span<const se::DeviceMemoryBase> rets) const;
+      absl::Span<const se::DeviceAddressBase> args,
+      absl::Span<const se::DeviceAddressBase> rets) const;
 
   // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
   XLA_FFI_CallFrame Build(
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index f73461fc7d297f..b58e2d9a2537b6 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -34,8 +34,8 @@ limitations under the License.
 namespace xla::ffi {
 
 TEST(CallFrameTest, UpdateCallFrame) {
-  se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
-  se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
+  se::DeviceAddressBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
 
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
@@ -116,7 +116,7 @@ TEST(CallFrameTest, UpdateCallFrame) {
 void BM_AddBufferArg(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   for (auto _ : state) {
@@ -151,17 +151,17 @@ void BM_AddAttributes(benchmark::State& state) {
 void BM_UpdateCallFrame(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
+  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     auto updated_call_frame =
@@ -173,17 +173,17 @@ void BM_UpdateCallFrame(benchmark::State& state) {
 void BM_UpdateCallFrameInPlace(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
+  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     benchmark::DoNotOptimize(
diff --git a/third_party/xla/xla/ffi/execution_state.cc b/third_party/xla/xla/ffi/execution_state.cc
index 8f5f5fd902b8b1..a3b5439a1ba1e7 100644
--- a/third_party/xla/xla/ffi/execution_state.cc
+++ b/third_party/xla/xla/ffi/execution_state.cc
@@ -116,10 +116,8 @@ absl::StatusOr<ExecutionState> ExecutionState::FromProto(
         proto.type_name());
   }
 
-  TF_ASSIGN_OR_RETURN(void* opaque_state,
-                      type_info.deserializer(proto.state()));
-
-  TF_RETURN_IF_ERROR(state.Set(type_id, type_info, opaque_state));
+  TF_ASSIGN_OR_RETURN(auto opaque_state, type_info.deserializer(proto.state()));
+  TF_RETURN_IF_ERROR(state.Set(type_id, type_info, opaque_state.release()));
   return state;
 }
 
@@ -127,4 +125,11 @@ bool ExecutionState::IsSet() const {
   return type_id_ != TypeRegistry::kUnknownTypeId;
 }
 
+bool ExecutionState::IsSerializable() const {
+  if (!IsSet()) {
+    return true;
+  }
+  return type_info_.serializer != nullptr;
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_state.h b/third_party/xla/xla/ffi/execution_state.h
index 17308665653dc6..d4aa091932e066 100644
--- a/third_party/xla/xla/ffi/execution_state.h
+++ b/third_party/xla/xla/ffi/execution_state.h
@@ -91,6 +91,7 @@ class ExecutionState {
   absl::StatusOr<T*> Get() const;
 
   bool IsSet() const;
+  bool IsSerializable() const;
 
  private:
   absl::Status Set(TypeId type_id, TypeInfo type_info, void* state);
diff --git a/third_party/xla/xla/ffi/execution_state_test.cc b/third_party/xla/xla/ffi/execution_state_test.cc
index e5ca8c3bb108e7..e79a54f339edc4 100644
--- a/third_party/xla/xla/ffi/execution_state_test.cc
+++ b/third_party/xla/xla/ffi/execution_state_test.cc
@@ -18,10 +18,13 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <type_traits>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/ffi/execution_state.pb.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -85,22 +88,26 @@ TEST(ExecutionStateTest, SetAndGetForExternalType) {
   EXPECT_EQ(data, value);
 }
 
+struct MyState {
+  std::string value;
+};
+
+template <>
+struct TypeRegistry::SerDes<MyState> : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(const MyState& value) {
+    return value.value;
+  }
+  static absl::StatusOr<std::unique_ptr<MyState>> Deserialize(
+      absl::string_view data) {
+    auto state = std::make_unique<MyState>();
+    state->value = data;
+    return state;
+  }
+};
+
 TEST(ExecutionStateTest, Serialization) {
-  struct MyState {
-    std::string value;
-  };
+  TypeRegistry::TypeInfo type_info = TypeRegistry::GetTypeInfo<MyState>();
 
-  TypeRegistry::TypeInfo type_info = {
-      /*deleter=*/
-      [](void* ptr) { delete static_cast<MyState*>(ptr); },
-      /*serializer=*/
-      [](const void* ptr) -> absl::StatusOr<std::string> {
-        return static_cast<const MyState*>(ptr)->value;
-      },
-      /*deserializer=*/
-      [](absl::string_view state) -> absl::StatusOr<void*> {
-        return new MyState{std::string(state)};
-      }};
   TF_ASSERT_OK_AND_ASSIGN(
       TypeRegistry::TypeId type_id,
       TypeRegistry::AssignExternalTypeId("my_state_type", type_info));
@@ -116,4 +123,23 @@ TEST(ExecutionStateTest, Serialization) {
   EXPECT_EQ(static_cast<MyState*>(round_trip_data)->value, "some_state_data");
 }
 
+TEST(ExecutionStateTest, IsSerializable) {
+  ExecutionState state;
+  // Empty state is serializable (as empty proto).
+  EXPECT_TRUE(state.IsSerializable());
+
+  // State without serializer.
+  struct NoSerializer {
+    int x;
+  };
+  TF_ASSERT_OK(state.Set(std::make_unique<NoSerializer>(NoSerializer{42})));
+  EXPECT_FALSE(state.IsSerializable());
+
+  // State with serializer.
+  ExecutionState serializable_state;
+  TF_ASSERT_OK(
+      serializable_state.Set(std::make_unique<MyState>(MyState{"foo"})));
+  EXPECT_TRUE(serializable_state.IsSerializable());
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index da6303e14faef7..4e1849a190d327 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -50,7 +50,7 @@ limitations under the License.
 #include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/types.h"  // IWYU pragma: keep
@@ -137,8 +137,8 @@ class AnyBuffer {
     return reinterpret_cast<T*>(buf_->data);
   }
 
-  se::DeviceMemoryBase device_memory() const {
-    return se::DeviceMemoryBase(untyped_data(), size_bytes());
+  se::DeviceAddressBase device_memory() const {
+    return se::DeviceAddressBase(untyped_data(), size_bytes());
   }
 
  private:
@@ -182,9 +182,9 @@ class Buffer {
     return reinterpret_cast<internal::NativeType<dtype>*>(untyped_data());
   }
 
-  se::DeviceMemory<internal::NativeType<dtype>> device_memory() const {
-    return se::DeviceMemory<internal::NativeType<dtype>>(
-        se::DeviceMemoryBase(untyped_data(), size_bytes()));
+  se::DeviceAddress<internal::NativeType<dtype>> device_memory() const {
+    return se::DeviceAddress<internal::NativeType<dtype>>(
+        se::DeviceAddressBase(untyped_data(), size_bytes()));
   }
 
  private:
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 31287ac7587ef4..3f0de64033061e 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "xla/ffi/ffi_structs.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_allocator.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/platform/logging.h"
@@ -795,7 +795,7 @@ static XLA_FFI_Error* XLA_FFI_DeviceMemory_Free(
 
   absl::Status status = gpu->allocator->Deallocate(
       args->ctx->device_ordinal,
-      stream_executor::DeviceMemoryBase(args->data, args->size));
+      stream_executor::DeviceAddressBase(args->data, args->size));
   if (!status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 8f0b00244c0a93..0369c8cc1946e5 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -179,7 +179,7 @@ TEST(FfiTest, CatchExceptionExplicit) {
 
 TEST(FfiTest, WrongNumArgs) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
+  builder.AddBufferArg(se::DeviceAddressBase(nullptr), PrimitiveType::F32, {});
   auto call_frame = builder.Build();
 
   auto handler = Ffi::Bind().Arg<AnyBuffer>().Arg<AnyBuffer>().To(
@@ -579,7 +579,7 @@ TEST(FfiTest, DecodingErrors) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -614,7 +614,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, TypedAndRankedBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), storage.size() * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), storage.size() * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -642,8 +642,8 @@ TEST(FfiTest, TypedAndRankedBufferArgument) {
 
 TEST(FfiTest, ComplexBufferArgument) {
   std::vector<std::complex<float>> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(),
-                              storage.size() * sizeof(std::complex<float>));
+  se::DeviceAddressBase memory(storage.data(),
+                               storage.size() * sizeof(std::complex<float>));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::C64, /*dims=*/{2, 2});
@@ -662,7 +662,7 @@ TEST(FfiTest, ComplexBufferArgument) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -679,7 +679,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -697,7 +697,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -715,7 +715,7 @@ TEST(FfiTest, WrongTypeBufferArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -743,7 +743,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -772,7 +772,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -833,7 +833,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -975,8 +975,8 @@ TEST(FfiTest, UpdateBufferArgumentsAndResults) {
   std::vector<float> storage0(4, 0.0f);
   std::vector<float> storage1(4, 0.0f);
 
-  se::DeviceMemoryBase memory0(storage0.data(), 4 * sizeof(float));
-  se::DeviceMemoryBase memory1(storage1.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory0(storage0.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory1(storage1.data(), 4 * sizeof(float));
 
   std::vector<int64_t> dims = {2, 2};
 
@@ -1169,7 +1169,7 @@ TEST(FfiTest, PlatformStream) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/ffi/type_registry.h b/third_party/xla/xla/ffi/type_registry.h
index d08ad80f015f59..244fd6c8653a9c 100644
--- a/third_party/xla/xla/ffi/type_registry.h
+++ b/third_party/xla/xla/ffi/type_registry.h
@@ -17,14 +17,18 @@ limitations under the License.
 #define XLA_FFI_TYPE_REGISTRY_H_
 
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <type_traits>
 
 #include "absl/base/no_destructor.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/safe_reinterpret_cast.h"
+#include "xla/util.h"
 
 namespace xla::ffi {
 
@@ -63,13 +67,27 @@ class TypeRegistry {
   struct TypeInfo {
     using Deleter = void (*)(void*);
     using Serializer = absl::StatusOr<std::string> (*)(const void*);
-    using Deserializer = absl::StatusOr<void*> (*)(absl::string_view);
+    using Deserializer =
+        absl::StatusOr<std::unique_ptr<void, Deleter>> (*)(absl::string_view);
 
     Deleter deleter = nullptr;
     Serializer serializer = nullptr;
     Deserializer deserializer = nullptr;
   };
 
+  // To declare a type `T` as serializable and deserializable, define a
+  // specialization of `TypeSerDes<T>` with `Serialize` and `Deserialize` apis.
+  //
+  //   template <>
+  //   struct TypeSerDes<T> : public std::true_type {
+  //     static absl::StatusOr<std::string> Serialize(const T& value);
+  //     static absl::StatusOr<std::unique_ptr<T>> Deserialize(
+  //       absl::string_view data);
+  // };
+  //
+  template <typename T>
+  struct SerDes : public std::false_type {};
+
   // Returns type name for a given type id. Returns an error if type id is not
   // registered. Works for both external and internal type ids.
   static absl::StatusOr<absl::string_view> GetTypeName(TypeId type_id);
@@ -106,6 +124,14 @@ class TypeRegistry {
   template <typename T>
   static TypeInfo GetTypeInfo();
 
+  // Serializes a value of a given type. For internal type ids only.
+  template <typename T>
+  static absl::StatusOr<std::string> Serialize(const T& value);
+
+  // Deserializes a value of a given type. For internal type ids only.
+  template <typename T>
+  static absl::StatusOr<std::unique_ptr<T>> Deserialize(absl::string_view data);
+
  private:
   static TypeId GetNextTypeId();
 };
@@ -126,9 +152,53 @@ TypeRegistry::TypeId TypeRegistry::GetTypeId() {
 
 template <typename T>
 TypeRegistry::TypeInfo TypeRegistry::GetTypeInfo() {
-  return TypeInfo{
-      [](void* state) { delete tsl::safe_reinterpret_cast<T*>(state); },
-  };
+  // Define deleter as a static member, because it's always available for the
+  // internal types.
+  static TypeInfo::Deleter deleter =
+      +[](void* state) { delete tsl::safe_reinterpret_cast<T*>(state); };
+
+  // Serializer and deserializer are defined only if `T` opts in to the
+  // serializable via the `SerDes` specialization.
+  TypeInfo::Serializer serializer = nullptr;
+  TypeInfo::Deserializer deserializer = nullptr;
+
+  if constexpr (SerDes<T>::value) {
+    serializer = +[](const void* value) {
+      return SerDes<T>::Serialize(*tsl::safe_reinterpret_cast<const T*>(value));
+    };
+
+    deserializer = +[](absl::string_view data)
+        -> absl::StatusOr<std::unique_ptr<void, TypeInfo::Deleter>> {
+      TF_ASSIGN_OR_RETURN(auto value, SerDes<T>::Deserialize(data));
+      return std::unique_ptr<void, TypeInfo::Deleter>(value.release(), deleter);
+    };
+  }
+
+  return TypeInfo{deleter, serializer, deserializer};
+}
+
+template <typename T>
+absl::StatusOr<std::string> TypeRegistry::Serialize(const T& value) {
+  TypeInfo type_info = GetTypeInfo<T>();
+  if (type_info.serializer == nullptr) {
+    return FailedPrecondition(
+        "Type is not serializable. Did you forget to specialize "
+        "TypeRegistry::SerDes<T>?");
+  }
+  return type_info.serializer(&value);
+}
+
+template <typename T>
+absl::StatusOr<std::unique_ptr<T>> TypeRegistry::Deserialize(
+    absl::string_view data) {
+  TypeInfo type_info = GetTypeInfo<T>();
+  if (type_info.deserializer == nullptr) {
+    return FailedPrecondition(
+        "Type is not deserializable. Did you forget to specialize "
+        "TypeRegistry::SerDes<T>?");
+  }
+  TF_ASSIGN_OR_RETURN(auto ptr, type_info.deserializer(data));
+  return std::unique_ptr<T>(tsl::safe_reinterpret_cast<T*>(ptr.release()));
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/type_registry_test.cc b/third_party/xla/xla/ffi/type_registry_test.cc
index 936b4e40231b93..15d7c9fd0f4196 100644
--- a/third_party/xla/xla/ffi/type_registry_test.cc
+++ b/third_party/xla/xla/ffi/type_registry_test.cc
@@ -17,18 +17,41 @@ limitations under the License.
 
 #include <cstdint>
 #include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla::ffi {
-namespace {
 
+// Define a custom type with `TypeSerDes` specialization to test that TypeInfo
+// is properly generated for such types.
+struct MyString {
+  std::string data;
+};
+
+template <>
+struct TypeRegistry::SerDes<MyString> : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(const MyString& type) {
+    return type.data;
+  }
+  static absl::StatusOr<std::unique_ptr<MyString>> Deserialize(
+      absl::string_view data) {
+    auto type = std::make_unique<MyString>();
+    type->data = std::string(data);
+    return type;
+  }
+};
+
+namespace {
 using ::testing::HasSubstr;
 
 TEST(TypeRegistryTest, RegisterExternalTypeId) {
@@ -87,5 +110,18 @@ TEST(TypeRegistryTest, InternalTypeInfo) {
   type_info.deleter(ptr);
 }
 
+TEST(TypeRegistryTest, SerializableType) {
+  MyString str = {"foo"};
+
+  TypeRegistry::TypeInfo type_info = TypeRegistry::GetTypeInfo<MyString>();
+  ASSERT_NE(type_info.serializer, nullptr);
+  ASSERT_NE(type_info.deserializer, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized, TypeRegistry::Serialize(str));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MyString> deserialized,
+                          TypeRegistry::Deserialize<MyString>(serialized));
+  EXPECT_EQ(deserialized->data, "foo");
+}
+
 }  // namespace
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/fp_util.h b/third_party/xla/xla/fp_util.h
index 7295a1bee56949..e877f7c5dd9de0 100644
--- a/third_party/xla/xla/fp_util.h
+++ b/third_party/xla/xla/fp_util.h
@@ -279,8 +279,10 @@ constexpr T GoldbergUlp(T x) {
     return GoldbergUlp(std::numeric_limits<T>::min());
   }
   std::optional<int> maybe_exponent = LogBase(x);
-  if (maybe_exponent.has_value(); const int exponent = *maybe_exponent) {
-    return ScaleBase(std::numeric_limits<T>::epsilon(), exponent);
+  if (maybe_exponent.has_value()) {
+    if (const int exponent = *maybe_exponent) {
+      return ScaleBase(std::numeric_limits<T>::epsilon(), exponent);
+    }
   }
   if constexpr (std::numeric_limits<T>::has_quiet_NaN) {
     return std::numeric_limits<T>::quiet_NaN();
diff --git a/third_party/xla/xla/hlo/analysis/BUILD b/third_party/xla/xla/hlo/analysis/BUILD
index 9795916a0a8291..1494f6de2d76e8 100644
--- a/third_party/xla/xla/hlo/analysis/BUILD
+++ b/third_party/xla/xla/hlo/analysis/BUILD
@@ -594,17 +594,18 @@ cc_library(
     name = "indexing_analysis",
     srcs = [
         "indexing_analysis.cc",
+        "indexing_analysis_utils.cc",
         "indexing_map.cc",
         "indexing_map_serialization.cc",
     ],
     hdrs = [
         "indexing_analysis.h",
+        "indexing_analysis_utils.h",
         "indexing_map.h",
         "indexing_map_serialization.h",
     ],
     deps = [
         ":interval",
-        ":symbolic_expr",
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:util",
@@ -627,6 +628,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:logging",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -637,7 +639,6 @@ xla_cc_test(
         ":indexing_analysis",
         ":indexing_test_utils",
         ":interval",
-        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
@@ -646,8 +647,6 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -657,7 +656,6 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
-        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
@@ -674,7 +672,6 @@ cc_library(
     hdrs = ["indexing_test_utils.h"],
     deps = [
         ":indexing_analysis",
-        ":symbolic_expr",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -702,12 +699,18 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
+        ":stablehlo_indexing_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/translate:stablehlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:test",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -800,3 +803,22 @@ xla_cc_test(
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "stablehlo_indexing_analysis",
+    srcs = ["stablehlo_indexing_analysis.cc"],
+    hdrs = ["stablehlo_indexing_analysis.h"],
+    deps = [
+        ":indexing_analysis",
+        ":interval",
+        "//xla:permutation_util",
+        "//xla:shape_util",
+        "//xla/mlir_hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/analysis/hlo_reachability.cc b/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
index 2cbf312f7f5a3b..54e09fa300dd9b 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
@@ -41,10 +41,8 @@ HloReachabilityMap::HloReachabilityMap(
   while (row < total_rows) {
     const int rows_to_allocate = std::min(kRowsPerAllocation, total_rows - row);
     size_t words_to_allocate = rows_to_allocate * words_per_bitset_;
+    // make_unique initializes the array of words to 0
     bit_storage_.push_back(std::make_unique<BitSet::Word[]>(words_to_allocate));
-    // Initialize all the bitsets to 0
-    memset(bit_storage_.back().get(), 0,
-           words_to_allocate * sizeof(BitSet::Word));
     row += rows_to_allocate;
   }
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index f4fc26d98e0932..7ce3f66bd2c36b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -42,11 +42,15 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/hlo/analysis/indexing_analysis_utils.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -94,7 +98,7 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
 //   into `idx`.
 struct HLORTVar {
   Interval feasible_values;
-  const HloInstruction* hlo;
+  InstructionRef hlo;
   mlir::AffineMap map;
   DimensionVector dim_upper_bounds;
 };
@@ -108,33 +112,80 @@ inline bool operator!=(const HLORTVar& lhs, const HLORTVar& rhs) {
   return !(lhs == rhs);
 }
 
-// Optimizes runtime variable if it's possible to replace it with a constant.
-//
-// Note: we had a more complex logic here that handled more instruction types
-// but was removed due to previous version not updating value ranges
-// (b/419279949).
-std::optional<AffineExpr> OptimizeRTVar(HLORTVar rt_var,
-                                        MLIRContext* mlir_context) {
-  if (auto constant_expr = DynCast<HloConstantInstruction>(rt_var.hlo)) {
-    if (rt_var.map.isConstant()) {
-      const auto idx = rt_var.map.getConstantResults();
+std::optional<int64_t> GetIntOrSplatIntValue(mlir::Attribute attr) {
+  if (auto int_attr = mlir::dyn_cast<mlir::IntegerAttr>(attr)) {
+    return int_attr.getInt();
+  }
+  if (auto splat = mlir::dyn_cast<mlir::SplatElementsAttr>(attr)) {
+    if (auto element_attr = mlir::dyn_cast_or_null<mlir::IntegerAttr>(
+            splat.getSplatValue<mlir::Attribute>())) {
+      return element_attr.getInt();
+    }
+  }
+  return std::nullopt;
+}
+
+}  // namespace
+
+std::optional<AffineExpr> OptimizeHloRTVar(const HloInstruction* hlo,
+                                           const RuntimeVarIndexing& rt_var,
+                                           const Interval& feasible_values,
+                                           MLIRContext* mlir_context) {
+  if (auto constant_expr = DynCast<HloConstantInstruction>(hlo)) {
+    if (rt_var.map.GetAffineMap().isConstant()) {
+      const auto idx = rt_var.map.GetAffineMap().getConstantResults();
       auto const_value = constant_expr->literal().GetIntegralAsS64(idx).value();
-      if (!rt_var.feasible_values.Contains(const_value)) {
-        // Constant is outside of the feasible values, keep the symbol to let
-        // the runtime to handle that.
+      if (!feasible_values.Contains(const_value)) {
         return std::nullopt;
       }
       return getAffineConstantExpr(const_value, mlir_context);
     }
   }
-  if (auto iota_expr = DynCast<HloIotaInstruction>(rt_var.hlo)) {
+  if (auto iota_expr = DynCast<HloIotaInstruction>(hlo)) {
     auto iota_dimension = iota_expr->iota_dimension();
-    CHECK(iota_dimension < rt_var.map.getNumResults());
-    return rt_var.map.getResults()[iota_dimension];
+    CHECK(iota_dimension < rt_var.map.GetAffineMap().getNumResults());
+    return rt_var.map.GetAffineMap().getResults()[iota_dimension];
+  }
+  return std::nullopt;
+}
+
+std::optional<AffineExpr> OptimizeMlirRTVar(mlir::Operation* op,
+                                            const RuntimeVarIndexing& rt_var,
+                                            const Interval& feasible_values,
+                                            MLIRContext* mlir_context) {
+  mlir::Attribute attr;
+  if (mlir::matchPattern(op, mlir::m_Constant(&attr))) {
+    auto int_val = GetIntOrSplatIntValue(attr);
+    if (int_val.has_value()) {
+      if (!feasible_values.Contains(*int_val)) {
+        return std::nullopt;
+      }
+      return getAffineConstantExpr(*int_val, mlir_context);
+    }
+  }
+  if (auto iota_op = llvm::dyn_cast<mlir::stablehlo::IotaOp>(op)) {
+    int64_t iota_dim = iota_op.getIotaDimension();
+    if (iota_dim < rt_var.map.GetAffineMap().getNumResults()) {
+      return rt_var.map.GetAffineMap().getResults()[iota_dim];
+    }
   }
   return std::nullopt;
 }
 
+std::optional<AffineExpr> OptimizeRTVar(const RuntimeVarIndexing& rt_var,
+                                        const Interval& feasible_values,
+                                        MLIRContext* mlir_context) {
+  if (const HloInstruction* hlo = rt_var.hlo()) {
+    return OptimizeHloRTVar(hlo, rt_var, feasible_values, mlir_context);
+  }
+  if (auto* op = rt_var.mlir_op()) {
+    return OptimizeMlirRTVar(op, rt_var, feasible_values, mlir_context);
+  }
+  return std::nullopt;
+}
+
+namespace {
+
 std::vector<IndexingMap::Variable> ConvertHLORTVarsToRTVars(
     const std::vector<HLORTVar>& hlo_rt_vars) {
   std::vector<IndexingMap::Variable> rt_vars;
@@ -155,7 +206,11 @@ IndexingMap FoldRTVarsAndConstructIndexingMap(
   CHECK_EQ(affine_map.getNumSymbols(), hlo_rt_vars.size());
   for (auto idx = 0; idx < affine_map.getNumSymbols(); ++idx) {
     auto& rt_var = hlo_rt_vars[idx];
-    std::optional<AffineExpr> result = OptimizeRTVar(rt_var, mlir_context);
+    std::optional<AffineExpr> result = OptimizeRTVar(
+        RuntimeVarIndexing{rt_var.hlo, IndexingMap::FromTensorSizes(
+                                           rt_var.map, rt_var.dim_upper_bounds,
+                                           /*symbol_upper_bounds=*/{})},
+        rt_var.feasible_values, mlir_context);
     if (!result) {
       continue;
     }
@@ -184,29 +239,24 @@ OperandIndexing CreateOperandIndexingWithRTVars(
   IndexingMap update_map_ops = FoldRTVarsAndConstructIndexingMap(
       operand_map, dim_vars, std::move(rt_vars));
 
-  return OperandIndexing(update_map_ops, rt_indexing);
+  OperandIndexing indexing(update_map_ops, rt_indexing);
+  indexing.RemoveUnusedSymbols();
+  return indexing;
 }
 
 HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
     const HloInstruction* instr, MLIRContext* mlir_context) {
-  IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
-  IndexingMap unit_map(
-      mlir::AffineMap::get(identity_map.GetAffineMap().getNumDims(),
-                           /*symbolCount=*/0, mlir_context),
-      identity_map.GetDimVars(), /*range_vars=*/{}, /*rt_vars=*/{});
-
-  HloInstructionIndexing instr_indexing;
-  instr_indexing.indexing_maps.resize(instr->operand_count());
-  int64_t operand_count = instr->operand_count();
-  for (int64_t operand_id = 0; operand_id < operand_count; ++operand_id) {
+  HloInstructionIndexing instr_indexing = CreateElementwiseIndexing(
+      instr->operand_count(), instr->shape(), mlir_context);
+  for (int64_t operand_id = 0; operand_id < instr->operand_count();
+       ++operand_id) {
     // Select allows implicit broadcasting in the predicate. We just handle it
     // generically here.
-    auto* operand = instr->operand(operand_id);
-    if (operand->shape().dimensions().size() == 0 &&
-        instr->shape().dimensions().size() > 0) {
-      instr_indexing.indexing_maps[operand_id].emplace(unit_map);
-    } else {
-      instr_indexing.indexing_maps[operand_id].emplace(identity_map);
+    if (instr->operand(operand_id)->shape().dimensions().empty() &&
+        !instr->shape().dimensions().empty()) {
+      instr_indexing.indexing_maps[operand_id].clear();
+      instr_indexing.indexing_maps[operand_id].emplace(
+          CreateScalarIndexingMap(instr->shape(), mlir_context));
     }
   }
   return instr_indexing;
@@ -218,19 +268,14 @@ HloInstructionIndexing ComputeInputToOutputCwiseOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({identity_map});
 }
 
+}  // namespace
+
+namespace {
+
 HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
     const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
-  auto output_dims = bcast->shape().dimensions();
-
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(bcast->dimensions().size());
-  for (int64_t bcast_dim : bcast->dimensions()) {
-    exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
-  }
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
-                     mlir_context),
-      output_dims, {});
+  IndexingMap indexing_map = ComputeBroadcastIndexingMap(
+      bcast->shape().dimensions(), bcast->dimensions(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -267,31 +312,15 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat, MLIRContext* mlir_context) {
-  const auto& operand_0_dims = concat->operand(0)->shape().dimensions();
-
-  // Initialize affine map and domain. Only concat_dim elements of both have to
-  // be adjusted for a particular operand_id.
-  mlir::MutableAffineMap affine_map =
-      AffineMap::getMultiDimIdentityMap(operand_0_dims.size(), mlir_context);
-  std::vector<IndexingMap::Variable> dim_vars =
-      DimVarsFromTensorSizes(operand_0_dims);
-
-  HloInstructionIndexing concat_indexing;
-  concat_indexing.indexing_maps.resize(concat->operand_count());
   int64_t concat_dim = concat->concatenate_dimension();
-  AffineExpr concat_dim_expr = getAffineDimExpr(concat_dim, mlir_context);
-  int64_t offset = 0;
-  for (const auto [operand_id, operand] : llvm::enumerate(concat->operands())) {
-    affine_map.setResult(concat_dim, concat_dim_expr - offset);
-    int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
-    dim_vars[concat_dim] =
-        IndexingMap::Variable{{offset, offset + operand_concat_dim - 1}};
-    concat_indexing.indexing_maps[operand_id].insert(
-        OperandIndexing(IndexingMap(affine_map.getAffineMap(), dim_vars,
-                                    /*range_vars=*/{}, /*rt_vars=*/{})));
-    offset += operand_concat_dim;
+  std::vector<int64_t> operand_concat_dim_sizes;
+  operand_concat_dim_sizes.reserve(concat->operand_count());
+  for (const auto* operand : concat->operands()) {
+    operand_concat_dim_sizes.push_back(operand->shape().dimensions(concat_dim));
   }
-  return concat_indexing;
+  return ComputeConcatenateIndexing(concat->shape().dimensions().size(),
+                                    concat_dim, concat->shape().dimensions(),
+                                    operand_concat_dim_sizes, mlir_context);
 }
 
 HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
@@ -336,74 +365,11 @@ HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
 std::pair<IndexingMap, IndexingMap> ComputeDotOperandsIndexingImpl(
     const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
     const DotDimensionNumbers& dim_numbers, MLIRContext* mlir_context) {
-  absl::Span<const int64_t> lhs_contracting_dims(
-      dim_numbers.lhs_contracting_dimensions());
-  absl::Span<const int64_t> rhs_contracting_dims =
-      dim_numbers.rhs_contracting_dimensions();
-
-  absl::Span<const int64_t> lhs_batch_dims = dim_numbers.lhs_batch_dimensions();
-  absl::Span<const int64_t> rhs_batch_dims = dim_numbers.rhs_batch_dimensions();
-
-  // According to the StableHLO specification, the dimensions of the output
-  // shape are ordered as follows:
-  //   lhs_batch_dims | lhs_non_contracting_dims | rhs_non_contracting_dims
-  SmallVector<AffineExpr> lhs_exprs(lhs_shape.dimensions().size());
-  SmallVector<AffineExpr> rhs_exprs(rhs_shape.dimensions().size());
-  int64_t output_dim_id = 0;
-
-  // lhs_batch_dims
-  for (auto [lhs_batch_dim, rhs_batch_dim] :
-       llvm::zip(lhs_batch_dims, rhs_batch_dims)) {
-    AffineExpr output_dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    lhs_exprs[lhs_batch_dim] = output_dim_expr;
-    rhs_exprs[rhs_batch_dim] = output_dim_expr;
-    ++output_dim_id;
-  }
-
-  // lhs_non_contracting_dims
-  auto lhs_non_contracting_dims =
-      GetNonContractingDims(lhs_shape, lhs_batch_dims, lhs_contracting_dims);
-  assert(lhs_non_contracting_dims.ok());
-
-  for (int64_t lhs_non_contracting_dim : lhs_non_contracting_dims.value()) {
-    lhs_exprs[lhs_non_contracting_dim] =
-        getAffineDimExpr(output_dim_id++, mlir_context);
-  }
-
-  // rhs_non_contracting_dims
-  auto rhs_non_contracting_dims =
-      GetNonContractingDims(rhs_shape, rhs_batch_dims, rhs_contracting_dims);
-  assert(rhs_non_contracting_dims.ok());
-  for (int64_t rhs_non_contracting_dim : rhs_non_contracting_dims.value()) {
-    rhs_exprs[rhs_non_contracting_dim] =
-        getAffineDimExpr(output_dim_id++, mlir_context);
-  }
-
-  int64_t input_dim_id = 0;
-  std::vector<int64_t> input_dim_sizes;
-  input_dim_sizes.reserve(lhs_contracting_dims.size());
-
-  for (auto [lhs_contracting_dim, rhs_contracting_dim] :
-       llvm::zip(lhs_contracting_dims, rhs_contracting_dims)) {
-    AffineExpr input_dim_expr = getAffineSymbolExpr(input_dim_id, mlir_context);
-    lhs_exprs[lhs_contracting_dim] = input_dim_expr;
-    rhs_exprs[rhs_contracting_dim] = input_dim_expr;
-    ++input_dim_id;
-
-    // LHS and RHS contracting dimensions must match pairwise, and we therefore
-    // need only populate a single input_dim_sizes vector.
-    input_dim_sizes.push_back(lhs_shape.dimensions(lhs_contracting_dim));
-  }
-
-  int64_t output_rank = output_shape.dimensions().size();
-  return std::make_pair(IndexingMap::FromTensorSizes(
-                            AffineMap::get(output_rank, input_dim_sizes.size(),
-                                           lhs_exprs, mlir_context),
-                            output_shape.dimensions(), input_dim_sizes),
-                        IndexingMap::FromTensorSizes(
-                            AffineMap::get(output_rank, input_dim_sizes.size(),
-                                           rhs_exprs, mlir_context),
-                            output_shape.dimensions(), input_dim_sizes));
+  return ComputeDotOperandsIndexing(
+      lhs_shape.dimensions(), rhs_shape.dimensions(), output_shape.dimensions(),
+      dim_numbers.lhs_batch_dimensions(), dim_numbers.rhs_batch_dimensions(),
+      dim_numbers.lhs_contracting_dimensions(),
+      dim_numbers.rhs_contracting_dimensions(), mlir_context);
 }
 
 // Returns the new map with the results scaled by (operand_shape / scale_shape).
@@ -473,29 +439,45 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
       << "b/118437727: Old form, not supported.";
   // A map from tensor iteration space to (), because index operands are 0d
   // tensors.
+  IndexingMap start_indices_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
+
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
-  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
-      empty_results_affine_map, output_shape.dimensions(), {});
-
   std::vector<HLORTVar> offsets_rt_vars;
   offsets_rt_vars.reserve(rank);
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
+
   for (auto [dim, slice_size] :
        llvm::enumerate(dynamic_slice->dynamic_slice_sizes())) {
-    exprs.push_back(getAffineDimExpr(dim, mlir_context) +
-                    getAffineSymbolExpr(dim, mlir_context));
-    offsets_rt_vars.push_back(HLORTVar{
-        Interval{0, input_shape.dimensions(dim) - slice_size},
-        dynamic_slice->operand(dim + first_index_num), empty_results_affine_map,
-        ShapeUtil::CreateDimensionVectorFromShape(output_shape)});
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+    const HloInstruction* offset_op =
+        dynamic_slice->operand(dim + first_index_num);
+    int64_t max_index = input_shape.dimensions(dim) - slice_size;
+
+    // Construct temp objects for optimization
+    RuntimeVarIndexing rt_indexing{offset_op, start_indices_map};
+    Interval feasible_values{0, max_index};
+
+    auto simplified_expr =
+        OptimizeRTVar(rt_indexing, feasible_values, mlir_context);
+    if (simplified_expr) {
+      exprs.push_back(dim_expr + *simplified_expr);
+    } else {
+      exprs.push_back(
+          dim_expr + getAffineSymbolExpr(offsets_rt_vars.size(), mlir_context));
+      offsets_rt_vars.push_back(
+          HLORTVar{feasible_values, offset_op, empty_results_affine_map,
+                   ShapeUtil::CreateDimensionVectorFromShape(output_shape)});
+    }
   }
   std::vector<OperandIndexing> indexing_maps(
       dynamic_slice->operand_count(), OperandIndexing(start_indices_map));
 
+  int symbol_count = offsets_rt_vars.size();
   indexing_maps[0] = CreateOperandIndexingWithRTVars(
-      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/symbol_count, exprs,
                      mlir_context),
       start_indices_map.GetDimVars(), std::move(offsets_rt_vars));
   HloInstructionIndexing result =
@@ -521,11 +503,11 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
       output_shape.dimensions(), {});
 
   // start_indices: (d0, ... d{N-1}) -> ()
+  IndexingMap start_indices_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
+
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
-  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
-      empty_results_affine_map, output_shape.dimensions(), {});
-
   // update: (d0 - rt0, ..., d{N-1} - rt{N-1})
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
@@ -619,40 +601,9 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
       {operand_indexing, OperandIndexing(indices_map)});
 }
 
-IndexingMap ComputeOutputToInputPadOpIndexingImpl(
-    absl::Span<const int64_t> output_dims,
-    absl::Span<const int64_t> padding_low,
-    absl::Span<const int64_t> padding_high,
-    absl::Span<const int64_t> padding_interior, MLIRContext* mlir_context) {
-  int64_t output_rank = output_dims.size();
+}  // namespace
 
-  std::vector<AffineExpr> exprs;
-  std::vector<std::pair<AffineExpr, Interval>> constraints;
-  std::vector<IndexingMap::Variable> dim_vars;
-  exprs.reserve(output_rank);
-  constraints.reserve(output_rank);
-  int64_t output_dim_id = 0;
-  for (const auto [output_dim, pad_low, pad_high, pad_interior] :
-       llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
-    AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    dim_vars.push_back({IndexingMap::Variable{
-        std::max(int64_t{0}, pad_low),
-        std::min(output_dim - 1, output_dim - 1 - pad_high)}});
-    if (pad_interior == 0) {
-      exprs.push_back(dim_expr - pad_low);
-    } else {
-      exprs.push_back((dim_expr - pad_low).floorDiv(pad_interior + 1));
-      constraints.push_back(
-          {(dim_expr - pad_low) % (pad_interior + 1), Interval{0, 0}});
-    }
-    ++output_dim_id;
-  }
-  return IndexingMap{
-      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      std::move(dim_vars),
-      /*range_vars = */ {},
-      /*rt_vars = */ {}, absl::MakeSpan(constraints)};
-}
+namespace {
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
     const HloPadInstruction* pad, MLIRContext* mlir_context) {
@@ -667,47 +618,25 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
     padding_high.push_back(dim_config.edge_padding_high());
     padding_interior.push_back(dim_config.interior_padding());
   }
-  IndexingMap input_indexing_map = ComputeOutputToInputPadOpIndexingImpl(
-      output_shape.dimensions(), padding_low, padding_high, padding_interior,
-      mlir_context);
-  IndexingMap padding_value_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
-                     mlir_context),
-      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+  IndexingMap input_indexing_map =
+      ComputePadIndexingMap(output_shape.dimensions(), padding_low,
+                            padding_high, padding_interior, mlir_context);
+  IndexingMap padding_value_indexing_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
   return HloInstructionIndexing::FromIndexingMaps(
       {input_indexing_map, padding_value_indexing_map});
 }
 
 HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     const HloReduceInstruction* reduce, MLIRContext* mlir_context) {
-  absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
-                                               reduce->dimensions().end());
-
   const Shape& input_shape = reduce->operand(0)->shape();
   const Shape& output_shape = GetOutputShape(reduce, 0);
 
-  std::vector<int64_t> parallel_dims_sizes;
-  int64_t output_dim_id = 0;
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(input_shape.dimensions().size());
-  for (auto [input_dim_id, input_dim] :
-       llvm::enumerate(input_shape.dimensions())) {
-    if (reduce_dims_ids.contains(input_dim_id)) {
-      exprs.push_back(
-          getAffineSymbolExpr(parallel_dims_sizes.size(), mlir_context));
-      parallel_dims_sizes.push_back(input_dim);
-      continue;
-    }
-    exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
-  }
-  IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), reduce_dims_ids.size(),
-                     exprs, mlir_context),
-      output_shape.dimensions(), parallel_dims_sizes);
-  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
-                     mlir_context),
-      output_shape.dimensions(), {});
+  IndexingMap inputs_indexing_map = ComputeReduceInputIndexingMap(
+      input_shape.dimensions(), output_shape.dimensions(), reduce->dimensions(),
+      mlir_context);
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(reduce->operand_count());
@@ -776,56 +705,24 @@ IndexingMap ComposeIndexingMapsForWindow(
     absl::Span<const int64_t> output_dimensions, const Window& window,
     MLIRContext* mlir_context) {
   size_t rank = input_dimensions.size();
-
-  // Compute shape of the padded input and the indexing map of pad op required
-  // to pad the input.
-  SmallVector<int64_t> padding_low, padding_high, padding_interior,
-      padded_input_dimensions;
-  padding_low.reserve(rank);
-  padding_high.reserve(rank);
-  padding_interior.reserve(rank);
-  padded_input_dimensions.reserve(rank);
-  SmallVector<AffineExpr, 4> exprs;
-  std::vector<IndexingMap::Variable> dim_vars;
-  std::vector<IndexingMap::Variable> range_vars;
-  exprs.reserve(rank);
-  dim_vars.reserve(rank);
-  range_vars.reserve(rank);
-  for (const auto& [dim_id, window_config] :
-       llvm::enumerate(window.dimensions())) {
-    padding_low.push_back(window_config.padding_low());
-    padding_high.push_back(window_config.padding_high());
-    // For some reason interior_padding in HLO pad is offset from base_dilations
-    // in HLO reduce-window by 1.
-    padding_interior.push_back(window_config.base_dilation() - 1);
-    padded_input_dimensions.push_back(
-        input_dimensions[dim_id] + window_config.padding_low() +
-        window_config.padding_high() +
-        (input_dimensions[dim_id] - 1) * (window_config.base_dilation() - 1));
-    AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
-    AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
-
-    exprs.push_back(symbol_expr * window_config.window_dilation() +
-                    window_config.stride() * dim_expr);
-    dim_vars.push_back(
-        {IndexingMap::Variable{0, output_dimensions[dim_id] - 1}});
-    range_vars.push_back({IndexingMap::Variable{0, window_config.size() - 1}});
-  }
-  // Indexing map for pad op that pads the input.
-  IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
-      padded_input_dimensions, padding_low, padding_high, padding_interior,
-      mlir_context);
-  // Indexing map for reduce-window, that does not do any padding.
-  IndexingMap input_indexing_no_padding(
-      AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
-      /*rt_vars=*/{});
-
-  // Composed indexing.
-  IndexingMap result =
-      ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
-  result.Simplify();
-  result.RemoveUnusedSymbols();
-  return result;
+  SmallVector<int64_t> window_dims, window_strides, window_dilations,
+      base_dilations, padding;
+  window_dims.reserve(rank);
+  window_strides.reserve(rank);
+  window_dilations.reserve(rank);
+  base_dilations.reserve(rank);
+  padding.reserve(rank * 2);
+  for (const auto& dim : window.dimensions()) {
+    window_dims.push_back(dim.size());
+    window_strides.push_back(dim.stride());
+    window_dilations.push_back(dim.window_dilation());
+    base_dilations.push_back(dim.base_dilation());
+    padding.push_back(dim.padding_low());
+    padding.push_back(dim.padding_high());
+  }
+  return ComposeWindowIndexingMap(input_dimensions, output_dimensions,
+                                  window_dims, window_strides, window_dilations,
+                                  base_dilations, padding, mlir_context);
 }
 
 // Indexing for reduce-window with dilations and non-trivial padding can be
@@ -843,10 +740,8 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
       reduce_window->window(), mlir_context);
 
   // Indexing map for the init value.
-  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
-                     mlir_context),
-      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(reduce_window->operand_count());
@@ -1168,43 +1063,16 @@ HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
 
 HloInstructionIndexing ComputeReverseOpIndexing(
     const HloReverseInstruction* reverse, MLIRContext* mlir_context) {
-  absl::flat_hash_set<int64_t> reverse_dims(reverse->dimensions().begin(),
-                                            reverse->dimensions().end());
-  auto output_dims = reverse->shape().dimensions();
-
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(output_dims.size());
-  for (auto [output_dim_id, output_dim] : llvm::enumerate(output_dims)) {
-    auto dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    if (!reverse_dims.contains(output_dim_id)) {
-      exprs.push_back(dim_expr);
-      continue;
-    }
-    exprs.push_back(-dim_expr + output_dim - 1);
-  }
-
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
-                     mlir_context),
-      output_dims, {});
-
+  IndexingMap indexing_map = ComputeReverseIndexingMap(
+      reverse->shape().dimensions(), reverse->dimensions(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
 HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
     const HloSliceInstruction* slice, MLIRContext* mlir_context) {
-  auto output_rank = slice->shape().dimensions().size();
-
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(output_rank);
-  for (int64_t dim = 0; dim < output_rank; ++dim) {
-    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
-    exprs.push_back(dim_expr * slice->slice_strides()[dim] +
-                    slice->slice_starts()[dim]);
-  }
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      slice->shape().dimensions(), {});
+  IndexingMap indexing_map = ComputeSliceIndexingMap(
+      slice->shape().dimensions(), slice->slice_starts(),
+      slice->slice_strides(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -1237,13 +1105,6 @@ HloInstructionIndexing ComputeInputToOutputSliceOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({std::move(indexing_map)});
 }
 
-AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
-                                      MLIRContext* mlir_context) {
-  return AffineMap::getPermutationMap(
-      std::vector<unsigned>(permutation.begin(), permutation.end()),
-      mlir_context);
-}
-
 HloInstructionIndexing ComputeOutputToInputTransposeOpIndexing(
     const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
   AffineMap inverse_permutation = ComputeTransposeIndexingMap(
@@ -1729,7 +1590,8 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
       // b/65689298.
       instr->opcode() == HloOpcode::kMap ||
       // For a single device, all-reduce is an elementwise op.
-      instr->opcode() == HloOpcode::kAllReduceStart) {
+      instr->opcode() == HloOpcode::kAllReduceStart ||
+      instr->opcode() == HloOpcode::kAllReduceDone) {
     return ComputeOutputToInputCwiseOpIndexing(instr, mlir_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
@@ -1815,7 +1677,8 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
       // b/65689298.
       instr->opcode() == HloOpcode::kMap ||
       // For a single device, all-reduce has 1:1 output to input mapping.
-      instr->opcode() == HloOpcode::kAllReduceStart) {
+      instr->opcode() == HloOpcode::kAllReduceStart ||
+      instr->opcode() == HloOpcode::kAllReduceDone) {
     return ComputeInputToOutputCwiseOpIndexing(instr, mlir_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
@@ -1945,7 +1808,7 @@ bool operator==(const OperandIndexing& lhs, const OperandIndexing& rhs) {
 }
 
 bool operator==(const RuntimeVarIndexing& lhs, const RuntimeVarIndexing& rhs) {
-  return lhs.map == rhs.map && lhs.hlo == rhs.hlo;
+  return lhs.map == rhs.map && lhs.instruction_ref == rhs.instruction_ref;
 }
 
 OperandIndexing ComposeOperandIndexing(const OperandIndexing& first,
@@ -1959,7 +1822,8 @@ OperandIndexing ComposeOperandIndexing(const OperandIndexing& first,
                           first.runtime_variables().end());
   for (const auto& rt_var : second.runtime_variables()) {
     IndexingMap combined_map = ComposeIndexingMaps(first.map(), rt_var.map);
-    combined_runtime.push_back(RuntimeVarIndexing{rt_var.hlo, combined_map});
+    combined_runtime.push_back(
+        RuntimeVarIndexing{rt_var.instruction_ref, combined_map});
   }
 
   std::optional<IndexingMap> replica_id_map;
@@ -1980,7 +1844,67 @@ OperandIndexing ComposeOperandIndexing(const OperandIndexing& first,
 }
 
 std::string RuntimeVarIndexing::ToString() const {
-  return absl::StrCat(hlo->ToString(), "; ", xla::ToString(map));
+  // Handle both HLO and MLIR operations producing a unified enough format to
+  // avoid duplication in tests.
+  std::string instruction_str;
+  if (auto* hlo = std::get_if<const HloInstruction*>(&instruction_ref)) {
+    if (*hlo) {
+      // For HLO, print simplified format for parameter and constant
+      if ((*hlo)->opcode() == HloOpcode::kParameter) {
+        instruction_str =
+            absl::StrCat("parameter(", (*hlo)->parameter_number(), ")");
+      } else if ((*hlo)->opcode() == HloOpcode::kConstant) {
+        instruction_str = "constant";
+        // Print constant value for scalar constants
+        const xla::Literal& literal = (*hlo)->literal();
+        if (xla::ShapeUtil::IsScalar(literal.shape())) {
+          instruction_str =
+              absl::StrCat("constant(", literal.ToStringWithoutShape(), ")");
+        }
+      } else {
+        instruction_str = (*hlo)->name();
+      }
+    } else {
+      instruction_str = "<null hlo>";
+    }
+  } else if (auto* val = std::get_if<mlir::Value>(&instruction_ref)) {
+    if (*val) {
+      if (auto* op = val->getDefiningOp()) {
+        // Try to extract constant value for stablehlo/mhlo constant ops
+        llvm::StringRef op_name = op->getName().getStringRef();
+        if (op_name == "stablehlo.constant" || op_name == "mhlo.constant") {
+          instruction_str = "constant";
+          if (auto attr = op->getAttrOfType<mlir::DenseElementsAttr>("value")) {
+            if (attr.isSplat() && attr.getNumElements() == 1) {
+              // Scalar constant - print the value
+              auto elem_type = attr.getElementType();
+              if (elem_type.isSignlessInteger()) {
+                instruction_str = absl::StrCat(
+                    "constant(",
+                    attr.getSplatValue<llvm::APInt>().getSExtValue(), ")");
+              } else if (elem_type.isF32()) {
+                instruction_str =
+                    absl::StrCat("constant(", attr.getSplatValue<float>(), ")");
+              } else if (elem_type.isF64()) {
+                instruction_str = absl::StrCat(
+                    "constant(", attr.getSplatValue<double>(), ")");
+              }
+            }
+          }
+        } else {
+          instruction_str = op_name.str();
+        }
+      } else {
+        // Block argument is print as "parameter(N)" to match HLO format.
+        auto block_arg = llvm::cast<mlir::BlockArgument>(*val);
+        instruction_str =
+            absl::StrCat("parameter(", block_arg.getArgNumber(), ")");
+      }
+    } else {
+      instruction_str = "<null value>";
+    }
+  }
+  return absl::StrCat(instruction_str, "; ", xla::ToString(map));
 }
 
 std::ostream& operator<<(std::ostream& os, const RuntimeVarIndexing& var) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.h b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
index e0650668397b83..8295d0133f8d89 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -30,8 +32,10 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/interval.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/shape.h"
@@ -108,15 +112,37 @@ IndexingMap ComputeEpilogueInputToOutputIndexing(
     HloInstructionAdaptor epilogue_parent, HloInstructionAdaptor epilogue_root,
     mlir::MLIRContext* mlir_context);
 
-// Indexing of the runtime variable of the HLO instruction.
+// Type for referencing either an HloInstruction or MLIR Value
+using InstructionRef = std::variant<const HloInstruction*, mlir::Value>;
+
+// Indexing of the runtime variable of the HLO instruction or MLIR operation.
 struct RuntimeVarIndexing {
-  // Instruction of the runtime variable. Note that while in trivial cases it
+  // Instruction reference. Can be either HloInstruction* (for XLA HLO) or
+  // mlir::Value (for StableHLO/MLIR). Note that while in trivial cases it
   // points to one of the operands of the instruction, with multiple
   // instructions and fusions it may point to an arbitrary instruction in the
   // computation.
-  const HloInstruction* hlo;
-  // Output-to-input indexing map from the instruction to the output of `hlo`.
+  InstructionRef instruction_ref;
+
+  // Output-to-input indexing map from the instruction to the output.
   IndexingMap map;
+
+  // Accessor for HloInstruction*
+  const HloInstruction* hlo() const {
+    if (auto* hlo = std::get_if<const HloInstruction*>(&instruction_ref)) {
+      return *hlo;
+    }
+    return nullptr;
+  }
+
+  // Accessor for MLIR operations
+  mlir::Operation* mlir_op() const {
+    if (auto* val = std::get_if<mlir::Value>(&instruction_ref)) {
+      return val->getDefiningOp();
+    }
+    return nullptr;
+  }
+
   std::string ToString() const;
 };
 
@@ -213,6 +239,11 @@ llvm::SmallVector<IndexingMap, 4> MapLogicalToLinearizedPhysicalShape(
     absl::Span<const HloInstruction* const> operands,
     mlir::MLIRContext* mlir_context);
 
+// Optimizes a runtime variable if it's possible to replace it with a constant.
+std::optional<mlir::AffineExpr> OptimizeRTVar(const RuntimeVarIndexing& rt_var,
+                                              const Interval& feasible_values,
+                                              mlir::MLIRContext* mlir_context);
+
 // Computes the indexing map from logical to linearized physical shape for each
 // operand and adds them to `result`. `result` may be non-empty when this
 // function is called and can be used to accumulate results from several calls
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
index 74aaf30a25216f..8db318e73f6b8b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
@@ -15,20 +15,33 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexing_analysis.h"
 
+#include <utility>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Visitors.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/stablehlo_indexing_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/translate/stablehlo.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+using ::llvm::dyn_cast;
 using ::testing::ElementsAre;
 using ::testing::Eq;
 using ::testing::ExplainMatchResult;
@@ -42,9 +55,84 @@ MATCHER_P2(MatchInstrIndexing, operand_id, indexing_map_matchers, "") {
                             result_listener);
 }
 
-using IndexingAnalysisTest = IndexingTestBase;
+class IndexingAnalysisTest : public IndexingTestBase,
+                             public ::testing::WithParamInterface<bool> {
+ public:
+  using IndexingTestBase::GetInputToOutputIndexing;
+  using IndexingTestBase::GetOutputToInputIndexing;
+
+  HloInstructionIndexing GetOutputToInputIndexing(
+      const HloInstruction* instr, int output_id,
+      bool use_physical_layout) override {
+    if (GetParam()) {
+      // StableHLO mode
+      auto module_ref =
+          xla::ConvertHloToStablehlo(mlir_context_, instr->GetModule());
+      if (!module_ref.ok()) {
+        ADD_FAILURE() << "HLO to StableHLO conversion failed: "
+                      << module_ref.status();
+        return HloInstructionIndexing::FromIndexingMaps({});
+      }
+      stablehlo_modules_.push_back(std::move(module_ref.value()));
+      auto module_op = *stablehlo_modules_.back();
+      mlir::Operation* op = nullptr;
+      module_op->walk([&](mlir::Operation* nested_op) {
+        if (auto name_loc = dyn_cast<mlir::NameLoc>(nested_op->getLoc())) {
+          if (name_loc.getName() == instr->name()) {
+            op = nested_op;
+            return mlir::WalkResult::interrupt();
+          }
+        }
+        return mlir::WalkResult::advance();
+      });
+      if (!op) {
+        ADD_FAILURE() << "Could not find corresponding StableHLO op for "
+                      << instr->name();
+        return HloInstructionIndexing::FromIndexingMaps({});
+      }
+      return ComputeOutputToInputIndexing(op, output_id);
+    }
+    return IndexingTestBase::GetOutputToInputIndexing(instr, output_id,
+                                                      use_physical_layout);
+  }
+
+  void SetUp() override {
+    IndexingTestBase::SetUp();
+    mlir_context_.loadDialect<mlir::stablehlo::StablehloDialect,
+                              mlir::func::FuncDialect>();
+
+    static const auto* unsupported_tests =
+        new absl::flat_hash_set<absl::string_view>{
+            // StableHLO indexing analysis does not support physical layout /
+            // permutations yet.
+            "PhysicalLayoutTestInputPermutation/1",
+            "PhysicalLayoutTestOutputPermutation/1",
+            "PhysicalLayoutTestInputAndOutputPermutation/1",
+
+            // Custom call / MHLO unknown handling.
+            "ScaledDotOp/1",
+        };
+
+    if (GetParam()) {
+      const testing::TestInfo* test_info =
+          testing::UnitTest::GetInstance()->current_test_info();
+      absl::string_view test_name = test_info->name();
+      // Here we rely on exact match of "TestName/1" which corresponds to
+      // GetParam() == true.
+      if (unsupported_tests->contains(test_name)) {
+        GTEST_SKIP() << "Skipping unsupported StableHLO test: " << test_name;
+      }
+    }
+  }
+
+ private:
+  std::vector<mlir::OwningOpRef<mlir::ModuleOp>> stablehlo_modules_;
+};
+
+INSTANTIATE_TEST_SUITE_P(StablehloIndexingAnalysis, IndexingAnalysisTest,
+                         ::testing::Values(false, true));
 
-TEST_F(IndexingAnalysisTest, GroupIndexingMapsByProducers) {
+TEST_P(IndexingAnalysisTest, GroupIndexingMapsByProducers) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -75,7 +163,7 @@ TEST_F(IndexingAnalysisTest, GroupIndexingMapsByProducers) {
                   )")))));
 }
 
-TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
+TEST_P(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -119,7 +207,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
                       )")))));
 }
 
-TEST_F(IndexingAnalysisTest,
+TEST_P(IndexingAnalysisTest,
        ComputeGroupedOutputToInputIndexing_VariadicReduce) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -180,7 +268,7 @@ TEST_F(IndexingAnalysisTest,
                   )")))));
 }
 
-TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
+TEST_P(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -211,7 +299,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
                                                    )")))));
 }
 
-TEST_F(IndexingAnalysisTest,
+TEST_P(IndexingAnalysisTest,
        ComputeGroupedOutputToInputIndexing_StartNotAtRoot) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -263,7 +351,7 @@ TEST_F(IndexingAnalysisTest,
           )")))));
 }
 
-TEST_F(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
+TEST_P(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -294,7 +382,7 @@ TEST_F(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, CopyNothing) {
+TEST_P(IndexingAnalysisTest, CopyNothing) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -313,7 +401,7 @@ TEST_F(IndexingAnalysisTest, CopyNothing) {
               MatchIndexingString("operand id = 0 KNOWN EMPTY"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeNothing) {
+TEST_P(IndexingAnalysisTest, ReshapeNothing) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -340,7 +428,7 @@ TEST_F(IndexingAnalysisTest, ReshapeNothing) {
             1);
 }
 
-TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
+TEST_P(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -371,7 +459,7 @@ TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
+TEST_P(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -402,7 +490,7 @@ TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ElementwiseOp) {
+TEST_P(IndexingAnalysisTest, ElementwiseOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -444,7 +532,7 @@ TEST_F(IndexingAnalysisTest, ElementwiseOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, Map) {
+TEST_P(IndexingAnalysisTest, Map) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     mapper {
@@ -491,7 +579,7 @@ TEST_F(IndexingAnalysisTest, Map) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
+TEST_P(IndexingAnalysisTest, BitcastIsReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -509,7 +597,7 @@ TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BitcastIsTranspose) {
+TEST_P(IndexingAnalysisTest, BitcastIsTranspose) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -528,7 +616,7 @@ TEST_F(IndexingAnalysisTest, BitcastIsTranspose) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
+TEST_P(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -555,7 +643,7 @@ TEST_F(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BroadcastOp) {
+TEST_P(IndexingAnalysisTest, BroadcastOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -583,7 +671,7 @@ TEST_F(IndexingAnalysisTest, BroadcastOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConstantOp) {
+TEST_P(IndexingAnalysisTest, ConstantOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -594,7 +682,7 @@ TEST_F(IndexingAnalysisTest, ConstantOp) {
   EXPECT_THAT(input_indexing.ToString(), IsEmpty());
 }
 
-TEST_F(IndexingAnalysisTest, ConcatenateOp) {
+TEST_P(IndexingAnalysisTest, ConcatenateOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -658,7 +746,7 @@ TEST_F(IndexingAnalysisTest, ConcatenateOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
+TEST_P(IndexingAnalysisTest, DynamicSliceOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -682,11 +770,11 @@ TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
         rt1 in [0, 0],
         rt2 in [0, 226]
       runtime variables:
-        rt0: %of1 = s32[] parameter(1); (d0, d1, d2) -> (),
+        rt0: parameter(1); (d0, d1, d2) -> (),
           domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 31]
-        rt1: %of2 = s32[] parameter(2); (d0, d1, d2) -> (),
+        rt1: parameter(2); (d0, d1, d2) -> (),
           domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 31]
-        rt2: %of3 = s32[] parameter(3); (d0, d1, d2) -> (),
+        rt2: parameter(3); (d0, d1, d2) -> (),
           domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 31]
     operand id = 1
       (d0, d1, d2) -> (),
@@ -709,7 +797,7 @@ TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
+TEST_P(IndexingAnalysisTest, DynamicUpdateSliceOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -735,9 +823,9 @@ TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
         rt0 in [0, 15],
         rt1 in [0, 20]
       runtime variables:
-        rt0: %of1 = s32[] parameter(2); (d0, d1) -> (),
+        rt0: parameter(2); (d0, d1) -> (),
           domain: d0 in [0, 19], d1 in [0, 29]
-        rt1: %of2 = s32[] parameter(3); (d0, d1) -> (),
+        rt1: parameter(3); (d0, d1) -> (),
           domain: d0 in [0, 19], d1 in [0, 29]
     operand id = 2
       (d0, d1) -> (),
@@ -752,7 +840,7 @@ TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
       )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
+TEST_P(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -778,7 +866,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithDot) {
+TEST_P(IndexingAnalysisTest, FusionOpWithDot) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     f {
       p0 = s8[3,12288,6,128]{3,2,1,0} parameter(0)
@@ -900,7 +988,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDot) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithSoftmax) {
+TEST_P(IndexingAnalysisTest, FusionOpWithSoftmax) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     add_computation {
       p0 = f32[] parameter(0)
@@ -964,7 +1052,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSoftmax) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
+TEST_P(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -992,7 +1080,7 @@ TEST_F(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
+TEST_P(IndexingAnalysisTest, FusionExponentialDuplication) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule test_module
 
@@ -1047,7 +1135,7 @@ TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, GatherOp) {
+TEST_P(IndexingAnalysisTest, GatherOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY main {
@@ -1069,9 +1157,9 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
         rt0 in [0, 26],
         rt1 in [0, 68]
       runtime variables:
-        rt0: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 0),
+        rt0: parameter(1); (d0, d1, d2, d3) -> (d0, 0),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
-        rt1: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 1),
+        rt1: parameter(1); (d0, d1, d2, d3) -> (d0, 1),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
     operand id = 1
       (d0, d1, d2, d3)[s0] -> (d0, s0),
@@ -1084,7 +1172,7 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
+TEST_P(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY main {
@@ -1106,9 +1194,9 @@ TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
         rt0 in [0, 26],
         rt1 in [0, 68]
       runtime variables:
-        rt0: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 1),
+        rt0: parameter(1); (d0, d1, d2, d3) -> (d0, 1),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
-        rt1: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 0),
+        rt1: parameter(1); (d0, d1, d2, d3) -> (d0, 0),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
     operand id = 1
       (d0, d1, d2, d3)[s0] -> (d0, s0),
@@ -1121,7 +1209,7 @@ TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1158,7 +1246,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1195,7 +1283,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
+TEST_P(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1230,7 +1318,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReducedSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReducedSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1266,7 +1354,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReducedSlice) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1287,7 +1375,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1309,7 +1397,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1332,7 +1420,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1357,7 +1445,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1394,11 +1482,11 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
         rt2 in [0, 25],
         rt3 in [0, 16]
       runtime variables:
-        rt0: %of21 = s32[] parameter(3); (d0, d1) -> (),
+        rt0: parameter(3); (d0, d1) -> (),
           domain: d0 in [0, 24], d1 in [0, 15]
-        rt1: %of22 = s32[] parameter(4); (d0, d1) -> (),
+        rt1: parameter(4); (d0, d1) -> (),
           domain: d0 in [0, 24], d1 in [0, 15]
-        rt2: %of11 = s32[] parameter(1); (d0, d1){rt0, rt1} -> (),
+        rt2: parameter(1); (d0, d1){rt0, rt1} -> (),
           domain:
             d0 in [0, 24],
             d1 in [0, 15],
@@ -1406,7 +1494,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
             rt1 in [0, 16],
             d0 + rt0 in [0, 49],
             d1 + rt1 in [0, 31]
-        rt3: %of12 = s32[] parameter(2); (d0, d1){rt0, rt1} -> (),
+        rt3: parameter(2); (d0, d1){rt0, rt1} -> (),
           domain:
             d0 in [0, 24], d1 in [0, 15],
             rt0 in [0, 25], rt1 in [0, 16],
@@ -1435,7 +1523,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
+TEST_P(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1476,7 +1564,7 @@ TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
+TEST_P(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1509,7 +1597,7 @@ TEST_F(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
+TEST_P(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1540,7 +1628,7 @@ TEST_F(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, IotaOp) {
+TEST_P(IndexingAnalysisTest, IotaOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1551,7 +1639,7 @@ TEST_F(IndexingAnalysisTest, IotaOp) {
   EXPECT_THAT(input_indexing.indexing_maps, IsEmpty());
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpCollapseShape) {
+TEST_P(IndexingAnalysisTest, ReshapeOpCollapseShape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1567,7 +1655,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpCollapseShape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpExpandShape) {
+TEST_P(IndexingAnalysisTest, ReshapeOpExpandShape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1584,7 +1672,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandShape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
+TEST_P(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1613,7 +1701,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
+TEST_P(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1631,7 +1719,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
+TEST_P(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1649,7 +1737,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
+TEST_P(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1668,7 +1756,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, PadOp) {
+TEST_P(IndexingAnalysisTest, PadOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1692,7 +1780,7 @@ TEST_F(IndexingAnalysisTest, PadOp) {
                                 )"));
 }
 
-TEST_F(IndexingAnalysisTest, PadOpNoInterior) {
+TEST_P(IndexingAnalysisTest, PadOpNoInterior) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1715,7 +1803,7 @@ TEST_F(IndexingAnalysisTest, PadOpNoInterior) {
                                 )"));
 }
 
-TEST_F(IndexingAnalysisTest, PadOpNegativePadding) {
+TEST_P(IndexingAnalysisTest, PadOpNegativePadding) {
   // The interior padding is applied first (even with negative padding), so we
   // get a size of 5 (7 + 6 - 8).
   // in:     0 1 2 3 4 5 6
@@ -1742,7 +1830,7 @@ TEST_F(IndexingAnalysisTest, PadOpNegativePadding) {
                                 )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceOp) {
+TEST_P(IndexingAnalysisTest, ReduceOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1793,7 +1881,7 @@ TEST_F(IndexingAnalysisTest, ReduceOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, VariadicReduceOp) {
+TEST_P(IndexingAnalysisTest, VariadicReduceOp) {
   HloInstruction* root = ParseAndGetRoot(R"(
     HloModule m
     min {
@@ -1895,7 +1983,7 @@ TEST_F(IndexingAnalysisTest, VariadicReduceOp) {
                   ElementsAre(MatchOperandIndexing(kInitToOutputIndexing))));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1926,7 +2014,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1961,7 +2049,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1995,7 +2083,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -2027,7 +2115,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -2058,7 +2146,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     combiner {
@@ -2136,7 +2224,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2171,7 +2259,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2208,7 +2296,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2245,7 +2333,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2282,7 +2370,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2317,7 +2405,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2352,7 +2440,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2388,7 +2476,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReverseOp) {
+TEST_P(IndexingAnalysisTest, ReverseOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2419,7 +2507,7 @@ TEST_F(IndexingAnalysisTest, ReverseOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReverseReshape) {
+TEST_P(IndexingAnalysisTest, ReverseReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     fused_computation {
@@ -2444,7 +2532,7 @@ TEST_F(IndexingAnalysisTest, ReverseReshape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, SliceOp) {
+TEST_P(IndexingAnalysisTest, SliceOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2479,7 +2567,7 @@ TEST_F(IndexingAnalysisTest, SliceOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, TransposeOp) {
+TEST_P(IndexingAnalysisTest, TransposeOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2508,7 +2596,7 @@ TEST_F(IndexingAnalysisTest, TransposeOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, TransposeOp4D) {
+TEST_P(IndexingAnalysisTest, TransposeOp4D) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2527,7 +2615,7 @@ TEST_F(IndexingAnalysisTest, TransposeOp4D) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, DotOp) {
+TEST_P(IndexingAnalysisTest, DotOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2564,7 +2652,7 @@ TEST_F(IndexingAnalysisTest, DotOp) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ScaledDotOp) {
+TEST_P(IndexingAnalysisTest, ScaledDotOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2605,7 +2693,7 @@ TEST_F(IndexingAnalysisTest, ScaledDotOp) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, UnsupportedOps) {
+TEST_P(IndexingAnalysisTest, UnsupportedOps) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2633,7 +2721,7 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
+TEST_P(IndexingAnalysisTest, FusionWithUnsupportedOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     fused_computation {
@@ -2670,7 +2758,7 @@ TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
+TEST_P(IndexingAnalysisTest, EpilogueIndexing) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
     fused_computation {
@@ -2703,7 +2791,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
+TEST_P(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
     fused_computation {
@@ -2732,7 +2820,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, BroadcastingElementwise) {
+TEST_P(IndexingAnalysisTest, BroadcastingElementwise) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2758,7 +2846,7 @@ TEST_F(IndexingAnalysisTest, BroadcastingElementwise) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_ScalarConstant) {
+TEST_P(IndexingAnalysisTest, FusionWithRTVarsSimplification_ScalarConstant) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2781,7 +2869,7 @@ TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_ScalarConstant) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest,
+TEST_P(IndexingAnalysisTest,
        FusionWithRTVarsSimplification_ScalarConstantOutsideOfRangeIsKept) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
@@ -2804,12 +2892,12 @@ TEST_F(IndexingAnalysisTest,
         d0 in [0, 9],
         rt0 in [0, 90]
       runtime variables:
-        rt0: %offset = s64[] constant(99); (d0) -> (),
+        rt0: constant(99); (d0) -> (),
           domain: d0 in [0, 9]
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_Iota) {
+TEST_P(IndexingAnalysisTest, FusionWithRTVarsSimplification_Iota) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2837,7 +2925,7 @@ TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_Iota) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_IotaAsConstant) {
+TEST_P(IndexingAnalysisTest, FusionWithRTVarsSimplification_IotaAsConstant) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2865,7 +2953,7 @@ TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_IotaAsConstant) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2884,7 +2972,10 @@ TEST_F(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
           calls=fused_computation
       }
     )hlo"));
-  EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
+  // HLO uses instruction name, StableHLO uses op name
+  EXPECT_THAT(input_indexing.ToString(),
+              MatchIndexingString(
+                  absl::StrFormat(R"(
     operand id = 0
       (d0, d1){rt0} -> (0, d1 + rt0 - 4096),
       domain:
@@ -2893,17 +2984,18 @@ TEST_F(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
         rt0 in [0, 4096],
         d1 + rt0 in [4096, 8191]
       runtime variables:
-        rt0: %bitcast.4 = s32[] bitcast(%slice); (d0, d1) -> (),
+        rt0: %s; (d0, d1) -> (),
           domain: d0 in [0, 0], d1 in [0, 4095]
     operand id = 1
       (d0, d1) -> (0),
       domain:
         d0 in [0, 0],
         d1 in [0, 4095]
-  )"));
+    )",
+                                  GetParam() ? "mhlo.bitcast" : "bitcast.4")));
 }
 
-TEST_F(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
+TEST_P(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule t
 
@@ -2946,7 +3038,7 @@ TEST_F(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
     operand id = 1 (d0, d1)[s0]{rt0} -> (rt0, d1, s0),
       domain: d0 in [0, 3], d1 in [0, 4], s0 in [0, 1], rt0 in [0, 3]
     runtime variables:
-      rt0: %p1 = s32[] parameter(1);
+      rt0: parameter(1);
         (d0, d1)[s0] -> (), domain: d0 in [0, 3], d1 in [0, 4], s0 in [0, 1]
     operand id = 2 (d0, d1) -> (),
       domain: d0 in [0, 3], d1 in [0, 4]
@@ -2957,7 +3049,7 @@ TEST_F(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherOp) {
+TEST_P(IndexingAnalysisTest, AllGatherOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m, replica_count=4
     ENTRY e {
@@ -2980,7 +3072,7 @@ TEST_F(IndexingAnalysisTest, AllGatherOp) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherFusionWithTranspose) {
+TEST_P(IndexingAnalysisTest, AllGatherFusionWithTranspose) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3019,7 +3111,7 @@ TEST_F(IndexingAnalysisTest, AllGatherFusionWithTranspose) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherFusionWithReshape) {
+TEST_P(IndexingAnalysisTest, AllGatherFusionWithReshape) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3063,7 +3155,7 @@ TEST_F(IndexingAnalysisTest, AllGatherFusionWithReshape) {
   )")));
 }
 
-TEST_F(IndexingAnalysisTest, ChainedAllGatherFusion) {
+TEST_P(IndexingAnalysisTest, ChainedAllGatherFusion) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3086,7 +3178,7 @@ TEST_F(IndexingAnalysisTest, ChainedAllGatherFusion) {
               ElementsAre(UndefinedOperandIndexing()));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherDotFusion_GatherNonContractingDim) {
+TEST_P(IndexingAnalysisTest, AllGatherDotFusion_GatherNonContractingDim) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3121,7 +3213,7 @@ TEST_F(IndexingAnalysisTest, AllGatherDotFusion_GatherNonContractingDim) {
   )")));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherDotFusion_GatherContractingDim) {
+TEST_P(IndexingAnalysisTest, AllGatherDotFusion_GatherContractingDim) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc
new file mode 100644
index 00000000000000..ba37bf74e778e7
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc
@@ -0,0 +1,340 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/analysis/indexing_analysis_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+using llvm::SmallVector;
+using mlir::AffineExpr;
+using mlir::AffineMap;
+using mlir::getAffineConstantExpr;
+using mlir::getAffineDimExpr;
+using mlir::getAffineSymbolExpr;
+using mlir::MLIRContext;
+
+IndexingMap ComputeBroadcastIndexingMap(
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> broadcast_dims, MLIRContext* mlir_context) {
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(broadcast_dims.size());
+  for (int64_t bcast_dim : broadcast_dims) {
+    exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
+  }
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                     mlir_context),
+      output_dims, {});
+}
+
+IndexingMap ComputeSliceIndexingMap(absl::Span<const int64_t> output_shape_dims,
+                                    absl::Span<const int64_t> slice_starts,
+                                    absl::Span<const int64_t> slice_strides,
+                                    mlir::MLIRContext* mlir_context) {
+  auto rank = output_shape_dims.size();
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+    exprs.push_back(dim_expr * slice_strides[dim] + slice_starts[dim]);
+  }
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(rank, /*symbolCount=*/0, exprs, mlir_context),
+      output_shape_dims, {});
+}
+
+IndexingMap ComputeReverseIndexingMap(
+    absl::Span<const int64_t> output_shape_dims,
+    absl::Span<const int64_t> reverse_dims, mlir::MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reverse_dims_set(reverse_dims.begin(),
+                                                reverse_dims.end());
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_shape_dims.size());
+  for (auto [output_dim_id, output_dim] : llvm::enumerate(output_shape_dims)) {
+    auto dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    exprs.push_back(reverse_dims_set.contains(output_dim_id)
+                        ? -dim_expr + output_dim - 1
+                        : dim_expr);
+  }
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape_dims.size(), /*symbolCount=*/0, exprs,
+                     mlir_context),
+      output_shape_dims, {});
+}
+
+HloInstructionIndexing ComputeConcatenateIndexing(
+    int64_t rank, int64_t concat_dim, absl::Span<const int64_t> output_dims,
+    const std::vector<int64_t>& operand_concat_dim_sizes,
+    mlir::MLIRContext* mlir_context) {
+  mlir::MutableAffineMap affine_map =
+      AffineMap::getMultiDimIdentityMap(rank, mlir_context);
+  std::vector<IndexingMap::Variable> dim_vars =
+      DimVarsFromTensorSizes(output_dims);
+
+  HloInstructionIndexing concat_indexing;
+  concat_indexing.indexing_maps.resize(operand_concat_dim_sizes.size());
+  AffineExpr concat_dim_expr = getAffineDimExpr(concat_dim, mlir_context);
+  int64_t offset = 0;
+  for (const auto [operand_id, operand_concat_dim] :
+       llvm::enumerate(operand_concat_dim_sizes)) {
+    affine_map.setResult(concat_dim, concat_dim_expr - offset);
+    dim_vars[concat_dim] =
+        IndexingMap::Variable{{offset, offset + operand_concat_dim - 1}};
+    concat_indexing.indexing_maps[operand_id].insert(
+        OperandIndexing(IndexingMap(affine_map.getAffineMap(), dim_vars,
+                                    /*range_vars=*/{}, /*rt_vars=*/{})));
+    offset += operand_concat_dim;
+  }
+  return concat_indexing;
+}
+
+std::pair<IndexingMap, IndexingMap> ComputeDotOperandsIndexing(
+    absl::Span<const int64_t> lhs_dims, absl::Span<const int64_t> rhs_dims,
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> lhs_batch_dims,
+    absl::Span<const int64_t> rhs_batch_dims,
+    absl::Span<const int64_t> lhs_contracting_dims,
+    absl::Span<const int64_t> rhs_contracting_dims, MLIRContext* mlir_context) {
+  SmallVector<AffineExpr> lhs_exprs(lhs_dims.size());
+  SmallVector<AffineExpr> rhs_exprs(rhs_dims.size());
+  int64_t output_dim_id = 0;
+
+  // Batch dimensions
+  for (auto [lhs_batch_dim, rhs_batch_dim] :
+       llvm::zip(lhs_batch_dims, rhs_batch_dims)) {
+    AffineExpr output_dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    lhs_exprs[lhs_batch_dim] = output_dim_expr;
+    rhs_exprs[rhs_batch_dim] = output_dim_expr;
+    ++output_dim_id;
+  }
+
+  // LHS non-contracting dims
+  absl::flat_hash_set<int64_t> lhs_batch_set(lhs_batch_dims.begin(),
+                                             lhs_batch_dims.end());
+  absl::flat_hash_set<int64_t> lhs_contracting_set(lhs_contracting_dims.begin(),
+                                                   lhs_contracting_dims.end());
+  for (int64_t i = 0; i < lhs_dims.size(); ++i) {
+    if (!lhs_batch_set.contains(i) && !lhs_contracting_set.contains(i)) {
+      lhs_exprs[i] = getAffineDimExpr(output_dim_id++, mlir_context);
+    }
+  }
+
+  // RHS non-contracting dims
+  absl::flat_hash_set<int64_t> rhs_batch_set(rhs_batch_dims.begin(),
+                                             rhs_batch_dims.end());
+  absl::flat_hash_set<int64_t> rhs_contracting_set(rhs_contracting_dims.begin(),
+                                                   rhs_contracting_dims.end());
+  for (int64_t i = 0; i < rhs_dims.size(); ++i) {
+    if (!rhs_batch_set.contains(i) && !rhs_contracting_set.contains(i)) {
+      rhs_exprs[i] = getAffineDimExpr(output_dim_id++, mlir_context);
+    }
+  }
+
+  // Contracting dimensions (as symbols)
+  int64_t symbol_id = 0;
+  std::vector<int64_t> symbol_sizes;
+  symbol_sizes.reserve(lhs_contracting_dims.size());
+  for (auto [lhs_contract, rhs_contract] :
+       llvm::zip(lhs_contracting_dims, rhs_contracting_dims)) {
+    AffineExpr symbol_expr = getAffineSymbolExpr(symbol_id, mlir_context);
+    lhs_exprs[lhs_contract] = symbol_expr;
+    rhs_exprs[rhs_contract] = symbol_expr;
+    symbol_sizes.push_back(lhs_dims[lhs_contract]);
+    ++symbol_id;
+  }
+
+  int64_t output_rank = output_dims.size();
+  return std::make_pair(IndexingMap::FromTensorSizes(
+                            AffineMap::get(output_rank, symbol_sizes.size(),
+                                           lhs_exprs, mlir_context),
+                            output_dims, symbol_sizes),
+                        IndexingMap::FromTensorSizes(
+                            AffineMap::get(output_rank, symbol_sizes.size(),
+                                           rhs_exprs, mlir_context),
+                            output_dims, symbol_sizes));
+}
+
+IndexingMap ComputeReduceInputIndexingMap(absl::Span<const int64_t> input_dims,
+                                          absl::Span<const int64_t> output_dims,
+                                          absl::Span<const int64_t> reduce_dims,
+                                          MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reduce_dims_set(reduce_dims.begin(),
+                                               reduce_dims.end());
+  std::vector<int64_t> parallel_dims_sizes;
+  int64_t output_dim_id = 0;
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(input_dims.size());
+
+  for (auto [input_dim_id, input_dim] : llvm::enumerate(input_dims)) {
+    if (reduce_dims_set.contains(input_dim_id)) {
+      exprs.push_back(
+          getAffineSymbolExpr(parallel_dims_sizes.size(), mlir_context));
+      parallel_dims_sizes.push_back(input_dim);
+      continue;
+    }
+    exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
+  }
+
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_dims.size(), reduce_dims_set.size(), exprs,
+                     mlir_context),
+      output_dims, parallel_dims_sizes);
+}
+
+IndexingMap ComputePadIndexingMap(absl::Span<const int64_t> output_dims,
+                                  absl::Span<const int64_t> padding_low,
+                                  absl::Span<const int64_t> padding_high,
+                                  absl::Span<const int64_t> padding_interior,
+                                  MLIRContext* mlir_context) {
+  int64_t output_rank = output_dims.size();
+
+  std::vector<AffineExpr> exprs;
+  std::vector<std::pair<AffineExpr, Interval>> constraints;
+  std::vector<IndexingMap::Variable> dim_vars;
+  exprs.reserve(output_rank);
+  constraints.reserve(output_rank);
+  int64_t output_dim_id = 0;
+  for (const auto [output_dim, pad_low, pad_high, pad_interior] :
+       llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
+    AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    dim_vars.push_back({IndexingMap::Variable{
+        std::max(int64_t{0}, pad_low),
+        std::min(output_dim - 1, output_dim - 1 - pad_high)}});
+    if (pad_interior == 0) {
+      exprs.push_back(dim_expr - pad_low);
+    } else {
+      exprs.push_back((dim_expr - pad_low).floorDiv(pad_interior + 1));
+      constraints.push_back(
+          {(dim_expr - pad_low) % (pad_interior + 1), Interval{0, 0}});
+    }
+    ++output_dim_id;
+  }
+  return IndexingMap{
+      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
+      std::move(dim_vars),
+      /*range_vars = */ {},
+      /*rt_vars = */ {}, absl::MakeSpan(constraints)};
+}
+
+IndexingMap ComposeWindowIndexingMap(absl::Span<const int64_t> input_dims,
+                                     absl::Span<const int64_t> output_dims,
+                                     absl::Span<const int64_t> window_dims,
+                                     absl::Span<const int64_t> window_strides,
+                                     absl::Span<const int64_t> window_dilations,
+                                     absl::Span<const int64_t> base_dilations,
+                                     absl::Span<const int64_t> padding,
+                                     MLIRContext* mlir_context) {
+  size_t rank = input_dims.size();
+
+  // Compute shape of the padded input and the indexing map of pad op required
+  // to pad the input.
+  SmallVector<int64_t> padding_low, padding_high, padding_interior,
+      padded_input_dimensions;
+  SmallVector<AffineExpr, 4> exprs;
+  std::vector<IndexingMap::Variable> dim_vars;
+  std::vector<IndexingMap::Variable> range_vars;
+  exprs.reserve(rank);
+  dim_vars.reserve(rank);
+  range_vars.reserve(rank);
+
+  for (size_t dim_id = 0; dim_id < rank; ++dim_id) {
+    int64_t pad_low = padding[dim_id * 2];
+    int64_t pad_high = padding[dim_id * 2 + 1];
+    int64_t base_dilation = base_dilations[dim_id];
+    int64_t window_dilation = window_dilations[dim_id];
+    int64_t window_stride = window_strides[dim_id];
+    int64_t output_dim = output_dims[dim_id];
+    int64_t window_dim = window_dims[dim_id];
+    int64_t input_dim_size = input_dims[dim_id];
+
+    padding_low.push_back(pad_low);
+    padding_high.push_back(pad_high);
+    // For some reason interior_padding in HLO pad is offset from base_dilations
+    // in HLO reduce-window by 1.
+    padding_interior.push_back(base_dilation - 1);
+    padded_input_dimensions.push_back(input_dim_size + pad_low + pad_high +
+                                      (input_dim_size - 1) *
+                                          (base_dilation - 1));
+    AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
+    AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
+
+    exprs.push_back(symbol_expr * window_dilation + window_stride * dim_expr);
+    dim_vars.push_back({IndexingMap::Variable{0, output_dim - 1}});
+    range_vars.push_back({IndexingMap::Variable{0, window_dim - 1}});
+  }
+  // Indexing map for pad op that pads the input.
+  IndexingMap padded_input_indexing =
+      ComputePadIndexingMap(padded_input_dimensions, padding_low, padding_high,
+                            padding_interior, mlir_context);
+  // Indexing map for reduce-window, that does not do any padding.
+  IndexingMap input_indexing_no_padding(
+      AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
+      /*rt_vars=*/{});
+
+  // Composed indexing.
+  IndexingMap result =
+      ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
+  result.Simplify();
+  result.RemoveUnusedSymbols();
+  return result;
+}
+
+HloInstructionIndexing CreateElementwiseIndexing(int64_t num_operands,
+                                                 const Shape& output_shape,
+                                                 MLIRContext* mlir_context) {
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(output_shape.dimensions().size(),
+                                        mlir_context),
+      output_shape.dimensions(), {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(num_operands);
+  for (int64_t i = 0; i < num_operands; ++i) {
+    indexing.indexing_maps[i].insert(OperandIndexing{identity_map});
+  }
+  return indexing;
+}
+
+IndexingMap CreateScalarIndexingMap(const Shape& output_shape,
+                                    MLIRContext* mlir_context) {
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
+                     mlir_context),
+      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+}
+
+AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
+                                      MLIRContext* mlir_context) {
+  return AffineMap::getPermutationMap(
+      std::vector<unsigned>(permutation.begin(), permutation.end()),
+      mlir_context);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h
new file mode 100644
index 00000000000000..d6b84d0d0b82f9
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h
@@ -0,0 +1,102 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_UTILS_H_
+#define XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_UTILS_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+struct HloInstructionIndexing;
+
+// Computes the indexing map for a Pad operation.
+IndexingMap ComputePadIndexingMap(absl::Span<const int64_t> output_dims,
+                                  absl::Span<const int64_t> padding_low,
+                                  absl::Span<const int64_t> padding_high,
+                                  absl::Span<const int64_t> padding_interior,
+                                  mlir::MLIRContext* mlir_context);
+
+// Computes the indexing map for a window-based operation (e.g. ReduceWindow,
+// Convolution).
+IndexingMap ComposeWindowIndexingMap(absl::Span<const int64_t> input_dims,
+                                     absl::Span<const int64_t> output_dims,
+                                     absl::Span<const int64_t> window_dims,
+                                     absl::Span<const int64_t> window_strides,
+                                     absl::Span<const int64_t> window_dilations,
+                                     absl::Span<const int64_t> base_dilations,
+                                     absl::Span<const int64_t> padding,
+                                     mlir::MLIRContext* mlir_context);
+
+// Creates an elementwise indexing for num_operands operands with the given
+// output shape. All operands use an identity mapping.
+HloInstructionIndexing CreateElementwiseIndexing(
+    int64_t num_operands, const Shape& output_shape,
+    mlir::MLIRContext* mlir_context);
+
+// Creates a scalar (empty) indexing map for the given output shape.
+// Used for scalar operands like init values or padding values.
+IndexingMap CreateScalarIndexingMap(const Shape& output_shape,
+                                    mlir::MLIRContext* mlir_context);
+
+IndexingMap ComputeBroadcastIndexingMap(
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> broadcast_dims, mlir::MLIRContext* mlir_context);
+
+IndexingMap ComputeSliceIndexingMap(absl::Span<const int64_t> output_shape_dims,
+                                    absl::Span<const int64_t> slice_starts,
+                                    absl::Span<const int64_t> slice_strides,
+                                    mlir::MLIRContext* mlir_context);
+
+IndexingMap ComputeReverseIndexingMap(
+    absl::Span<const int64_t> output_shape_dims,
+    absl::Span<const int64_t> reverse_dims, mlir::MLIRContext* mlir_context);
+
+mlir::AffineMap ComputeTransposeIndexingMap(
+    absl::Span<const int64_t> permutation, mlir::MLIRContext* mlir_context);
+
+HloInstructionIndexing ComputeConcatenateIndexing(
+    int64_t rank, int64_t concat_dim, absl::Span<const int64_t> output_dims,
+    const std::vector<int64_t>& operand_concat_dim_sizes,
+    mlir::MLIRContext* mlir_context);
+
+// Computes indexing maps for DotGeneral operands.
+// Returns a pair of (lhs_indexing_map, rhs_indexing_map).
+std::pair<IndexingMap, IndexingMap> ComputeDotOperandsIndexing(
+    absl::Span<const int64_t> lhs_dims, absl::Span<const int64_t> rhs_dims,
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> lhs_batch_dims,
+    absl::Span<const int64_t> rhs_batch_dims,
+    absl::Span<const int64_t> lhs_contracting_dims,
+    absl::Span<const int64_t> rhs_contracting_dims,
+    mlir::MLIRContext* mlir_context);
+
+// Computes indexing map for reduce input operands.
+IndexingMap ComputeReduceInputIndexingMap(absl::Span<const int64_t> input_dims,
+                                          absl::Span<const int64_t> output_dims,
+                                          absl::Span<const int64_t> reduce_dims,
+                                          mlir::MLIRContext* mlir_context);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_UTILS_H_
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.h b/third_party/xla/xla/hlo/analysis/indexing_map.h
index 58b5f5bbec139d..e91d063b2f3408 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.h
@@ -36,7 +36,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
index bcb0b595ac114e..5646e22a76ba1f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
index b531cdc2ddb14b..d6b5823e2d388f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
index 2ae50aae18f57b..27041cb7ee2555 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
index 93f727b9b35e91..23e519deac9c01 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
@@ -33,11 +33,8 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -1457,8 +1454,8 @@ TEST_F(IndexingMapTest, RangeVarSupportsAbslHashAndEqAndNe) {
 }
 
 TEST_F(IndexingMapTest, RTVarSupportsAbslHashAndEqAndNe) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
-                          ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                       ParseAndReturnVerifiedModule(R"(
                             HloModule m
                             ENTRY e {
                               ROOT %constant = s64[] constant(42)
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
index f7db83e6f76331..d7b24a58e4cd11 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
index 2c8a67f3ae65cc..d14e6d621ce6e8 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
@@ -28,10 +28,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
@@ -70,13 +70,19 @@ class IndexingTestBase : public HloHardwareIndependentTestBase {
  public:
   HloInstruction* ParseAndGetRoot(absl::string_view hlo_string);
 
-  HloInstructionIndexing GetOutputToInputIndexing(
-      const HloInstruction* instr, int output_id = 0,
-      bool use_physical_layout = false);
+  virtual HloInstructionIndexing GetOutputToInputIndexing(
+      const HloInstruction* instr, int output_id, bool use_physical_layout);
+  HloInstructionIndexing GetOutputToInputIndexing(const HloInstruction* instr,
+                                                  int output_id = 0) {
+    return GetOutputToInputIndexing(instr, output_id, false);
+  }
 
-  HloInstructionIndexing GetInputToOutputIndexing(
-      const HloInstruction* instr, int input_id = 0,
-      bool use_physical_layout = false);
+  virtual HloInstructionIndexing GetInputToOutputIndexing(
+      const HloInstruction* instr, int input_id, bool use_physical_layout);
+  HloInstructionIndexing GetInputToOutputIndexing(const HloInstruction* instr,
+                                                  int input_id = 0) {
+    return GetInputToOutputIndexing(instr, input_id, false);
+  }
 
   mlir::MLIRContext mlir_context_;
   std::unique_ptr<VerifiedHloModule> module_;
diff --git a/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc
new file mode 100644
index 00000000000000..ee547bb52b8ea9
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc
@@ -0,0 +1,1009 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/analysis/stablehlo_indexing_analysis.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"  // IWYU pragma: keep
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_analysis_utils.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/layout_util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+#include "xla/permutation_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace {
+
+using namespace ::mlir::stablehlo;  // NOLINT
+namespace mhlo = ::mlir::mhlo;
+
+using ::llvm::ArrayRef;
+using ::llvm::enumerate;
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::mlir::BlockArgument;
+using ::mlir::DenseIntElementsAttr;
+using ::mlir::dyn_cast;
+using ::mlir::MLIRContext;
+using ::mlir::Operation;
+using ::mlir::RankedTensorType;
+using ::mlir::SmallVector;
+using ::mlir::Value;
+
+HloInstructionIndexing CreateUnknownIndexing(int64_t count) {
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(count);
+  for (int64_t i = 0; i < count; ++i) {
+    indexing.indexing_maps[i].insert(
+        OperandIndexing{IndexingMap::GetUndefined()});
+  }
+  return indexing;
+}
+
+Shape GetShape(Value value) {
+  auto shaped_type = dyn_cast<RankedTensorType>(value.getType());
+  if (!shaped_type) {
+    return Shape();
+  }
+  std::vector<int64_t> dimensions(shaped_type.getShape().begin(),
+                                  shaped_type.getShape().end());
+  return ShapeUtil::MakeShape(F32, dimensions);
+}
+
+// Operation-specific helper implementations
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    AllGatherOp all_gather, int output_id) {
+  MLIRContext* context = all_gather.getContext();
+  int64_t all_gather_dim = all_gather.getAllGatherDim();
+  auto output_shape = GetShape(all_gather.getResult(0));
+  int64_t output_rank = output_shape.dimensions().size();
+
+  // Input shape for the first operand
+  auto input_shape = GetShape(all_gather.getOperand(0));
+  int64_t all_gather_input_dim_size = input_shape.dimensions(all_gather_dim);
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_rank);
+
+  for (int64_t i = 0; i < output_rank; ++i) {
+    auto dim = mlir::getAffineDimExpr(i, context);
+    exprs.push_back(i == all_gather_dim ? dim % all_gather_input_dim_size
+                                        : dim);
+  }
+
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 0, exprs, context), output_shape.dimensions(),
+      {});
+
+  AffineExpr replica_id_expr = mlir::getAffineDimExpr(all_gather_dim, context)
+                                   .floorDiv(all_gather_input_dim_size);
+
+  IndexingMap replica_id_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 0, replica_id_expr, context),
+      output_shape.dimensions(), {});
+
+  OperandIndexing operand_indexing(indexing_map, {}, replica_id_map);
+
+  HloInstructionIndexing indexing;
+  // HLO implementation only returns indexing for the first operand.
+  // We mirror this behavior for consistency, although StableHLO ops might be
+  // variadic.
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(operand_indexing);
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    BitcastConvertOp bitcast, int output_id) {
+  MLIRContext* context = bitcast.getContext();
+  auto input_shape = GetShape(bitcast.getOperand());
+  auto output_shape = GetShape(bitcast.getResult());
+  IndexingMap indexing_map = GetBitcastMap(output_shape, input_shape, context);
+  indexing_map.Simplify();
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    BroadcastInDimOp bcast, int output_id) {
+  MLIRContext* context = bcast.getContext();
+  // Check if result has RankedTensorType
+  if (!dyn_cast<RankedTensorType>(bcast.getResult().getType())) {
+    return CreateUnknownIndexing(1);
+  }
+  auto output_shape = GetShape(bcast.getResult());
+  IndexingMap indexing_map = ComputeBroadcastIndexingMap(
+      output_shape.dimensions(), bcast.getBroadcastDimensions(), context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ConcatenateOp concat, int output_id) {
+  MLIRContext* context = concat.getContext();
+  int64_t concat_dim = concat.getDimension();
+  auto output_shape = GetShape(concat.getResult());
+  std::vector<int64_t> operand_concat_dim_sizes;
+  operand_concat_dim_sizes.reserve(concat.getInputs().size());
+  for (Value operand : concat.getInputs()) {
+    operand_concat_dim_sizes.push_back(
+        GetShape(operand).dimensions(concat_dim));
+  }
+  return ComputeConcatenateIndexing(output_shape.dimensions().size(),
+                                    concat_dim, output_shape.dimensions(),
+                                    operand_concat_dim_sizes, context);
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ConvolutionOp conv, int output_id) {
+  MLIRContext* context = conv.getContext();
+  auto input_shape = GetShape(conv.getLhs());
+  auto kernel_shape = GetShape(conv.getRhs());
+  auto output_shape = GetShape(conv.getResult());
+  auto dnums = conv.getDimensionNumbers();
+  size_t rank = output_shape.dimensions().size();
+
+  // Collect sizes for input/output spatial dimensions.
+  size_t spatial_rank = dnums.getInputSpatialDimensions().size();
+  std::vector<int64_t> input_spatial_sizes(spatial_rank);
+  std::vector<int64_t> kernel_spatial_sizes(spatial_rank);
+  std::vector<int64_t> output_spatial_sizes(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_spatial_sizes[i] =
+        input_shape.dimensions(dnums.getInputSpatialDimensions()[i]);
+    kernel_spatial_sizes[i] =
+        kernel_shape.dimensions(dnums.getKernelSpatialDimensions()[i]);
+    output_spatial_sizes[i] =
+        output_shape.dimensions(dnums.getOutputSpatialDimensions()[i]);
+  }
+
+  SmallVector<int64_t> ones(spatial_rank, 1);
+  auto strides = conv.getWindowStrides().value_or(ones);
+  auto lhs_dilation = conv.getLhsDilation().value_or(ones);
+  auto rhs_dilation = conv.getRhsDilation().value_or(ones);
+  SmallVector<int64_t> padding_flat;
+  if (conv.getPadding()) {
+    for (auto val : conv.getPadding()->getValues<int64_t>()) {
+      padding_flat.push_back(val);
+    }
+  } else {
+    padding_flat.assign(spatial_rank * 2, 0);
+  }
+
+  // Indexing map for the input value (spatial dimensions only).
+  // The dimension numbers in the resulting affine expressions have to be
+  // remapped to correspond to the correct output dimensions.
+  IndexingMap input_spatial_indexing = ComposeWindowIndexingMap(
+      input_spatial_sizes, output_spatial_sizes, kernel_spatial_sizes, strides,
+      rhs_dilation, lhs_dilation, padding_flat, context);
+  std::vector<AffineExpr> replacement_dims(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    replacement_dims[i] =
+        mlir::getAffineDimExpr(dnums.getOutputSpatialDimensions()[i], context);
+  }
+
+  // Build affine expressions and constraints for input spatial dimensions.
+  std::vector<AffineExpr> input_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_exprs[dnums.getInputSpatialDimensions()[i]] =
+        input_spatial_indexing.GetAffineMap().getResult(i).replaceDims(
+            replacement_dims);
+  }
+  llvm::MapVector<AffineExpr, Interval> input_constraints;
+  for (const auto& [key, val] : input_spatial_indexing.GetConstraints()) {
+    input_constraints[key.replaceDims(replacement_dims)] = val;
+  }
+
+  // Build affine expressions for kernel spatial and output dimensions.
+  std::vector<AffineExpr> kernel_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    kernel_exprs[dnums.getKernelSpatialDimensions()[i]] =
+        mlir::getAffineSymbolExpr(i, context);
+  }
+  AffineExpr dim_expr =
+      mlir::getAffineDimExpr(dnums.getOutputFeatureDimension(), context);
+  kernel_exprs[dnums.getKernelOutputFeatureDimension()] = dim_expr;
+
+  // Build initial symbol ranges.
+  std::vector<IndexingMap::Variable> input_symbols =
+      input_spatial_indexing.GetRangeVars();
+  std::vector<IndexingMap::Variable> kernel_symbols =
+      RangeVarsFromTensorSizes(kernel_spatial_sizes);
+
+  // Add symbol for input feature dimension.
+  input_exprs[dnums.getInputFeatureDimension()] =
+      mlir::getAffineSymbolExpr(input_symbols.size(), context);
+  kernel_exprs[dnums.getKernelInputFeatureDimension()] =
+      mlir::getAffineSymbolExpr(kernel_symbols.size(), context);
+
+  int64_t input_group_size =
+      kernel_shape.dimensions(dnums.getKernelInputFeatureDimension());
+  Interval input_feature_range{0, input_group_size - 1};
+  input_symbols.push_back(IndexingMap::Variable{input_feature_range});
+  kernel_symbols.push_back(IndexingMap::Variable{input_feature_range});
+
+  // With multiple feature groups, the input feature dimension is equally split.
+  if (conv.getFeatureGroupCount() > 1) {
+    AffineExpr& input_feature = input_exprs[dnums.getInputFeatureDimension()];
+    int64_t output_group_size =
+        output_shape.dimensions(dnums.getOutputFeatureDimension());
+    int64_t feature_group_size =
+        output_group_size / conv.getFeatureGroupCount();
+    input_feature = dim_expr.floorDiv(feature_group_size) * input_group_size +
+                    input_feature;
+  }
+
+  // With multiple batch groups, the input batch dimension is equally split.
+  AffineExpr batch_dim_expr =
+      mlir::getAffineDimExpr(dnums.getOutputBatchDimension(), context);
+  if (conv.getBatchGroupCount() > 1) {
+    int64_t batch_group_size =
+        output_shape.dimensions(dnums.getOutputBatchDimension());
+    AffineExpr batch_group_expr =
+        mlir::getAffineSymbolExpr(input_symbols.size(), context);
+    input_symbols.push_back(IndexingMap::Variable{
+        {0, static_cast<int64_t>(conv.getBatchGroupCount()) - 1}});
+    input_exprs[dnums.getInputBatchDimension()] =
+        batch_group_expr * batch_group_size + batch_dim_expr;
+  } else {
+    input_exprs[dnums.getInputBatchDimension()] = batch_dim_expr;
+  }
+
+  // Indexing map for the input value.
+  IndexingMap inputs_indexing(
+      AffineMap::get(rank, input_symbols.size(), input_exprs, context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), input_symbols,
+      /*rt_vars=*/{}, input_constraints);
+  // We may need to simplify and remove unused symbols again, as the input
+  // feature dimension size may be trivial.
+  inputs_indexing.Simplify();
+  inputs_indexing.RemoveUnusedSymbols();
+
+  // Indexing map for the kernel value.
+  IndexingMap kernel_indexing(
+      AffineMap::get(rank, kernel_symbols.size(), kernel_exprs, context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), kernel_symbols,
+      /*rt_vars=*/{});
+  kernel_indexing.Simplify();
+  kernel_indexing.RemoveUnusedSymbols();
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{inputs_indexing});
+  indexing.indexing_maps[1].insert(OperandIndexing{kernel_indexing});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DotGeneralOp dot_general, int output_id) {
+  MLIRContext* context = dot_general.getContext();
+  auto lhs_shape = GetShape(dot_general.getLhs());
+  auto rhs_shape = GetShape(dot_general.getRhs());
+  auto output_shape = GetShape(dot_general.getResult());
+  auto dim_numbers = dot_general.getDotDimensionNumbers();
+
+  auto lhs_batch_dims = dim_numbers.getLhsBatchingDimensions();
+  auto rhs_batch_dims = dim_numbers.getRhsBatchingDimensions();
+  auto lhs_contracting_dims = dim_numbers.getLhsContractingDimensions();
+  auto rhs_contracting_dims = dim_numbers.getRhsContractingDimensions();
+
+  auto [lhs_map, rhs_map] = ComputeDotOperandsIndexing(
+      lhs_shape.dimensions(), rhs_shape.dimensions(), output_shape.dimensions(),
+      lhs_batch_dims, rhs_batch_dims, lhs_contracting_dims,
+      rhs_contracting_dims, context);
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{lhs_map});
+  indexing.indexing_maps[1].insert(OperandIndexing{rhs_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DotOp dot, int output_id) {
+  MLIRContext* context = dot.getContext();
+  auto lhs_shape = GetShape(dot.getLhs());
+  auto rhs_shape = GetShape(dot.getRhs());
+  auto output_shape = GetShape(dot.getResult());
+
+  // Following XLA's DotOp pattern:
+  // For dot product: lhs[..., k] * rhs[k, ...] -> output[..., ...]
+  // LHS: batch_dims + k (contracting)
+  // RHS: k (contracting) + non_contracting
+  int64_t lhs_rank = lhs_shape.dimensions().size();
+  int64_t rhs_rank = rhs_shape.dimensions().size();
+  int64_t output_rank = output_shape.dimensions().size();
+
+  llvm::SmallVector<AffineExpr> lhs_exprs(lhs_rank);
+  llvm::SmallVector<AffineExpr> rhs_exprs(rhs_rank);
+  // LHS non-contracting dimensions map to output dims [0, output_rank-1)
+  // For vector-matrix or matrix-vector: this is either batch dims or empty
+  for (int64_t i = 0; i < lhs_rank - 1; ++i) {
+    lhs_exprs[i] = mlir::getAffineDimExpr(i, context);
+  }
+  // RHS non-contracting dimensions map to output dims starting after LHS
+  // For matrix-vector: output_rank may be < rhs_rank-1 (vector result)
+  for (int64_t i = 0; i < rhs_rank - 1; ++i) {
+    int64_t output_dim = (lhs_rank - 1) + i;
+    if (output_dim < output_rank) {
+      rhs_exprs[i + 1] = mlir::getAffineDimExpr(output_dim, context);
+    } else {
+      // Matrix-vector case: result is vector, extra RHS dims are implicit
+      rhs_exprs[i + 1] = mlir::getAffineConstantExpr(0, context);
+    }
+  }
+
+  // Contracting dimension (k): symbol for both LHS and RHS
+  int64_t k_dim = lhs_shape.dimensions()[lhs_rank - 1];
+  AffineExpr k_expr = mlir::getAffineSymbolExpr(0, context);
+  lhs_exprs[lhs_rank - 1] = k_expr;
+  rhs_exprs[0] = k_expr;
+  IndexingMap lhs_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 1, lhs_exprs, context),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {k_dim});
+  IndexingMap rhs_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 1, rhs_exprs, context),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {k_dim});
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{lhs_map});
+  indexing.indexing_maps[1].insert(OperandIndexing{rhs_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DynamicSliceOp dynamic_slice, int output_id) {
+  MLIRContext* context = dynamic_slice.getContext();
+  auto input_shape = GetShape(dynamic_slice.getOperand());
+  auto output_shape = GetShape(dynamic_slice.getResult());
+  int64_t rank = output_shape.dimensions().size();
+
+  std::vector<int64_t> dim_sizes(output_shape.dimensions().begin(),
+                                 output_shape.dimensions().end());
+  std::vector<IndexingMap::Variable> dim_vars;
+  dim_vars.reserve(dim_sizes.size());
+  for (auto size : dim_sizes) {
+    dim_vars.push_back(IndexingMap::Variable{{0, size - 1}});
+  }
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  std::vector<IndexingMap::Variable> rt_vars;
+  std::vector<RuntimeVarIndexing> runtime_vars;
+
+  // An empty affine map for scalar runtime variables.
+  // Needed for indices_map construction below
+  AffineMap empty_map = AffineMap::get(rank, 0, {}, context);
+
+  for (auto [dim, slice_size] :
+       llvm::enumerate(dynamic_slice.getSliceSizes())) {
+    AffineExpr dim_expr = getAffineDimExpr(dim, context);
+    Value rt_var_val = dynamic_slice.getStartIndices()[dim];
+    int64_t max_index = input_shape.dimensions(dim) - slice_size;
+
+    // Construct indexing map for the start index (scalar map keyed by output
+    // dimensions) We reuse the scalar map logic: (d0...dN) -> ()
+    IndexingMap rt_index_map = CreateScalarIndexingMap(output_shape, context);
+
+    // Attempt constant folding/optimization
+    RuntimeVarIndexing rt_indexing{rt_var_val, rt_index_map};
+    Interval feasible_values{0, max_index};
+
+    auto simplified_expr = OptimizeRTVar(rt_indexing, feasible_values, context);
+
+    if (simplified_expr) {
+      exprs.push_back(dim_expr + *simplified_expr);
+    } else {
+      exprs.push_back(dim_expr + getAffineSymbolExpr(rt_vars.size(), context));
+      rt_vars.push_back(IndexingMap::Variable{{0, max_index}});
+      runtime_vars.push_back(RuntimeVarIndexing{rt_var_val, rt_index_map});
+    }
+  }
+
+  IndexingMap input_map{AffineMap::get(rank, rt_vars.size(), exprs, context),
+                        dim_vars,
+                        {},
+                        rt_vars};
+
+  OperandIndexing operand_indexing{input_map, runtime_vars};
+
+  IndexingMap indices_map =
+      IndexingMap::FromTensorSizes(empty_map, dim_sizes, {});
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(dynamic_slice.getNumOperands());
+  indexing.indexing_maps[0].insert(operand_indexing);
+  for (size_t i = 1; i < dynamic_slice.getNumOperands(); ++i) {
+    indexing.indexing_maps[i].insert(OperandIndexing{indices_map});
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DynamicUpdateSliceOp dus, int output_id) {
+  MLIRContext* context = dus.getContext();
+  auto operand_shape = GetShape(dus.getOperand());
+  auto update_shape = GetShape(dus.getUpdate());
+  auto output_shape = GetShape(dus.getResult());
+  int64_t rank = output_shape.dimensions().size();
+
+  // Operand (input): identity mapping
+  std::vector<AffineExpr> identity;
+  identity.reserve(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    identity.push_back(getAffineDimExpr(dim, context));
+  }
+  std::vector<int64_t> dim_sizes(output_shape.dimensions().begin(),
+                                 output_shape.dimensions().end());
+  IndexingMap operand_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(rank, 0, identity, context), dim_sizes, {});
+
+  // Update: (d0 - rt0, ..., d{N-1} - rt{N-1}) with runtime variables
+  std::vector<AffineExpr> update_exprs;
+  std::vector<IndexingMap::Variable> rt_vars;
+  update_exprs.reserve(rank);
+  rt_vars.reserve(rank);
+
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    update_exprs.push_back(getAffineDimExpr(dim, context) -
+                           getAffineSymbolExpr(dim, context));
+    rt_vars.push_back(IndexingMap::Variable{
+        {0, operand_shape.dimensions(dim) - update_shape.dimensions(dim)}});
+  }
+
+  std::vector<IndexingMap::Variable> dim_vars;
+  dim_vars.reserve(dim_sizes.size());
+  for (auto size : dim_sizes) {
+    dim_vars.push_back(IndexingMap::Variable{{0, size - 1}});
+  }
+
+  IndexingMap update_map{
+      AffineMap::get(rank, rank, update_exprs, context), dim_vars, {}, rt_vars};
+
+  // Create RuntimeVarIndexing for offset operands
+  std::vector<RuntimeVarIndexing> runtime_vars;
+  runtime_vars.reserve(rank);
+  AffineMap empty_map = AffineMap::get(rank, 0, {}, context);
+  IndexingMap rt_index_map =
+      IndexingMap::FromTensorSizes(empty_map, dim_sizes, {});
+
+  for (auto offset_value : dus.getStartIndices()) {
+    runtime_vars.push_back(RuntimeVarIndexing{offset_value, rt_index_map});
+  }
+
+  OperandIndexing update_indexing{update_map, runtime_vars};
+
+  // Start indices: empty map
+  IndexingMap indices_map =
+      IndexingMap::FromTensorSizes(empty_map, dim_sizes, {});
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(dus.getNumOperands());
+  indexing.indexing_maps[0].insert(OperandIndexing{operand_map});
+  indexing.indexing_maps[1].insert(update_indexing);
+  for (size_t i = 2; i < dus.getNumOperands(); ++i) {
+    indexing.indexing_maps[i].insert(OperandIndexing{indices_map});
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    GatherOp gather, int output_id) {
+  MLIRContext* context = gather.getContext();
+  auto operand_shape = GetShape(gather.getOperand());
+  auto start_indices_shape = GetShape(gather.getStartIndices());
+  auto output_shape = GetShape(gather.getResult());
+  int64_t output_rank = output_shape.dimensions().size();
+
+  auto dimension_numbers = gather.getDimensionNumbers();
+  int64_t index_vector_dim = dimension_numbers.getIndexVectorDim();
+  int64_t index_vector_length =
+      start_indices_shape.dimensions(index_vector_dim);
+
+  // Map for indices operand: (d0, ..., d{rank-1}) -> (d0, s0)
+  // where s0 ranges over index vector dimension
+  AffineExpr indices_id_dim = getAffineDimExpr(0, context);
+  std::vector<int64_t> dim_sizes(output_shape.dimensions().begin(),
+                                 output_shape.dimensions().end());
+  std::vector<IndexingMap::Variable> dim_vars;
+  dim_vars.reserve(dim_sizes.size());
+  for (auto size : dim_sizes) {
+    dim_vars.push_back(IndexingMap::Variable{{0, size - 1}});
+  }
+
+  IndexingMap indices_map{
+      AffineMap::get(output_rank, 1,
+                     {indices_id_dim, getAffineSymbolExpr(0, context)},
+                     context),
+      dim_vars,
+      {IndexingMap::Variable{{0, index_vector_length - 1}}},
+      /*rt_vars=*/{}};
+
+  // Map for operand with runtime variables
+  std::vector<AffineExpr> exprs;
+  std::vector<RuntimeVarIndexing> runtime_vars;
+  std::vector<IndexingMap::Variable> rt_vars;
+  auto slice_sizes = gather.getSliceSizes();
+  auto offset_dims = dimension_numbers.getOffsetDims();
+  auto start_index_map = dimension_numbers.getStartIndexMap();
+
+  exprs.reserve(operand_shape.dimensions().size());
+
+  for (auto [operand_dim_id, slice_size] : enumerate(slice_sizes)) {
+    int64_t output_dim_id = offset_dims[operand_dim_id];
+    exprs.push_back(mlir::getAffineDimExpr(output_dim_id, context));
+
+    // Check if this dimension is indexed by start_indices
+    auto it = absl::c_find(start_index_map, operand_dim_id);
+    if (it == start_index_map.end()) {
+      continue;
+    }
+
+    int64_t start_index_map_idx = it - start_index_map.begin();
+
+    // Create runtime variable for this index
+    AffineMap rt_var_map = AffineMap::get(
+        output_rank, 0,
+        {indices_id_dim,
+         mlir::getAffineConstantExpr(start_index_map_idx, context)},
+        context);
+
+    IndexingMap rt_index_map =
+        IndexingMap::FromTensorSizes(rt_var_map, dim_sizes, {});
+
+    int64_t upper_bound = operand_shape.dimensions(operand_dim_id) - slice_size;
+
+    RuntimeVarIndexing rt_indexing{gather.getStartIndices(), rt_index_map};
+    Interval feasible_values{0, upper_bound};
+
+    if (auto simplified =
+            OptimizeRTVar(rt_indexing, feasible_values, context)) {
+      exprs.back() = exprs.back() + *simplified;
+      continue;
+    }
+
+    runtime_vars.push_back(rt_indexing);
+    rt_vars.push_back(IndexingMap::Variable{{0, upper_bound}});
+
+    // Add runtime variable to expression
+    exprs.back() = exprs.back() +
+                   mlir::getAffineSymbolExpr(runtime_vars.size() - 1, context);
+  }
+
+  IndexingMap operand_map{
+      AffineMap::get(output_rank, runtime_vars.size(), exprs, context),
+      dim_vars,
+      {},
+      rt_vars};
+
+  OperandIndexing operand_indexing{operand_map, runtime_vars};
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(operand_indexing);
+  indexing.indexing_maps[1].insert(OperandIndexing{indices_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    GetTupleElementOp gte, int output_id) {
+  if (!dyn_cast<RankedTensorType>(gte.getResult().getType())) {
+    return CreateUnknownIndexing(1);
+  }
+  auto output_shape = GetShape(gte.getResult());
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(output_shape.dimensions().size(),
+                                        gte.getContext()),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{identity_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    PadOp pad, int output_id) {
+  MLIRContext* context = pad.getContext();
+  auto output_shape = GetShape(pad.getResult());
+  auto edge_padding_low = pad.getEdgePaddingLow();
+  auto edge_padding_high = pad.getEdgePaddingHigh();
+  auto interior_padding = pad.getInteriorPadding();
+  IndexingMap input_indexing_map =
+      ComputePadIndexingMap(output_shape.dimensions(), edge_padding_low,
+                            edge_padding_high, interior_padding, context);
+  IndexingMap padding_value_indexing_map =
+      CreateScalarIndexingMap(output_shape, context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{input_indexing_map});
+  indexing.indexing_maps[1].insert(OperandIndexing{padding_value_indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReduceOp reduce, int output_id) {
+  MLIRContext* context = reduce.getContext();
+
+  auto input_shape = GetShape(reduce.getInputs()[0]);
+  auto output_shape = GetShape(reduce.getResults()[0]);
+
+  IndexingMap inputs_indexing_map = ComputeReduceInputIndexingMap(
+      input_shape.dimensions(), output_shape.dimensions(),
+      reduce.getDimensions(), context);
+
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, context);
+
+  HloInstructionIndexing indexing;
+  int64_t num_inputs = reduce.getInputs().size();
+  int64_t num_operands = num_inputs + reduce.getInitValues().size();
+  indexing.indexing_maps.resize(num_operands);
+
+  for (int64_t id = 0; id < num_inputs; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inputs_indexing_map));
+  }
+  for (int64_t id = num_inputs; id < num_operands; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inits_indexing_map));
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReduceWindowOp reduce_window, int output_id) {
+  MLIRContext* context = reduce_window.getContext();
+
+  // Following XLA's ReduceWindowOp pattern:
+  // Indexing for reduce-window with dilations and non-trivial padding
+  // is represented as a composition using ComposeWindowIndexingMap
+
+  auto input_shape = GetShape(reduce_window.getInputs()[0]);
+  auto output_shape = GetShape(reduce_window.getResults()[0]);
+
+  SmallVector<int64_t> default_dilations(input_shape.dimensions().size(), 1);
+  SmallVector<int64_t> default_padding(input_shape.dimensions().size() * 2, 0);
+
+  ArrayRef<int64_t> window_dilations =
+      reduce_window.getWindowDilations()
+          ? ArrayRef<int64_t>(*reduce_window.getWindowDilations())
+          : ArrayRef(default_dilations);
+  ArrayRef<int64_t> base_dilations =
+      reduce_window.getBaseDilations()
+          ? ArrayRef<int64_t>(*reduce_window.getBaseDilations())
+          : ArrayRef(default_dilations);
+
+  SmallVector<int64_t> padding_flat;
+  if (reduce_window.getPadding()) {
+    auto padding_attr = reduce_window.getPadding().value();
+    for (auto val : padding_attr.getValues<int64_t>()) {
+      padding_flat.push_back(val);
+    }
+  } else {
+    padding_flat = default_padding;
+  }
+
+  // Indexing map for the input value
+  IndexingMap inputs_indexing = ComposeWindowIndexingMap(
+      input_shape.dimensions(), output_shape.dimensions(),
+      reduce_window.getWindowDimensions(),
+      reduce_window.getWindowStrides().value_or(
+          reduce_window.getWindowDimensions()),
+      window_dilations, base_dilations, padding_flat, context);
+
+  // Indexing map for the init value
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, context);
+
+  HloInstructionIndexing indexing;
+  int64_t num_inputs = reduce_window.getInputs().size();
+  int64_t num_operands = num_inputs + reduce_window.getInitValues().size();
+  indexing.indexing_maps.resize(num_operands);
+
+  for (int64_t id = 0; id < num_inputs; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inputs_indexing));
+  }
+  for (int64_t id = num_inputs; id < num_operands; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inits_indexing_map));
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReshapeOp reshape, int output_id) {
+  MLIRContext* context = reshape.getContext();
+  auto input_shape = GetShape(reshape.getOperand());
+  auto output_shape = GetShape(reshape.getResult());
+  IndexingMap indexing_map = GetBitcastMap(output_shape, input_shape, context);
+  indexing_map.Simplify();
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReverseOp reverse, int output_id) {
+  MLIRContext* context = reverse.getContext();
+  auto output_shape = GetShape(reverse.getResult());
+  IndexingMap indexing_map = ComputeReverseIndexingMap(
+      output_shape.dimensions(), reverse.getDimensions(), context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    SliceOp slice, int output_id) {
+  MLIRContext* context = slice.getContext();
+  auto output_shape = GetShape(slice.getResult());
+  IndexingMap indexing_map = ComputeSliceIndexingMap(
+      output_shape.dimensions(), slice.getStartIndices(), slice.getStrides(),
+      context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    TransposeOp transpose, int output_id) {
+  MLIRContext* context = transpose.getContext();
+  auto output_shape = GetShape(transpose.getResult());
+  auto permutation = std::vector<int64_t>(transpose.getPermutation().begin(),
+                                          transpose.getPermutation().end());
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ComputeTransposeIndexingMap(InversePermutation(permutation), context),
+      output_shape.dimensions(), {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    TupleOp tuple_op, int output_id) {
+  MLIRContext* context = tuple_op.getContext();
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(tuple_op->getNumOperands());
+  for (auto [i, operand] : enumerate(tuple_op->getOperands())) {
+    if (!dyn_cast<RankedTensorType>(operand.getType())) {
+      continue;
+    }
+    auto operand_shape = GetShape(operand);
+    IndexingMap identity_map = IndexingMap::FromTensorSizes(
+        AffineMap::getMultiDimIdentityMap(operand_shape.dimensions().size(),
+                                          context),
+        std::vector<int64_t>(operand_shape.dimensions().begin(),
+                             operand_shape.dimensions().end()),
+        {});
+    indexing.indexing_maps[i].insert(OperandIndexing{identity_map});
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    mlir::mhlo::BitcastOp op, int output_id) {
+  Shape input_shape = GetShape(op.getOperand());
+  if (auto attr = op->getAttrOfType<DenseIntElementsAttr>("source_layout")) {
+    std::vector<int64_t> layout;
+    for (const auto& val : attr.getValues<int64_t>()) {
+      layout.push_back(val);
+    }
+    *input_shape.mutable_layout() = LayoutUtil::MakeLayout(layout);
+  }
+
+  Shape output_shape = GetShape(op.getResult());
+  if (auto attr = op->getAttrOfType<DenseIntElementsAttr>("result_layout")) {
+    std::vector<int64_t> layout;
+    for (const auto& val : attr.getValues<int64_t>()) {
+      layout.push_back(val);
+    }
+    *output_shape.mutable_layout() = LayoutUtil::MakeLayout(layout);
+  }
+  IndexingMap indexing_map =
+      GetBitcastMap(output_shape, input_shape, op.getContext());
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    mhlo::CopyOp op, int output_id) {
+  auto output_shape = GetShape(op.getResult());
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(output_shape.dimensions().size(),
+                                        op.getContext()),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{identity_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    mhlo::FusionOp op, int output_id) {
+  auto& region = op.getRegion();
+  if (region.empty()) {
+    return CreateUnknownIndexing(op.getNumOperands());
+  }
+
+  auto& block = region.front();
+  auto terminator = block.getTerminator();
+  if (output_id >= terminator->getNumOperands()) {
+    return CreateUnknownIndexing(op.getNumOperands());
+  }
+
+  HloInstructionIndexing fusion_indexing;
+  fusion_indexing.indexing_maps.resize(op.getNumOperands());
+
+  struct WorkItem {
+    Value value;
+    OperandIndexing indexing;
+  };
+  std::vector<WorkItem> worklist;
+
+  // Start with the result of the fusion corresponding to output_id
+  Value root_val = terminator->getOperand(output_id);
+  Shape root_shape = GetShape(root_val);
+  int64_t rank = root_shape.dimensions().size();
+
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(rank, op.getContext()),
+      std::vector<int64_t>(root_shape.dimensions().begin(),
+                           root_shape.dimensions().end()),
+      {});
+  worklist.push_back({root_val, OperandIndexing{identity_map}});
+
+  while (!worklist.empty()) {
+    auto [val, current_indexing] = worklist.back();
+    worklist.pop_back();
+
+    if (current_indexing.IsUndefined()) {
+      // Propagate undefined?
+    }
+
+    if (auto block_arg = dyn_cast<BlockArgument>(val)) {
+      if (block_arg.getOwner() == &block) {
+        int arg_idx = block_arg.getArgNumber();
+        if (arg_idx < fusion_indexing.indexing_maps.size()) {
+          fusion_indexing.indexing_maps[arg_idx].insert(current_indexing);
+        }
+      }
+      continue;
+    }
+
+    Operation* producer = val.getDefiningOp();
+    if (!producer) {
+      continue;
+    }
+
+    // Recursive call to handle internal op
+    int producer_result_idx = llvm::cast<mlir::OpResult>(val).getResultNumber();
+    auto producer_indexing =
+        ComputeOutputToInputIndexing(producer, producer_result_idx);
+
+    for (size_t i = 0; i < producer->getNumOperands(); ++i) {
+      Value operand = producer->getOperand(i);
+      for (const auto& operand_indexing : producer_indexing.indexing_maps[i]) {
+        if (operand_indexing.IsUndefined() || current_indexing.IsUndefined()) {
+          worklist.push_back(
+              {operand, OperandIndexing{IndexingMap::GetUndefined()}});
+          continue;
+        }
+        // Note: ComposeOperandIndexing order is (Inner, Outer) aka (Consumer,
+        // Producer) to compute Outer(Inner(x)).
+        OperandIndexing composed =
+            ComposeOperandIndexing(current_indexing, operand_indexing);
+        if (!composed.IsUndefined()) {
+          composed.Simplify();
+          composed.RemoveUnusedSymbols();
+        }
+        worklist.push_back({operand, composed});
+      }
+    }
+  }
+  return fusion_indexing;
+}
+
+}  // namespace
+
+HloInstructionIndexing ComputeOutputToInputIndexing(Operation* op,
+                                                    int output_id) {
+  MLIRContext* context = op->getContext();
+  HloInstructionIndexing indexing =
+      llvm::TypeSwitch<Operation*, HloInstructionIndexing>(op)
+          // Operations with extracted helpers.
+          .Case<AllGatherOp, BitcastConvertOp, BroadcastInDimOp, ConcatenateOp,
+                ConvolutionOp, DotOp, DotGeneralOp, DynamicSliceOp,
+                DynamicUpdateSliceOp, GatherOp, GetTupleElementOp, PadOp,
+                ReduceOp, ReduceWindowOp, ReshapeOp, ReverseOp, SliceOp,
+                TransposeOp, TupleOp,
+                // MHLO ops.
+                mhlo::BitcastOp, mhlo::CopyOp, mhlo::FusionOp>(
+              [&](auto typed_op) {
+                return ComputeOutputToInputIndexingImpl(typed_op, output_id);
+              })
+
+          // Elementwise identity operations, all operands use identity mapping.
+          .Case<AddOp, SubtractOp, MulOp, DivOp, RemOp, MaxOp, MinOp, AndOp,
+                OrOp, XorOp, AbsOp, NegOp, SignOp, CosineOp, SineOp, TanhOp,
+                SqrtOp, RsqrtOp, ExpOp, Expm1Op, LogOp, Log1pOp, FloorOp,
+                CeilOp, ConvertOp, SelectOp, ClampOp, CompareOp,
+                PopulationCountOp, NotOp, IsFiniteOp, RoundNearestEvenOp,
+                OptimizationBarrierOp, MapOp, SortOp>([&](Operation* op) {
+            if (!dyn_cast<RankedTensorType>(op->getResult(0).getType())) {
+              return CreateUnknownIndexing(op->getNumOperands());
+            }
+            auto output_shape = GetShape(op->getResult(0));
+            HloInstructionIndexing indexing = CreateElementwiseIndexing(
+                op->getNumOperands(), output_shape, context);
+            // Handle scalar broadcast for operands with no dimensions
+            for (auto [i, operand] : llvm::enumerate(op->getOperands())) {
+              if (GetShape(operand).dimensions().empty()) {
+                indexing.indexing_maps[i].clear();
+                indexing.indexing_maps[i].insert(OperandIndexing{
+                    CreateScalarIndexingMap(output_shape, context)});
+              }
+            }
+            return indexing;
+          })
+
+          // Default:
+          //  - IotaOp, ConstantOp, CreateTokenOp, AfterAllOp
+          //  - unknown indexing for unsupported operations
+          .Default([&](Operation* op) {
+            return CreateUnknownIndexing(op->getNumOperands());
+          });
+  return indexing;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h
new file mode 100644
index 00000000000000..b5fca80c1bc70e
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_STABLEHLO_INDEXING_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_STABLEHLO_INDEXING_ANALYSIS_H_
+
+#include "mlir/IR/Operation.h"
+
+namespace xla {
+struct HloInstructionIndexing;
+
+HloInstructionIndexing ComputeOutputToInputIndexing(mlir::Operation* op,
+                                                    int output_id);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_STABLEHLO_INDEXING_ANALYSIS_H_
diff --git a/third_party/xla/xla/hlo/builder/lib/BUILD b/third_party/xla/xla/hlo/builder/lib/BUILD
index 997525cec41e5b..597ac38c912fe3 100644
--- a/third_party/xla/xla/hlo/builder/lib/BUILD
+++ b/third_party/xla/xla/hlo/builder/lib/BUILD
@@ -96,6 +96,7 @@ xla_test(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/hlo/builder/lib/comparators_test.cc b/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
index 974ae4899046b9..523c11b479d614 100644
--- a/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/hlo/builder/lib/constants.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
diff --git a/third_party/xla/xla/hlo/builder/lib/math.h b/third_party/xla/xla/hlo/builder/lib/math.h
index 921e7cd3f4a0f2..4614c0442aafac 100644
--- a/third_party/xla/xla/hlo/builder/lib/math.h
+++ b/third_party/xla/xla/hlo/builder/lib/math.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index c24f1ca6ce17e0..379b128680313e 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -89,7 +89,6 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:endian",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index ce763d545a5dfa..cea61cf467cb32 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/internal/endian.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc
index 03333a03f75c41..8054b819c4a179 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc
@@ -156,7 +156,7 @@ class AutoShardingWrapperPass
     mlir::OpBuilder builder(context);
 
     auto original_mesh_op =
-        builder.create<sdy::MeshOp>(module_op.getLoc(), "mesh", sdy_mesh);
+        sdy::MeshOp::create(builder, module_op.getLoc(), "mesh", sdy_mesh);
     symbol_table.insert(original_mesh_op, module_op.getBody()->begin());
     mlir::PassManager dedup_pm(context);
     dedup_pm.addPass(xla::sdy::createSdyRoundTripDedupMeshesPass());
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 6d4c270f5b856f..26157ab983ec1f 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "hlo_module_metadata.cc",
         "hlo_opcode.cc",
         "hlo_original_value.cc",
+        "hlo_original_value_util.cc",
         "hlo_schedule.cc",
         "hlo_sharding_metadata.cc",
         "replica_group.cc",
@@ -51,6 +52,7 @@ cc_library(
         "hlo_op_metadata.h",
         "hlo_opcode.h",
         "hlo_original_value.h",
+        "hlo_original_value_util.h",
         "hlo_print_options.h",
         "hlo_schedule.h",
         "hlo_sharding.h",
@@ -203,12 +205,14 @@ cc_library(
 
 cc_library(
     name = "named_sharding",
+    srcs = ["named_sharding.cc"],
     hdrs = ["named_sharding.h"],
     deps = [
         ":mesh_and_axis",
         ":tile_assignment",
         "//xla:xla_data_proto_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 57989017c4e43a..50d5655aeab3ab 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -593,6 +593,36 @@ absl::Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
   return absl::OkStatus();
 }
 
+absl::Status HloComputation::PermuteParameters(
+    absl::Span<const int64_t> permutation) {
+  if (permutation.size() != num_parameters()) {
+    return absl::InvalidArgumentError(
+        "Permutation size must match the number of parameters.");
+  }
+  if (permutation.size() == 1) {
+    return absl::OkStatus();
+  }
+
+  std::vector<std::unique_ptr<HloInstruction>> new_param_instructions(
+      num_parameters());
+  for (int64_t i = 0; i < num_parameters(); ++i) {
+    int64_t new_param_number = permutation[i];
+    new_param_instructions[new_param_number] = HloInstruction::CreateParameter(
+        new_param_number, param_instructions_[i]->shape(),
+        param_instructions_[i]->name());
+  }
+
+  for (int64_t i = 0; i < num_parameters(); ++i) {
+    ReplaceParameter(i, std::move(new_param_instructions[permutation[i]]));
+  }
+
+  absl::c_sort(param_instructions_,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->parameter_number() < b->parameter_number();
+               });
+  return absl::OkStatus();
+}
+
 bool HloComputation::IsSafelyRemovable(
     const HloInstruction* instruction, bool ignore_control_dependency,
     std::optional<
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index f13f1a8a937aa0..64f3ca34d84a98 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -984,6 +984,10 @@ class HloComputation {
 
   void ClearCalledComputations();
 
+  // Permutes the parameter numbers of this computation according to the
+  // provided permutation.
+  absl::Status PermuteParameters(absl::Span<const int64_t> permutation);
+
  private:
   friend class HloModule;
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 946668fcbe5a5f..eb0c9fd9b969f2 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -60,6 +60,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_op_metadata.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
@@ -1691,7 +1692,7 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+    int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
     bool constrain_layout, const std::optional<int64_t>& channel_id,
     bool use_global_device_ids) {
   return std::make_unique<HloAllGatherInstruction>(
@@ -1710,13 +1711,11 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateAllGatherStart(const Shape& shape,
-                                     absl::Span<HloInstruction* const> operands,
-                                     int64_t all_gather_dimension,
-                                     const CollectiveDeviceList& device_list,
-                                     bool constrain_layout,
-                                     const std::optional<int64_t>& channel_id,
-                                     bool use_global_device_ids) {
+HloInstruction::CreateAllGatherStart(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids) {
   return std::make_unique<HloAllGatherInstruction>(
       HloOpcode::kAllGatherStart, shape, operands, all_gather_dimension,
       device_list, constrain_layout, channel_id, use_global_device_ids);
@@ -1736,9 +1735,9 @@ HloInstruction::CreateAllGatherStart(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids) {
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids) {
   return std::make_unique<HloAllReduceInstruction>(
       HloOpcode::kAllReduce, shape, operands, reduce_computation, device_list,
       constrain_layout, channel_id, use_global_device_ids);
@@ -1755,11 +1754,14 @@ HloInstruction::CreateAllGatherStart(
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateReduceScatter(
-    const Shape& shape, absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids, int64_t scatter_dimension) {
+HloInstruction::CreateReduceScatter(const Shape& shape,
+                                    absl::Span<HloInstruction* const> operands,
+                                    HloComputation* reduce_computation,
+                                    const CollectiveDeviceListBase& device_list,
+                                    bool constrain_layout,
+                                    const std::optional<int64_t>& channel_id,
+                                    bool use_global_device_ids,
+                                    int64_t scatter_dimension) {
   return std::make_unique<HloReduceScatterInstruction>(
       shape, operands, reduce_computation, device_list, constrain_layout,
       channel_id, use_global_device_ids, scatter_dimension);
@@ -1778,13 +1780,11 @@ HloInstruction::CreateReduceScatter(
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateAllReduceStart(const Shape& shape,
-                                     absl::Span<HloInstruction* const> operands,
-                                     HloComputation* reduce_computation,
-                                     const CollectiveDeviceList& device_list,
-                                     bool constrain_layout,
-                                     const std::optional<int64_t>& channel_id,
-                                     bool use_global_device_ids) {
+HloInstruction::CreateAllReduceStart(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids) {
   return std::make_unique<HloAllReduceInstruction>(
       HloOpcode::kAllReduceStart, shape, operands, reduce_computation,
       device_list, constrain_layout, channel_id, use_global_device_ids);
@@ -1803,7 +1803,7 @@ HloInstruction::CreateAllReduceStart(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id,
     const std::optional<int64_t>& split_dimension) {
   return std::make_unique<HloAllToAllInstruction>(shape, operands, device_list,
@@ -1821,10 +1821,10 @@ HloInstruction::CreateAllReduceStart(
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateRaggedAllToAll(const Shape& shape,
-                                     absl::Span<HloInstruction* const> operands,
-                                     const CollectiveDeviceList& device_list,
-                                     const std::optional<int64_t>& channel_id) {
+HloInstruction::CreateRaggedAllToAll(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    const CollectiveDeviceListBase& device_list,
+    const std::optional<int64_t>& channel_id) {
   return std::make_unique<HloRaggedAllToAllInstruction>(
       shape, operands, device_list, channel_id);
 }
@@ -1841,7 +1841,7 @@ HloInstruction::CreateRaggedAllToAll(
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCollectiveBroadcast(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id) {
   return std::make_unique<HloCollectiveBroadcastInstruction>(
       HloOpcode::kCollectiveBroadcast, shape, operands, device_list,
@@ -5813,7 +5813,7 @@ const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
   return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
 
-const CollectiveDeviceList& HloInstruction::device_list() const {
+const CollectiveDeviceListBase& HloInstruction::device_list() const {
   return Cast<HloCollectiveInstruction>(this)->device_list();
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index d62cb20233986d..40a178fa26958e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -496,7 +496,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // order of inputs from different participants.
   static std::unique_ptr<HloInstruction> CreateAllGather(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
       bool constrain_layout, const std::optional<int64_t>& channel_id,
       bool use_global_device_ids);
 
@@ -516,7 +516,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // conjunction of a AllGatherDone op that synchronizes and returns the result.
   static std::unique_ptr<HloInstruction> CreateAllGatherStart(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
       bool constrain_layout, const std::optional<int64_t>& channel_id,
       bool use_global_device_ids);
 
@@ -543,7 +543,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   static std::unique_ptr<HloInstruction> CreateAllReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -559,7 +559,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   static std::unique_ptr<HloInstruction> CreateReduceScatter(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids,
       int64_t scatter_dimension);
 
@@ -587,7 +587,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   static std::unique_ptr<HloInstruction> CreateAllReduceStart(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -625,7 +625,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // performs AllToAll and then concatenates the results into a single array.
   static std::unique_ptr<HloInstruction> CreateAllToAll(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id,
       const std::optional<int64_t>& split_dimension = std::nullopt);
 
@@ -733,7 +733,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   //
   static std::unique_ptr<HloInstruction> CreateRaggedAllToAll(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list,
+      const CollectiveDeviceListBase& device_list,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -748,7 +748,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // on that replica is a tensor consists of 0(s) in `shape`.
   static std::unique_ptr<HloInstruction> CreateCollectiveBroadcast(
       const Shape& shape, absl::Span<HloInstruction* const> operand,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -2308,7 +2308,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   const std::vector<ReplicaGroup>& replica_groups() const;
 
   // Delegates to HloCollectiveInstruction::device_list.
-  const CollectiveDeviceList& device_list() const;
+  const CollectiveDeviceListBase& device_list() const;
 
   // Delegates to HloCollectivePermuteInstruction::source_target_pairs.
   const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs() const;
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 55c3b05fe0d10f..cc8459830044e9 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -918,10 +918,10 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
 HloCollectiveInstruction::HloCollectiveInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
     : HloChannelInstruction(opcode, shape, channel_id),
-      device_list_(device_list),
+      device_list_(device_list.Clone()),
       constrain_layout_(constrain_layout) {
   for (auto operand : operands) {
     AppendOperand(operand);
@@ -930,7 +930,23 @@ HloCollectiveInstruction::HloCollectiveInstruction(
 
 HloInstructionProto HloCollectiveInstruction::ToProto() const {
   HloInstructionProto proto = HloChannelInstruction::ToProto();
-  *proto.mutable_collective_device_list() = device_list_.ToProto();
+
+  if (const CollectiveDeviceList* device_list_v1 =
+          dynamic_cast<const CollectiveDeviceList*>(device_list_.get())) {
+    *proto.mutable_collective_device_list() = device_list_v1->ToProto();
+  } else if (const IotaReplicaGroupList* device_list_v2 =
+                 dynamic_cast<const IotaReplicaGroupList*>(
+                     device_list_.get())) {
+    *proto.mutable_iota_collective_device_list() = device_list_v2->ToProto();
+  } else if (const MeshAxesReplicaGroupList* device_list_v3 =
+                 dynamic_cast<const MeshAxesReplicaGroupList*>(
+                     device_list_.get())) {
+    *proto.mutable_mesh_axes_replica_group_list() = device_list_v3->ToProto();
+  } else {
+    LOG(FATAL) << "Unknown or missing CollectiveDeviceList type in "
+                  "HloCollectiveInstruction";
+  }
+
   proto.set_constrain_layout(constrain_layout_);
   return proto;
 }
@@ -940,10 +956,10 @@ void HloCollectiveInstruction::PrintExtraAttributesImpl(
   HloChannelInstruction::PrintExtraAttributesImpl(printer, options);
   printer.Next([this, &options](Printer* printer) {
     VLOG(4) << name() << " replica_groups="
-            << device_list_.ToString(options.print_full_replica_group_list());
+            << device_list_->ToString(options.print_full_replica_group_list());
 
     printer->Append("replica_groups=");
-    device_list_.Print(printer, options.print_full_replica_group_list());
+    device_list_->Print(printer, options.print_full_replica_group_list());
   });
   if (constrain_layout_) {
     printer.Next(
@@ -969,7 +985,7 @@ bool HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloAllGatherInstruction::HloAllGatherInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands, int64_t all_gather_dimension,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids)
     : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id),
@@ -1029,9 +1045,9 @@ bool HloAllGatherInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloAllReduceInstructionBase::HloAllReduceInstructionBase(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids)
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids)
     : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id),
       use_global_device_ids_(use_global_device_ids) {
@@ -1090,9 +1106,10 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
 
 HloReduceScatterInstruction::HloReduceScatterInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids, int64_t scatter_dimension)
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+    int64_t scatter_dimension)
     : HloAllReduceInstructionBase(
           HloOpcode::kReduceScatter, shape, operands, reduce_computation,
           device_list, constrain_layout, channel_id, use_global_device_ids),
@@ -1145,7 +1162,7 @@ HloReduceScatterInstruction::CloneWithNewOperandsImpl(
 
 HloAllToAllInstruction::HloAllToAllInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id,
     const std::optional<int64_t>& split_dimension)
     : HloCollectiveInstruction(HloOpcode::kAllToAll, shape, operands,
@@ -1200,7 +1217,7 @@ bool HloAllToAllInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 
 HloRaggedAllToAllInstruction::HloRaggedAllToAllInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list,
+    const CollectiveDeviceListBase& device_list,
     const std::optional<int64_t>& channel_id)
     : HloCollectiveInstruction(HloOpcode::kRaggedAllToAll, shape, operands,
                                device_list,
@@ -1235,7 +1252,7 @@ void HloRaggedAllToAllInstruction::PrintExtraAttributesImpl(
 HloCollectiveBroadcastInstruction::HloCollectiveBroadcastInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
     : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id) {}
@@ -2597,6 +2614,33 @@ absl::Status HloFusionInstruction::DeduplicateFusionOperands() {
   return absl::OkStatus();
 }
 
+absl::Status HloFusionInstruction::PermuteFusionOperands(
+    absl::Span<const int64_t> permutation) {
+  if (permutation.size() != operand_count()) {
+    return absl::InvalidArgumentError(
+        "Permutation size must match the number of operands.");
+  }
+  std::vector<bool> seen(permutation.size(), false);
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    if (permutation[i] < 0 || permutation[i] >= operand_count() ||
+        seen[permutation[i]]) {
+      return absl::InvalidArgumentError(
+          "Argument is not a permutation of operand indices.");
+    }
+    seen[permutation[i]] = true;
+  }
+
+  TF_RETURN_IF_ERROR(
+      fused_instructions_computation()->PermuteParameters(permutation));
+  InstructionVector new_operands(operand_count());
+  for (int64_t i = 0; i < operand_count(); ++i) {
+    new_operands[permutation[i]] = mutable_operand(i);
+  }
+  RemoveAllOperands();
+  AppendOperands(new_operands);
+  return absl::OkStatus();
+}
+
 HloCallInstruction::HloCallInstruction(const Shape& shape,
                                        HloInstruction* called_computation_root)
     : HloCallableInstruction(HloOpcode::kCall, shape) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 7c6753c80071a6..88f902c9b093f8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -648,10 +648,19 @@ class HloRecvDoneInstruction : public HloSendRecvInstruction {
 class HloCollectiveInstruction : public HloChannelInstruction {
  public:
   const std::vector<ReplicaGroup>& replica_groups() const {
-    return device_list_.replica_groups();
+    return device_list_->replica_groups();
   }
 
-  const CollectiveDeviceList& device_list() const { return device_list_; }
+  const CollectiveDeviceListBase& device_list() const {
+    const CollectiveDeviceList* device_list_v1 =
+        dynamic_cast<const CollectiveDeviceList*>(device_list_.get());
+    // TODO(b/468442352): After XLA codebase is genericized to utilize
+    // CollectiveDeviceListBase instead of CollectiveDeviceList remove this
+    // check and return CollectiveDeviceListBase instead.
+    CHECK(device_list_v1 != nullptr)
+        << "Failed to cast device_list_ to CollectiveDeviceList";
+    return *device_list_v1;
+  }
 
   // Returns true if the layout of the AllReduce is enforced by XLA client (as
   // the layout set in the shape). The only reason for the client to set the
@@ -674,8 +683,8 @@ class HloCollectiveInstruction : public HloChannelInstruction {
   explicit HloCollectiveInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& collective_device_list, bool constrain_layout,
-      const std::optional<int64_t>& channel_id);
+      const CollectiveDeviceListBase& collective_device_list,
+      bool constrain_layout, const std::optional<int64_t>& channel_id);
 
   HloInstructionProto ToProto() const override;
 
@@ -686,7 +695,7 @@ class HloCollectiveInstruction : public HloChannelInstruction {
       absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
           eq_computations) const override;
 
-  CollectiveDeviceList device_list_;
+  std::shared_ptr<CollectiveDeviceListBase> device_list_;
   bool constrain_layout_;
 };
 
@@ -695,7 +704,7 @@ class HloAllGatherInstruction : public HloCollectiveInstruction {
   explicit HloAllGatherInstruction(HloOpcode opcode, const Shape& shape,
                                    absl::Span<HloInstruction* const> operands,
                                    int64_t all_gather_dimension,
-                                   const CollectiveDeviceList& device_list,
+                                   const CollectiveDeviceListBase& device_list,
                                    bool constrain_layout,
                                    const std::optional<int64_t>& channel_id,
                                    bool use_global_device_ids);
@@ -751,7 +760,7 @@ class HloAllReduceInstructionBase : public HloCollectiveInstruction {
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
 
   // Returns true if the ids in the ReplicaGroup config represent a global id of
@@ -808,7 +817,7 @@ class HloReduceScatterInstruction : public HloAllReduceInstructionBase {
   explicit HloReduceScatterInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids,
       int64_t scatter_dimension);
 
@@ -853,7 +862,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllToAllInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id,
       const std::optional<int64_t>& split_dimension);
 
@@ -901,7 +910,7 @@ class HloRaggedAllToAllInstruction : public HloCollectiveInstruction {
  public:
   explicit HloRaggedAllToAllInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list,
+      const CollectiveDeviceListBase& device_list,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -932,7 +941,7 @@ class HloCollectiveBroadcastInstruction : public HloCollectiveInstruction {
   explicit HloCollectiveBroadcastInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -1588,6 +1597,10 @@ class HloFusionInstruction : public HloCallableInstruction {
   // If multiple operands are the same instruction, keeps only one of them.
   absl::Status DeduplicateFusionOperands();
 
+  // Permutes the operands computation according to the provided permutation.
+  // The fusion computation is also adjusted accordingly.
+  absl::Status PermuteFusionOperands(absl::Span<const int64_t> permutation);
+
   static bool ClassOf(const HloInstruction* hlo) {
     return hlo->opcode() == HloOpcode::kFusion;
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index 4656f2a442ca91..dfb08c3b780477 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -55,7 +55,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -267,20 +267,27 @@ void HloModule::MarkFusionDuplications(
     const {
   for (const HloComputation* computation : computations()) {
     for (auto* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kFusion) {
-        auto rep =
-            replacements.find(instruction->fused_instructions_computation());
-        if (rep != replacements.end()) {
-          xla::HloComputation* new_comp = rep->second;
-          if (new_comp->IsFusionComputation()) {
-            auto dedup_name = new_comp->FusionInstruction()->name();
-            new_comp->FusionInstruction()->set_metadata_deduplicated_name(
-                std::string(dedup_name));
-            instruction->set_metadata_deduplicated_name(
-                std::string(dedup_name));
-          }
-        }
+      if (instruction->opcode() != HloOpcode::kFusion) {
+        continue;
+      }
+      auto it =
+          replacements.find(instruction->fused_instructions_computation());
+      if (it == replacements.end()) {
+        continue;
+      }
+      HloComputation* representative = it->second;
+      // Follow chain to find the root representative.
+      for (auto it2 = replacements.find(representative);
+           it2 != replacements.end(); it2 = replacements.find(representative)) {
+        representative = it2->second;
+      }
+      if (!representative->IsFusionComputation()) {
+        continue;
       }
+      std::string dedup_name(representative->FusionInstruction()->name());
+      representative->FusionInstruction()->set_metadata_deduplicated_name(
+          dedup_name);
+      instruction->set_metadata_deduplicated_name(dedup_name);
     }
   }
 }
@@ -399,7 +406,7 @@ void HloModule::Print(
         },
         value);
   }
-  PrintStackFrameIndex(printer);
+  PrintStackFrameIndex(printer, options);
   printer->Append("\n\n");
   PrintComputations(printer, options);
 }
@@ -472,8 +479,10 @@ void HloModule::PrintComputations(Printer* printer,
   }
 }
 
-void HloModule::PrintStackFrameIndex(Printer* printer) const {
-  if (!stack_frame_index_.has_value()) {
+void HloModule::PrintStackFrameIndex(Printer* printer,
+                                     const HloPrintOptions& options) const {
+  if (!stack_frame_index_.has_value() ||
+      stack_frame_index_->file_names().empty() || !options.print_metadata()) {
     return;
   }
   printer->Append("\n\nFileNames\n");
@@ -1751,7 +1760,7 @@ void HloModule::OriginalValueRecoveryTable::AddRecoveryComputation(
     std::optional<OriginalArray>* new_original_array =
         new_inst->original_value()->mutable_original_array(shape_index);
     if (!*new_original_array) {
-      if (recovery_computation->get() == nullptr) {
+      if (*recovery_computation == nullptr) {
         // If the recovery computation is a nullptr, it means this is an
         // identity computation and we can just pass through the original array.
         new_original_array->emplace(*old_original_array);
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 17027cecccfb58..63b6f4b237cbe0 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -447,7 +447,8 @@ class HloModule {
   void PrintComputations(Printer* printer,
                          const HloPrintOptions& options) const;
   void PrintConfig(Printer* printer, const HloModuleConfig& config) const;
-  void PrintStackFrameIndex(Printer* printer) const;
+  void PrintStackFrameIndex(Printer* printer,
+                            const HloPrintOptions& options) const;
 
  public:
   // Prints a string representation of the module.
@@ -823,6 +824,11 @@ class HloModule {
     stack_frame_index_ = std::move(stack_frame_index);
   }
 
+  // Getter for the stack frame index.
+  const std::optional<StackFrameIndexProto>& stack_frame_index() const {
+    return stack_frame_index_;
+  }
+
   // Finalizes this module by destroying internal data structures that might be
   // used for building or modifying the module. It is undefined behavior to
   // modify the module (add computations or instructions) after the call. Should
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_group.h b/third_party/xla/xla/hlo/ir/hlo_module_group.h
index 63fba3704d07f7..c7ff76d818823c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_group.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_group.h
@@ -92,18 +92,10 @@ class HloModuleGroup {
   // Returns true if there are no modules in the module group.
   bool empty() const { return !module_; }
 
-  absl::string_view cache_key() const { return cache_key_; }
-  void set_cache_key(absl::string_view cache_key) {
-    cache_key_ = std::string(cache_key);
-  }
-
- private:
   std::string name_;
 
   // Vector of modules as std::unique_ptrs.
   std::unique_ptr<HloModule> module_;
-
-  std::string cache_key_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloModuleGroup& group);
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.cc b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
index 9ca6b693639a7f..51eb648979a1cc 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
@@ -24,17 +24,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/no_destructor.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/pointer_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tuple_tree.h"
@@ -247,55 +243,6 @@ std::shared_ptr<OriginalValue> OriginalValue::CreateFromInstruction(
   return original_value;
 }
 
-void CopyOriginalValue(const HloInstruction* src_instruction,
-                       HloInstruction* dest_instruction, bool clone,
-                       bool issue_warning) {
-  if (!src_instruction || !dest_instruction ||
-      !ShapeUtil::Compatible(src_instruction->shape(),
-                             dest_instruction->shape())) {
-    if (issue_warning) {
-      LOG(WARNING)
-          << "Expect the new instruction to have the same shape with the old "
-             "instruction when moving over original_value";
-    }
-    return;
-  }
-
-  std::shared_ptr<OriginalValue> original_value =
-      src_instruction->original_value();
-  if (!original_value) {
-    return;
-  }
-
-  if (!clone || original_value->is_synthetic_call()) {
-    dest_instruction->set_original_value(original_value);
-    return;
-  }
-
-  // Deep clone the tree.
-  auto cloned_tree = std::make_shared<OriginalValue>(original_value->tree());
-  dest_instruction->set_original_value(cloned_tree);
-}
-
-void DeduplicateOriginalValues(HloModule* module) {
-  absl::flat_hash_set<std::shared_ptr<OriginalValue>,
-                      PointeeHash<OriginalValue>, PointeeEqual<OriginalValue>>
-      unique_original_values;
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (std::shared_ptr<OriginalValue> original_value =
-              instruction->original_value()) {
-        auto p = unique_original_values.insert(original_value);
-        if (!p.second) {
-          // Reassign the pointer with the existing identical object and release
-          // the duplicate.
-          instruction->set_original_value(*p.first);
-        }
-      }
-    }
-  }
-}
-
 /* static */
 TupleTree<std::optional<OriginalArray>>&
 OriginalValue::EmptyOriginalValueTupleTree() {
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.h b/third_party/xla/xla/hlo/ir/hlo_original_value.h
index 5ccfebc6839308..92eb1fab1fd465 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.h
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.h
@@ -167,17 +167,5 @@ class OriginalValue {
       data_;
 };
 
-// Copies the original value of the source to the destination instruction if the
-// shapes of the source and destination are compatible. This performs a deep
-// copy if clone is set to true. Otherwise, it performs a shallow copy. Print a
-// warning if the shapes are not compatible and issue_warning is set to true.
-void CopyOriginalValue(const HloInstruction* src_instruction,
-                       HloInstruction* dest_instruction, bool clone,
-                       bool issue_warning);
-
-// Removes duplicates of original value objects referenced in the module to save
-// memory storage.
-void DeduplicateOriginalValues(HloModule* module);
 }  // namespace xla
-
 #endif  // XLA_HLO_IR_HLO_ORIGINAL_VALUE_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
index b70a8f1576587e..c3a8d55425f2bb 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/hash/hash_testing.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_util.cc b/third_party/xla/xla/hlo/ir/hlo_original_value_util.cc
new file mode 100644
index 00000000000000..2c8fa9d56a64ce
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_util.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_original_value_util.h"
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/utils/pointer_utils.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+void CopyOriginalValue(const HloInstruction* src_instruction,
+                       HloInstruction* dest_instruction, bool clone,
+                       bool issue_warning) {
+  if (!src_instruction || !dest_instruction ||
+      !ShapeUtil::Compatible(src_instruction->shape(),
+                             dest_instruction->shape())) {
+    if (issue_warning) {
+      LOG(WARNING)
+          << "Expect the new instruction to have the same shape with the old "
+             "instruction when moving over original_value";
+    }
+    return;
+  }
+
+  std::shared_ptr<OriginalValue> original_value =
+      src_instruction->original_value();
+  if (!original_value) {
+    return;
+  }
+
+  if (!clone || original_value->is_synthetic_call()) {
+    dest_instruction->set_original_value(original_value);
+    return;
+  }
+
+  // Deep clone the tree.
+  auto cloned_tree = std::make_shared<OriginalValue>(original_value->tree());
+  dest_instruction->set_original_value(cloned_tree);
+}
+
+void DeduplicateOriginalValues(HloModule* module) {
+  absl::flat_hash_set<std::shared_ptr<OriginalValue>,
+                      PointeeHash<OriginalValue>, PointeeEqual<OriginalValue>>
+      unique_original_values;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (std::shared_ptr<OriginalValue> original_value =
+              instruction->original_value()) {
+        auto p = unique_original_values.insert(original_value);
+        if (!p.second) {
+          // Reassign the pointer with the existing identical object and release
+          // the duplicate.
+          instruction->set_original_value(*p.first);
+        }
+      }
+    }
+  }
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_util.h b/third_party/xla/xla/hlo/ir/hlo_original_value_util.h
new file mode 100644
index 00000000000000..94ca6f5debdc16
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_util.h
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_ORIGINAL_VALUE_UTIL_H_
+#define XLA_HLO_IR_HLO_ORIGINAL_VALUE_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_original_value.h"
+
+namespace xla {
+
+// Copies the original value of the source to the destination instruction.
+// Original arrays in the source original value are rearranged in the new
+// original value according to the given mapping of old to new tuple indices.
+template <typename T>
+std::enable_if_t<std::is_integral_v<T>> CopyOriginalValue(
+    const HloInstruction* src_instruction, HloInstruction* dest_instruction,
+    const absl::flat_hash_map<T, T>& old_to_new_tuple_idx) {
+  std::shared_ptr<OriginalValue> old_original_value =
+      src_instruction->original_value();
+  if (!old_original_value) {
+    return;
+  }
+  const int64_t src_tuple_size = old_original_value->tree().num_leaves();
+  const int64_t dest_tuple_size = old_to_new_tuple_idx.size();
+  std::shared_ptr<xla::OriginalValue> new_original_value =
+      std::make_shared<xla::OriginalValue>(dest_instruction->shape());
+  for (const auto& [old_idx, new_idx] : old_to_new_tuple_idx) {
+    if (old_idx < 0 || old_idx >= src_tuple_size || new_idx < 0 ||
+        new_idx >= dest_tuple_size) {
+      return;
+    }
+    new_original_value->mutable_tree()->CopySubtreeFrom(
+        old_original_value->tree(), {old_idx}, {new_idx});
+  }
+  dest_instruction->set_original_value(new_original_value);
+}
+
+// Copies the original value of the source to the destination instruction if the
+// shapes of the source and destination are compatible. This performs a deep
+// copy if clone is set to true. Otherwise, it performs a shallow copy. Print a
+// warning if the shapes are not compatible and issue_warning is set to true.
+void CopyOriginalValue(const HloInstruction* src_instruction,
+                       HloInstruction* dest_instruction, bool clone,
+                       bool issue_warning);
+
+// Removes duplicates of original value objects referenced in the module to save
+// memory storage.
+void DeduplicateOriginalValues(HloModule* module);
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_ORIGINAL_VALUE_UTIL_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_print_options.h b/third_party/xla/xla/hlo/ir/hlo_print_options.h
index cd4a72bb176dd8..87eca03d396113 100644
--- a/third_party/xla/xla/hlo/ir/hlo_print_options.h
+++ b/third_party/xla/xla/hlo/ir/hlo_print_options.h
@@ -96,7 +96,6 @@ class HloPrintOptions {
         .set_print_large_constants(true)
         .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly)
         .set_print_metadata(false)
-        .set_print_backend_config(false)
         .set_print_operand_shape(false)
         .set_print_operand_index_annotation_interval(0)
         .set_print_program_shape(false)
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index b6376ad4f88aca..e958a1fc966bf8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -1099,23 +1099,12 @@ int64_t HloSharding::TotalNumTiles() const {
 }
 
 int64_t HloSharding::NumTiles() const {
-  if (IsTileMaximal()) {
-    return 1;
-  }
-  CHECK(!IsManual());
-  CHECK(!IsUnknown());
-  return Product(absl::Span<const int64_t>(tile_assignment_.dimensions())
-                     .subspan(0, TiledDataRank()));
-}
-
-int64_t HloSharding::NumTilesLeaf() const {
-  DCHECK(!IsTuple());
   if (IsTileMaximalLeaf()) {
     return 1;
   }
   CHECK(!IsManualLeaf() && !IsUnknownLeaf());
   return Product(absl::Span<const int64_t>(tile_assignment_.dimensions())
-                     .subspan(0, TiledDataRankLeaf()));
+                     .subspan(0, TiledDataRank()));
 }
 
 int64_t HloSharding::NumTiles(absl::Span<const int64_t> dims) const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index d4a0515e931146..93a58a4cdfb4d6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -130,16 +130,6 @@ class HloSharding {
                        metadata);
   }
 
-  explicit HloSharding(NamedSharding named_sharding)
-      : replicated_(false),
-        maximal_(false),
-        tuple_(false),
-        manual_(false),
-        unknown_(false),
-        unreduced_(false),
-        replicate_on_last_tile_dim_(false),
-        named_sharding_(std::move(named_sharding)) {}
-
   // Creates a subgroup sharding with device-level tile assignment, the
   // sharding type of each subgroup is defined by subgroup_types. When creating
   // the HloSharding, subgroup dims of the same type will be merged.
@@ -493,21 +483,42 @@ class HloSharding {
   // REQUIRES: !IsReplicated() && !IsTuple()
   const TileAssignment& tile_assignment() const { return tile_assignment_; }
 
+  const NamedSharding& named_sharding() const {
+    CHECK(UseNamedShardingLeaf());
+    return named_sharding_.value();
+  }
+
   // Returns the number of dimensions.
-  int64_t num_dimensions() const { return tile_assignment().num_dimensions(); }
+  int64_t num_dimensions() const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->num_dimensions();
+    }
+    return tile_assignment().num_dimensions();
+  }
 
   // Returns number of shards in the given dimension.
   int64_t dimension(int64_t dim_index) const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->dimension(dim_index);
+    }
     return tile_assignment().dim(dim_index);
   }
 
   // Returns all sharding dimensions.
   absl::Span<const int64_t> dimensions() const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->dimensions();
+    }
     return tile_assignment().dimensions();
   }
 
   // Returns the total number of devices used by sharding.
-  int64_t num_devices() const { return tile_assignment().num_elements(); }
+  int64_t num_devices() const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->num_devices();
+    }
+    return tile_assignment().num_elements();
+  }
 
   // Gets the subgroup types array.
   // REQUIRES: !IsTuple()
@@ -534,9 +545,8 @@ class HloSharding {
   // Gets the total number of tiles including subgroups and partial replication.
   int64_t TotalNumTiles() const;
   // Gets the number of tiles. If it has partial replication, this will not
-  // equal the device count.
+  // equal the device count. This method is not defined for tuple shardings.
   int64_t NumTiles() const;
-  int64_t NumTilesLeaf() const;
   // Like NumTiles() but considers only some specific dimensions passed as
   // argument
   int64_t NumTiles(absl::Span<const int64_t> dims) const;
@@ -576,17 +586,8 @@ class HloSharding {
   }
 
   // Returns the data rank for tiled sharding. It doesn't include subgroup dims.
+  // This method is not defined for tuple shardings.
   int64_t TiledDataRank() const {
-    CHECK(IsTiled());
-    int64_t rank = tile_assignment_.num_dimensions();
-    if (ReplicateOnLastTileDim()) {
-      rank--;
-    }
-    rank -= subgroup_types_.size();
-    return rank;
-  }
-  int64_t TiledDataRankLeaf() const {
-    DCHECK(!IsTuple());
     CHECK(IsTiledLeaf());
     int64_t rank = tile_assignment_.num_dimensions();
     if (ReplicateOnLastTileDim()) {
@@ -668,9 +669,15 @@ class HloSharding {
 
   const ShardGroup& GetShardGroup() const { return shard_group_; }
 
-  std::optional<NamedSharding> named_sharding() const {
-    return named_sharding_;
-  }
+  explicit HloSharding(NamedSharding named_sharding)
+      : replicated_(false),
+        maximal_(false),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        unreduced_(false),
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::move(named_sharding)) {}
 
  private:
   explicit HloSharding(bool manual, bool replicated, bool unknown,
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
index 6a7807f51d698e..d0cb57442ec492 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -94,6 +96,28 @@ Mesh::Mesh(TileAssignment device_assignment,
   CHECK_OK(ValidateMesh());
 }
 
+std::string Mesh::ToString() const {
+  std::string mesh_str = "@mesh";
+  // Add the mesh axes names and sizes.
+  std::vector<std::string> formatted_axes_names;
+  formatted_axes_names.reserve(axes_names_.size());
+  for (int64_t i = 0; i < axes_names_.size(); ++i) {
+    formatted_axes_names.push_back(
+        absl::StrCat(axes_names_[i], "=", device_assignment_.dim(i)));
+  }
+
+  // Add the device assignment if it is not an iota case.
+  std::optional<IotaTileAssignment> iota = device_assignment_.iota();
+  std::string device_assignment_str = "";
+  if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
+    device_assignment_str =
+        absl::StrCat("(", device_assignment_.ArrayToString(), ")");
+  }
+  absl::StrAppend(&mesh_str, "<", absl::StrJoin(formatted_axes_names, ","), ">",
+                  device_assignment_str);
+  return mesh_str;
+}
+
 MeshProto Mesh::ToProto() const {
   MeshProto proto;
   int64_t num_axes = axes_names_.size();
@@ -168,26 +192,15 @@ Mesh Mesh::FromProto(const MeshProto& proto) {
   return Mesh(tile_assignment, mesh_axis_names_span);
 }
 
-absl::Status AxisRef::Validate(const Mesh& mesh) const {
-  if (mesh_axis_index_ >= mesh.axis_names().size()) {
-    return absl::InvalidArgumentError(
-        "Axis index must be less than number of axes.");
-  }
-  if (!sub_axis_info_.has_value()) {
-    return absl::OkStatus();
-  }
-
-  int64_t axis_size = mesh.axis_size(mesh_axis_index_);
-  if (axis_size % sub_axis_info_->pre_size != 0 ||
-      axis_size % sub_axis_info_->size != 0) {
-    return absl::InvalidArgumentError(
-        "Pre-size and size must divide the full axis size.");
-  }
-  if (sub_axis_info_->size >= axis_size) {
-    return absl::InvalidArgumentError(
-        "Sub-axis size must be strictly less than the full axis size.");
+std::string AxisRef::ToString(const Mesh& mesh) const {
+  CHECK_GE(mesh_axis_index_, 0);
+  CHECK_LT(mesh_axis_index_, mesh.axis_names().size());
+  std::string axis_str = mesh.axis_names()[mesh_axis_index_];
+  if (sub_axis_info_.has_value()) {
+    absl::StrAppend(&axis_str, ":(", sub_axis_info_->pre_size, ")",
+                    sub_axis_info_->size);
   }
-  return absl::OkStatus();
+  return axis_str;
 }
 
 AxisRefProto AxisRef::ToProto() const {
@@ -309,6 +322,36 @@ bool AxisRef::CanCoexistWithoutOverlap(const AxisRef& other) const {
   return max_pre_size % min_next_pre_size == 0;
 }
 
+absl::Status AxisRef::Validate(const Mesh& mesh) const {
+  if (mesh_axis_index_ >= mesh.axis_names().size()) {
+    return absl::InvalidArgumentError(
+        "Axis index must be less than number of axes.");
+  }
+  if (!sub_axis_info_.has_value()) {
+    return absl::OkStatus();
+  }
+
+  int64_t axis_size = mesh.axis_size(mesh_axis_index_);
+  if (axis_size % sub_axis_info_->pre_size != 0 ||
+      axis_size % sub_axis_info_->size != 0) {
+    return absl::InvalidArgumentError(
+        "Pre-size and size must divide the full axis size.");
+  }
+  if (sub_axis_info_->size >= axis_size) {
+    return absl::InvalidArgumentError(
+        "Sub-axis size must be strictly less than the full axis size.");
+  }
+  return absl::OkStatus();
+}
+
+int64_t AxisRef::size(const Mesh& mesh) const {
+  if (sub_axis_info_.has_value()) {
+    return sub_axis_info_->size;
+  }
+
+  return mesh.axis_size(mesh_axis_index_);
+}
+
 bool AxesCanCoexistWithoutOverlap(absl::Span<const AxisRef> axes) {
   for (int64_t i = 0; i < axes.size() - 1; ++i) {
     for (int64_t j = i + 1; j < axes.size(); ++j) {
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.h b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
index 2b913f7638dad0..e83f58b7c0cb43 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.h
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
@@ -24,8 +24,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -92,32 +90,12 @@ class Mesh {
 
   bool operator!=(const Mesh& other) const { return !(*this == other); }
 
-  std::string ToString() const {
-    std::string mesh_str = "@mesh";
-    // Add the mesh axes names and sizes.
-    std::vector<std::string> formatted_axes_names;
-    formatted_axes_names.reserve(axes_names_.size());
-    for (int64_t i = 0; i < axes_names_.size(); ++i) {
-      formatted_axes_names.push_back(
-          absl::StrCat(axes_names_[i], "=", device_assignment_.dim(i)));
-    }
-
-    // Add the device assignment if it is not an iota case.
-    std::optional<IotaTileAssignment> iota = device_assignment_.iota();
-    std::string device_assignment_str = "";
-    if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
-      device_assignment_str =
-          absl::StrCat("(", device_assignment_.ArrayToString(), ")");
-    }
-    absl::StrAppend(&mesh_str, "<", absl::StrJoin(formatted_axes_names, ","),
-                    ">", device_assignment_str);
-    return mesh_str;
-  }
-
   bool DeviceAssignmentEquals(const Mesh& other) const {
     return device_assignment_ == other.device_assignment_;
   }
 
+  std::string ToString() const;
+
   MeshProto ToProto() const;
 
   static Mesh FromProto(const MeshProto& proto);
@@ -178,16 +156,7 @@ class AxisRef {
 
   bool operator!=(const xla::AxisRef& other) const { return !(*this == other); }
 
-  std::string ToString(const Mesh& mesh) const {
-    CHECK_GE(mesh_axis_index_, 0);
-    CHECK_LT(mesh_axis_index_, mesh.axis_names().size());
-    std::string axis_str = mesh.axis_names()[mesh_axis_index()];
-    if (sub_axis_info_.has_value()) {
-      absl::StrAppend(&axis_str, ":(", sub_axis_info_->pre_size, ")",
-                      sub_axis_info_->size);
-    }
-    return axis_str;
-  }
+  std::string ToString(const Mesh& mesh) const;
 
   AxisRefProto ToProto() const;
 
@@ -202,6 +171,8 @@ class AxisRef {
   int64_t mesh_axis_index() const { return mesh_axis_index_; }
   std::optional<SubAxis> sub_axis_info() const { return sub_axis_info_; }
 
+  int64_t size(const Mesh& mesh) const;
+
  private:
   absl::Status ValidateAxisRef();
 };
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
index 3dbcc53db77246..1bc5b9896b90a6 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/ir/mesh_and_axis.h"
 
 #include <cstdint>
-#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -350,4 +349,17 @@ TEST(MeshAndAxisTest, MaximalMesh) {
   EXPECT_EQ(maximal_mesh, Mesh::FromProto(maximal_mesh.ToProto()));
 }
 
+TEST(MeshAndAxisTest, AxisRefSize) {
+  Mesh mesh({2 * 7, 3 * 11, 5 * 13}, {"a", "b", "c"});
+  EXPECT_EQ(AxisRef(0).size(mesh), 14);
+  EXPECT_EQ(AxisRef(1).size(mesh), 33);
+  EXPECT_EQ(AxisRef(2).size(mesh), 65);
+  EXPECT_EQ(AxisRef(0, {1, 2}).size(mesh), 2);
+  EXPECT_EQ(AxisRef(0, {2, 7}).size(mesh), 7);
+  EXPECT_EQ(AxisRef(1, {1, 3}).size(mesh), 3);
+  EXPECT_EQ(AxisRef(1, {3, 11}).size(mesh), 11);
+  EXPECT_EQ(AxisRef(2, {1, 5}).size(mesh), 5);
+  EXPECT_EQ(AxisRef(2, {5, 13}).size(mesh), 13);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.cc b/third_party/xla/xla/hlo/ir/named_sharding.cc
new file mode 100644
index 00000000000000..efdecbcea0fa16
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/named_sharding.cc
@@ -0,0 +1,90 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/named_sharding.h"
+
+#include <cstdint>
+#include <map>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+
+namespace xla {
+
+int64_t NamedSharding::DimensionSharding::getShardedSize(
+    const Mesh& mesh) const {
+  return std::accumulate(axes_.begin(), axes_.end(), 1,
+                         [&mesh](int64_t cur, const AxisRef& axis) {
+                           return cur * axis.size(mesh);
+                         });
+}
+
+namespace test_utils {
+// Construct sharding with given mesh. 'dim_shardings', 'replicated_axes',
+// 'unreduced_axes' refer to axis names in the mesh.
+// This is a test only helper function.
+NamedSharding FromAxisNames(
+    Mesh mesh, absl::Span<const std::vector<std::string>> dim_shardings,
+    absl::Span<const std::string> replicated_axes,
+    absl::Span<const std::string> unreduced_axes,
+    absl::Span<const OpMetadata> metadata) {
+  std::map<std::string, int64_t> mesh_axis_to_index;
+  for (int64_t i = 0; i < mesh.axis_names().size(); ++i) {
+    mesh_axis_to_index[mesh.axis_names()[i]] = i;
+  }
+
+  std::vector<NamedSharding::DimensionSharding> dim_shardings_;
+  dim_shardings_.reserve(dim_shardings.size());
+  for (const auto& axes_for_dim : dim_shardings) {
+    std::vector<AxisRef> axis_refs;
+    axis_refs.reserve(axes_for_dim.size());
+    for (const std::string& axis_name : axes_for_dim) {
+      auto it = mesh_axis_to_index.find(axis_name);
+      CHECK(it != mesh_axis_to_index.end())
+          << "Axis " << axis_name << " not found in mesh " << mesh.ToString();
+      axis_refs.push_back(AxisRef(it->second));
+    }
+    dim_shardings_.push_back(NamedSharding::DimensionSharding(
+        std::move(axis_refs), /*is_closed=*/true));
+  }
+
+  std::vector<AxisRef> replicated_axes_;
+  replicated_axes_.reserve(replicated_axes.size());
+  for (const std::string& axis_name : replicated_axes) {
+    auto it = mesh_axis_to_index.find(axis_name);
+    CHECK(it != mesh_axis_to_index.end())
+        << "Axis " << axis_name << " not found in mesh " << mesh.ToString();
+    replicated_axes_.push_back(AxisRef(it->second));
+  }
+
+  std::vector<AxisRef> unreduced_axes_;
+  unreduced_axes_.reserve(unreduced_axes.size());
+  for (const std::string& axis_name : unreduced_axes) {
+    auto it = mesh_axis_to_index.find(axis_name);
+    CHECK(it != mesh_axis_to_index.end())
+        << "Axis " << axis_name << " not found in mesh " << mesh.ToString();
+    unreduced_axes_.push_back(AxisRef(it->second));
+  }
+
+  return NamedSharding(mesh, dim_shardings_, replicated_axes_, unreduced_axes_,
+                       metadata);
+}
+}  // namespace test_utils
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index 6c93bed8d40c74..3631512f6299dd 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_HLO_IR_NAMED_SHARDING_H_
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -38,11 +39,16 @@ class NamedSharding {
       return axes_ == other.axes_ && is_closed_ == other.is_closed_;
     }
 
-    explicit DimensionSharding(std::vector<AxisRef> axes, bool is_closed)
-        : axes_(std::move(axes)), is_closed_(is_closed) {}
+    // Note that by default we assume closed sharding.
+    explicit DimensionSharding() : is_closed_(true) {};
+
+    explicit DimensionSharding(absl::Span<const AxisRef> axes, bool is_closed)
+        : axes_(axes.begin(), axes.end()), is_closed_(is_closed) {}
 
     absl::Span<const AxisRef> axes() const { return axes_; }
 
+    int64_t getShardedSize(const Mesh& mesh) const;
+
    private:
     std::vector<AxisRef> axes_;
     bool is_closed_;
@@ -60,8 +66,6 @@ class NamedSharding {
     return !(*this == other);
   }
 
-  const Mesh& mesh() const { return mesh_; }
-
   // TODO(b/456212087): Add validation checks
   explicit NamedSharding(Mesh mesh,
                          absl::Span<const DimensionSharding> dim_shardings = {},
@@ -69,14 +73,59 @@ class NamedSharding {
                          absl::Span<const AxisRef> unreduced_axes = {},
                          absl::Span<const OpMetadata> metadata = {})
       : mesh_(std::move(mesh)),
-        dim_shardings_(dim_shardings.begin(), dim_shardings.end()),
+        dim_shardings_(CanonicalizedDimShardings(dim_shardings)),
         replicated_axes_(replicated_axes.begin(), replicated_axes.end()),
         unreduced_axes_(unreduced_axes.begin(), unreduced_axes.end()),
-        metadata_(metadata.begin(), metadata.end()) {}
+        metadata_(metadata.begin(), metadata.end()) {
+    sharded_sizes_.reserve(dim_shardings_.size());
+    for (const DimensionSharding& dim_sharding : dim_shardings_) {
+      sharded_sizes_.push_back(dim_sharding.getShardedSize(mesh_));
+    }
+  }
+
+  const Mesh& mesh() const { return mesh_; }
+  absl::Span<const DimensionSharding> dim_shardings() const {
+    return dim_shardings_;
+  }
+  const DimensionSharding& dim_sharding(int64_t dim) const {
+    return dim_shardings_[dim];
+  }
+  absl::Span<const AxisRef> replicated_axes() const { return replicated_axes_; }
+  absl::Span<const AxisRef> unreduced_axes() const { return unreduced_axes_; }
+  absl::Span<const OpMetadata> metadata() const { return metadata_; }
+
+  // Returns number of dimensions.
+  int64_t num_dimensions() const { return dim_shardings_.size(); }
+
+  // Returns size of the given dimension.
+  int64_t dimension(int64_t dim) const {
+    return dim_shardings_[dim].getShardedSize(mesh_);
+  }
+
+  // Returns all sharding dimensions.
+  absl::Span<const int64_t> dimensions() const { return sharded_sizes_; }
+
+  // Returns the total number of devices used by sharding.
+  int64_t num_devices() const {
+    return mesh_.device_assignment().num_elements();
+  }
 
  private:
   friend class HloSharding;
 
+  std::vector<DimensionSharding> CanonicalizedDimShardings(
+      absl::Span<const DimensionSharding> dim_shardings) const {
+    bool all_dims_empty = absl::c_all_of(
+        dim_shardings,
+        [](const DimensionSharding& ds) { return ds.axes().empty(); });
+
+    if (all_dims_empty) {
+      return {};
+    }
+    return std::vector<DimensionSharding>(dim_shardings.begin(),
+                                          dim_shardings.end());
+  }
+
   // Creates a sharding with empty mesh and no sharding axes depicting it is
   // replicated across all devices.
   static NamedSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
@@ -116,8 +165,26 @@ class NamedSharding {
   std::vector<AxisRef> replicated_axes_;
   std::vector<AxisRef> unreduced_axes_;
   std::vector<OpMetadata> metadata_;
+
+  // Stores sharded sizes for each dimension. Required to maintain backward
+  // compatibility with existing `HloSharding::dimensions()` implementation
+  // returning a span.
+  // Once we make API change for `HloSharding::dimensions()` to return a vector,
+  // we can remove this field.
+  std::vector<int64_t> sharded_sizes_;
 };
 
+// Contains test only helper functions.
+namespace test_utils {
+// Construct sharding with given mesh. 'dim_shardings', 'replicated_axes',
+// 'unreduced_axes' refer to axis names in the mesh.
+NamedSharding FromAxisNames(
+    Mesh mesh, absl::Span<const std::vector<std::string>> dim_shardings,
+    absl::Span<const std::string> replicated_axes = {},
+    absl::Span<const std::string> unreduced_axes = {},
+    absl::Span<const OpMetadata> metadata = {});
+}  // namespace test_utils
+
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_NAMED_SHARDING_H_
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
index 36e9cfbbba67bb..19764b7042f3b4 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding_test.cc
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/hlo/ir/named_sharding.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/mesh_and_axis.h"
 #include "xla/xla_data.pb.h"
@@ -23,6 +24,45 @@ namespace xla {
 namespace {
 
 using DimensionSharding = NamedSharding::DimensionSharding;
+using ::testing::ElementsAre;
+
+TEST(NamedShardingTest, CanonicalizedDimShardings) {
+  Mesh mesh_abcd({2, 4}, {"a", "b"});
+
+  DimensionSharding empty_ds;
+  NamedSharding sharding1(mesh_abcd, {empty_ds, empty_ds});
+  EXPECT_TRUE(sharding1.dim_shardings().empty());
+
+  DimensionSharding ds_a({AxisRef(0)}, /*is_closed=*/true);
+  NamedSharding sharding2(mesh_abcd, {ds_a, empty_ds});
+  EXPECT_FALSE(sharding2.dim_shardings().empty());
+}
+
+TEST(NamedShardingTest, AxisNameCtor) {
+  Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
+  AxisRef axis_a(0);
+  AxisRef axis_b(1);
+  AxisRef axis_c(2);
+  AxisRef axis_d(3);
+
+  NamedSharding sharding =
+      test_utils::FromAxisNames(mesh_abcd, /*dim_shardings=*/{{"c"}, {"b"}},
+                                /*replicated_axes=*/{"a"},
+                                /*unreduced_axes=*/{"d"});
+  DimensionSharding ds_c({axis_c}, /*is_closed=*/true);
+  DimensionSharding ds_b({axis_b}, /*is_closed=*/true);
+  EXPECT_EQ(sharding,
+            NamedSharding(mesh_abcd, {ds_c, ds_b}, {axis_a}, {axis_d}));
+
+  NamedSharding sharding2 = test_utils::FromAxisNames(
+      mesh_abcd,
+      /*dim_shardings=*/{{"c", "a"}, {}, {"b"}},
+      /*replicated_axes=*/{"d"}, /*unreduced_axes=*/{});
+  DimensionSharding ds_ca({axis_c, axis_a}, /*is_closed=*/true);
+  EXPECT_EQ(sharding2,
+            NamedSharding(mesh_abcd, {ds_ca, DimensionSharding(), ds_b},
+                          {axis_d}, {}));
+}
 
 TEST(NamedShardingTest, Equality) {
   Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
@@ -71,5 +111,79 @@ TEST(NamedShardingTest, Equality) {
             NamedSharding(mesh_diff_shape, {ds_ab, ds_dc}, {axis_b}, {axis_c}));
 }
 
+TEST(NamedShardingTest, GetShardedSize) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  EXPECT_EQ(ds_ab.getShardedSize(mesh), 2 * 2);
+
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+  EXPECT_EQ(ds_dc.getShardedSize(mesh), 2 * 3);
+
+  DimensionSharding ds_b({axis_b}, /*is_closed=*/true);
+  EXPECT_EQ(ds_b.getShardedSize(mesh), 2);
+
+  DimensionSharding ds_empty({}, /*is_closed=*/true);
+  EXPECT_EQ(ds_empty.getShardedSize(mesh), 1);
+}
+
+TEST(NamedShardingTest, Dimension) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+
+  NamedSharding sharding(mesh, /*dim_shardings=*/{ds_ab, ds_dc});
+
+  EXPECT_EQ(sharding.dimension(0), 2 * 2);
+  EXPECT_EQ(sharding.dimension(1), 2 * 3);
+  EXPECT_EQ(sharding.num_dimensions(), 2);
+
+  NamedSharding empty_sharding(mesh, /*dim_shardings=*/{});
+  EXPECT_EQ(empty_sharding.num_dimensions(), 0);
+}
+
+TEST(NamedShardingTest, Dimensions) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+
+  NamedSharding sharding(mesh, /*dim_shardings=*/{ds_ab, ds_dc});
+  EXPECT_THAT(sharding.dimensions(), ElementsAre(2 * 2, 2 * 3));
+
+  NamedSharding empty_sharding(mesh, /*dim_shardings=*/{});
+  EXPECT_THAT(empty_sharding.dimensions(), ElementsAre());
+}
+
+TEST(NamedShardingTest, NumDevices) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+  NamedSharding sharding(mesh, {});
+  EXPECT_EQ(sharding.num_devices(), 2 * 4 * 3 * 8);
+
+  Mesh maximal_mesh(5);
+  NamedSharding maximal_sharding(maximal_mesh);
+  EXPECT_EQ(maximal_sharding.num_devices(), 1);
+
+  Mesh empty_mesh;
+  NamedSharding empty_sharding(empty_mesh);
+  EXPECT_EQ(empty_sharding.num_devices(), 0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/replica_group.cc b/third_party/xla/xla/hlo/ir/replica_group.cc
index e78141fc8fee05..d0c67c7afd6387 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group.cc
@@ -189,7 +189,8 @@ std::vector<std::vector<int64_t>> get_replica_groups_for_full_axes(
   return replica_groups;
 }
 
-void MeshAxesReplicaGroupList::InitializeDimToReshapeAndAggregateAxes() {
+absl::flat_hash_map<int64_t, MeshAxesReplicaGroupList::ReshapeAndAggregateAxes>
+MeshAxesReplicaGroupList::GetDimToReshapeAndAggregateAxes() const {
   absl::flat_hash_map<int64_t, std::vector<AxisRef>> dim_to_axes;
   for (const AxisRef& axis : axes_) {
     dim_to_axes[axis.mesh_axis_index()].push_back(axis);
@@ -216,17 +217,14 @@ void MeshAxesReplicaGroupList::InitializeDimToReshapeAndAggregateAxes() {
     }
     dim_map[dim] = reshape_and_aggregate_axes;
   }
-  dim_to_reshape_and_aggregate_axes_ = dim_map;
+  return dim_map;
 }
 
 std::pair<std::vector<int64_t>, std::vector<int64_t>>
-MeshAxesReplicaGroupList::ComputeReindexedAxes() {
-  if (!dim_to_reshape_and_aggregate_axes_.has_value()) {
-    InitializeDimToReshapeAndAggregateAxes();
-  }
+MeshAxesReplicaGroupList::ComputeReindexedAxes() const {
   std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
   absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes> dim_map =
-      dim_to_reshape_and_aggregate_axes_.value();
+      GetDimToReshapeAndAggregateAxes();
   for (int64_t i = 0; i < mesh_.axis_sizes().size(); ++i) {
     int64_t axis_size = mesh_.axis_size(i);
     auto it = dim_map.find(i);
@@ -247,7 +245,7 @@ MeshAxesReplicaGroupList::ComputeReindexedAxes() {
 }
 
 std::vector<std::vector<int64_t>>
-MeshAxesReplicaGroupList::flattened_replica_groups() {
+MeshAxesReplicaGroupList::flattened_replica_groups() const {
   std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
   std::tie(reindex_axis_sizes, reindexed_grouped_axes) = ComputeReindexedAxes();
   return get_replica_groups_for_full_axes(
@@ -292,7 +290,7 @@ MeshAxesReplicaGroupList MeshAxesReplicaGroupList::FromProto(
   return MeshAxesReplicaGroupList(mesh, axes);
 }
 
-IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() {
+IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() const {
   CHECK(mesh_.device_assignment().iota().has_value());
   std::vector<int64_t> reshape_dims, reindexed_grouped_axes;
   std::tie(reshape_dims, reindexed_grouped_axes) = ComputeReindexedAxes();
@@ -312,7 +310,7 @@ IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() {
                               reshape_dims, transpose_perm);
 }
 
-CollectiveDeviceList MeshAxesReplicaGroupList::ToCollectiveDeviceList() {
+CollectiveDeviceList MeshAxesReplicaGroupList::ToCollectiveDeviceList() const {
   return CollectiveDeviceList(flattened_replica_groups());
 }
 
@@ -413,15 +411,22 @@ CollectiveDeviceList::flattened_replica_groups() const {
   return result;
 }
 
+std::string CollectiveDeviceList::ToString() const {
+  return ToString(/*print_full_replica_group_list=*/false);
+}
+
 std::string CollectiveDeviceList::ToString(
     bool print_full_replica_group_list) const {
   if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
     return iota_replica_group_list_->ToString();
   }
-
   return ReplicaGroupsToString(replica_groups());
 }
 
+void CollectiveDeviceList::Print(Printer* printer) const {
+  return Print(printer, /*print_full_replica_group_list=*/false);
+}
+
 void CollectiveDeviceList::Print(Printer* printer,
                                  bool print_full_replica_group_list) const {
   if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
@@ -484,4 +489,31 @@ CollectiveDeviceList CollectiveDeviceList::FromProto(
   return FromProto(proto.collective_device_list());
 }
 
+CollectiveDeviceList ConvertToV1CollectiveDeviceList(
+    const CollectiveDeviceListBase& device_list) {
+  switch (device_list.version()) {
+    case CollectiveDeviceListVersion::kListOfLists: {
+      return dynamic_cast<const CollectiveDeviceList&>(device_list);
+    }
+    case CollectiveDeviceListVersion::kIota: {
+      if (const auto* v2 =
+              dynamic_cast<const IotaReplicaGroupList*>(&device_list)) {
+        return CollectiveDeviceList(*v2);
+      }
+      const auto* v1 = dynamic_cast<const CollectiveDeviceList*>(&device_list);
+      CHECK(v1 != nullptr) << "Failed to convert kIota to V1 list.";
+      return *v1;
+    }
+    case CollectiveDeviceListVersion::kMeshAxes: {
+      const auto* v3 =
+          dynamic_cast<const MeshAxesReplicaGroupList*>(&device_list);
+      CHECK(v3 != nullptr) << "Failed to convert kMeshAxes to V1 list.";
+      return v3->ToCollectiveDeviceList();
+    }
+    default:
+      LOG(FATAL) << "Unknown CollectiveDeviceListVersion: "
+                 << static_cast<int>(device_list.version());
+  }
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/replica_group.h b/third_party/xla/xla/hlo/ir/replica_group.h
index b6e30d24071ef3..a06b50fd4f1eb4 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.h
+++ b/third_party/xla/xla/hlo/ir/replica_group.h
@@ -41,12 +41,78 @@ namespace xla {
 class IotaReplicaGroupList;
 class CollectiveDeviceList;
 
-class MeshAxesReplicaGroupList {
-  struct ReshapeAndAggregateAxes {
-    std::vector<int64_t> reshape_dims;
-    std::vector<int64_t> aggregate_axes;
+enum class CollectiveDeviceListVersion { kListOfLists, kIota, kMeshAxes };
+
+// Base class providing the interface for all collective device list
+// representations.
+class CollectiveDeviceListBase {
+ public:
+  virtual ~CollectiveDeviceListBase() = default;
+  CollectiveDeviceListBase() = default;
+  CollectiveDeviceListBase(const CollectiveDeviceListBase&) = default;
+  CollectiveDeviceListBase& operator=(const CollectiveDeviceListBase&) =
+      default;
+  CollectiveDeviceListBase(CollectiveDeviceListBase&&) = default;
+  CollectiveDeviceListBase& operator=(CollectiveDeviceListBase&&) = default;
+
+  // This is strict equality, which means that two different types
+  // can't be compared for functional equality (i.e. even though an
+  // IotaReplicaGroup and a CollectiveDeviceList may correspond to the same
+  // underlying set of device groups, they will compare as unequal).
+  friend bool operator==(const CollectiveDeviceListBase& lhs,
+                         const CollectiveDeviceListBase& rhs) {
+    if (typeid(lhs) != typeid(rhs)) {
+      return false;
+    }
+    // If types are the same, delegate to the derived implementation
+    return lhs.isEqual(rhs);
+  }
+
+  virtual int64_t num_replica_groups() const = 0;
+  virtual int64_t num_devices_per_group() const = 0;
+  int64_t num_total_devices() const {
+    return num_replica_groups() * num_devices_per_group();
+  }
+  virtual std::vector<std::vector<int64_t>> flattened_replica_groups()
+      const = 0;
+
+  virtual const std::vector<ReplicaGroup>& replica_groups() const {
+    if (replica_groups_ != nullptr) {
+      return *replica_groups_;
+    }
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>();
+    replica_groups_->reserve(num_replica_groups());
+    for (const auto& group : flattened_replica_groups()) {
+      ReplicaGroup replica_group;
+      replica_group.mutable_replica_ids()->Add(group.begin(), group.end());
+      replica_groups_->push_back(std::move(replica_group));
+    }
+    return *replica_groups_;
   };
 
+  virtual void Print(Printer* printer) const = 0;
+  virtual void Print(Printer* printer,
+                     bool print_full_replica_group_list) const {
+    Print(printer);
+  }
+  virtual std::string ToString() const = 0;
+  virtual std::string ToString(bool print_full_replica_group_list) const {
+    return ToString();
+  };
+
+  virtual std::unique_ptr<CollectiveDeviceListBase> Clone() const = 0;
+  virtual CollectiveDeviceListVersion version() const = 0;
+
+ protected:
+  // Used by operator== to check equality of derived types.
+  virtual bool isEqual(const CollectiveDeviceListBase& other) const = 0;
+
+  // shared_ptr for fast copy and lazy materialization.
+  mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
+};
+
+// Compact representation using Mesh and Axis indices.
+class MeshAxesReplicaGroupList : public CollectiveDeviceListBase {
  public:
   explicit MeshAxesReplicaGroupList(Mesh mesh, std::vector<AxisRef> axes);
 
@@ -59,40 +125,50 @@ class MeshAxesReplicaGroupList {
     return H::combine(std::move(h), c.mesh_, c.axes_);
   }
 
-  int64_t num_replica_groups() const;
-  int64_t num_devices_per_group() const;
-  std::vector<std::vector<int64_t>> flattened_replica_groups();
-
-  void Print(Printer* printer) const;
-
-  std::string ToString() const;
-
+  // Overrides
+  int64_t num_replica_groups() const override;
+  int64_t num_devices_per_group() const override;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
+  void Print(Printer* printer) const override;
+  std::string ToString() const override;
   MeshAxesReplicaGroupListProto ToProto() const;
 
+  std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
+    return std::make_unique<MeshAxesReplicaGroupList>(*this);
+  }
+  CollectiveDeviceListVersion version() const override {
+    return CollectiveDeviceListVersion::kMeshAxes;
+  }
+
+  // Conversion and Serialization
   static MeshAxesReplicaGroupList FromProto(
       const MeshAxesReplicaGroupListProto& proto);
+  IotaReplicaGroupList ToIotaReplicaGroupList() const;
+  CollectiveDeviceList ToCollectiveDeviceList() const;
 
-  // Methods for converting to V2 and V1 representations.
-  IotaReplicaGroupList ToIotaReplicaGroupList();
-  CollectiveDeviceList ToCollectiveDeviceList();
+ protected:
+  bool isEqual(const CollectiveDeviceListBase& other) const override {
+    return *this == static_cast<const MeshAxesReplicaGroupList&>(other);
+  }
 
  private:
-  void InitializeDimToReshapeAndAggregateAxes();
-  std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes();
+  struct ReshapeAndAggregateAxes {
+    std::vector<int64_t> reshape_dims;
+    std::vector<int64_t> aggregate_axes;
+  };
+
+  // Internal helpers for computing device groups.
+  absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>
+  GetDimToReshapeAndAggregateAxes() const;
+  std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes()
+      const;
+
   Mesh mesh_;
   std::vector<AxisRef> axes_;
-  std::optional<absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>>
-      dim_to_reshape_and_aggregate_axes_;
 };
 
-std::string ReplicaGroupsToString(
-    absl::Span<const ReplicaGroup> replica_groups);
-
-// Represents a list of replica groups (a list of list of devices) with
-// reshaping and transposing an iota array (iota tile assignment). Can be used
-// to represent certain common patterns of device lists in a compact, scalable
-// format.
-class IotaReplicaGroupList {
+// Representation using Iota patterns (reshaping/transposing linear ranges).
+class IotaReplicaGroupList : public CollectiveDeviceListBase {
  public:
   explicit IotaReplicaGroupList(int64_t num_replica_groups,
                                 int64_t num_devices_per_group)
@@ -125,8 +201,8 @@ class IotaReplicaGroupList {
                       c.transpose_perm());
   }
 
-  int64_t num_replica_groups() const;
-  int64_t num_devices_per_group() const;
+  int64_t num_replica_groups() const override;
+  int64_t num_devices_per_group() const override;
   absl::Span<const int64_t> reshape_dims() const {
     return iota_tile_assignment_.reshape_dims();
   }
@@ -134,43 +210,54 @@ class IotaReplicaGroupList {
     return iota_tile_assignment_.transpose_perm();
   }
   Array<int64_t> ToArray() const { return iota_tile_assignment_.ToArray(); }
-  std::vector<std::vector<int64_t>> flattened_replica_groups() const;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
 
-  void Print(Printer* printer) const;
+  void Print(Printer* printer) const override;
+  std::string ToString() const override;
 
-  std::string ToString() const;
+  std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
+    return std::make_unique<IotaReplicaGroupList>(*this);
+  }
+  CollectiveDeviceListVersion version() const override {
+    return CollectiveDeviceListVersion::kIota;
+  }
 
   IotaReplicaGroupListProto ToProto() const;
-
   static IotaReplicaGroupList FromProto(const IotaReplicaGroupListProto& proto);
 
+ protected:
+  bool isEqual(const CollectiveDeviceListBase& other) const override {
+    return *this == static_cast<const IotaReplicaGroupList&>(other);
+  }
+
  private:
   IotaTileAssignment iota_tile_assignment_;
   int64_t num_replica_groups_ = -1;
   int64_t num_devices_per_group_ = -1;
 };
 
-// Represents a series of devices participating in a collective operation
-// (all-gather, all-reduce, etc.). While this directly translates to a list of
-// replica groups, it may be used to represent these lists in compact forms.
-class CollectiveDeviceList {
+// Legacy/Explicit representation using an explicit list of ReplicaGroups.
+class CollectiveDeviceList : public CollectiveDeviceListBase {
  public:
-  explicit CollectiveDeviceList()
-      : replica_groups_(std::make_shared<std::vector<ReplicaGroup>>()) {};
+  explicit CollectiveDeviceList() {
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>();
+  };
 
-  explicit CollectiveDeviceList(std::vector<ReplicaGroup> replica_groups)
-      : replica_groups_(std::make_shared<std::vector<ReplicaGroup>>(
-            std::move(replica_groups))) {};
+  explicit CollectiveDeviceList(std::vector<ReplicaGroup> replica_groups) {
+    replica_groups_ =
+        std::make_shared<std::vector<ReplicaGroup>>(std::move(replica_groups));
+  };
 
-  explicit CollectiveDeviceList(absl::Span<const ReplicaGroup> replica_groups)
-      : replica_groups_(std::make_shared<std::vector<ReplicaGroup>>(
-            replica_groups.begin(), replica_groups.end())) {};
+  explicit CollectiveDeviceList(absl::Span<const ReplicaGroup> replica_groups) {
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>(
+        replica_groups.begin(), replica_groups.end());
+  };
 
   explicit CollectiveDeviceList(
-      absl::Span<const std::vector<int64_t>> replica_groups)
-      : replica_groups_(ToReplicaGroupVector(replica_groups)) {};
+      absl::Span<const std::vector<int64_t>> replica_groups) {
+    replica_groups_ = ToReplicaGroupVector(replica_groups);
+  };
 
-  // Replica groups are materialized lazily upon first access.
   explicit CollectiveDeviceList(
       const IotaReplicaGroupList& iota_replica_group_list)
       : iota_replica_group_list_(iota_replica_group_list) {}
@@ -205,42 +292,57 @@ class CollectiveDeviceList {
     return h;
   }
 
-  // Lazyly explands iota if applicable.
-  const std::vector<ReplicaGroup>& replica_groups() const;
-  std::vector<std::vector<int64_t>> flattened_replica_groups() const;
+  // Overrides
+  const std::vector<ReplicaGroup>& replica_groups() const override;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
   const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
     return iota_replica_group_list_;
   }
 
-  int64_t num_replica_groups() const {
+  int64_t num_replica_groups() const override {
     return iota_replica_group_list_.has_value()
                ? iota_replica_group_list_->num_replica_groups()
                : replica_groups_->size();
   }
 
-  int64_t num_devices_per_group() const {
+  int64_t num_devices_per_group() const override {
     return iota_replica_group_list_.has_value()
                ? iota_replica_group_list_->num_devices_per_group()
                : replica_groups_->begin()->replica_ids_size();
   }
 
+  void Print(Printer* printer) const override;
   void Print(Printer* printer,
-             bool print_full_replica_group_list = false) const;
+             bool print_full_replica_group_list) const override;
+  std::string ToString() const override;
+  std::string ToString(bool print_full_replica_group_list) const override;
 
-  std::string ToString(bool print_full_replica_group_list = false) const;
+  CollectiveDeviceListVersion version() const override {
+    if (iota_replica_group_list_.has_value()) {
+      return CollectiveDeviceListVersion::kIota;
+    }
+    return CollectiveDeviceListVersion::kListOfLists;
+  }
 
   CollectiveDeviceListProto ToProto() const;
   static CollectiveDeviceList FromProto(const CollectiveDeviceListProto& proto);
   static CollectiveDeviceList FromProto(const HloInstructionProto& proto);
 
+  std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
+    return std::make_unique<CollectiveDeviceList>(*this);
+  };
+
+ protected:
+  bool isEqual(const CollectiveDeviceListBase& other) const override {
+    return *this == static_cast<const CollectiveDeviceList&>(other);
+  }
+
  private:
-  // Construct collective device list from protobuf replica group start and end
-  // iterators.
   CollectiveDeviceList(
       tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator start,
-      tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator end)
-      : replica_groups_(
-            std::make_shared<std::vector<ReplicaGroup>>(start, end)) {};
+      tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator end) {
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>(start, end);
+  };
 
   static std::shared_ptr<std::vector<ReplicaGroup>> ToReplicaGroupVector(
       absl::Span<const std::vector<int64_t>> replica_groups) {
@@ -254,14 +356,16 @@ class CollectiveDeviceList {
     return result;
   }
 
-  // Load replica groups from iota tile assignment if not already done so.
   void MaybeMaterializeFullReplicaGroupList() const;
 
   std::optional<IotaReplicaGroupList> iota_replica_group_list_;
-  // shared_ptr for fast copy.
-  mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
 };
 
+std::string ReplicaGroupsToString(
+    absl::Span<const ReplicaGroup> replica_groups);
+CollectiveDeviceList ConvertToV1CollectiveDeviceList(
+    const CollectiveDeviceListBase& device_list);
+
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_REPLICA_GROUP_H_
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser.cc b/third_party/xla/xla/hlo/parser/hlo_parser.cc
index fe1e5298e5db58..626fc80f49839d 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
@@ -2070,27 +2071,11 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       if (!preset_operands && !ParseOperands(&operands, builder)) {
         return nullptr;
       }
-      auto is_async_shape_correct = [](const Shape& shape) {
-        return shape.IsTuple() && shape.tuple_shapes().size() >= 2 &&
-               shape.tuple_shapes(0).IsTuple();
-      };
-      // Verify operand/resulting shapes
-      if (opcode == HloOpcode::kAsyncUpdate ||
-          opcode == HloOpcode::kAsyncDone) {
-        if (operands.size() != 1 ||
-            !is_async_shape_correct(operands[0]->shape())) {
-          TokenError(
-              "AsyncUpdate and AsyncDone expect a single operand in the form "
-              "of ((async-operands), async-outputs, state).");
-          return nullptr;
-        }
-      }
-      if (opcode == HloOpcode::kAsyncStart ||
-          opcode == HloOpcode::kAsyncUpdate) {
-        if (!is_async_shape_correct(*shape)) {
+      if (opcode == HloOpcode::kAsyncStart) {
+        if (!shape->IsTuple() || shape->tuple_shapes().size() < 2 ||
+            !shape->tuple_shapes(0).IsTuple()) {
           TokenError(
-              "AsyncStart and AsyncUpdate expect the op shape to be in the "
-              "form of "
+              "AsyncStart expects the op shape to be in the form of "
               "((async-operands), async-outputs, state).");
           return nullptr;
         }
@@ -2099,17 +2084,20 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       // previous async op.
       if (opcode == HloOpcode::kAsyncUpdate ||
           opcode == HloOpcode::kAsyncDone) {
-        if (operands.size() != 1 ||
-            !is_async_shape_correct(operands[0]->shape())) {
+        if (operands.size() != 1 || !operands[0]->IsAsynchronous() ||
+            operands[0]->opcode() == HloOpcode::kAsyncDone) {
           TokenError(
-              "AsyncUpdate and AsyncDone expect a single operand in the form "
-              "of ((async-operands), async-outputs, state).");
+              "AsyncUpdate and AsyncDone expect a single async op as their "
+              "operand.");
           return nullptr;
         }
-        if (!operands[0]->IsAsynchronous()) {
+      }
+      // For AsyncUpdate, the operand and the result should have the same shape.
+      if (opcode == HloOpcode::kAsyncUpdate) {
+        if (operands[0]->shape() != *shape) {
           TokenError(
-              "AsyncUpdate and AsyncDone expect their operand to be the "
-              "previous async op.");
+              "AsyncUpdate expects the op shape to be the same as the operand "
+              "shape.");
           return nullptr;
         }
       }
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
index 3e33268136e874..3edde58537a8fd 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
@@ -2820,15 +2820,14 @@ class HloParameterizedParserTest
     VLOG(3) << "Running HloParameterizedParserTest with short_form = "
             << short_form << ", proto_round_trip = " << proto_round_trip;
     const std::string& original = GetParam().module_string;
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(original));
-    TF_ASSERT_OK_AND_ASSIGN(
+    ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(original));
+    ASSERT_OK_AND_ASSIGN(
         module, ParseAndReturnVerifiedModule(module->ToString(
                     HloPrintOptions().set_print_large_constants(true))));
 
     if (proto_round_trip) {
-      TF_ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
-                                          module->ToProto(), module->config()));
+      ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
+                                       module->ToProto(), module->config()));
     }
     if (short_form) {
       EXPECT_EQ(original, module->ToString(HloPrintOptions::ShortParsable()));
@@ -2879,7 +2878,7 @@ INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
 class HloNonRoundtripParserTest
     : public ::testing::TestWithParam<NonRoundtripTestData> {};
 TEST_P(HloNonRoundtripParserTest, Run) {
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(GetParam().test_name,
                                                 GetParam().input_module_string,
                                                 HloModuleConfig()));
@@ -3020,7 +3019,7 @@ ENTRY %configuration_test() -> s32[] {
   %constant = s32[] constant(42), backend_config="foo bar"
 })";
   auto result = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(result.status());
+  ASSERT_OK(result.status());
   EXPECT_EQ("foo bar", result.value()
                            ->entry_computation()
                            ->root_instruction()
@@ -3481,7 +3480,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
 })";
 
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   auto program_layout = module.value()->entry_computation_layout();
   ASSERT_EQ(program_layout.parameter_count(), 1);
   auto param_layout = program_layout.parameter_layout(0).layout();
@@ -3516,7 +3515,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(false)
                                          .set_keep_module_auto_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   // Do not set the default layout.
   EXPECT_FALSE(module.value()->entry_computation_layout().AnyLayoutSet());
 }
@@ -3542,7 +3541,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(true)
                                          .set_keep_module_auto_layouts(true));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   // Do not set the default layout.
   EXPECT_FALSE(module.value()->entry_computation_layout().AnyLayoutSet());
 }
@@ -3568,7 +3567,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(true)
                                          .set_keep_module_auto_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation_layout()
                   .parameter_layout(0)
@@ -3598,7 +3597,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(true)
                                          .set_keep_module_auto_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation_layout()
                   .parameter_layout(0)
@@ -3620,7 +3619,7 @@ ENTRY main {
   absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(
           original, {}, HloParserOptions().set_fill_missing_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_FALSE(module.value()
                    ->entry_computation()
                    ->root_instruction()
@@ -3641,7 +3640,7 @@ ENTRY main {
   absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(
           original, {}, HloParserOptions().set_fill_missing_layouts(true));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation()
                   ->root_instruction()
@@ -3664,7 +3663,7 @@ ENTRY main {
   absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(
           original, {}, HloParserOptions().set_fill_missing_layouts(true));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation()
                   ->root_instruction()
@@ -3683,7 +3682,7 @@ c2 {
   const2 = f32[1]{0} constant({67890})
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_EQ(module.value()->entry_computation()->name(), "c2");
 }
 
@@ -3694,7 +3693,7 @@ ENTRY consts {
   last = f32[1]{0} constant({67890})
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_EQ(module.value()->entry_computation()->root_instruction()->name(),
             "last");
 }
@@ -3713,7 +3712,7 @@ ENTRY /*comment*/ c1 {
 
 )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, MultilineComments) {
@@ -3732,7 +3731,7 @@ d
 */
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, UnterminatedComment) {
@@ -3755,7 +3754,7 @@ ENTRY c1 {
   ROOT const1 = f32[1]{0} constant({12345}) // Something else
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) {
@@ -3763,7 +3762,7 @@ TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) {
       "HloModule slash_slash_comment:\r\n// Garbage\r\nENTRY c1 {\r\n// Foo "
       "bar\r\nROOT const1 = f32[1]{0} constant({12345}) // Something else\r\n}";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) {
@@ -3771,7 +3770,7 @@ TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) {
       "HloModule slash_slash_comment:\r// Garbage\rENTRY c1 {\r// Foo "
       "bar\rROOT const1 = f32[1]{0} constant({12345}) // Something else\r}";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, MultipleEntries) {
@@ -3798,7 +3797,7 @@ ENTRY entry {
 }
   )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   std::unique_ptr<HloModule> parsed_module = std::move(module).value();
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {0}),
             ShapeIndex{0});
@@ -3825,7 +3824,7 @@ ENTRY entry {
 }
   )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   std::unique_ptr<HloModule> parsed_module = std::move(module).value();
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {0}),
             ShapeIndex({0, 0}));
@@ -3920,7 +3919,7 @@ ENTRY entry {
 }
   )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   std::unique_ptr<HloModule> parsed_module = std::move(module).value();
   EXPECT_TRUE(
       parsed_module->buffer_donor_config().ParameterIsBufferDonor(0, {0}));
@@ -4019,7 +4018,7 @@ ENTRY ReduceR3ToR2 {
   ROOT result = f32[8,16]{1,0} reduce(p0, p1), dimensions={2}, to_apply=add
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(original));
   ASSERT_NE(module->entry_computation(), nullptr);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Reduce()));
@@ -4027,13 +4026,13 @@ ENTRY ReduceR3ToR2 {
 
 TEST_F(HloParserTest, ParseSharding) {
   const std::string original = "{maximal device=42}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
 }
 
 TEST_F(HloParserTest, ParseShardingPartialReplication) {
   const std::string original = "{devices=[2,2]0,1,2,3 last_tile_dim_replicate}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   Array<int64_t> tiling_last_dim_replicated({{0, 1}, {2, 3}});
   EXPECT_EQ(HloSharding::PartialTile(tiling_last_dim_replicated).ToString(),
@@ -4044,7 +4043,7 @@ TEST_F(HloParserTest, ParseShardingSubGroup) {
   const std::string original =
       "{devices=[2,2,2,2]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 "
       "last_tile_dims={manual, replicated}}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   Array<int64_t> tile_assignment({2, 2, 2, 2});
   tile_assignment.FillIota(0);
@@ -4056,7 +4055,7 @@ TEST_F(HloParserTest, ParseShardingSubGroup) {
 
 TEST_F(HloParserTest, ParseTrivialIotaShardingPartialReplication) {
   const std::string original = "{devices=[2,2]<=[4] last_tile_dim_replicate}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tiling_last_dim_replicated((absl::Span<const int64_t>){2, 2});
   EXPECT_EQ(HloSharding::PartialTile(tiling_last_dim_replicated).ToString(),
@@ -4066,7 +4065,7 @@ TEST_F(HloParserTest, ParseTrivialIotaShardingPartialReplication) {
 TEST_F(HloParserTest, ParseTrivialIotaShardingSubGroup) {
   const std::string original =
       "{devices=[2,2,2,2]<=[16] last_tile_dims={manual, replicated}}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tile_assignment({2, 2, 2, 2});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
@@ -4078,7 +4077,7 @@ TEST_F(HloParserTest, ParseTrivialIotaShardingSubGroup) {
 TEST_F(HloParserTest, ParseTransposedIotaShardingPartialReplication) {
   const std::string original =
       "{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tiling_last_dim_replicated({2, 2}, {2, 2}, {1, 0});
   EXPECT_EQ(HloSharding::PartialTile(tiling_last_dim_replicated).ToString(),
@@ -4089,7 +4088,7 @@ TEST_F(HloParserTest, ParseTransposedIotaShardingSubGroup) {
   const std::string original =
       "{devices=[2,2,2,2]<=[2,2,4]T(2,1,0) last_tile_dims={manual, "
       "replicated}}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tile_assignment({2, 2, 2, 2}, {2, 2, 4}, {2, 1, 0});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
@@ -4100,7 +4099,7 @@ TEST_F(HloParserTest, ParseTransposedIotaShardingSubGroup) {
 
 TEST_F(HloParserTest, ParseShardAs) {
   const std::string original = "{manual shard_as 1}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   EXPECT_EQ(
       HloSharding::Manual().SetShardGroup(HloSharding::ShardAs(1)).ToString(),
@@ -4111,7 +4110,7 @@ TEST_F(HloParserTest, ParseShardLike) {
   const std::string original =
       "{devices=[2,2,2,2]<=[16] last_tile_dims={manual, replicated} shard_like "
       "1}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tile_assignment({2, 2, 2, 2});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
@@ -4124,7 +4123,7 @@ TEST_F(HloParserTest, ParseShardLike) {
 
 TEST_F(HloParserTest, ParseUnknownSharding) {
   const std::string original = "{unknown}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   EXPECT_EQ(HloSharding::Unknown().ToString(), original);
 }
@@ -4132,53 +4131,53 @@ TEST_F(HloParserTest, ParseUnknownSharding) {
 TEST_F(HloParserTest, ParseFrontendAttributes) {
   const std::string original =
       R"({attr_a="test_a",attr_b="b",attr_c={type="s64"},attr_d="a=\"b/c\""})";
-  TF_ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
-                          ParseFrontendAttributes(original));
+  ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
+                       ParseFrontendAttributes(original));
   EXPECT_EQ(FrontendAttributesToString(frontend_attributes), original);
 }
 
 TEST_F(HloParserTest, ParseWindow) {
   Window original = window_util::MakeWindow({1, 2, 3});
-  TF_ASSERT_OK_AND_ASSIGN(Window parsed,
-                          ParseWindow(window_util::ToString(original)));
+  ASSERT_OK_AND_ASSIGN(Window parsed,
+                       ParseWindow(window_util::ToString(original)));
   EXPECT_EQ(window_util::ToString(original), window_util::ToString(parsed));
 }
 
 TEST_F(HloParserTest, ParseConvolutionDimensionNumbers) {
   const std::string original = "b0f_0io->b0f";
-  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
-                          ParseConvolutionDimensionNumbers(original));
+  ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                       ParseConvolutionDimensionNumbers(original));
   EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums));
 }
 
 TEST_F(HloParserTest, ParseConvolutionDimensionNumbersWithUnknownDims) {
   const std::string original = "b0?f_?0?io->?b?0?f";
-  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
-                          ParseConvolutionDimensionNumbers(original));
+  ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                       ParseConvolutionDimensionNumbers(original));
   EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums));
 }
 
 TEST_F(HloParserTest, ParseReplicaGroups) {
   const std::string original = "{{0,1},{2,3}}";
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<ReplicaGroup> replica_groups,
-                          ParseReplicaGroupsOnly(original));
+  ASSERT_OK_AND_ASSIGN(std::vector<ReplicaGroup> replica_groups,
+                       ParseReplicaGroupsOnly(original));
   EXPECT_EQ(original, ReplicaGroupsToString(replica_groups));
 }
 
 TEST_F(HloParserTest, ParsePaddingConfigNoInteriorPadding) {
   const std::string original = "0_1x2_3";
-  TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
+  ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
   EXPECT_EQ(original, PaddingConfigToString(dnums));
 }
 
 TEST_F(HloParserTest, ParsePaddingConfigInteriorPadding) {
   const std::string original = "0_1_0x2_3_4";
-  TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
+  ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
   EXPECT_EQ(original, PaddingConfigToString(dnums));
 }
 
 TEST_F(HloParserTest, ParsePaddingConfigInteriorPaddingImplicitZeroDim) {
-  TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig("0_1x2_3_4"));
+  ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig("0_1x2_3_4"));
   // The extra "_0" gets added to the canonical string because the other dim has
   // interior padding.
   EXPECT_EQ("0_1_0x2_3_4", PaddingConfigToString(dnums));
@@ -4198,7 +4197,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
   const std::string text =
       "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, "
       "f32[2,4]{1,0} %x)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4229,17 +4228,55 @@ TEST(HloParserSingleOpTest, SingleOpNoOperandShapesProducesError) {
 TEST(HloParserSingleOpTest, SingleOpNoNames) {
   const std::string text =
       "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST(HloParserSingleOpTest, SkipStackFrameIndex) {
+  const std::string text = R"(HloModule m, entry_computation_layout={()->pred[]}
+
+FileNames
+1 "<embedded module>"
+2 "experimental/module.py"
+3 "yet/another/test.py"
+
+FunctionNames
+1 "main"
+2 "method"
+
+FileLocations
+1 {file_name_id=1 function_name_id=1 line=153 end_line=153 column=2 end_column=31}
+2 {file_name_id=3 function_name_id=2 line=35 end_line=35 column=2 end_column=24}
+3 {file_name_id=2 function_name_id=2 line=83 end_line=83 column=2 end_column=15}
+
+StackFrames
+1 {file_location_id=1 parent_frame_id=1}
+2 {file_location_id=2 parent_frame_id=2}
+
+
+ENTRY %constant_pred () -> pred[] {
+  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="opname" stack_frame_id=1}
+})";
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  HloPrintOptions options = HloPrintOptions::Canonical();
+  options.set_print_metadata(false);
+  EXPECT_EQ(module->ToString(options),
+            R"(HloModule m, entry_computation_layout={()->pred[]}
+
+ENTRY constant_pred {
+  ROOT tmp_0 = pred[] constant(true)
+}
+
+)");
+}
+
 TEST(HloParserSingleOpTest, CanonicalOp) {
   const std::string text =
       "f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4275,7 +4312,7 @@ TEST(HloParserSingleOpTest, CanonicalOpWithNested) {
   }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_EQ(
@@ -4300,7 +4337,7 @@ TEST(HloParserSingleOpTest, CanonicalOpIndexedConditionalInlinedBranches) {
 }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_EQ(
@@ -4318,7 +4355,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested) {
   ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %param_0, f32[3,2,1,1]{3,2,1,0} %broadcast)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4365,7 +4402,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
 TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   const std::string text =
       R"(%convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4398,7 +4435,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_FALSE(module->has_schedule());
 }
 
@@ -4415,7 +4452,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_FALSE(module->has_schedule());
 }
 
@@ -4432,9 +4469,9 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_TRUE(module->has_schedule());
-  TF_ASSERT_OK(module->schedule().Verify());
+  ASSERT_OK(module->schedule().Verify());
   EXPECT_EQ(module->schedule().sequences().size(), 1);
   ASSERT_TRUE(
       module->schedule().is_computation_scheduled(module->entry_computation()));
@@ -4459,9 +4496,9 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_TRUE(module->has_schedule());
-  TF_ASSERT_OK(module->schedule().Verify());
+  ASSERT_OK(module->schedule().Verify());
   EXPECT_EQ(module->schedule().sequences().size(), 1);
   ASSERT_TRUE(
       module->schedule().is_computation_scheduled(module->entry_computation()));
@@ -4536,7 +4573,7 @@ ENTRY entry {
   ROOT root = f32[ 1, 2,3, 4, 5]{0, 1, 2,3, 4 } parameter(0)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
 }
 
 TEST_F(HloParserTest, ShapeMismatchInOperand) {
@@ -4557,7 +4594,7 @@ ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
 
 TEST_F(HloParserTest, ParseShapeStringR2F32) {
   std::string shape_string = "f32[123,456]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4566,7 +4603,7 @@ TEST_F(HloParserTest, ParseShapeStringR2F32) {
 
 TEST_F(HloParserTest, ParseShapeStringUnbounded) {
   std::string shape_string = "f32[?,784]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected =
       ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize, 784}, {true, false});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
@@ -4576,7 +4613,7 @@ TEST_F(HloParserTest, ParseShapeStringUnbounded) {
 
 TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
   std::string shape_string = "(f32[1572864],s8[5120,1024])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
                                  ShapeUtil::MakeShape(S8, {5120, 1024})});
@@ -4587,7 +4624,7 @@ TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
 
 TEST_F(HloParserTest, ParseShapeStringNestedTuple) {
   std::string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape({
       ShapeUtil::MakeShape(F32, {1}),
       ShapeUtil::MakeTupleShape(
@@ -4602,7 +4639,7 @@ TEST_F(HloParserTest, ParseShapeStringNestedTuple) {
 
 TEST_F(HloParserTest, ParseShapeStringWithLayout) {
   std::string shape_string = "f32[123,456]{0,1}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {0, 1});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4612,7 +4649,7 @@ TEST_F(HloParserTest, ParseShapeStringWithLayout) {
 TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
   // One tile.
   std::string shape_string = "f32[123,456]{0,1:T(2,128)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {0, 1},
                                                        {Tile({2, 128})});
   EXPECT_EQ(expected, actual)
@@ -4621,7 +4658,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
 
   // Tile with negative dimension size for combining dimensions.
   shape_string = "f32[123,456,789]{0,1,2:T(2, * , 128)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {123, 456, 789}, {0, 1, 2},
       {Tile({2, Tile::kCombineDimension, 128})});
@@ -4631,7 +4668,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
 
   // Two tiles.
   shape_string = "bf16[123,456,789]{2,1,0:T(2,*,128)(2,1)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       BF16, {123, 456, 789}, {2, 1, 0},
       {Tile({2, Tile::kCombineDimension, 128}), Tile({2, 1})});
@@ -4649,7 +4686,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
 TEST_F(HloParserTest, ParseShapeStringWithElementSizeInBits) {
   // Tile, element size, and memory space.
   std::string shape_string = "s4[123,456]{1,0:T(2,128)E(4)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(S4, {123, 456}, {1, 0},
                                                        {Tile({2, 128})}, 1, 4);
   EXPECT_EQ(expected, actual)
@@ -4660,7 +4697,7 @@ TEST_F(HloParserTest, ParseShapeStringWithElementSizeInBits) {
 TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   // Tile, element size, and memory space.
   std::string shape_string = "pred[123,456]{1,0:T(2,128)S(3)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {Tile({2, 128})}, 1, 0, 3);
   EXPECT_EQ(expected, actual)
@@ -4669,7 +4706,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
 
   // Element size and memory space.
   shape_string = "pred[123,456]{1,0:S(3)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {},
                                                  1, 0, 3);
   EXPECT_EQ(expected, actual)
@@ -4678,7 +4715,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
 
   // Memory space only.
   shape_string = "pred[123,456]{1,0:S(3)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {},
                                                  1, 0, 3);
   EXPECT_EQ(expected, actual)
@@ -4689,7 +4726,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
 TEST_F(HloParserTest, ParseShapeStringWithDynamicShapeMetadataPrefix) {
   // Tile, element size, and memory space.
   std::string shape_string = "f32[123,456]{1,0:T(16,128)M(1024)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {1, 0},
                                                        {Tile({16, 128})});
   expected.mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(1024);
@@ -4701,7 +4738,7 @@ TEST_F(HloParserTest, ParseShapeStringWithDynamicShapeMetadataPrefix) {
 TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
   // Tile, memory space, and split config.
   std::string shape_string = "pred[123,456]{1,0:T(2,128)S(3)SC(1:200)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {Tile({2, 128})}, 1, 0, 3,
       {SplitConfig(1, {200})});
@@ -4711,7 +4748,7 @@ TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
 
   // Memory space and split config.
   shape_string = "pred[123,456]{1,0:S(3)SC(0:10)(1:4,5)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {}, 1, 0, 3,
       {SplitConfig(0, {10}), SplitConfig(1, {4, 5})});
@@ -4721,7 +4758,7 @@ TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
 
   // Split config only.
   shape_string = "pred[123,456]{1,0:SC(1:50,200)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {}, 1, 0, 0, {SplitConfig(1, {50, 200})});
   EXPECT_EQ(expected, actual)
@@ -4730,7 +4767,7 @@ TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
 }
 
 TEST_F(HloParserTest, ParseOpaqueType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
   Shape expected = ShapeUtil::MakeOpaqueShape();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4738,7 +4775,7 @@ TEST_F(HloParserTest, ParseOpaqueType) {
 }
 
 TEST_F(HloParserTest, ParseTokenType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("token[]"));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("token[]"));
   Shape expected = ShapeUtil::MakeTokenShape();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4756,7 +4793,7 @@ TEST_F(HloParserTest, ParseInvalidShapeString) {
 
 TEST_F(HloParserTest, ParseDynamicArray) {
   std::string shape_string = "f32[123,<=456]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShape(F32, {123, 456}, {false, true});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4765,7 +4802,7 @@ TEST_F(HloParserTest, ParseDynamicArray) {
 
 TEST_F(HloParserTest, ParseDynamicTuple) {
   std::string shape_string = "(f32[42], u32[<=123,<=456])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {42}),
        ShapeUtil::MakeShape(U32, {123, 456}, {true, true})});
@@ -4897,7 +4934,7 @@ ENTRY InferUnaryShape {
   ROOT v = abs(a)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
 }
 
 TEST_F(HloParserTest, InferBinaryShape) {
@@ -4908,7 +4945,7 @@ ENTRY InferBinaryShape {
   ROOT sum = add(a, b)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 10}, {1, 0})));
@@ -4923,7 +4960,7 @@ ENTRY InferTernaryShape {
   ROOT select = select(p, f, t)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeScalarShape(S32)));
@@ -4950,7 +4987,7 @@ ENTRY InferDotShape {
   ROOT dot = dot(a, b), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={1}, rhs_contracting_dims={0}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeShape(F32, {2}, {0})));
@@ -4965,7 +5002,7 @@ ENTRY InferTupleShape () -> s32[2,3] {
   ROOT get = get-tuple-element(tuple), index=1, sharding={maximal device=0}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeShapeWithDenseLayout(S32, {2, 3}, {1, 0})));
@@ -4991,7 +5028,7 @@ ENTRY InferUnaryShape {
   ROOT conditional = conditional(p, a, c), true_computation=Negate, false_computation=Identity
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeScalarShape(F32)));
@@ -5189,8 +5226,7 @@ TEST_F(HloParserTest, ParseSingleComputation) {
 test {
   ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5221,8 +5257,7 @@ TEST_F(HloParserTest, ParseSingleEntryComputation) {
 ENTRY test {
   ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5247,8 +5282,7 @@ comp1 {
 comp2 {
   ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5273,8 +5307,7 @@ ENTRY comp1 {
 comp2 {
   ROOT root =  f32[1,64,10,128]{3,2,1,0} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5314,9 +5347,8 @@ ENTRY %main {
   ROOT %async-done = s32[1024]{0} async-done(((s32[1024]{0}, s32[256]{0}, s32[]), s32[1024]{0}, u32[]) %async-start), calls=%async_wrapped
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(
       auto roundtrip_module,
       ParseAndReturnUnverifiedModule(module->ToString(
           HloPrintOptions().set_syntax_sugar_async_ops(true))));
@@ -5362,8 +5394,7 @@ ENTRY AsyncStartMissingOperandWrapper {
       ParseAndReturnUnverifiedModule(hlo_string).status(),
       absl_testing::StatusIs(
           tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncStart and AsyncUpdate expect the op shape to be "
-                    "in the form of "
+          HasSubstr("AsyncStart expects the op shape to be in the form of "
                     "((async-operands), async-outputs, state).")));
 }
 
@@ -5385,11 +5416,9 @@ ENTRY AsyncUpdateMissingOperandWrapper {
   )";
   EXPECT_THAT(
       ParseAndReturnUnverifiedModule(hlo_string).status(),
-      absl_testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncStart and AsyncUpdate expect the op shape to be "
-                    "in the form of "
-                    "((async-operands), async-outputs, state).")));
+      absl_testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("AsyncUpdate expects the op shape to be "
+                                       "the same as the operand shape.")));
 }
 
 TEST_F(HloParserTest, AsyncOpTupleWrongType) {
@@ -5411,8 +5440,7 @@ ENTRY AsyncStartAndAsyncDone {
       ParseAndReturnUnverifiedModule(hlo_string).status(),
       absl_testing::StatusIs(
           tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncStart and AsyncUpdate expect the op shape to be "
-                    "in the form of "
+          HasSubstr("AsyncStart expects the op shape to be in the form of "
                     "((async-operands), async-outputs, state).")));
 }
 
@@ -5429,10 +5457,9 @@ ENTRY AsyncStartAndAsyncDone {
   )";
   EXPECT_THAT(
       ParseAndReturnUnverifiedModule(hlo_string).status(),
-      absl_testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncUpdate and AsyncDone expect their operand to be "
-                    "the previous async op.")));
+      absl_testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("AsyncUpdate and AsyncDone expect a "
+                                       "single async op as their operand.")));
 }
 
 TEST_F(HloParserTest, AsyncUpdateAndAsyncDoneNoAsyncStart) {
@@ -5449,10 +5476,9 @@ ENTRY AsyncStartAndAsyncDone {
   )";
   EXPECT_THAT(
       ParseAndReturnUnverifiedModule(hlo_string).status(),
-      absl_testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncUpdate and AsyncDone expect their operand to be "
-                    "the previous async op.")));
+      absl_testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("AsyncUpdate and AsyncDone expect a "
+                                       "single async op as their operand.")));
 }
 
 TEST_F(HloParserTest, AsyncUpdateWithSyntaxSugarWrongOp) {
@@ -5704,8 +5730,7 @@ TEST_F(HloParserTest, ReplicaIdWithLayout) {
     ROOT replica-id.18600 = u32[]{:T(128)} replica-id()
   }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   EXPECT_TRUE(
       module->entry_computation()->root_instruction()->shape().has_layout());
   EXPECT_FALSE(module->entry_computation()
@@ -5726,8 +5751,7 @@ ENTRY %test {
 
 
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
 
   ExpectHasSubstr(module->ToString(HloPrintOptions::ShortParsable()),
                   "origin={{\"v\"}}");
@@ -5740,8 +5764,7 @@ ENTRY %test {
   ROOT op = ((f32[], f32[3]{0}), f32[2,3]) parameter(0),  origin={(({}, {"v2"}), {"v3"})}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
 
   ExpectHasSubstr(module->ToString(HloPrintOptions::ShortParsable()),
                   "origin={(({}, {\"v2\"}), {\"v3\"})}");
@@ -5761,8 +5784,7 @@ ENTRY %test (Arg_0: s32[]) -> s32[] {
   %Arg_0 = s32[] parameter(0), origin={{"Arg_0"}}
   ROOT %pad_add_fusion = s32[] fusion(%Arg_0), kind=kLoop, calls=%fused_computation, origin={{"concatenate"}}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
 
   auto fusion_inst = static_cast<HloFusionInstruction*>(
       module->entry_computation()->root_instruction());
@@ -5783,8 +5805,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyMode) {
   )";
   ResultAccuracy expected_result_accuracy = ResultAccuracy();
   expected_result_accuracy.set_mode(ResultAccuracy::HIGHEST);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   auto* unary = module->entry_computation()->root_instruction();
   EXPECT_THAT(unary->result_accuracy(), EqualsProto(expected_result_accuracy));
 }
@@ -5819,8 +5840,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyRtol) {
   tolerance.set_atol(1.0);  // NOLINT
   tolerance.set_ulps(2);
   *expected_result_accuracy.mutable_tolerance() = tolerance;
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   auto* unary = module->entry_computation()->root_instruction();
   EXPECT_THAT(unary->result_accuracy(), EqualsProto(expected_result_accuracy));
 }
@@ -5862,8 +5882,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyNoConfig) {
     ROOT %exponential = f32[] exponential(f32[] %exponent)
   }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   ResultAccuracy default_result_accuracy;
   default_result_accuracy.set_mode(ResultAccuracy::DEFAULT);
   EXPECT_THAT(
@@ -5900,8 +5919,8 @@ TEST_F(HloParserTest,
                 statistics={visualizing_index=1,stat-1=33,stat-2=44}
     ROOT add-done = s32[] add-done(add-start), origin={{"v3"}}
   })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
-                          ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                       ParseAndReturnVerifiedModule(hlo));
   // Check the wrapped instruction.
   HloInstruction* wrapped_instr =
       m->entry_computation()->root_instruction()->async_wrapped_instruction();
@@ -5948,8 +5967,7 @@ TEST_F(HloParserTest, ResultAccuracyToProto) {
     ROOT %exponential = f32[] exponential(f32[] %exponent), result_accuracy={tolerance={rtol=0.5, atol=1.0, ulps=2}}
   }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   HloInstruction* exp_hlo_instruction =
       module->entry_computation()->root_instruction();
   HloInstructionProto exp_hlo_inst_proto = exp_hlo_instruction->ToProto();
@@ -5975,7 +5993,7 @@ TEST_F(HloParserTest, ParseBufferMoreThanOneElement) {
 
 TEST_F(HloParserTest, ParseBufferScalar) {
   std::string shape_string = "b(s32[])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeValidatedBufferShape(S32, {}).value();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -5984,7 +6002,7 @@ TEST_F(HloParserTest, ParseBufferScalar) {
 
 TEST_F(HloParserTest, ParseBufferArray) {
   std::string shape_string = "b(f32[8,16]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeValidatedBufferShape(F32, {8, 16}).value();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -6012,10 +6030,10 @@ ENTRY entry {
       absl::StrFormat(hlo_template, "mode=cross_replica,");
   const std::string hlo_without_mode = absl::StrFormat(hlo_template, "");
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module_with_mode,
-                          ParseAndReturnVerifiedModule(hlo_with_mode));
-  TF_ASSERT_OK_AND_ASSIGN(auto module_without_mode,
-                          ParseAndReturnVerifiedModule(hlo_without_mode));
+  ASSERT_OK_AND_ASSIGN(auto module_with_mode,
+                       ParseAndReturnVerifiedModule(hlo_with_mode));
+  ASSERT_OK_AND_ASSIGN(auto module_without_mode,
+                       ParseAndReturnVerifiedModule(hlo_without_mode));
   EXPECT_EQ(*module_with_mode->entry_computation(),
             *module_without_mode->entry_computation());
 }
diff --git a/third_party/xla/xla/hlo/pass/BUILD b/third_party/xla/xla/hlo/pass/BUILD
index 5a64c36e4596ad..4dff3438a1cd05 100644
--- a/third_party/xla/xla/hlo/pass/BUILD
+++ b/third_party/xla/xla/hlo/pass/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
@@ -118,6 +119,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
index bec1de8aaaa219..be0eb44c285037 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
+#include <memory>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
index cfbe9723201e1f..fb43ac39280e8a 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
index 82cf884716c34e..1e40fbf64e9f66 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
@@ -202,6 +202,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> LinkComputation(
   TF_ASSIGN_OR_RETURN(HloComputation * linked_clone_ptr, linker.Link());
 
   linked_module->ReplaceEntryComputation(linked_clone_ptr);
+  linked_module->mutable_config().SetComputationLayoutIfExists(
+      linked_clone_ptr->ComputeProgramShape());
   xla::HloDCE dce_pass;
   TF_RETURN_IF_ERROR(dce_pass.Run(linked_module.get()).status());
 
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
index 2ebba94ada5c87..5a46cfb331364e 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
@@ -147,8 +147,9 @@ TEST_F(LinkingTest, SingleCallLinking) {
   const HloLinkingManifest& linking_manifest =
       module_split_group->linking_manifest;
   auto* original_root = FindComputation(original_module.get(), "main");
-  auto* split_group_root = module_split_group->address_book.at(original_root)
-                               ->computation_map.at(original_root);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloComputation* split_group_root,
+      module_split_group->GetClonedComputation(original_root));
 
   TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
                           LinkComputation(linking_manifest, split_group_root));
@@ -208,8 +209,9 @@ TEST_F(LinkingTest, ChainGraphLinking) {
   const HloLinkingManifest& linking_manifest =
       module_split_group->linking_manifest;
   auto* original_root = FindComputation(original_module.get(), "main");
-  auto* split_group_root = module_split_group->address_book.at(original_root)
-                               ->computation_map.at(original_root);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloComputation* split_group_root,
+      module_split_group->GetClonedComputation(original_root));
 
   TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
                           LinkComputation(linking_manifest, split_group_root));
@@ -280,13 +282,12 @@ TEST_F(LinkingTest, DiamondGraphLinking) {
   const HloLinkingManifest& linking_manifest =
       module_split_group->linking_manifest;
   auto* original_root = FindComputation(original_module.get(), "main");
-  ASSERT_TRUE(module_split_group->address_book.contains(original_root));
-  auto* split = module_split_group->address_book.at(original_root);
-  ASSERT_TRUE(split->computation_map.contains(original_root));
-  auto* split_root = split->computation_map.at(original_root);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloComputation* split_group_root,
+      module_split_group->GetClonedComputation(original_root));
 
   TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
-                          LinkComputation(linking_manifest, split_root));
+                          LinkComputation(linking_manifest, split_group_root));
   HloVerifier verifier(HloVerifierOpts{});
   TF_ASSERT_OK(verifier.Run(linked_module.get()));
 
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
index ac8afaeaeaf0a2..00a0bd2a0acefc 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
@@ -332,4 +332,24 @@ absl::StatusOr<std::unique_ptr<HloModuleSplitGroup>> CreateHloModuleSplitGroup(
       std::move(linking_manifest));
 }
 
+absl::StatusOr<const HloComputation*> HloModuleSplitGroup::GetClonedComputation(
+    const HloComputation* original_computation) const {
+  auto it = address_book.find(original_computation);
+  if (it == address_book.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Original computation '", original_computation->name(),
+                     "' not found in HloModuleSplitGroup address book."));
+  }
+  auto& computation_map = it->second->computation_map;
+  auto it2 = computation_map.find(original_computation);
+  if (it2 == computation_map.end()) {
+    return absl::InternalError(absl::StrCat(
+        "Original computation '", original_computation->name(),
+        "' found in address book but not in computation map for its "
+        "module split '",
+        it->second->submodule->name(), "'."));
+  }
+  return it2->second;
+}
+
 }  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
index 4569edd925e5ca..97a0031035511c 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
@@ -94,6 +94,11 @@ struct HloModuleSplitGroup {
       : address_book(std::move(address_book)),
         module_splits(std::move(module_splits)),
         linking_manifest(std::move(linking_manifest)) {}
+
+  // Returns the cloned version of the given original computation, or
+  // an error if the computation is not part of this split group.
+  absl::StatusOr<const HloComputation*> GetClonedComputation(
+      const HloComputation* original_computation) const;
 };
 
 // Split the given module. Returns a mapping from `HloComputation*` to
diff --git a/third_party/xla/xla/hlo/testlib/BUILD b/third_party/xla/xla/hlo/testlib/BUILD
index dc5d2a2da5cf65..afd25005125feb 100644
--- a/third_party/xla/xla/hlo/testlib/BUILD
+++ b/third_party/xla/xla/hlo/testlib/BUILD
@@ -1,17 +1,9 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load(
-    "//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -65,7 +57,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -99,9 +90,7 @@ cc_library(
     data = [
         "@llvm-project//llvm:FileCheck",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        "//xla:types",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:resource_loader",
@@ -109,7 +98,9 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:path",
     ],
diff --git a/third_party/xla/xla/hlo/testlib/filecheck.cc b/third_party/xla/xla/hlo/testlib/filecheck.cc
index f85e742f2933ab..7f351633f231df 100644
--- a/third_party/xla/xla/hlo/testlib/filecheck.cc
+++ b/third_party/xla/xla/hlo/testlib/filecheck.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/resource_loader.h"
@@ -30,8 +32,9 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> RunFileCheck(const std::string& input,
-                                  absl::string_view pattern) {
+absl::StatusOr<bool> RunFileCheck(
+    const std::string& input, absl::string_view pattern,
+    absl::Span<const absl::string_view> additional_check_prefixes) {
   // Generate an input file for the FileCheck pattern.
   std::string pattern_path;
   auto env = tsl::Env::Default();
@@ -40,11 +43,13 @@ absl::StatusOr<bool> RunFileCheck(const std::string& input,
   }
   TF_RETURN_IF_ERROR(tsl::WriteStringToFile(env, pattern_path, pattern));
   VLOG(3) << "input: " << input;
-  return RunFileCheckWithPatternFile(input, pattern_path);
+  return RunFileCheckWithPatternFile(input, pattern_path,
+                                     additional_check_prefixes);
 }
 
 absl::StatusOr<bool> RunFileCheckWithPatternFile(
-    const std::string& input, const std::string& pattern_file) {
+    const std::string& input, const std::string& pattern_file,
+    absl::Span<const absl::string_view> additional_check_prefixes) {
   // Invoke FileCheck to check whether input matches `pattern`.
   std::string binary_name = "FileCheck";
   tsl::io::AppendDotExeIfWindows(binary_name);
@@ -53,24 +58,16 @@ absl::StatusOr<bool> RunFileCheckWithPatternFile(
           ? tsl::io::JoinPath("external", "llvm-project", "llvm", binary_name)
           : tsl::io::JoinPath("llvm", "llvm-project", "llvm", binary_name));
 
+  std::string check_prefixes = "--check-prefixes=CHECK";
+  for (const absl::string_view& prefix : additional_check_prefixes) {
+    absl::StrAppend(&check_prefixes, ",", prefix);
+  }
+
   tsl::SubProcess file_check_process;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  std::string file_check_prefixes;
-#if GOOGLE_CUDA
-  file_check_prefixes = "--check-prefixes=CHECK,CHECK-PTX";
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
-  file_check_prefixes = "--check-prefixes=CHECK,CHECK-GCN";
-#endif  // TENSORFLOW_USE_ROCM
   file_check_process.SetProgram(
       file_check_path,
       {file_check_path, "-v", "-dump-input=fail", "--dump-input-filter=all",
-       file_check_prefixes, "--allow-unused-prefixes", pattern_file});
-#else  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  file_check_process.SetProgram(file_check_path,
-                                {file_check_path, "-v", "-dump-input=fail",
-                                 "--dump-input-filter=all", pattern_file});
-#endif
+       check_prefixes, "--allow-unused-prefixes", pattern_file});
   file_check_process.SetChannelAction(tsl::CHAN_STDIN, tsl::ACTION_PIPE);
   file_check_process.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
   if (!file_check_process.Start()) {
diff --git a/third_party/xla/xla/hlo/testlib/filecheck.h b/third_party/xla/xla/hlo/testlib/filecheck.h
index 3ea8de22f60fe8..2d2ec9712bd082 100644
--- a/third_party/xla/xla/hlo/testlib/filecheck.h
+++ b/third_party/xla/xla/hlo/testlib/filecheck.h
@@ -20,21 +20,23 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/types.h"
+#include "absl/types/span.h"
 
 namespace xla {
 
 // Runs FileCheck with the given pattern over given input string. Provided that
 // FileCheck can execute, returns true if and only if FileCheck succeeded in
 // matching the input.
-absl::StatusOr<bool> RunFileCheck(const std::string& input,
-                                  absl::string_view pattern);
+absl::StatusOr<bool> RunFileCheck(
+    const std::string& input, absl::string_view pattern,
+    absl::Span<const absl::string_view> additional_check_prefixes = {});
 
 // Runs FileCheck with the given pattern file over given input string. Provided
 // that FileCheck can execute, returns true if and only if FileCheck succeeded
 // in matching the input.
 absl::StatusOr<bool> RunFileCheckWithPatternFile(
-    const std::string& input, const std::string& pattern_file);
+    const std::string& input, const std::string& pattern_file,
+    absl::Span<const absl::string_view> additional_check_prefixes = {});
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
index b950513875436e..8ac3ca2ef35c89 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -222,7 +223,8 @@ void HloHardwareIndependentTestBase::RunAndFilecheckHloRewrite(
     absl::string_view hlo, HloPassInterface&& hlo_pass,
     std::optional<absl::string_view> expected,
     std::function<void(HloModule*)> after_pass_checks,
-    const HloModuleConfig* config) const {
+    const HloModuleConfig* config,
+    absl::Span<const absl::string_view> additional_check_prefixes) const {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           config ? ParseAndReturnVerifiedModule(hlo, *config)
                                  : ParseAndReturnVerifiedModule(hlo));
@@ -233,7 +235,7 @@ void HloHardwareIndependentTestBase::RunAndFilecheckHloRewrite(
         bool filecheck_matches,
         RunFileCheck(
             module->ToString(HloPrintOptions().set_print_large_constants(true)),
-            *expected));
+            *expected, additional_check_prefixes));
     EXPECT_TRUE(filecheck_matches) << module->ToString();
     if (after_pass_checks) {
       after_pass_checks(module.get());
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
index d9a0973ce1b0bf..3d02c90b35c375 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
@@ -167,7 +166,8 @@ class HloHardwareIndependentTestBase : public ::testing::Test {
       absl::string_view hlo_with_filecheck_lines, HloPassInterface&& hlo_pass,
       std::optional<absl::string_view> expected,
       std::function<void(HloModule*)> after_pass_checks = nullptr,
-      const HloModuleConfig* config = nullptr) const;
+      const HloModuleConfig* config = nullptr,
+      absl::Span<const absl::string_view> additional_check_prefixes = {}) const;
 
   using FixedMapping =
       std::initializer_list<std::pair<absl::string_view, absl::string_view>>;
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 480674c2bf4495..778942bfc200a3 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -189,6 +189,7 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/hlo/transforms/expanders/BUILD b/third_party/xla/xla/hlo/transforms/expanders/BUILD
index f175a711fa1e57..0e3524878643d0 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/BUILD
+++ b/third_party/xla/xla/hlo/transforms/expanders/BUILD
@@ -381,6 +381,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -637,6 +638,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "convolution_type_canonicalizer",
+    srcs = ["convolution_type_canonicalizer.cc"],
+    hdrs = ["convolution_type_canonicalizer.h"],
+    deps = [
+        ":op_expander_pass",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "convolution_type_canonicalizer_test",
+    srcs = ["convolution_type_canonicalizer_test.cc"],
+    deps = [
+        ":convolution_type_canonicalizer",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_matchers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "stochastic_convert_decomposer_test",
     srcs = ["stochastic_convert_decomposer_test.cc"],
diff --git a/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc
new file mode 100644
index 00000000000000..a2dd02ade08ed6
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc
@@ -0,0 +1,62 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "xla/hlo/transforms/expanders/convolution_type_canonicalizer.h"
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+bool ConvolutionTypeCanonicalizer::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return (instruction->opcode() == HloOpcode::kDot ||
+          instruction->opcode() == HloOpcode::kConvolution) &&
+         (primitive_util::IsFloatingPointType(
+              instruction->operand(0)->shape().element_type()) &&
+          primitive_util::IsFloatingPointType(
+              instruction->operand(1)->shape().element_type())) &&
+         primitive_util::IsIntegralType(instruction->shape().element_type());
+}
+
+absl::StatusOr<HloInstruction*> ConvolutionTypeCanonicalizer::ExpandInstruction(
+    HloInstruction* instruction) {
+  auto original_shape = instruction->shape();
+  auto new_shape = ShapeUtil::ChangeElementType(original_shape, F32);
+  HloInstruction* replacement_instruction;
+  if (instruction->opcode() == HloOpcode::kDot) {
+    replacement_instruction = instruction->parent()->AddInstruction(
+        HloInstruction::CreateDot(new_shape, instruction->mutable_operand(0),
+                                  instruction->mutable_operand(1),
+                                  instruction->dot_dimension_numbers(),
+                                  instruction->precision_config()));
+  } else {
+    replacement_instruction =
+        instruction->parent()->AddInstruction(HloInstruction::CreateConvolve(
+            new_shape, instruction->mutable_operand(0),
+            instruction->mutable_operand(1), instruction->feature_group_count(),
+            instruction->batch_group_count(), instruction->window(),
+            instruction->convolution_dimension_numbers(),
+            instruction->precision_config()));
+  }
+  HloInstruction* output_cast = instruction->parent()->AddInstruction(
+      HloInstruction::CreateConvert(original_shape, replacement_instruction));
+  return output_cast;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h
new file mode 100644
index 00000000000000..6fea6e9b58a991
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h
@@ -0,0 +1,44 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_TYPE_CANONICALIZER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_TYPE_CANONICALIZER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+class ConvolutionTypeCanonicalizer : public OpExpanderPass {
+ public:
+  ConvolutionTypeCanonicalizer() = default;
+  absl::string_view name() const override {
+    return "ConvolutionTypeCanonicalizer";
+  }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_TYPE_CANONICALIZER_H_
diff --git a/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc
new file mode 100644
index 00000000000000..74092479fd07b8
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc
@@ -0,0 +1,98 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "xla/hlo/transforms/expanders/convolution_type_canonicalizer.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/utils/hlo_matchers.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ConvolutionTypeCanonicalizerTest : public HloHardwareIndependentTestBase {
+};
+
+TEST_F(ConvolutionTypeCanonicalizerTest, DotBf16ToS32) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main {
+  p0 = bf16[10,10]{1,0} parameter(0)
+  p1 = bf16[10,10]{1,0} parameter(1)
+  ROOT dot = s32[10,10]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  ConvolutionTypeCanonicalizer pass;
+  ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Convert(op::Dot(op::Parameter(0), op::Parameter(1))));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              xla::testing::opcode_matchers::Shape("s32[10,10]{1,0}"));
+}
+
+TEST_F(ConvolutionTypeCanonicalizerTest, ConvolutionBf16ToS32) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main {
+  p0 = bf16[1,1024,1024,1]{3,2,1,0} parameter(0)
+  p1 = bf16[1,1,1,1]{3,2,1,0} parameter(1)
+  ROOT conv = s32[1,1024,1024,1]{3,2,1,0} convolution(p0, p1), window={size=1x1}, dim_labels=b01f_01io->b01f
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  ConvolutionTypeCanonicalizer pass;
+  ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Convert(op::Convolution(op::Parameter(0), op::Parameter(1))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      xla::testing::opcode_matchers::Shape("s32[1,1024,1024,1]{3,2,1,0}"));
+}
+
+TEST_F(ConvolutionTypeCanonicalizerTest, NoChangeNeeded) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main {
+  p0 = f32[10,10]{1,0} parameter(0)
+  p1 = f32[10,10]{1,0} parameter(1)
+  ROOT dot = f32[10,10]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  ConvolutionTypeCanonicalizer pass;
+  ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
index 5af2a3e0056679..f2d015bc3f98c8 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -127,13 +128,6 @@ absl::StatusOr<HloInstruction*> PermutationSortExpander::ExpandInstruction(
       instruction->AddInstruction(HloInstruction::CreateBroadcast(
           update_shape, zero, /*broadcast_dimensions=*/{}));
 
-  // Construct the updates operand of scatter.
-  for (int64_t i = 0; i < rank; ++i) {
-    ShapeUtil::AppendMinorDimension(1, &update_shape);
-  }
-  HloInstruction* scatter_updates = instruction->AddInstruction(
-      HloInstruction::CreateReshape(update_shape, values));
-
   // Construct the updates computation, which simply replaces the operand
   // values with the update values.
   HloComputation::Builder b("update_replace_computation");
@@ -149,12 +143,12 @@ absl::StatusOr<HloInstruction*> PermutationSortExpander::ExpandInstruction(
   ScatterDimensionNumbers dim_numbers;
   dim_numbers.set_index_vector_dim(rank);
   for (int64_t i = 0; i < rank; ++i) {
-    dim_numbers.add_update_window_dims(rank + i);
+    dim_numbers.add_inserted_window_dims(i);
     dim_numbers.add_scatter_dims_to_operand_dims(i);
   }
   HloInstruction* scatter =
       instruction->AddInstruction(HloInstruction::CreateScatter(
-          values->shape(), scatter_operand, scatter_indices, scatter_updates,
+          update_shape, scatter_operand, scatter_indices, values,
           update_replace_computation, dim_numbers,
           /*indices_are_sorted=*/false, /*unique_indices=*/true));
   return instruction->AddInstruction(HloInstruction::CreateTuple(
diff --git a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc
index 0351221da62b0d..329df8c95e9bd6 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc
@@ -63,11 +63,14 @@ TEST_F(PermutationSortExpanderTest, ReplacePermutationSortWithScatter) {
 
   EXPECT_THAT(PermutationSortExpander().Run(module.get()), IsOkAndHolds(true));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root,
-              op::Tuple(op::Iota(),
-                        op::Scatter(op::Broadcast(op::Constant()),
-                                    op::Concatenate(op::Iota(), op::Reshape()),
-                                    op::Reshape())));
+  EXPECT_THAT(
+      root, op::Tuple(op::Iota(),
+                      op::Scatter(
+                          op::Broadcast(op::Constant()),
+                          op::Concatenate(op::Iota(),
+                                          op::Reshape(op::GetTupleElement(
+                                              op::Sort(), /*tuple_index=*/1))),
+                          op::Iota())));
 }
 
 TEST_F(PermutationSortExpanderTest, DontReplaceIfWrongComparisonDirection) {
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.cc b/third_party/xla/xla/hlo/transforms/host_offloader.cc
index 1fdd3360d98b82..b31481ecbe2578 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.cc
@@ -249,11 +249,11 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
       if (is_end_of_offload) {
         // This DynamicSlice is the end of this path of host memory offload.
         continue;
-      } else {
-        // This is not the end of host memory offload. This is treated as device
-        // compute happening on host memory, convert it to host compute.
-        need_to_wrap_instruction_as_host_compute = true;
-      }
+      }  // This is not the end of host memory offload. This is treated as
+         // device
+      // compute happening on host memory, convert it to host compute.
+      need_to_wrap_instruction_as_host_compute = true;
+
     } else if (instruction->opcode() == HloOpcode::kSlice) {
       TF_ASSIGN_OR_RETURN(bool is_end_of_offload,
                           SliceLeadsToMoveToDeviceCustomCall(instruction));
@@ -263,11 +263,11 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
         // memory.
         slices_to_dynamify.insert(instruction);
         continue;
-      } else {
-        // This is not the end of host memory offload. This is treated as device
-        // compute happening on host memory, convert it to host compute.
-        need_to_wrap_instruction_as_host_compute = true;
-      }
+      }  // This is not the end of host memory offload. This is treated as
+         // device
+      // compute happening on host memory, convert it to host compute.
+      need_to_wrap_instruction_as_host_compute = true;
+
     } else if (instruction->opcode() == HloOpcode::kCopy) {
       if (instruction->shape() == instruction->operand(0)->shape()) {
         need_to_wrap_instruction_as_host_compute = true;
@@ -298,10 +298,17 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
           "to move the inputs to the device so that computation happens on the "
           "device.",
           instruction->name());
+      bool h2h_copy_allowed =
+          instruction->opcode() == HloOpcode::kCopy &&
+          instruction->GetModule()
+              ->config()
+              .debug_options()
+              .xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled();  // NOLINT
       if (instruction->GetModule()
               ->config()
               .debug_options()
-              .xla_disable_automatic_host_compute_offload()) {
+              .xla_disable_automatic_host_compute_offload() &&
+          !h2h_copy_allowed) {
         return absl::InvalidArgumentError(
             "Automatic host compute offloading is disabled.");
       }
@@ -336,17 +343,16 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
             "Memory offloaded starting from %s is output streamed",
             starting_instruction_and_index.ToString());
         continue;
-      } else {
-        if (VLOG_IS_ON(1)) {
-          LOG(INFO) << "Instruction trace leading to error:";
-          PrintTrace(instruction_and_shape_index, previous);
-        }
-        return absl::InvalidArgumentError(
-            absl::StrFormat("Tensor which is moved to host (starting from %s) "
-                            "is returned from the entry computation but the "
-                            "layout for this output is not set to host memory.",
-                            starting_instruction->name()));
       }
+      if (VLOG_IS_ON(1)) {
+        LOG(INFO) << "Instruction trace leading to error:";
+        PrintTrace(instruction_and_shape_index, previous);
+      }
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Tensor which is moved to host (starting from %s) "
+                          "is returned from the entry computation but the "
+                          "layout for this output is not set to host memory.",
+                          starting_instruction->name()));
     }
     // Push successors onto the queue to be visited.
     TF_ASSIGN_OR_RETURN(
@@ -636,8 +642,8 @@ HostOffloader::GetStartingInstructions(
       // Found a DynamicUpdateSlice.
       result.push_back(instruction_and_shape);
       continue;
-    } else if (!InstructionIsAllowedBetweenMoveToHostAndDus(
-                   current_instruction)) {
+    }
+    if (!InstructionIsAllowedBetweenMoveToHostAndDus(current_instruction)) {
       // Found the start of "normal" memory offloading.
       result.push_back(instruction_and_shape);
       continue;
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
index 79e3098927d5b0..d9286419bde896 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
@@ -95,6 +95,14 @@ class HostOffloaderTest : public HloHardwareIndependentTestBase {
         .set_xla_disable_automatic_host_compute_offload(true);
   }
 
+  static void AllowH2hCopyWhenAutomaticHostComputeOffloadDisabled(
+      HloModule* module) {
+    module->mutable_config()
+        .mutable_debug_options()
+        .set_xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled(
+            true);
+  }
+
   AliasInfo alias_info_;
 };
 
@@ -4650,6 +4658,49 @@ TEST_F(HostOffloaderTest, AutomaticHostComputeOffloadDisabled) {
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST_F(HostOffloaderTest,
+       H2hCopyDisallowedWhenAutomaticHostComputeOffloadDisabled) {
+  const absl::string_view hlo_string = R"(
+    HloModule module, entry_computation_layout={(f32[1024]{0:T(128)S(5)})->f32[1024]{0:T(128)S(5)}}
+
+    ENTRY main {
+      param = f32[1024]{0} parameter(0)
+      ROOT a_copy = f32[1024]{0} copy(param)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  DisableAutomaticHostComputeOffload(module.get());
+  // A copy on host memory exists, but we have disabled automatic host compute
+  // offloading and we haven't allowed H2H copies, so we expect an error.
+  absl::StatusOr<bool> changed = RunHostOffloader(module.get());
+  EXPECT_THAT(changed,
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(HostOffloaderTest,
+       H2hCopyAllowedWhenAutomaticHostComputeOffloadDisabled) {  // NOLINT
+  const absl::string_view hlo_string = R"(
+    HloModule module, entry_computation_layout={(f32[1024]{0:T(128)S(5)})->f32[1024]{0:T(128)S(5)}}
+
+    ENTRY main {
+      param = f32[1024]{0} parameter(0)
+      ROOT a_copy = f32[1024]{0} copy(param)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  DisableAutomaticHostComputeOffload(module.get());
+  AllowH2hCopyWhenAutomaticHostComputeOffloadDisabled(module.get());
+  // A copy on host memory exists, and we have disabled automatic host compute
+  // offloading, but we have allowed H2H copies, so we expect success.
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+  EXPECT_TRUE(changed);
+  VLOG(1) << module->ToString();
+  HloInstruction* a_copy = FindInstruction(module.get(), "a_copy");
+  EXPECT_TRUE(host_offload_utils::ComputeTypeIsHost(a_copy));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc b/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
index e2a46b570a86d4..1b4558a9dfe2a6 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
@@ -24,9 +24,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -592,5 +594,54 @@ TEST_F(MemorySpacePropagationTest, RunOnComputationPropagateFromOutput) {
   EXPECT_EQ(absl::HashOf(*module), absl::HashOf(*ref));
 }
 
+// TODO (b/469840065): Re-enable this test once the memory space propagation bug
+// is fixed for nested fusions.
+TEST_F(MemorySpacePropagationTest, DISABLED_NestedFusionShapeMismatchBug) {
+  absl::string_view hlo_string =
+      R"(HloModule jit_insert.fusion.21.isolated, is_scheduled=true
+
+%copy_fusion.20.clone {
+  %input.20 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} parameter(0)
+  ROOT %copy.4014 = s4[8,32768,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} copy(%input.20)
+}
+
+%fused_computation.434.clone {
+  %param_0.777 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} parameter(0)
+  %fusion.505 = s4[8,32768,1,256]{3,1,0,2:T(8,128)(8,1)E(4)S(1)} fusion(%param_0.777), kind=kLoop, output_to_operand_aliasing={{}: (0, {})}, calls=%copy_fusion.20.clone
+  %param_3.751 = pred[]{:T(512)} parameter(3)
+  %broadcast.1846 = pred[1,16384,1,256]{3,1,0,2:T(8,128)(4,1)} broadcast(%param_3.751), dimensions={}, metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}
+  %param_2.1768 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)S(1)} parameter(2)
+  %param_1.980 = s32[]{:T(128)S(6)} parameter(1)
+  %constant.9791 = s32[]{:T(128)} constant(0), metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit"}
+  %dynamic-slice.2042 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} dynamic-slice(%fusion.505, %param_1.980, %constant.9791, %constant.9791, %constant.9791), dynamic_slice_sizes={1,16384,1,256}, metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"indices_config":{"index_known_bits":[{"zeroes":"0","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"}],"is_index_aligned":[true,true,true,true]},"used_scoped_memory_configs":[]}
+  %select.912 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} select(%broadcast.1846, %param_2.1768, %dynamic-slice.2042), metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}
+  ROOT %dynamic-update-slice.455 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} dynamic-update-slice(%fusion.505, %select.912, %param_1.980, %constant.9791, %constant.9791, /*index=5*/%constant.9791), metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"indices_config":{"index_known_bits":[{"zeroes":"0","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"}],"is_index_aligned":[true,true,true,true]},"used_scoped_memory_configs":[]}
+}
+
+ENTRY %jit_insert.fusion.21.isolated.root {
+  %bitcast.1556.hbm = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)} parameter(0)
+  %select.32 = s32[]{:T(128)S(6)} parameter(1)
+  %collective-permute.56.hbm = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} parameter(2)
+  %and.74 = pred[]{:T(512)} parameter(3)
+  %copy = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} copy(%bitcast.1556.hbm)
+  %copy.1 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)S(1)} copy(%collective-permute.56.hbm)
+  %fusion.21 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} fusion(%copy, %select.32, %copy.1, %and.74), kind=kLoop, calls=%fused_computation.434.clone, metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"used_scoped_memory_configs":[],"aliasing_operands":{"lists":[{"indices":["0","4"]}]}}
+  ROOT %copy.2 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)} copy(%fusion.21)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  // %copy.4014 output memory space must get modified to match %fusion.505
+  // output shape.
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).value());
+  HloComputation* computation =
+      module->GetComputationWithName("copy_fusion.20.clone");
+  EXPECT_NE(computation, nullptr);
+  const HloInstruction* copy = computation->GetInstructionWithName("copy.4014");
+  EXPECT_NE(copy, nullptr);
+  EXPECT_EQ(copy->shape().layout().memory_space(), 1);
+  TF_EXPECT_OK(Verify(module.get()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
index 06249915db3d08..a40177014b7e1f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
@@ -407,6 +407,7 @@ cc_library(
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
@@ -1852,6 +1853,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dead_dynamic_update_slice_elimination",
+    srcs = ["dead_dynamic_update_slice_elimination.cc"],
+    hdrs = ["dead_dynamic_update_slice_elimination.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "dead_dynamic_update_slice_elimination_test",
+    srcs = ["dead_dynamic_update_slice_elimination_test.cc"],
+    deps = [
+        ":dead_dynamic_update_slice_elimination",
+        ":hlo_dce",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "call_parameter_cleanup_test",
     srcs = ["call_parameter_cleanup_test.cc"],
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
index f2368abca9f8a0..53d73d88643a5e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
@@ -74,6 +74,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -1300,6 +1301,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleBitcast(
     VLOG(3) << bitcast->ToString() << " has control predecessors, skipping.";
     return absl::OkStatus();
   }
+
   // If a bitcast feeds a bitcast, make it a single bitcast.
   // Make sure the whole chain of bitcasts is optimized.
   if (bitcast->operand(0)->opcode() == HloOpcode::kBitcast) {
@@ -1325,6 +1327,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleBitcast(
     bitcast = new_bitcast;
   }
 
+  ASSIGN_OR_RETURN(bool transpose_chain_removed,
+                   TryRemovingBitcastOrReshapeTransposeChain(bitcast));
+  if (transpose_chain_removed) {
+    return absl::OkStatus();
+  }
+
   // Check whether we can potentially simplify the bitcast into a broadcast
   // operand.
   if (bitcast->opcode() == HloOpcode::kBitcast &&
@@ -6138,12 +6146,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleRemainder(
 }
 
 absl::StatusOr<bool>
-AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
-    HloInstruction* reshape) {
-  // Detect a chain of transposes and reshapes that can be replaced with a
-  // nop. All reshapes only add, remove or shuffle degenerate dimensions, such
-  // as [x,y,z]->[x,y,1,z] or its reverse, [x,y,1,z]->[x,1,y,z], etc. And all
-  // the shapes in the chain have at most one degenerate dimension. Then all
+AlgebraicSimplifierVisitor::TryRemovingBitcastOrReshapeTransposeChain(
+    HloInstruction* instruction) {
+  // Detect a chain of transposes and reshapes/bitcasts that can be replaced
+  // with a nop. All reshapes only add, remove or shuffle degenerate dimensions,
+  // such as [x,y,z]->[x,y,1,z] or its reverse, [x,y,1,z]->[x,1,y,z], etc. And
+  // all the shapes in the chain have at most one degenerate dimension. Then all
   // the transposes in the chain effectively permute x,y,z, while the
   // degenerate dimension is ignored. As long as all transposes compose to
   // identity permutation, the chain can be replaced with a nop if the
@@ -6157,12 +6165,19 @@ AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
                            });
   };
 
-  auto is_valid_reshape = [&](const HloInstruction* reshape) {
-    CHECK(reshape->opcode() == HloOpcode::kReshape);
-    return get_num_of_degenerate_dimensions(reshape->shape()) <= 1 &&
-           get_num_of_degenerate_dimensions(reshape->operand(0)->shape()) <=
-               1 &&
-           reshape->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
+  auto is_valid_reshape_or_bitcast = [&](const HloInstruction* inst) {
+    if (inst->opcode() != HloOpcode::kReshape &&
+        inst->opcode() != HloOpcode::kBitcast) {
+      return false;
+    }
+    if (inst->opcode() == HloOpcode::kBitcast &&
+        !options_.ReshapeIsBitcast(inst->operand(0)->shape(), inst->shape())) {
+      return false;
+    }
+    return get_num_of_degenerate_dimensions(inst->shape()) <= 1 &&
+           get_num_of_degenerate_dimensions(inst->operand(0)->shape()) <= 1 &&
+           ShapeUtil::InsertedOrDeleted1SizedDimensions(
+               inst->operand(0)->shape(), inst->shape());
   };
 
   auto get_degenerate_dimension = [](const Shape& shape) {
@@ -6204,19 +6219,34 @@ AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
     return DimensionVector(permutation.begin(), permutation.end());
   };
 
-  if (!options_.is_layout_sensitive() && is_valid_reshape(reshape)) {
-    int64_t effective_size = ShapeUtil::TrueNumDimensions(reshape->shape());
+  bool is_valid_start = false;
+  if (instruction->opcode() == HloOpcode::kTranspose) {
+    is_valid_start = !IsIdentityPermutation(instruction->dimensions());
+  } else {
+    is_valid_start = is_valid_reshape_or_bitcast(instruction);
+  }
+
+  if (is_valid_start) {
+    int64_t effective_size = ShapeUtil::TrueNumDimensions(instruction->shape());
     std::vector<int64_t> permutation(effective_size);
     // Init with identity permutation.
     std::iota(permutation.begin(), permutation.end(), 0);
 
+    if (instruction->opcode() == HloOpcode::kTranspose) {
+      auto effective_perm = get_effective_permutation(
+          instruction->dimensions(), instruction->operand(0)->shape(),
+          instruction->shape());
+      permutation.assign(effective_perm.begin(), effective_perm.end());
+    }
+
     bool is_nop = true;
     HloInstruction* starting_instruction = nullptr;
-    HloInstruction* current = reshape->mutable_operand(0);
+    HloInstruction* current = instruction->mutable_operand(0);
     while (current->opcode() == HloOpcode::kReshape ||
-           current->opcode() == HloOpcode::kTranspose) {
-      if (current->opcode() == HloOpcode::kReshape &&
-          !is_valid_reshape(current)) {
+           current->opcode() == HloOpcode::kTranspose ||
+           current->opcode() == HloOpcode::kBitcast) {
+      if (current->opcode() != HloOpcode::kTranspose &&
+          !is_valid_reshape_or_bitcast(current)) {
         is_nop = false;
         break;
       }
@@ -6238,13 +6268,38 @@ AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
     }
 
     if (is_nop && starting_instruction != nullptr &&
-        Shape::Equal().IgnoreLayout()(
-            reshape->shape(), starting_instruction->operand(0)->shape()) &&
         IsIdentityPermutation(permutation)) {
-      VLOG(2) << "Deleting reshape-transpose chain: " << reshape->ToString();
-      TF_RETURN_IF_ERROR(ReplaceInstruction(
-          reshape, starting_instruction->mutable_operand(0)));
-      return true;
+      HloInstruction* new_operand = starting_instruction->mutable_operand(0);
+      bool replace_success = false;
+      if (options_.is_layout_sensitive()) {
+        if (ShapeUtil::Equal(instruction->shape(), new_operand->shape())) {
+          RETURN_IF_ERROR(ReplaceInstruction(instruction, new_operand));
+          replace_success = true;
+        } else if (options_.ReshapeIsBitcast(new_operand->shape(),
+                                             instruction->shape())) {
+          // If ReshapeIsBitcast is true, the shapes are guaranteed to have the
+          // same in-memory representation, including padding and tiling
+          // effects. Therefore, their byte sizes must be equal.
+          DCHECK_EQ(ShapeUtil::ByteSizeOf(new_operand->shape()),
+                    ShapeUtil::ByteSizeOf(instruction->shape()))
+              << "ReshapeIsBitcast is true, but byte sizes differ.";
+          RETURN_IF_ERROR(ReplaceWithNewInstruction(
+              instruction, HloInstruction::CreateBitcast(instruction->shape(),
+                                                         new_operand)));
+          replace_success = true;
+        }
+      } else {  // Non-layout sensitive.
+        if (Shape::Equal().IgnoreLayout()(instruction->shape(),
+                                          new_operand->shape())) {
+          RETURN_IF_ERROR(ReplaceInstruction(instruction, new_operand));
+          replace_success = true;
+        }
+      }
+      if (replace_success) {
+        VLOG(2) << "Deleting bitcast-or-reshape-transpose chain: "
+                << instruction->ToString();
+        return true;
+      }
     }
   }
   return false;
@@ -6287,8 +6342,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReshape(
     return ReplaceInstruction(reshape, operand);
   }
 
-  TF_ASSIGN_OR_RETURN(bool reshape_transpose_chain_removed,
-                      TryRemovingReshapeTransposeChain(reshape));
+  ASSIGN_OR_RETURN(bool reshape_transpose_chain_removed,
+                   TryRemovingBitcastOrReshapeTransposeChain(reshape));
   if (reshape_transpose_chain_removed) {
     return absl::OkStatus();
   }
@@ -6899,85 +6954,6 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     }
   }
 
-  // Simplify:
-  //    Txx[...,1]   slice(Txx[..., K] reshape(Txx[...,N*K])) // N > 1
-  // To:
-  //    Txx[...,1] reshape(Txx[..., N]   slice(Txx[...,N*K], stride(-1)=K)
-  //
-  // Maintaining data-parallelism to improve throughput on some architectures.
-  HloInstruction* reshape;
-  if (Match(slice, m::Slice(m::Reshape(&reshape, m::Op())))) {
-    HloInstruction* input = reshape->mutable_operand(0);
-    const Shape& input_shape = input->shape();
-    const Shape& reshape_shape = reshape->shape();
-
-    const int64_t input_rank = input_shape.dimensions().size();
-    const int64_t reshape_rank = reshape_shape.dimensions().size();
-    const int64_t slice_rank = slice->shape().dimensions().size();
-
-    // Reshape must have at least 2 dimensions and same number of
-    // dimensions as slice.
-    if (reshape_rank >= 2 && reshape_rank == slice_rank) {
-      bool is_valid_reshape_slice = true;
-      for (int64_t i = 0; i < slice_rank; ++i) {
-        if (i == slice_rank - 1) {
-          // Continue if we are slicing exactly one element from the last
-          // dimension.
-          if (slice->slice_limits(i) - slice->slice_starts(i) == 1) {
-            continue;
-          }
-        } else {
-          // Continue if we are not slicing any other dimension.
-          if (slice->slice_starts(i) == 0 &&
-              slice->slice_limits(i) == reshape_shape.dimensions(i) &&
-              slice->slice_strides(i) == 1) {
-            continue;
-          }
-        }
-        // If the rules above are not met, prevent a match.
-        is_valid_reshape_slice = false;
-        break;
-      }
-
-      // Check if slice is selecting a single element from the last dimension.
-      if (is_valid_reshape_slice) {
-        int64_t slice_index = slice->slice_starts()[slice_rank - 1];
-        int64_t K = reshape_shape.dimensions(reshape_rank - 1);
-
-        // Check if input shape can be viewed as [..., N*K], where N is two or
-        // more, e.g. Input [1, 2024, 4, 128], Reshape [518144, 2].
-        // Last dim of input 128 is multiple of 2.
-        if (!input_shape.dimensions().empty()) {
-          int64_t last_dim = input_shape.dimensions(input_rank - 1);
-          if (last_dim % K == 0 && last_dim / K > 1) {
-            // It matches!
-            DimensionVector starts(input_rank, 0);
-            DimensionVector limits(input_shape.dimensions().begin(),
-                                   input_shape.dimensions().end());
-            DimensionVector strides(input_rank, 1);
-
-            starts[input_rank - 1] = slice_index;
-            limits[input_rank - 1] = last_dim;
-            strides[input_rank - 1] = K;
-
-            Shape new_slice_shape = input_shape;
-            new_slice_shape.set_dimensions(
-                input_rank - 1, input_shape.dimensions(input_rank - 1) / K);
-            simplifier_->UpdateLayout(&new_slice_shape);
-
-            HloInstruction* new_slice =
-                slice->parent()->AddInstruction(HloInstruction::CreateSlice(
-                    new_slice_shape, input, starts, limits, strides));
-            HloInstruction* new_reshape = slice->parent()->AddInstruction(
-                HloInstruction::CreateReshape(slice->shape(), new_slice));
-
-            return ReplaceInstruction(slice, new_reshape);
-          }
-        }
-      }
-    }
-  }
-
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
       hlo_instruction_utils::IsUnstridedSlice(slice) &&
       hlo_instruction_utils::IsUnstridedSlice(slice->operand(0))) {
@@ -9137,6 +9113,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
                                            transpose->dimensions())));
   }
 
+  ASSIGN_OR_RETURN(bool chain_removed,
+                   TryRemovingBitcastOrReshapeTransposeChain(transpose));
+  if (chain_removed) {
+    return absl::OkStatus();
+  }
+
   const auto consider_swapping_dot_operands = [&](HloInstruction* dot) {
     // If the RHS is a parameter-like, and the LHS is not, do not swap the
     // operands, since the dot operands are in a convenient order for layout
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
index da0eb0f8612d7c..87848713101343 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
@@ -855,10 +855,10 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
                                         bool multi_output_reduce,
                                         HloReduceInstruction* reduce);
 
-  // Detects a chain of transposes and reshapes that can be replaced with a
-  // nop.
-  absl::StatusOr<bool> TryRemovingReshapeTransposeChain(
-      HloInstruction* reshape);
+  // Detects a chain of transposes and reshapes (or bitcasts) that can be
+  // replaced with a nop.
+  absl::StatusOr<bool> TryRemovingBitcastOrReshapeTransposeChain(
+      HloInstruction* instruction);
 
   // Helper function for HandleReduce. Reorders reduce dot
   // to a dot reduce. reduce(dot(A, B)) to dot(A, reduce(B))
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
index cecf7762332bbd..377ec2c76a81f8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
@@ -338,13 +338,91 @@ TEST_F(AlgebraicSimplifierTest, EliminateReshapeTransposeChain) {
   ROOT %reshape.96336 = f32[224,4,1,4096] reshape(%transpose.8665)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
   AlgebraicSimplifier(default_options_).Run(m.get()).value();
   VLOG(2) << "Module after: " << m->ToString();
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::Parameter(0)));
 }
 
+TEST_F(AlgebraicSimplifierTest, EliminateBitcastTransposeChain) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+      param = f32[10, 20] parameter(0)
+      transpose = f32[20, 10] transpose(param), dimensions={1, 0}
+      bitcast = f32[1, 20, 10] reshape(transpose)
+      transpose2 = f32[1, 10, 20] transpose(bitcast), dimensions={0, 2, 1}
+      ROOT bitcast2 = f32[10, 20] reshape(transpose2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(false);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
+
+TEST_F(AlgebraicSimplifierTest, EliminateBitcastTransposeChain_DifferentTypes) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+      param = f32[10, 20] parameter(0)
+      transpose = f32[20, 10] transpose(param), dimensions={1, 0}
+      bitcast = s32[1, 20, 10] bitcast(transpose)
+      transpose2 = s32[1, 10, 20] transpose(bitcast), dimensions={0, 2, 1}
+      ROOT bitcast2 = f32[10, 20] bitcast(transpose2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(false);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_FALSE(simplifier.Run(m.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, BitcastTransposeChainReshapeIsBitcast) {
+  const std::string hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      p0 = bf16[512,16,3072]{2,1,0} parameter(0)
+      transpose.3 = bf16[512,3072,16]{2,1,0} transpose(p0), dimensions={0,2,1}
+      bitcast = bf16[1,512,3072,16]{3,2,1,0} bitcast(transpose.3)
+      transpose.2 = bf16[1,512,16,3072]{3,2,1,0} transpose(bitcast), dimensions={0,1,3,2}
+      ROOT bitcast.1 = bf16[8192,3072]{1,0} bitcast(transpose.2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, LayoutSensitive_EqualShapes_StartTranspose) {
+  const std::string hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      p0 = f32[2,3]{1,0} parameter(0)
+      t1 = f32[3,2]{1,0} transpose(p0), dimensions={1,0}
+      b1 = f32[1,3,2]{2,1,0} bitcast(t1)
+      t2 = f32[1,2,3]{2,1,0} transpose(b1), dimensions={0,2,1}
+      ROOT b2 = f32[2,3]{1,0} bitcast(t2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
+
 // Reshape-transpose chain is not eliminated since effective transposes
 // do not compose to identity permutation.
 TEST_F(AlgebraicSimplifierTest, NotEliminateReshapeTransposeChain) {
@@ -5299,66 +5377,6 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
-TEST_F(AlgebraicSimplifierTest, SliceWithReshape) {
-  const absl::string_view hlo_string = R"hlo(
-  HloModule SliceWithReshape
-
-  ENTRY main {
-    %arg = f32[1,2024,4,128]{3,2,1,0} parameter(0)
-    %reshape.1 = f32[2,259072,2]{2,1,0} reshape(%arg)
-    %slice = f32[2,259072,1]{2,1,0} slice(%reshape.1), slice={[0:2], [0:259072], [1:2]}
-    ROOT %reshape.2 = f32[518144]{0} reshape(%slice)
-  }
-)hlo";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
-
-  auto* root = module->entry_computation()->root_instruction();
-  VLOG(2) << module->ToString();
-
-  // Expected: Reshape(Slice(Arg))
-  // AlgebraicSimplifier merges the two reshapes.
-  const HloInstruction* slice;
-  EXPECT_THAT(root, GmockMatch(m::Reshape(
-                        m::Slice(&slice, m::Parameter(0)))));
-
-  EXPECT_EQ(slice->slice_strides(3), 2);
-  EXPECT_EQ(slice->slice_starts(3), 1);
-  EXPECT_EQ(slice->slice_limits(3), 128);
-  EXPECT_EQ(slice->shape().dimensions(3), 64);
-}
-
-TEST_F(AlgebraicSimplifierTest, SmallSliceWithReshape) {
-  const absl::string_view hlo_string = R"hlo(
-  HloModule SliceWithReshape
-
-  ENTRY main {
-    %arg = f32[2]{0} parameter(0)
-    %reshape.1 = f32[2,1]{1,0} reshape(%arg)
-    %slice = f32[1,1]{1,0} slice(%reshape.1), slice={[0:1], [0:1]}
-    ROOT %reshape.2 = f32[1]{0} reshape(%slice)
-  }
-)hlo";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
-
-  auto* root = module->entry_computation()->root_instruction();
-  LOG(INFO) << module->ToString();
-
-  // Expected: Reshape(Slice(Arg))
-  // AlgebraicSimplifier merges the two reshapes.
-  const HloInstruction* slice;
-  EXPECT_THAT(root, GmockMatch(m::Reshape(
-                        m::Slice(&slice, m::Parameter(0)))));
-
-  EXPECT_EQ(slice->slice_strides(0), 1);
-  EXPECT_EQ(slice->slice_starts(0), 0);
-  EXPECT_EQ(slice->slice_limits(0), 1);
-  EXPECT_EQ(slice->shape().dimensions(0), 1);
-}
-
 TEST_F(AlgebraicSimplifierTest, SliceOfBroadcastToBroadcast) {
   HloComputation::Builder builder(TestName());
   const int64_t dim0 = 11;
@@ -13150,5 +13168,33 @@ TEST_F(AlgebraicSimplifierTest, ConditionalWithConvert) {
                   )));
 }
 
+TEST_F(AlgebraicSimplifierTest,
+       BitcastTransposeChain_InvalidBitcastLayoutChange) {
+  // This test ensures that a bitcast which effectively acts as a transpose (due
+  // to layout change) prevents the removal of the transpose chain.
+  //
+  // Buggy behavior: Simplifier sees Transpose(1,0) ... Transpose(1,0), thinks
+  // they cancel out, ignores the Bitcast's layout effect, and simplifies to p0.
+  const std::string hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      p0 = f32[10,10]{0,1} parameter(0)
+      t1 = f32[10,10]{0,1} transpose(p0), dimensions={1,0}
+      b1 = f32[10,10]{1,0} bitcast(t1)
+      ROOT t2 = f32[10,10]{0,1} transpose(b1), dimensions={1,0}
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  simplifier.Run(m.get()).value();
+
+  // Ensure it didn't incorrectly simplify to the parameter.
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              Not(GmockMatch(m::Parameter(0))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc
new file mode 100644
index 00000000000000..ef7058daa766f3
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc
@@ -0,0 +1,176 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h"
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+std::optional<int64_t> GetConstantAsInt64(const HloInstruction* inst) {
+  if (!inst->IsConstant() || !ShapeUtil::IsScalar(inst->shape())) {
+    return std::nullopt;
+  }
+  return primitive_util::PrimitiveTypeSwitch<std::optional<int64_t>>(
+      [&](auto primitive_type_constant) -> std::optional<int64_t> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return static_cast<int64_t>(
+              inst->literal().GetFirstElement<NativeT>());
+        }
+        return std::nullopt;
+      },
+      inst->shape().element_type());
+}
+
+std::optional<std::vector<int64_t>> GetStartIndices(const HloInstruction* dus) {
+  absl::Span<HloInstruction* const> start_indices_operands =
+      absl::MakeSpan(dus->operands())
+          .subspan(xla::Cast<HloDynamicUpdateSliceInstruction>(dus)
+                       ->first_index_operand_number());
+  std::vector<int64_t> start_indices;
+  for (HloInstruction* operand : start_indices_operands) {
+    std::optional<int64_t> start_index = GetConstantAsInt64(operand);
+    if (!start_index.has_value()) {
+      return std::nullopt;
+    }
+    start_indices.push_back(*start_index);
+  }
+  return start_indices;
+}
+
+// Checks if the ranges [start1, end1) and [start2, end2) overlap.
+//
+// Example:
+// RangesOverlap(0, 10, 5, 15) -> true
+// RangesOverlap(0, 10, 10, 20) -> false
+// RangesOverlap(0, 10, 15, 20) -> false
+bool RangesOverlap(int64_t start1, int64_t end1, int64_t start2, int64_t end2) {
+  return start1 < end2 && start2 < end1;
+}
+
+// If true, the updated elements of the dynamic-update-slice is not accessed
+// by the slice user.
+bool IsDusUpdateUnused(const std::vector<int64_t>& dus_starts,
+                       const Shape& update_shape,
+                       const HloInstruction* slice_user) {
+  if (slice_user->opcode() != HloOpcode::kSlice) {
+    return false;
+  }
+  // Get Slice ranges
+  const std::vector<int64_t>& slice_starts = slice_user->slice_starts();
+  const std::vector<int64_t>& slice_limits = slice_user->slice_limits();
+
+  // The slice accesses the updated part IFF there is an overlap in *ALL*
+  // dimensions. If there is no overlap in any dimension, the slice is safe,
+  // i.e., it doesn't access the updated elements.
+  for (int dim = 0; dim < update_shape.dimensions().size(); ++dim) {
+    int64_t dus_start = dus_starts[dim];
+    int64_t dus_limit = dus_start + update_shape.dimensions(dim);
+    int64_t slice_start = slice_starts[dim];
+    int64_t slice_limit = slice_limits[dim];
+    if (RangesOverlap(dus_start, dus_limit, slice_start, slice_limit)) {
+      continue;
+    }
+    // Disjoint in this dimension, so slice does not overlap with update.
+    return true;
+  }
+  // Overlap in all dimensions, so slice reads updated values.
+  return false;
+}
+
+// Helper function to process a single DynamicUpdateSlice instruction.
+// Returns true if the module was changed.
+absl::StatusOr<bool> ProcessDynamicUpdateSlice(HloInstruction* dus,
+                                               HloComputation* comp) {
+  const std::optional<std::vector<int64_t>> dus_starts = GetStartIndices(dus);
+  if (!dus_starts.has_value()) {
+    // Not a constant start index, cannot simplify.
+    return false;
+  }
+  const std::vector<int64_t>& dus_starts_vec = *dus_starts;
+  HloInstruction* update_operand = dus->mutable_operand(1);
+  if (dus_starts_vec.size() != update_operand->shape().dimensions().size()) {
+    // DUS start indices size does not match update operand shape dimensions
+    // size.
+    VLOG(1) << "DUS start indices size does not match update operand shape "
+               "dimensions size: "
+            << dus->ToString();
+    return false;
+  }
+
+  bool is_dus_update_unused =
+      dus->user_count() > 0 &&
+      absl::c_all_of(dus->users(), [&](HloInstruction* user) {
+        return IsDusUpdateUnused(dus_starts_vec, update_operand->shape(), user);
+      });
+  VLOG(2) << "  is_dus_update_unused: " << is_dus_update_unused;
+  if (is_dus_update_unused) {
+    TF_RETURN_IF_ERROR(dus->ReplaceAllUsesWith(dus->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(comp->RemoveInstruction(dus));
+    return true;  // Changed
+  }
+  return false;  // Not changed
+}
+
+}  // namespace
+
+absl::StatusOr<bool> DeadDynamicUpdateSliceElimination::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  auto computations_range = module->computations(execution_threads);
+  std::vector<HloComputation*> computations(computations_range.begin(),
+                                            computations_range.end());
+  for (HloComputation* computation : computations) {
+    std::vector<HloInstruction*> post_order_instructions =
+        computation->MakeInstructionPostOrder();
+    for (auto it = post_order_instructions.rbegin();
+         it != post_order_instructions.rend(); ++it) {
+      HloInstruction* instruction = *it;
+      if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
+        continue;
+      }
+      VLOG(2) << "Processing DUS: " << instruction->ToString();
+      TF_ASSIGN_OR_RETURN(bool dus_changed,
+                          ProcessDynamicUpdateSlice(instruction, computation));
+      if (dus_changed) {
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h
new file mode 100644
index 00000000000000..5f17372119f5ea
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h
@@ -0,0 +1,52 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_DEAD_DYNAMIC_UPDATE_SLICE_ELIMINATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_DEAD_DYNAMIC_UPDATE_SLICE_ELIMINATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that removes dynamic-update-slice (DUS) instructions if the region
+// they modify is never accessed by any downstream operations.
+//
+// This optimization applies if all users of a DUS are slice instructions, its
+// indices are constant, and none of its slice users read from the region
+// updated by the DUS. If these conditions are met, the pass replaces all uses
+// of the DUS with its input operand and removes the DUS instruction. The
+// optimization is applied from root to top, so that if any DUS is removed that
+// makes certain upstream DUSs removable, those will also be removed in the same
+// pass.
+class DeadDynamicUpdateSliceElimination : public HloModulePass {
+ public:
+  DeadDynamicUpdateSliceElimination() = default;
+  ~DeadDynamicUpdateSliceElimination() override = default;
+  absl::string_view name() const override { return "dead-dus-elimination"; }
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (instructions were removed).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_DEAD_DYNAMIC_UPDATE_SLICE_ELIMINATION_H_
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc
new file mode 100644
index 00000000000000..9a96ddff60554a
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+namespace m = ::xla::match;
+
+class DeadDynamicUpdateSliceEliminationTest
+    : public HloHardwareIndependentTestBase {};
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, NoDeadDUS) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %constant.0 = bf16[] constant(0)
+  %idx.1806 = s32[] constant(1806)
+  %idx.0 = s32[] constant(0)
+  %param.0 = bf16[2408,16] parameter(0)
+  %update_block = bf16[301,16] broadcast(%constant.0), dimensions={}
+  %dus = bf16[2408,16] dynamic-update-slice(%param.0, %update_block, %idx.1806, %idx.0)
+  ROOT %slice = bf16[602,16] slice(%dus), slice={[1505:2107], [0:16]}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_FALSE(dds.Run(module.get()).value());
+}
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, MultiUsersNoDeadDUS) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %constant.0 = bf16[] constant(0)
+  %idx.1806 = s32[] constant(1806)
+  %idx.0 = s32[] constant(0)
+  %param.0 = bf16[2408,16] parameter(0)
+  %update_block = bf16[301,16] broadcast(%constant.0), dimensions={}
+  %dus = bf16[2408,16] dynamic-update-slice(%param.0, %update_block, %idx.1806, %idx.0)
+  %slice.0 = bf16[301,16] slice(%dus), slice={[1505:1806], [0:16]}
+  %slice.1 = bf16[301,16] slice(%dus), slice={[1806:2107], [0:16]}
+  ROOT %tuple = (bf16[301,16], bf16[301,16]) tuple(%slice.0, %slice.1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_FALSE(dds.Run(module.get()).value());
+}
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, RemoveDeadDUS) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %constant.0 = bf16[] constant(0)
+  %idx.1806 = s32[] constant(1806)
+  %idx.0 = s32[] constant(0)
+  %param.0 = bf16[2408,16] parameter(0)
+  %update_block = bf16[301,16] broadcast(%constant.0), dimensions={}
+  %dus = bf16[2408,16] dynamic-update-slice(%param.0, %update_block, %idx.1806, %idx.0)
+  ROOT %slice = bf16[301,16] slice(%dus), slice={[1505:1806], [0:16]}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_TRUE(dds.Run(module.get()).value());
+  HloDCE dce;
+  EXPECT_TRUE(dce.Run(module.get()).value());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(0))));
+}
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, RemoveDeadDUSChain) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %param.0 = bf16[256,2408,1,16,256] parameter(0)
+  %constant.bf16.0 = bf16[] constant(0)
+  %broadcast.12717 = bf16[256,301,1,16,256] broadcast(%constant.bf16.0), dimensions={}
+  %constant.6347 = s32[] constant(0)
+  %constant.6386 = s32[] constant(2107)
+  %constant.6387 = s32[] constant(1806)
+  %constant.6388 = s32[] constant(1505)
+  %constant.6389 = s32[] constant(1204)
+  %constant.6390 = s32[] constant(903)
+  %constant.6391 = s32[] constant(602)
+  %constant.6392 = s32[] constant(301)
+  %dynamic-update-slice.643 = bf16[256,2408,1,16,256] dynamic-update-slice(%param.0, %broadcast.12717, %constant.6347, %constant.6386, %constant.6347, %constant.6347, %constant.6347)
+  %gather.214 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.643), slice={[0:256], [1806:2107], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.644 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.643, %broadcast.12717, %constant.6347, %constant.6387, %constant.6347, %constant.6347, %constant.6347)
+  %gather.215 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.644), slice={[0:256], [1505:1806], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.645 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.644, %broadcast.12717, %constant.6347, %constant.6388, %constant.6347, %constant.6347, %constant.6347)
+  %gather.216 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.645), slice={[0:256], [1204:1505], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.646 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.645, %broadcast.12717, %constant.6347, %constant.6389, %constant.6347, %constant.6347, %constant.6347)
+  %gather.217 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.646), slice={[0:256], [903:1204], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.647 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.646, %broadcast.12717, %constant.6347, %constant.6390, %constant.6347, %constant.6347, %constant.6347)
+  %gather.218 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.647), slice={[0:256], [602:903], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.648 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.647, %broadcast.12717, %constant.6347, %constant.6391, %constant.6347, %constant.6347, %constant.6347)
+  %gather.219 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.648), slice={[0:256], [301:602], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.649 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.648, %broadcast.12717, %constant.6347, %constant.6392, %constant.6347, %constant.6347, %constant.6347)
+  %gather.220 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.649), slice={[0:256], [0:301], [0:1], [0:16], [0:256]}
+  ROOT %result = (bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256]) tuple(%gather.214, %gather.215, %gather.216, %gather.217, %gather.218, %gather.219, %gather.220)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_TRUE(dds.Run(module.get()).value());
+  for (HloInstruction* instruction :
+       module->entry_computation()->instructions()) {
+    EXPECT_NE(instruction->opcode(), HloOpcode::kDynamicUpdateSlice);
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index db0040ed1ba75b..21611ee0129540 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <numeric>
 #include <set>
 #include <string>
 #include <utility>
@@ -415,10 +416,9 @@ absl::StatusOr<HloInstruction*> TryMergeOperand(HloInstruction* a,
   return TryMergeLHSWithRHSOperand(b, a);
 }
 
-absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
-                               std::function<bool(const HloInstruction* dot_a,
-                                                  const HloInstruction* dot_b)>
-                                   can_merge) {
+absl::StatusOr<bool> MergeDots(
+    HloComputation* comp, int64_t max_size_to_merge,
+    std::function<int64_t(const HloInstruction* dot)> queue_id) {
   auto is_merge_candidate = [&](HloInstruction* instr) {
     int64_t bytes = ShapeUtil::ByteSizeOfElements(instr->shape());
     for (const HloInstruction* operand : instr->operands()) {
@@ -429,13 +429,17 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
 
   // Collect equivalence classes.  Specifically, create the map
   //
-  //   instruction -> [canonical dots that use the instruction].
+  //   instruction, queue_id -> [canonical dots that use the instruction].
+  //
+  // queue_id is backend-specific. Dots with different queue_ids may run
+  // concurrently on different streams and will not be merged.
   //
   // We'll then try to merge dots within each equivalence class.  A dot will be
   // a member of two equivalence classes (because it has two operands), but if
   // it's merged with a dot from one equivalence class, it won't also be merged
   // in another class.
-  absl::flat_hash_map<HloInstruction*, absl::flat_hash_set<HloInstruction*>>
+  absl::flat_hash_map<std::pair<HloInstruction*, int64_t>,
+                      absl::flat_hash_set<HloInstruction*>>
       equivalence_classes;
   for (HloInstruction* instr : comp->instructions()) {
     // Cowardly skip instructions with control dependencies.
@@ -445,11 +449,12 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
       continue;
     }
     for (HloInstruction* operand : instr->operands()) {
-      equivalence_classes[operand].insert(instr);
+      equivalence_classes[{operand, queue_id(instr)}].insert(instr);
       // DotDecomposer inserts transposes to establish a normal form. Transposed
       // operands still count as equivalent.
       if (operand->opcode() == HloOpcode::kTranspose) {
-        equivalence_classes[operand->mutable_operand(0)].insert(instr);
+        equivalence_classes[{operand->mutable_operand(0), queue_id(instr)}]
+            .insert(instr);
       }
     }
   }
@@ -462,7 +467,7 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
   //    us to merge.)
   absl::erase_if(
       equivalence_classes,
-      [&](const std::pair<const HloInstruction*,
+      [&](const std::pair<std::pair<const HloInstruction*, int64_t>,
                           absl::flat_hash_set<HloInstruction*>>& kv) {
         const auto& v = kv.second;
         return v.size() < 2 || absl::c_none_of(v, is_merge_candidate);
@@ -473,6 +478,16 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
     return false;
   }
 
+  VLOG(0) << "Merging Dots in computation: " << comp->name();
+  VLOG(1) << "Found " << equivalence_classes.size()
+          << " equivalence classes with "
+          << std::accumulate(equivalence_classes.begin(),
+                             equivalence_classes.end(), std::uint64_t{0},
+                             [](std::uint64_t total, auto const& values) {
+                               return values.second.size() + total;
+                             })
+          << " dots in total.";
+
   // Build a dependency graph representing the whole computation.
   GraphCycles graph;
 
@@ -505,13 +520,14 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
   // them earlier because removing an instruction deletes it; we'd then have
   // dangling pointers in our hashtable!)
   absl::flat_hash_set<HloInstruction*> dead_instrs;
-  std::vector<HloInstruction*> keys;
+  std::vector<std::pair<HloInstruction*, int64_t>> keys;
   keys.reserve(equivalence_classes.size());
   for (auto& kv : equivalence_classes) {
     keys.push_back(kv.first);
   }
-  absl::c_sort(keys, [](const HloInstruction* a, const HloInstruction* b) {
-    return a->unique_id() < b->unique_id();
+  absl::c_sort(keys, [](std::pair<const HloInstruction*, int64_t> a,
+                        std::pair<const HloInstruction*, int64_t> b) {
+    return a.first->unique_id() < b.first->unique_id();
   });
   for (auto key : keys) {
     const auto& values = equivalence_classes[key];
@@ -539,7 +555,7 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
             (!is_merge_candidate(a) && !is_merge_candidate(b)) ||
             // Perform reachability checks last since they can be expensive.
             graph.IsReachableNonConst(a_id, b_id) ||
-            graph.IsReachableNonConst(b_id, a_id) || !can_merge(a, b)) {
+            graph.IsReachableNonConst(b_id, a_id)) {
           continue;
         }
 
@@ -591,7 +607,7 @@ absl::StatusOr<bool> DotMerger::RunImpl(
   for (HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
     TF_ASSIGN_OR_RETURN(bool changed_computation,
-                        MergeDots(comp, max_size_to_merge_, can_merge_));
+                        MergeDots(comp, max_size_to_merge_, queue_id_));
     changed |= changed_computation;
   }
   return changed;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
index c89ba6153f2feb..e0da3f4f1952b8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -59,10 +60,9 @@ class DotMerger : public HloModulePass {
  public:
   explicit DotMerger(
       int64_t max_size_to_merge,
-      std::function<bool(const HloInstruction* a, const HloInstruction* b)>
-          can_merge = [](const HloInstruction* dot_a,
-                         const HloInstruction* dot_b) -> bool { return true; })
-      : max_size_to_merge_(max_size_to_merge), can_merge_(can_merge) {}
+      std::function<int64_t(const HloInstruction* dot)> queue_id =
+          [](const HloInstruction* dot) -> int64_t { return 0; })
+      : max_size_to_merge_(max_size_to_merge), queue_id_(queue_id) {}
 
   absl::string_view name() const override { return "dot-merger"; }
 
@@ -73,9 +73,8 @@ class DotMerger : public HloModulePass {
 
  private:
   int64_t max_size_to_merge_;
-  // Predicate function for backend-specific compatibility check.
-  std::function<bool(const HloInstruction* dot_a, const HloInstruction* dot_b)>
-      can_merge_;
+  // Predicate function for backend-specific operation queue mapping.
+  std::function<int64_t(const HloInstruction* dot)> queue_id_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
index 3e6d3e109c6196..73ba972f33f3cc 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
@@ -831,18 +831,21 @@ TEST_F(DotMergerTest, NoMergeWithFalseCompatibility) {
     lhs1 = f32[2,4,300,200] parameter(1)
     rhs  = f32[2,4,200, 50] parameter(2)
     dot0 = f32[2,4,100, 50] dot(lhs0, rhs), lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-                                            lhs_contracting_dims={3}, rhs_contracting_dims={2}
+        lhs_contracting_dims={3}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"0"}
     dot1 = f32[2,4,300, 50] dot(lhs1, rhs), lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-                                            lhs_contracting_dims={3}, rhs_contracting_dims={2}
+        lhs_contracting_dims={3}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"1"}
     ROOT tuple = (f32[2,4,100,50], f32[2,4,300,50]) tuple(dot0, dot1)
   })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(module_string));
-  std::function<bool(const HloInstruction* dot_a, const HloInstruction* dot_b)>
-      can_merge = [&](const HloInstruction* dot_a,
-                      const HloInstruction* dot_b) -> bool { return false; };
+  std::function<int64_t(const HloInstruction* dot)> queue_id =
+      [&](const HloInstruction* dot) -> int64_t {
+    // The queue_id will typically be taken from the backend_config, but deps on
+    // backend-specific protos is avoided for testing.
+    return dot->name() == "dot1" ? 1 : 0;
+  };
   DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max(),
-                 can_merge);
+                 queue_id);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_FALSE(changed);
 }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc
index 17a074406e1bf0..9e0b3c860400f9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal_util.h"
@@ -663,5 +664,52 @@ TEST_F(HloComputationDeduplicatorTest, DontDeduplicateReduceAllReduce) {
   EXPECT_EQ(computation_names.size(), 3);
 }
 
-}  //  namespace
+TEST_F(HloComputationDeduplicatorTest, DeduplicateChain) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+fusion0 {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+fusion1 {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+fusion2 {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+ENTRY entry {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  fusion.2 = f32[] fusion(p0, p1), kind=kLoop, calls=fusion2
+  fusion.1 = f32[] fusion(fusion.2, p1), kind=kLoop, calls=fusion1
+  fusion.0 = f32[] fusion(fusion.1, p1), kind=kLoop, calls=fusion0
+  ROOT add = f32[] add(fusion.0, p1)
+}
+)"));
+  HloComputationDeduplicator dedup(/*mark_fusion_duplications=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, dedup.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(module->computation_count(), 4);
+  HloInstruction* fusion0 =
+      module->entry_computation()->GetInstructionWithName("fusion.0");
+  HloInstruction* fusion1 =
+      module->entry_computation()->GetInstructionWithName("fusion.1");
+  HloInstruction* fusion2 =
+      module->entry_computation()->GetInstructionWithName("fusion.2");
+  EXPECT_EQ(fusion0->metadata().deduplicated_name(), "fusion.0");
+  EXPECT_EQ(fusion1->metadata().deduplicated_name(), "fusion.0");
+  EXPECT_EQ(fusion2->metadata().deduplicated_name(), "fusion.0");
+}
+
+}  // namespace
 }  //  namespace xla
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
index 97c728901b6c07..c091d6e9d645fc 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
@@ -127,18 +127,18 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncStart(
       async_builder
           .createBlock(&function.getBody(), {}, Untuple(result_types[0]), locs)
           ->getArguments();
-  auto sync_operation = async_builder.create<sync_op>(
-      loc, Untuple(result_types[1]), sync_operand, attributes);
-  async_builder.create<mlir::func::ReturnOp>(loc, sync_operation->getResults());
+  auto sync_operation = sync_op::create(
+      async_builder, loc, Untuple(result_types[1]), sync_operand, attributes);
+  mlir::func::ReturnOp::create(async_builder, loc,
+                               sync_operation->getResults());
   TF_RETURN_IF_ERROR(mutate_op(sync_operation));
 
   function->setAttr(kExecutionThread, builder->getStringAttr("main"));
 
   auto bundle_result_type =
       mlir::mhlo::AsyncBundleType::get(context, result_types);
-  return builder
-      ->create<mlir::mhlo::AsyncStartOp>(loc, bundle_result_type, operands,
-                                         async_attributes)
+  return mlir::mhlo::AsyncStartOp::create(*builder, loc, bundle_result_type,
+                                          operands, async_attributes)
       .getOperation();
 }
 
@@ -166,13 +166,13 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncDone(
   auto start_tuple =
       llvm::dyn_cast<mlir::TupleType>(async_bundle.getTypes()[1]);
   if (start_tuple && llvm::isa<mlir::TupleType>(start_tuple.getType(0))) {
-    auto op = builder->create<mlir::mhlo::AsyncDoneOp>(loc, result_type,
-                                                       operands, attributes);
+    auto op = mlir::mhlo::AsyncDoneOp::create(*builder, loc, result_type,
+                                              operands, attributes);
     return {op};
   }
   if (useBundleResult) result_type = async_bundle.getTypes()[1];
-  auto op = builder->create<mlir::mhlo::AsyncDoneOp>(loc, Untuple(result_type),
-                                                     operands, attributes);
+  auto op = mlir::mhlo::AsyncDoneOp::create(*builder, loc, Untuple(result_type),
+                                            operands, attributes);
   return CreateTupleFromOpResults(builder, loc, op.getOperation(), result_type);
 }
 
@@ -233,9 +233,8 @@ absl::StatusOr<mlir::Operation*> ImportSend(
   if (args.size() == 2 && IsEmptyTuple(args[0].getType())) {
     args = args.drop_front(1);
   }
-  auto send = builder
-                  ->create<mlir::stablehlo::SendOp>(loc, token.getType(), args,
-                                                    attributes)
+  auto send = mlir::stablehlo::SendOp::create(*builder, loc, token.getType(),
+                                              args, attributes)
                   .getOperation();
   if (instruction->has_sharding()) {
     const HloSharding& sharding = instruction->sharding();
@@ -305,8 +304,9 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
 
   // Return recv op for non-pipelined send, skip empty tuple result type
   if (!IsEmptyTuple(result_types[0])) {
-    auto recv = builder->create<mlir::stablehlo::RecvOp>(
-        loc, llvm::SmallVector<mlir::Type>{result_types[0], result_types[2]},
+    auto recv = mlir::stablehlo::RecvOp::create(
+        *builder, loc,
+        llvm::SmallVector<mlir::Type>{result_types[0], result_types[2]},
         operands, attributes);
     if (instruction->has_sharding()) {
       const HloSharding& sharding = instruction->sharding();
@@ -328,14 +328,14 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
 
   // Recv with no result, only token.
   // To keep parity, if op only returns token, wrap in tuple<tuple<>, token>
-  auto recv = builder->create<mlir::stablehlo::RecvOp>(
-      loc, llvm::SmallVector<mlir::Type>{result_types[2]}, operands,
+  auto recv = mlir::stablehlo::RecvOp::create(
+      *builder, loc, llvm::SmallVector<mlir::Type>{result_types[2]}, operands,
       attributes);
-  auto empty_tuple = builder->create<mlir::stablehlo::TupleOp>(
-      loc, llvm::ArrayRef<mlir::Value>{});
+  auto empty_tuple = mlir::stablehlo::TupleOp::create(
+      *builder, loc, llvm::ArrayRef<mlir::Value>{});
 
-  return builder->create<mlir::stablehlo::TupleOp>(
-      loc,
+  return mlir::stablehlo::TupleOp::create(
+      *builder, loc,
       llvm::ArrayRef<mlir::Value>{empty_tuple.getResult(), recv.getResult(0)});
 }
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
index b784f882ac546a..8b68c112604104 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
@@ -199,12 +199,6 @@ mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
       numPrimitiveOperations = 6;
       break;
     }
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9: {
-      lhs = rhs = builder->getBF16Type();
-      accum = builder->getF32Type();
-      numPrimitiveOperations = 9;
-      break;
-    }
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32: {
       lhs = rhs = builder->getTF32Type();
       accum = builder->getF32Type();
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
index f71b9561b49551..f3a7f691b703fd 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -65,10 +65,9 @@ absl::StatusOr<mlir::Operation*> ImportDynamicBroadcastInDimOp(
         mlir::cast<mlir::IntegerAttr>(broadcast_dimension).getInt();
   }
 
-  return builder
-      ->create<mlir::stablehlo::DynamicBroadcastInDimOp>(
-          loc, result_type, operands[0], operands[1],
-          builder->getDenseI64ArrayAttr(broadcast_dimensions))
+  return mlir::stablehlo::DynamicBroadcastInDimOp::create(
+             *builder, loc, result_type, operands[0], operands[1],
+             builder->getDenseI64ArrayAttr(broadcast_dimensions))
       .getOperation();
 }
 
@@ -78,8 +77,8 @@ absl::StatusOr<mlir::Operation*> ImportDynamicReshapeOp(
   if (!backend_config.empty()) {
     return Internal("backend_config attribute must be empty.");
   }
-  return builder
-      ->create<mlir::stablehlo::DynamicReshapeOp>(loc, result_type, operands)
+  return mlir::stablehlo::DynamicReshapeOp::create(*builder, loc, result_type,
+                                                   operands)
       .getOperation();
 }
 
@@ -89,8 +88,8 @@ absl::StatusOr<mlir::Operation*> ImportRealDynamicSliceOp(
   if (!backend_config.empty()) {
     return Internal("backend_config attribute must be empty.");
   }
-  return builder
-      ->create<mlir::stablehlo::RealDynamicSliceOp>(loc, result_type, operands)
+  return mlir::stablehlo::RealDynamicSliceOp::create(*builder, loc, result_type,
+                                                     operands)
       .getOperation();
 }
 
@@ -185,20 +184,18 @@ absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
   }
 
   if (custom_call_target == "mhlo.uniform_quantize") {
-    return builder
-        ->create<mlir::stablehlo::UniformQuantizeOp>(
-            loc,
-            mlir::RankedTensorType::get(
-                mlir::cast<mlir::RankedTensorType>(result_type).getShape(),
-                getQuantizedType(backend_config)),
-            operands)
+    return mlir::stablehlo::UniformQuantizeOp::create(
+               *builder, loc,
+               mlir::RankedTensorType::get(
+                   mlir::cast<mlir::RankedTensorType>(result_type).getShape(),
+                   getQuantizedType(backend_config)),
+               operands)
         .getOperation();
   }
 
   if (custom_call_target == "mhlo.uniform_dequantize") {
-    return builder
-        ->create<mlir::stablehlo::UniformDequantizeOp>(loc, result_type,
-                                                       operands)
+    return mlir::stablehlo::UniformDequantizeOp::create(*builder, loc,
+                                                        result_type, operands)
         .getOperation();
   }
   return InvalidArgument("Unsupported MHLO op custom_call %s",
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
index 2f65724b80aaae..380d72b54d9c6a 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
@@ -203,7 +203,8 @@ mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder, mlir::Location loc,
         CreateTupleValue(func_builder, loc, flatten_values, child_type));
   }
 
-  return func_builder->create<mlir::stablehlo::TupleOp>(loc, flatten_sub_values)
+  return mlir::stablehlo::TupleOp::create(*func_builder, loc,
+                                          flatten_sub_values)
       .getResult();
 }
 
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
index 326ec30ba20572..d2a31df67d3b4b 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -490,7 +490,7 @@ std::optional<xla::OpSharding> ExtractShardyResultShardingFromFrontendAttrs(
   mlir::Operation* defining_op =
       mlir::sdy::getBodyTerminatorOperand(function, res_num).getDefiningOp();
   auto custom_call_op =
-      mlir::dyn_cast_or_null<mlir::mhlo::CustomCallOp>(defining_op);
+      mlir::dyn_cast_or_null<mlir::stablehlo::CustomCallOp>(defining_op);
 
   if (custom_call_op == nullptr ||
       custom_call_op.getCallTargetName() !=
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc
index e9a37c9fc24c65..230922533025f0 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc
@@ -184,8 +184,8 @@ TEST_F(AttributeExporterTest, ExtractShardyResultShardingFromFrontendAttrs) {
       "{mesh = #sdy.mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>}"
     }} {
       func.func @main(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>, tensor<8x8xf32>) {
-        %0 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
-        %1 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
         return %0, %1 : tensor<8x8xf32>, tensor<8x8xf32>
       }
     }
@@ -222,8 +222,8 @@ TEST_F(AttributeExporterTest,
   constexpr absl::string_view mlir_source = R"mlir(
     module @test {
       func.func @main(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>, tensor<8x8xf32>) {
-        %0 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
-        %1 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
         return %0, %1 : tensor<8x8xf32>, tensor<8x8xf32>
       }
     }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index a2f1b054409486..c91789a77d3594 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -122,13 +122,6 @@ limitations under the License.
 #define DEBUG_TYPE "xla-translate"
 
 using ::int64_t;
-using ::tsl::int16;
-using ::tsl::int32;
-using ::tsl::int8;
-using ::tsl::uint16;
-using ::tsl::uint32;
-using ::tsl::uint64;
-using ::tsl::uint8;
 
 // All Module level and Function level attributes must be included in:
 //   xla/mlir_hlo/utils/unregistered_attributes.h
@@ -3040,8 +3033,8 @@ mlir::LogicalResult ExportXlaOp(SetDimensionSizeOp op, OpLoweringContext ctx) {
     return op.emitError(shape_or.status().ToString());
   }
   xla::XlaOp xla_result;
-  if (auto constant =
-          llvm::dyn_cast_or_null<ConstantOp>(op.getSize().getDefiningOp());
+  if (auto constant = llvm::dyn_cast_or_null<stablehlo::ConstantOp>(
+          op.getSize().getDefiningOp());
       constant != nullptr) {
     auto value = constant.getValue();
     auto values = value.getValues<mlir::IntegerAttr>();
@@ -3341,9 +3334,9 @@ LogicalResult ExportXlaOp(ReduceScatterOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(mhlo::AsyncStartOp op, OpLoweringContext ctx) {
   for (auto* user : op.getResult().getUsers()) {
-    if (!isa<AsyncUpdateOp, AsyncDoneOp>(user)) {
+    if (!isa<mhlo::AsyncUpdateOp, mhlo::AsyncDoneOp>(user)) {
       return op.emitOpError() << "Users of AsyncStart's return value must be "
                               << "async_update or async_done";
     }
@@ -3358,8 +3351,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
   mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
       FlatSymbolRefAttr::get(op->getContext(), op.getCalledComputation()));
 
-  auto all_gather_op =
-      dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
+  auto all_gather_op = dyn_cast_or_null<stablehlo::AllGatherOp>(
+      callee.getBody().front().front());
   if (all_gather_op && SimplyReturnedOp(all_gather_op)) {
     TensorType operand_type =
         mlir::cast<TensorType>(all_gather_op.getOperand(0).getType());
@@ -3379,8 +3372,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         Convert_use_global_device_ids(all_gather_op.getUseGlobalDeviceIds()));
     return success();
   }
-  auto all_reduce_op =
-      dyn_cast_or_null<AllReduceOp>(callee.getBody().front().front());
+  auto all_reduce_op = dyn_cast_or_null<stablehlo::AllReduceOp>(
+      callee.getBody().front().front());
   if (all_reduce_op && SimplyReturnedOp(all_reduce_op)) {
     xla::XlaComputationId computation;
     if (failed(ctx.converter->LowerRegionAsComputation(
@@ -3395,8 +3388,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         Convert_use_global_device_ids(all_reduce_op.getUseGlobalDeviceIds()));
     return success();
   }
-  auto collective_permute_op =
-      dyn_cast_or_null<CollectivePermuteOp>(callee.getBody().front().front());
+  auto collective_permute_op = dyn_cast_or_null<stablehlo::CollectivePermuteOp>(
+      callee.getBody().front().front());
   if (collective_permute_op && SimplyReturnedOp(collective_permute_op)) {
     value_map[result] =
         xla::internal::XlaBuilderFriend::BuildCollectivePermuteStart(
@@ -3406,7 +3399,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
             Convert_channel_handle(collective_permute_op.getChannelHandle()));
     return mlir::success();
   }
-  auto copy_op = dyn_cast_or_null<CopyOp>(callee.getBody().front().front());
+  auto copy_op =
+      dyn_cast_or_null<mhlo::CopyOp>(callee.getBody().front().front());
   if (copy_op && SimplyReturnedOp(copy_op)) {
     std::optional<int> cross_program_prefetch_index =
         copy_op.getCrossProgramPrefetchIndex()
@@ -3416,7 +3410,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         ctx.builder, operands[0], cross_program_prefetch_index);
     return mlir::success();
   }
-  auto send_op = dyn_cast_or_null<SendOp>(callee.getBody().front().front());
+  auto send_op =
+      dyn_cast_or_null<stablehlo::SendOp>(callee.getBody().front().front());
   if (send_op && SimplyReturnedOp(send_op)) {
     xla::XlaOp operand;
     if (operands.size() == 2)
@@ -3433,7 +3428,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         send_op.getIsHostTransfer());
     return mlir::success();
   }
-  auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
+  auto recv_op =
+      dyn_cast_or_null<stablehlo::RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
     auto result_types =
         mlir::cast<AsyncBundleType>(result.getType()).getTypes()[1];
@@ -3467,8 +3463,9 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
-  if (!isa<AsyncStartOp, AsyncUpdateOp>(op.getBundle().getDefiningOp())) {
+LogicalResult ExportXlaOp(mhlo::AsyncUpdateOp op, OpLoweringContext ctx) {
+  if (!isa<mhlo::AsyncStartOp, mhlo::AsyncUpdateOp>(
+          op.getBundle().getDefiningOp())) {
     auto theerror = op.emitError()
                     << "Defining op of AsyncUpdate's operand must be "
                     << "async_start or async_update";
@@ -3481,7 +3478,7 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
   }
 
   for (auto* user : op.getResult().getUsers()) {
-    if (!isa<AsyncUpdateOp, AsyncDoneOp>(user)) {
+    if (!isa<mhlo::AsyncUpdateOp, mhlo::AsyncDoneOp>(user)) {
       return op.emitOpError() << "Users of AsyncUpdate's return value must be "
                               << "async_update or async_done";
     }
@@ -3498,8 +3495,9 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
-  if (!isa<AsyncStartOp, AsyncUpdateOp>(op.getBundle().getDefiningOp())) {
+LogicalResult ExportXlaOp(mhlo::AsyncDoneOp op, OpLoweringContext ctx) {
+  if (!isa<mhlo::AsyncStartOp, mhlo::AsyncUpdateOp>(
+          op.getBundle().getDefiningOp())) {
     auto theerror = op.emitError()
                     << "Defining op of AsyncDone's operand must be "
                     << "async_start or async_update";
@@ -3515,42 +3513,44 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (failed(GetXlaOp(op.getBundle(), value_map, &operand, op)))
     return failure();
 
-  // Find the AsyncStartOp that starts the async chain.
+  // Find the mhlo::AsyncStartOp that starts the async chain.
   Operation* start = op;
-  while (start != nullptr && !isa<AsyncStartOp>(start)) {
+  while (start != nullptr && !isa<mhlo::AsyncStartOp>(start)) {
     start = start->getOperand(0).getDefiningOp();
-    if (start == nullptr || !isa<AsyncStartOp, AsyncUpdateOp>(start)) {
+    if (start == nullptr ||
+        !isa<mhlo::AsyncStartOp, mhlo::AsyncUpdateOp>(start)) {
       return op.emitError() << "Defining op of AsyncDone's operand must be "
                             << "async_start or async_update";
     }
   }
 
-  if (!isa<AsyncStartOp>(start)) {
+  if (!isa<mhlo::AsyncStartOp>(start)) {
     return op.emitError() << "Could not find async chain start";
   }
 
   mlir::func::FuncOp callee =
       ctx.converter->LookUpSymbol(FlatSymbolRefAttr::get(
-          op->getContext(), cast<AsyncStartOp>(start).getCalledComputation()));
+          op->getContext(),
+          cast<mhlo::AsyncStartOp>(start).getCalledComputation()));
 
-  auto all_gather_op =
-      dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
+  auto all_gather_op = dyn_cast_or_null<stablehlo::AllGatherOp>(
+      callee.getBody().front().front());
   if (all_gather_op && SimplyReturnedOp(all_gather_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllGatherDone(
             ctx.builder, operand, xla::TypeToShape(all_gather_op.getType(0)));
     return success();
   }
-  auto all_reduce_op =
-      dyn_cast_or_null<AllReduceOp>(callee.getBody().front().front());
+  auto all_reduce_op = dyn_cast_or_null<stablehlo::AllReduceOp>(
+      callee.getBody().front().front());
   if (all_reduce_op && SimplyReturnedOp(all_reduce_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllReduceDone(
             ctx.builder, operand, xla::TypeToShape(all_reduce_op.getType(0)));
     return success();
   }
-  auto collective_permute_op =
-      dyn_cast_or_null<CollectivePermuteOp>(callee.getBody().front().front());
+  auto collective_permute_op = dyn_cast_or_null<stablehlo::CollectivePermuteOp>(
+      callee.getBody().front().front());
   if (collective_permute_op && SimplyReturnedOp(collective_permute_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildCollectivePermuteDone(
@@ -3558,13 +3558,15 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
             xla::TypeToShape(collective_permute_op.getType()));
     return success();
   }
-  auto copy_op = dyn_cast_or_null<CopyOp>(callee.getBody().front().front());
+  auto copy_op =
+      dyn_cast_or_null<mhlo::CopyOp>(callee.getBody().front().front());
   if (copy_op && SimplyReturnedOp(copy_op)) {
     value_map[op.getResult(0)] = xla::internal::XlaBuilderFriend::BuildCopyDone(
         ctx.builder, operand, xla::TypeToShape(copy_op.getType()));
     return success();
   }
-  auto send_op = dyn_cast_or_null<SendOp>(callee.getBody().front().front());
+  auto send_op =
+      dyn_cast_or_null<stablehlo::SendOp>(callee.getBody().front().front());
   if (send_op && SimplyReturnedOp(send_op)) {
     value_map[op.getResult(0)] = xla::internal::XlaBuilderFriend::BuildSendDone(
         ctx.builder, operand,
@@ -3572,7 +3574,8 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
         send_op.getIsHostTransfer());
     return success();
   }
-  auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
+  auto recv_op =
+      dyn_cast_or_null<stablehlo::RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
     auto result_types =
         mlir::cast<AsyncBundleType>(op.getBundle().getType()).getTypes()[1];
@@ -6428,12 +6431,11 @@ absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
 #endif
   pm.enableVerifier(enableVerifier);
 
-  mhlo::StablehloLegalizeToHloPassOptions shlo_pass_opts;
-  shlo_pass_opts.convert_xla_supported_stablehlo_ =
-      !options.direct_stablehlo_to_hlo;
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass(shlo_pass_opts));
+  mhlo::HloLegalizeToStablehloPassOptions shlo_pass_opts;
+  shlo_pass_opts.allow_xla_features_ = true;
+  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass(shlo_pass_opts));
   if (failed(pm.run(module))) {
-    return absl::InternalError("Unable to convert StableHLO to MHLO");
+    return absl::InternalError("Unable to convert MHLO to StableHLO");
   }
 
   TF_RETURN_IF_ERROR(PrepareForExport(module));
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir
index 852525dc317da5..461307a0b558f8 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir
@@ -10,7 +10,7 @@ func.func @main() {
 
 // Tests dynamic result shape
 
-// CHECK: 'mhlo.all_gather' op can't be translated to XLA HLO
+// CHECK: 'stablehlo.all_gather' op can't be translated to XLA HLO
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
@@ -24,7 +24,7 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
 
 // Tests dynamic operand shape
 
-// CHECK: 'mhlo.all_gather' op can't be translated to XLA HLO
+// CHECK: 'stablehlo.all_gather' op can't be translated to XLA HLO
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 534fdbcafb1d58..66f7bf731b8dad 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -151,6 +151,8 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:mesh_and_axis",
+        "//xla/hlo/ir:named_sharding",
         "//xla/hlo/ir:tile_assignment",
         "//xla/service:call_graph",
         "//xla/service:dot_as_convolution_util",
@@ -184,6 +186,8 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:mesh_and_axis",
+        "//xla/hlo/ir:named_sharding",
         "//xla/hlo/ir:tile_assignment",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 1644a384510159..116e26a4377d28 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -48,6 +48,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/utils/hlo_container_util.h"
 #include "xla/layout.h"
@@ -359,7 +361,7 @@ static bool IsLeafShardingMoreSpecific(const HloSharding& lhs,
     return false;
   }
   if (!rhs.IsTileMaximalLeaf()) {
-    return lhs.NumTilesLeaf() > rhs.NumTilesLeaf();
+    return lhs.NumTiles() > rhs.NumTiles();
   }
   // If we are not replicated then only tiled (not tile maximal) shardings
   // can improve us.
@@ -403,19 +405,17 @@ bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
     }
     return changed;
   }
-  if (!may_combine_partial_sharding || !to_merge.HasPartialReplication() ||
-      !dst->HasPartialReplication() ||
-      to_merge.num_devices() != dst->num_devices()) {
-    goto check_if_more_specific;
-  }
 
-  if (MergeShardingIfCompatible(
+  if (may_combine_partial_sharding && to_merge.HasPartialReplication() &&
+      dst->HasPartialReplication() &&
+      to_merge.num_devices() == dst->num_devices() &&
+      MergeShardingIfCompatible(
           to_merge,
           /*minimum_tiles=*/std::max(to_merge.NumTiles(), dst->NumTiles()) + 1,
           dst)) {
     return true;
   }
-check_if_more_specific:
+
   return IsLeafShardingMoreSpecific(*dst, to_merge);
 }
 
@@ -789,6 +789,28 @@ HloSharding TransposeSharding(const HloSharding& sharding,
   if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
+
+  if (sharding.UseNamedShardingLeaf()) {
+    // For NamedSharding, subgroup dimensions (e.g., for replication) are
+    // handled separately from data dimensions. The `dimensions` parameter here
+    // only permutes data dimensions, so its size must match the tensor rank.
+    // This differs from the tile-based HloSharding format, where subgroup
+    // dimensions are part of the tile assignment.
+    CHECK_EQ(sharding.num_dimensions(), dimensions.size());
+
+    std::vector<NamedSharding::DimensionSharding> transposed_dim_shardings(
+        sharding.num_dimensions());
+    for (int64_t i = 0; i < dimensions.size(); ++i) {
+      transposed_dim_shardings[dimensions[i]] =
+          sharding.named_sharding().dim_sharding(i);
+    }
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), transposed_dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(),
+        sharding.named_sharding().metadata()));
+  }
+
   std::vector<int> perm_dimensions(dimensions.begin(), dimensions.end());
   // Add subgroup dims if missing.
   if (sharding.TiledDataRank() == dimensions.size()) {
@@ -1080,6 +1102,21 @@ HloSharding PropagateShardingAlongDimsAndReplicateOthers(
     return source_sharding;
   }
 
+  if (source_sharding.UseNamedShardingLeaf()) {
+    std::vector<NamedSharding::DimensionSharding> target_dim_shardings(
+        target_shape_rank);
+    for (int i = 0; i < source_dims.size(); ++i) {
+      target_dim_shardings[target_dims[i]] =
+          source_sharding.named_sharding().dim_shardings()[source_dims[i]];
+    }
+
+    return HloSharding(NamedSharding(
+        source_sharding.named_sharding().mesh(), target_dim_shardings,
+        source_sharding.named_sharding().replicated_axes(),
+        source_sharding.named_sharding().unreduced_axes(),
+        source_sharding.named_sharding().metadata()));
+  }
+
   HloSharding replicate_other_dims =
       PartiallyReplicateTiledShardingOnAllDimsExcept(source_sharding,
                                                      source_dims);
@@ -1493,13 +1530,28 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
   if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
+
+  if (sharding.UseNamedShardingLeaf()) {
+    std::vector<NamedSharding::DimensionSharding> dim_shardings(
+        sharding.named_sharding().dim_shardings().begin(),
+        sharding.named_sharding().dim_shardings().end());
+    for (int64_t dim : dims_to_replicate) {
+      CHECK_LT(dim, dim_shardings.size())
+          << "Dimension " << dim << " is out of bounds for number dimensions "
+          << dim_shardings.size();
+      dim_shardings[dim] = NamedSharding::DimensionSharding();
+    }
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(), sharding.metadata()));
+  }
+
   int64_t group_count = 1;
-  DimensionVector valid_dims_to_replicate;
   for (int64_t dim : dims_to_replicate) {
-    if (dim >= sharding.TiledDataRank()) {
-      continue;
-    }
-    valid_dims_to_replicate.push_back(dim);
+    CHECK_LT(dim, sharding.TiledDataRank())
+        << "Dimension " << dim << " is out of bounds for number dimensions "
+        << sharding.TiledDataRank();
     group_count *= sharding.dimension(dim);
   }
   if (group_count == 1) {
@@ -1511,14 +1563,14 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
   DimensionVector dim_permutation(sharding.TiledDataRank());
   absl::c_iota(dim_permutation, 0);
   absl::c_stable_sort(dim_permutation, [&](const int64_t a, const int64_t b) {
-    return absl::c_linear_search(valid_dims_to_replicate, a) <
-           absl::c_linear_search(valid_dims_to_replicate, b);
+    return absl::c_linear_search(dims_to_replicate, a) <
+           absl::c_linear_search(dims_to_replicate, b);
   });
   auto new_tile =
       TransposeSharding(sharding, dim_permutation).tile_assignment();
   DimensionVector new_tile_shape(sharding.dimensions().begin(),
                                  sharding.dimensions().end());
-  for (int64_t dim : valid_dims_to_replicate) {
+  for (int64_t dim : dims_to_replicate) {
     new_tile_shape[dim] = 1;
   }
   if (sharding.ReplicateOnLastTileDim()) {
@@ -1555,6 +1607,15 @@ HloSharding PartiallyReplicateTiledShardingOnAllDimsExcept(
 
 HloSharding ReplicateAllDataDims(const HloSharding& sharding,
                                  int64_t data_rank) {
+  if (sharding.UseNamedShardingLeaf()) {
+    std::vector<NamedSharding::DimensionSharding> dim_shardings(
+        data_rank >= 0 ? data_rank : sharding.num_dimensions());
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(), sharding.metadata()));
+  }
+
   if (sharding.IsManual()) {
     return sharding;
   }
@@ -1580,6 +1641,35 @@ HloSharding RemoveShapeDimensions(const HloSharding& sharding,
   if (sharding.IsTileMaximal() || dims_to_remove.empty()) {
     return sharding;
   }
+
+  if (sharding.UseNamedShardingLeaf()) {
+    // For NamedSharding, subgroup dimensions (e.g., for replication) are
+    // handled separately from data dimensions. The `dimensions` parameter here
+    // only permutes data dimensions, so its size must match the tensor rank.
+    // This differs from the tile-based HloSharding format, where subgroup
+    // dimensions are part of the tile assignment.
+    DCHECK(
+        std::all_of(dims_to_remove.begin(), dims_to_remove.end(),
+                    [&](int64_t i) { return i < sharding.num_dimensions(); }));
+
+    std::vector<NamedSharding::DimensionSharding> new_dim_shardings;
+    new_dim_shardings.reserve(sharding.num_dimensions() -
+                              dims_to_remove.size());
+    for (int64_t i = 0; i < sharding.num_dimensions(); ++i) {
+      if (absl::c_linear_search(dims_to_remove, i)) {
+        CHECK_EQ(sharding.dimension(i), 1);
+      } else {
+        new_dim_shardings.push_back(
+            sharding.named_sharding().dim_shardings()[i]);
+      }
+    }
+
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), new_dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(), sharding.metadata()));
+  }
+
   DimensionVector new_tile_shape;
   new_tile_shape.reserve(sharding.num_dimensions() - dims_to_remove.size());
   for (int64_t i = 0; i < sharding.num_dimensions(); ++i) {
@@ -1602,6 +1692,36 @@ std::optional<HloSharding> TransposeShardingWithCollapsedDims(
   if (source.IsTileMaximal() || source.IsManual()) {
     return source;
   }
+
+  if (source.UseNamedShardingLeaf()) {
+    // For NamedSharding, subgroup dimensions (e.g., for replication) are
+    // handled separately from data dimensions. The `dimensions` parameter here
+    // only permutes data dimensions, so its size must match the tensor rank.
+    // This differs from the tile-based HloSharding format, where subgroup
+    // dimensions are part of the tile assignment.
+    CHECK_EQ(source.num_dimensions(), src_to_tgt.size());
+
+    for (int64_t i = 0; i < src_to_tgt.size(); ++i) {
+      if (src_to_tgt[i] < 0 && source.dimension(i) > 1) {
+        return std::nullopt;
+      }
+    }
+
+    std::vector<NamedSharding::DimensionSharding> new_dim_shardings(
+        tgt_to_src.size());
+    for (int64_t i = 0; i < tgt_to_src.size(); ++i) {
+      if (tgt_to_src[i] >= 0) {
+        new_dim_shardings[i] =
+            source.named_sharding().dim_sharding(tgt_to_src[i]);
+      }
+    }
+
+    return HloSharding(NamedSharding(
+        source.named_sharding().mesh(), new_dim_shardings,
+        source.named_sharding().replicated_axes(),
+        source.named_sharding().unreduced_axes(), source.metadata()));
+  }
+
   if (src_to_tgt.size() < source.num_dimensions()) {
     // Add missing subgroup dims.
     DimensionVector new_src_to_tgt(src_to_tgt.begin(), src_to_tgt.end());
@@ -2797,19 +2917,21 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
   if (from.IsManual()) {
     return std::nullopt;
   }
-  int64_t sharding_tiles = from.NumTiles();
+  int64_t sharding_tiles;
+  if (!from.IsTuple()) {
+    sharding_tiles = from.NumTiles();
+  }
   if (MergeSharding(*to_improved, &from, may_combine_partial_sharding)) {
     // Override existing tiled sharding only when the new sharding is compatible
     // with the existing one. This avoids unexpected resharding when `sharding`
     // just has more tiles than existing sharding but they are not mergeable.
     if (!allow_aggressive_resharding && to_improved_shape.IsArray() &&
-        !to_improved->IsTileMaximal() && from.NumTiles() == sharding_tiles) {
-      if (!IsSubTilingOrEqualSharding(to_improved_shape, from, *to_improved)) {
-        VLOG(10) << "Not merging because of different device distribution";
-        VLOG(10) << "Instr sharding: " << to_improved->ToString();
-        VLOG(10) << "New sharding " << from.ToString();
-        return std::nullopt;
-      }
+        !to_improved->IsTileMaximal() && from.NumTiles() == sharding_tiles &&
+        !IsSubTilingOrEqualSharding(to_improved_shape, from, *to_improved)) {
+      VLOG(10) << "Not merging because of different device distribution";
+      VLOG(10) << "Instr sharding: " << to_improved->ToString();
+      VLOG(10) << "New sharding " << from.ToString();
+      return std::nullopt;
     }
     return from;
   }
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index c5164f0be6e26f..95fd3d307cd954 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -99,8 +99,8 @@ bool IsSubTilingOrEqualSharding(const Shape& shape,
 // sharding with same preference level.
 bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs);
 
-// Tries to refine `to_merge` by combining with `old`. Returns if the final
-// `to_merge` is more specific than `old`.
+// Tries to refine `dst` by merging `to_merge` into it. Returns if the final
+// `dst` is more specific than `to_merge`.
 bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
                    bool may_combine_partial_sharding);
 
@@ -126,9 +126,8 @@ HloSharding FindCommonSharding(
 HloSharding MoveAndMergeShardingTiles(const HloSharding& sharding,
                                       int64_t source_dim, int64_t target_dim);
 
-// Returns the HloSharding with the tile dimensions and tile assignment
-// transposed based on the specified dimension numbers. In case of a tile
-// maximal sharding returns the original sharding.
+// Returns the HloSharding transposed based on the specified dimension numbers.
+// In case of a tile maximal sharding returns the original sharding.
 HloSharding TransposeSharding(const HloSharding& sharding,
                               absl::Span<const int64_t> dimensions);
 
@@ -251,9 +250,10 @@ HloSharding PartiallyReplicateTiledShardingOnAllDimsExcept(
 HloSharding ReplicateAllDataDims(const HloSharding& sharding,
                                  int64_t data_rank = -1);
 
-// Returns a sharding the removes given tile dimensions.
+// Returns a sharding that removes given sharding dimensions.
 //
-// Precondition: if not tile maximal, the size of each tile dimension must be 1.
+// Precondition: if not tile maximal, the size of each sharding dimension must
+// be 1.
 HloSharding RemoveShapeDimensions(const HloSharding& sharding,
                                   absl::Span<const int64_t> dims_to_remove);
 
@@ -264,12 +264,13 @@ std::optional<HloSharding> TransposeShardingWithCollapsedDims(
     const HloSharding& source, absl::Span<int64_t const> src_to_tgt,
     absl::Span<int64_t const> tgt_to_src);
 
-// Given a `source_sharding`, preserve the tiles along the `source_dims` and
-// replicate the rest. The `target_dims` are used to determine the order of the
-// dimensions in the resulting sharding. If `source_dims` and `target_dims` are
-// in the different order (i.e., different ArgSort results), we need to
-// transpose the tile assignment.
+// Given a `source_sharding`, preserve the dimensions along the `source_dims`
+// and replicate the rest. The `target_dims` are used to determine the order of
+// the dimensions in the resulting sharding.
 //
+// [For tiled sharding format] If `source_dims` and `target_dims` are in the
+// different order (i.e., different ArgSort results), we need to transpose the
+// tile assignment.
 // Given the following input,
 //   * source_sharding = {devices=[2,3,5,7,11]<=[2310]}
 //   * source_dims = [2, 4, 1]
@@ -277,6 +278,16 @@ std::optional<HloSharding> TransposeShardingWithCollapsedDims(
 //   * target_shape_rank = 5
 // The result shoule be {devices=[1,11,5,3,1,14]<=[2,3,5,7,11]T(4,2,1,0,3)
 // last_tile_dim_replicate}.
+//
+// [For named sharding format]
+// Given the following input,
+//   * mesh = Mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+//   * source_sharding = NamedSharding(mesh, {{"a"}, {"b"}, {"c"}, {"d"},
+//   {"e"}})
+//   * source_dims = [2, 4, 1]
+//   * target_dims = [2, 1, 3]
+//   * target_shape_rank = 5
+// The result shoule be NamedSharding(mesh, {{}, {"e"}, {"c"}, {"b"}, {}})
 HloSharding PropagateShardingAlongDimsAndReplicateOthers(
     const HloSharding& source_sharding, absl::Span<const int64_t> source_dims,
     absl::Span<const int64_t> target_dims, int64_t target_shape_rank);
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index ecaad635b7a440..c6b95c58cf8b54 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
@@ -143,12 +145,28 @@ TEST(HloShardingUtilTest, MoveAndMergeShardingTilesSubGroup) {
 TEST(HloShardingUtilTest, TransposeShardingReplicated) {
   EXPECT_EQ(TransposeSharding(HloSharding::Replicate(), {0, 1, 2}),
             HloSharding::Replicate());
+
+  EXPECT_EQ(
+      TransposeSharding(HloSharding::Replicate({}, /*use_named_sharding=*/true),
+                        {0, 1, 2}),
+      HloSharding::Replicate({}, /*use_named_sharding=*/true));
 }
 
 TEST(HloShardingUtilTest, TransposeShardingTiled) {
   HloSharding input = HloSharding::IotaTile({1, 2, 1, 2});
   HloSharding output = HloSharding::IotaTile({2, 1, 2, 1}, {2, 2}, {1, 0});
   EXPECT_EQ(TransposeSharding(input, {3, 0, 1, 2}), output);
+
+  {
+    Mesh mesh({2, 2}, {"a", "b"});
+    NamedSharding input =
+        test_utils::FromAxisNames(mesh, {{}, {"a"}, {}, {"b"}});
+    NamedSharding output =
+        test_utils::FromAxisNames(mesh, {{"b"}, {}, {"a"}, {}});
+    EXPECT_EQ(
+        TransposeSharding(HloSharding(input), {3, 2, 1, 0}).named_sharding(),
+        output);
+  }
 }
 
 TEST(HloShardingUtilTest, TransposeShardingWithCollapsedDimsSubgroupManual) {
@@ -158,6 +176,16 @@ TEST(HloShardingUtilTest, TransposeShardingWithCollapsedDimsSubgroupManual) {
       HloSharding::Subgroup(TileAssignment({1, 1, 2, 4}), {OpSharding::MANUAL});
   EXPECT_EQ(TransposeShardingWithCollapsedDims(input, {-1, 2}, {-1, -1, 1}),
             output);
+
+  {
+    Mesh mesh({1, 2, 4}, {"a", "b", "c"});
+    NamedSharding input = test_utils::FromAxisNames(mesh, {{"a"}, {"b"}});
+    NamedSharding output = test_utils::FromAxisNames(mesh, {{}, {}, {"b"}});
+    EXPECT_EQ(TransposeShardingWithCollapsedDims(HloSharding(input), {-1, 2},
+                                                 {-1, -1, 1})
+                  ->named_sharding(),
+              output);
+  }
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned1) {
@@ -168,8 +196,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned1) {
       HloSharding::PartialTile(TileAssignment({2, 2, 3}, {3, 2, 2}, {1, 2, 0}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned2) {
@@ -180,8 +207,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned2) {
       HloSharding::PartialTile(TileAssignment({2, 2, 3}, {2, 3, 2}, {0, 2, 1}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned3) {
@@ -192,8 +218,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned3) {
       HloSharding::PartialTile(TileAssignment({4, 3}, {2, 3, 2}, {0, 2, 1}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned4) {
@@ -204,8 +229,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned4) {
       HloSharding::PartialTile(TileAssignment({2, 2, 3}, {3, 4}, {1, 0}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned5) {
@@ -215,8 +239,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned5) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 3, 2, 2});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
@@ -225,8 +248,7 @@ TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
   HloSharding sharding = HloSharding::AssignDevice(7);
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
@@ -235,7 +257,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
   HloSharding sharding = HloSharding::IotaTile({1, 2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
@@ -245,8 +267,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
@@ -256,8 +277,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplit2) {
@@ -267,8 +287,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit2) {
   HloSharding output_sharding = HloSharding::IotaTile({4, 4, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplit3) {
@@ -279,8 +298,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit3) {
       HloSharding::PartialTile(TileAssignment({2, 1, 2}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
@@ -290,8 +308,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
@@ -300,8 +317,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
   HloSharding sharding = HloSharding::IotaTile({2, 1, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
@@ -311,8 +327,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
   HloSharding output_sharding = HloSharding::IotaTile({1, 2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTrivialDimensionInsertedToEnd) {
@@ -322,16 +337,14 @@ TEST(HloShardingUtilTest, ReshapeShardingTrivialDimensionInsertedToEnd) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, NoopReshapeShardingEmptyTile) {
   Shape shape = ShapeUtil::MakeShape(F32, {7, 1, 1});
   HloSharding sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result = ReshapeSharding(shape, shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingScalar) {
@@ -340,7 +353,7 @@ TEST(HloShardingUtilTest, ReshapeShardingScalar) {
   HloSharding sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne1) {
@@ -351,12 +364,10 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne1) {
 
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 
   result = ReshapeSharding(output_shape, input_shape, output_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), input_sharding);
+  EXPECT_EQ(result, input_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne2) {
@@ -367,8 +378,7 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne2) {
       HloSharding::PartialTile(TileAssignment({4, 2, 8}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne3) {
@@ -378,8 +388,7 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne3) {
   HloSharding output_sharding = HloSharding::IotaTile({4, 2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne4) {
@@ -390,8 +399,7 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne4) {
       HloSharding::PartialTile(TileAssignment({4, 2, 4}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne1) {
@@ -401,12 +409,10 @@ TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne1) {
   HloSharding output_sharding = HloSharding::IotaTile({1, 4});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 
   result = ReshapeSharding(output_shape, input_shape, output_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), input_sharding);
+  EXPECT_EQ(result, input_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne2) {
@@ -416,12 +422,10 @@ TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne2) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 
   result = ReshapeSharding(output_shape, input_shape, output_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), input_sharding);
+  EXPECT_EQ(result, input_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose1) {
@@ -430,8 +434,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose1) {
   HloSharding sharding = HloSharding::IotaTile({2, 1, 5});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose2) {
@@ -441,8 +444,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose2) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 13});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  ASSERT_TRUE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose3) {
@@ -451,7 +453,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose3) {
   HloSharding input_sharding = HloSharding::IotaTile({1, 1, 5});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose4) {
@@ -462,8 +464,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose4) {
       HloSharding::PartialTile(TileAssignment({1, 1, 5, 1, 1, 1, 13}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingWithPadding1) {
@@ -472,7 +473,7 @@ TEST(HloShardingUtilTest, ReshapeShardingWithPadding1) {
   HloSharding input_sharding = HloSharding::IotaTile({8});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingWithPadding2) {
@@ -483,8 +484,7 @@ TEST(HloShardingUtilTest, ReshapeShardingWithPadding2) {
       HloSharding::PartialTile(TileAssignment({4, 2}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, PropagateReshapeShardingTranspose1) {
@@ -566,6 +566,18 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers1) {
   HloSharding expected = HloSharding::PartialTile(
       TileAssignment({1, 11, 5, 3, 1, 14}, {2, 3, 5, 7, 11}, {4, 2, 1, 0, 3}));
   EXPECT_EQ(target_sharding, expected);
+
+  {
+    Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+    NamedSharding source_sharding =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+    HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+        HloSharding(source_sharding), source_dims, target_dims,
+        target_shape_rank);
+    NamedSharding expected =
+        test_utils::FromAxisNames(mesh, {{}, {"e"}, {"c"}, {"b"}, {}});
+    EXPECT_EQ(target_sharding.named_sharding(), expected);
+  }
 }
 
 TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers2) {
@@ -578,6 +590,18 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers2) {
   HloSharding expected = HloSharding::PartialTile(
       TileAssignment({2, 5, 11, 21}, {2, 3, 5, 7, 11}, {0, 2, 4, 1, 3}));
   EXPECT_EQ(target_sharding, expected);
+
+  {
+    Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+    NamedSharding source_sharding =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+    HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+        HloSharding(source_sharding), source_dims, target_dims,
+        target_shape_rank);
+    NamedSharding expected =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"c"}, {"e"}});
+    EXPECT_EQ(target_sharding.named_sharding(), expected);
+  }
 }
 
 TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers3) {
@@ -590,6 +614,71 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers3) {
   HloSharding expected = HloSharding::PartialTile(
       TileAssignment({11, 7, 1, 3, 10}, {2, 3, 5, 7, 11}, {4, 3, 1, 0, 2}));
   EXPECT_EQ(target_sharding, expected);
+
+  {
+    Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+    NamedSharding source_sharding =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+    HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+        HloSharding(source_sharding), source_dims, target_dims,
+        target_shape_rank);
+    NamedSharding expected =
+        test_utils::FromAxisNames(mesh, {{"e"}, {"d"}, {}, {"b"}});
+    EXPECT_EQ(target_sharding.named_sharding(), expected);
+  }
+}
+
+TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers4) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding =
+      test_utils::FromAxisNames(mesh, {{"a"}, {"c", "b"}, {}, {"d"}, {}}, {},
+                                /*unreduced_axes=*/{"e"});
+  std::vector<int64_t> source_dims = {2, 1, 3};
+  std::vector<int64_t> target_dims = {0, 3, 1};
+  int64_t target_shape_rank = 4;
+  HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+      HloSharding(source_sharding), source_dims, target_dims,
+      target_shape_rank);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{}, {"d"}, {}, {"c", "b"}}, {},
+                                /*unreduced_axes=*/{"e"});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
+}
+
+TEST(HloShardingUtilTest, PartiallyReplicateTiledShardingOnDims) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding =
+      test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+  std::vector<int64_t> dims_to_replicate = {3, 1};
+  HloSharding target_sharding = PartiallyReplicateTiledShardingOnDims(
+      HloSharding(source_sharding), dims_to_replicate);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{"a"}, {}, {"c"}, {}, {"e"}});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
+}
+
+TEST(HloShardingUtilTest, ReplicateAllDataDims) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding = test_utils::FromAxisNames(
+      mesh, {{"a"}, {}, {"c"}, {}, {"e"}}, /*replicated_axes=*/{"d"},
+      /*unreduced_axes=*/{"b"});
+  HloSharding target_sharding =
+      ReplicateAllDataDims(HloSharding(source_sharding), 3);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{}, {}, {}}, {"d"}, {"b"});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
+}
+
+TEST(HloShardingUtilTest, RemoveShapeDimensions) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding =
+      test_utils::FromAxisNames(mesh, {{"a"}, {}, {"c"}, {}, {"e"}});
+  std::vector<int64_t> dims_to_remove = {1, 3};
+  HloSharding target_sharding =
+      RemoveShapeDimensions(HloSharding(source_sharding), dims_to_remove);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{"a"}, {"c"}, {"e"}});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
 }
 
 TEST(HloShardingUtilTest, MergeManualSubgroupSharding) {
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index 3f252f4653d64a..6412cf872001fc 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -206,13 +206,7 @@ def lit_test_suite_for_gpus(
             "--param=GPU=%s" % (gpu),
         ]
         gpu_data = data + [
-            "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
+            "//xla/backends/gpu/target_config:all_gpu_specs",
         ]
         lit_test_suite(
             "%s_%s" % (name, gpu),
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index f93f6784af1aaf..5b14ae306c3326 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -636,8 +636,8 @@ absl::Status LiteralBase::Piece::AllocateBuffers() {
   const int64_t bytes = total_bytes_dense();
   if (bytes > kMaxInlinedBytes) {
     CHECK_EQ(buffer(), nullptr);
-    storage_.Emplace<DenseRep>(
-        static_cast<char*>(tsl::port::AlignedMalloc(bytes, kMinimumAlignment)));
+    storage_.Emplace<DenseRep>(static_cast<char*>(tsl::port::AlignedMalloc(
+        bytes, static_cast<std::align_val_t>(kMinimumAlignment))));
     if (buffer() == nullptr) {
       return absl::ResourceExhaustedError(
           "Failed to allocate buffer for Literal");
@@ -1747,18 +1747,27 @@ void ConvertBetweenNativeTypes(absl::Span<const NativeSrcT> src_data,
     if constexpr (!std::is_same_v<NativeDestT, bool> &&
                   !std::numeric_limits<NativeSrcT>::is_integer &&
                   std::numeric_limits<NativeDestT>::is_integer) {
+      // NaN check.
       if (src != src) {
         return NativeDestT{0};
       }
-      if (src >=
-          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::max())) {
+
+      // Clamp values that cannot fit in the destination type to avoid undefined
+      // behavior.
+      // An N-bit integer has a max of 2^N - 1 so max() + 1 is 2^N. Ensure
+      // double can losslessly hold.
+      static_assert((std::numeric_limits<double>::max_exponent - 1) >=
+                    std::numeric_limits<NativeDestT>::digits);
+      if (static_cast<double>(src) >=
+          static_cast<double>(std::numeric_limits<NativeDestT>::max())) {
         return std::numeric_limits<NativeDestT>::max();
       }
-      if (src <=
-          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::lowest())) {
+      if (static_cast<double>(src) <=
+          static_cast<double>(std::numeric_limits<NativeDestT>::lowest())) {
         return std::numeric_limits<NativeDestT>::lowest();
       }
     }
+
     // TODO(b/370786669): Once ml_dtypes is updated to include
     // https://github.com/jax-ml/ml_dtypes/pull/205, do not special-case e3m4 by
     // casting to half first.
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index 0c55b2cb54e10c..fd86910256d471 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -1912,6 +1912,66 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   EXPECT_EQ(c128.Convert(S32).status().code(), tsl::error::UNIMPLEMENTED);
 }
 
+TEST_F(LiteralUtilTest, ConvertFromF4E2M1FN) {
+  // Raw F4 inputs.
+  Literal f4 = LiteralUtil::CreateR1<tsl::float4_e2m1fn>({
+      static_cast<tsl::float4_e2m1fn>(0.0f),
+      static_cast<tsl::float4_e2m1fn>(0.5f),
+      static_cast<tsl::float4_e2m1fn>(1.0f),
+      static_cast<tsl::float4_e2m1fn>(1.5f),
+      static_cast<tsl::float4_e2m1fn>(2.0f),
+      static_cast<tsl::float4_e2m1fn>(3.0f),
+      static_cast<tsl::float4_e2m1fn>(4.0f),
+      static_cast<tsl::float4_e2m1fn>(6.0f),
+      static_cast<tsl::float4_e2m1fn>(-0.0f),
+      static_cast<tsl::float4_e2m1fn>(-0.5f),
+      static_cast<tsl::float4_e2m1fn>(-1.0f),
+      static_cast<tsl::float4_e2m1fn>(-1.5f),
+      static_cast<tsl::float4_e2m1fn>(-2.0f),
+      static_cast<tsl::float4_e2m1fn>(-3.0f),
+      static_cast<tsl::float4_e2m1fn>(-4.0f),
+      static_cast<tsl::float4_e2m1fn>(-6.0f),
+  });
+  // We assert these are our expectations.
+  Literal f32 = LiteralUtil::CreateR1<float>({
+      0.0f,
+      0.5f,
+      1.0f,
+      1.5f,
+      2.0f,
+      3.0f,
+      4.0f,
+      6.0f,
+      -0.0f,
+      -0.5f,
+      -1.0f,
+      -1.5f,
+      -2.0f,
+      -3.0f,
+      -4.0f,
+      -6.0f,
+  });
+
+  // From F4E2M1FN.
+  EXPECT_EQ(f4.Convert(U2), f32.Convert(U2));
+  EXPECT_EQ(f4.Convert(S2), f32.Convert(S2));
+  EXPECT_EQ(f4.Convert(U4), f32.Convert(U4));
+  EXPECT_EQ(f4.Convert(S4), f32.Convert(S4));
+  EXPECT_EQ(f4.Convert(F4E2M1FN), f32.Convert(F4E2M1FN));
+  EXPECT_EQ(f4.Convert(U8), f32.Convert(U8));
+  EXPECT_EQ(f4.Convert(S8), f32.Convert(S8));
+  EXPECT_EQ(f4.Convert(F8E4M3FN), f32.Convert(F8E4M3FN));
+  EXPECT_EQ(f4.Convert(F8E4M3B11FNUZ), f32.Convert(F8E4M3B11FNUZ));
+  EXPECT_EQ(f4.Convert(F8E5M2), f32.Convert(F8E5M2));
+  EXPECT_EQ(f4.Convert(U16), f32.Convert(U16));
+  EXPECT_EQ(f4.Convert(S16), f32.Convert(S16));
+  EXPECT_EQ(f4.Convert(F16), f32.Convert(F16));
+  EXPECT_EQ(f4.Convert(BF16), f32.Convert(BF16));
+  EXPECT_EQ(f4.Convert(U32), f32.Convert(U32));
+  EXPECT_EQ(f4.Convert(S32), f32.Convert(S32));
+  EXPECT_EQ(f4.Convert(F32).value(), f32);
+}
+
 TYPED_TEST(LiteralUtilFloatTest, ConvertIfTypesMatchF8) {
   constexpr auto ptype = primitive_util::NativeToPrimitiveType<TypeParam>();
   if (!primitive_util::IsF8Type(ptype)) {
diff --git a/third_party/xla/xla/maybe_owning.h b/third_party/xla/xla/maybe_owning.h
index 2b63a45543375d..04bd39a670bea3 100644
--- a/third_party/xla/xla/maybe_owning.h
+++ b/third_party/xla/xla/maybe_owning.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_MAYBE_OWNING_H_
 #define XLA_MAYBE_OWNING_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 
@@ -76,6 +77,31 @@ class MaybeOwning final {
 
   bool OwnsPtr() const { return kOwningBitMask & ptr_and_owning_bit_; }
 
+  friend bool operator==(const MaybeOwning& mo, std::nullptr_t) {
+    // A MaybeOwning is considered null if its internal pointer is null.
+    // The get() method correctly removes the mask and returns the raw pointer.
+    return mo.get() == nullptr;
+  }
+
+  friend bool operator==(std::nullptr_t, const MaybeOwning& mo) {
+    // Maintain symmetry for the comparison order
+    return mo.get() == nullptr;
+  }
+
+  friend bool operator!=(const MaybeOwning& mo, std::nullptr_t) {
+    return mo.get() != nullptr;
+  }
+
+  friend bool operator!=(std::nullptr_t, const MaybeOwning& mo) {
+    return mo.get() != nullptr;
+  }
+
+  explicit operator bool() const {
+    // The class is considered 'true' if the underlying pointer is not null.
+    // We use the existing get() method, which correctly handles the mask.
+    return get() != nullptr;
+  }
+
  private:
   enum : uint64_t {
     kOwningBitMask = 1UL,
diff --git a/third_party/xla/xla/megascale/BUILD b/third_party/xla/xla/megascale/BUILD
new file mode 100644
index 00000000000000..4aa6b4e8f5498e
--- /dev/null
+++ b/third_party/xla/xla/megascale/BUILD
@@ -0,0 +1,28 @@
+load("//xla/megascale:package_groups.bzl", "megascale_package_groups")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tf_proto_library",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":internal"]),
+    licenses = ["notice"],
+)
+
+megascale_package_groups()
+
+tf_proto_library(
+    name = "dcn_topology_proto",
+    srcs = ["dcn_topology.proto"],
+    create_grpc_library = True,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "addresses_proto",
+    srcs = ["addresses.proto"],
+    create_grpc_library = True,
+    make_default_target_header_only = True,
+)
diff --git a/third_party/xla/xla/megascale/addresses.proto b/third_party/xla/xla/megascale/addresses.proto
new file mode 100644
index 00000000000000..ad6611335bbc3c
--- /dev/null
+++ b/third_party/xla/xla/megascale/addresses.proto
@@ -0,0 +1,28 @@
+syntax = "proto3";
+
+package xla.megascale.runtime;
+
+option java_multiple_files = true;
+option java_outer_classname = "Runtime";
+
+message HostNetworkAddress {
+  string address = 1;
+  string interface_name = 2;
+  // The host name used for debugging only, and is supplied by pathways or MXLA
+  // coordinator. Do not use this for creating connection to other peers, use
+  // the address above.
+  string host_name_for_debugging = 3;
+}
+
+// NetworkAddressMapping provides mapping between a unique endpoint (slice_id,
+// host_id) and the network address it is reachable at.
+message NetworkAddressMapping {
+  int32 slice_id = 1;
+  int32 host_id = 2;
+  repeated HostNetworkAddress addresses = 3;
+}
+
+// Holds the network address mapping of all endpoints (slice_id, host_id).
+message EndpointAddresses {
+  repeated NetworkAddressMapping address_mappings = 1;
+}
diff --git a/third_party/xla/xla/megascale/dcn_topology.proto b/third_party/xla/xla/megascale/dcn_topology.proto
new file mode 100644
index 00000000000000..de87573195e36b
--- /dev/null
+++ b/third_party/xla/xla/megascale/dcn_topology.proto
@@ -0,0 +1,64 @@
+syntax = "proto3";
+
+package xla.megascale.runtime;
+
+option java_multiple_files = true;
+option java_outer_classname = "Runtime";
+
+message DCNTopology {
+  // SymmetricTree represents a simple network topology with symmetric
+  // splitting at each level.
+  message SymmetricTree {
+    // The length of branching_per_layer is the depth (number of distinct
+    // layers) of the network topology. The values give the branching factor at
+    // each layer. Index 0 holds the uppermost level in the topology. For
+    // example: a 24 slice topology, in three groups of two subgroups of four
+    // slices would be represented as: branching_per_layer =
+    // [3, 2, 4] slice_ids are not explicitly specified and are assumed to be
+    // contiguously assigned. i.e. slice_id = branching_per_layer[0] * 8  +
+    // branching_per_layer[1] * 4 + branching_per_layer[2]
+    repeated int32 branching_per_layer = 1;
+  }
+
+  // Node recursively defines a fully specified tree. The tree is expected to
+  // be balanced but allowed to be asymmetric.
+  message TreeNode {
+    // Contiguous range of slices in half-open interval [slice_id_start,
+    // slice_id_end). The contiguous nature has no special signficance beyond
+    // compactly represent large number of slices. e.g. SliceRange{0, 10} and
+    // SliceRange{20, 30} all have the same connectivity between them.
+    message SliceRange {
+      int32 slice_id_start = 1;
+      // Ignored when slice_id_end <= slice_id_start.
+      int32 slice_id_end = 2;
+    }
+
+    // Optional label for readability.
+    optional string label = 1;
+
+    // We expect the Topology to be a balanced asymmetric tree. This implies
+    // that at any level we should either have nodes OR slice_ranges.
+    repeated TreeNode nodes = 2;
+    repeated SliceRange slice_ranges = 3;
+
+    // Specifies the degree to which egress from this node to higher layers in
+    // topology is constrained. Valid range [0.0, 1.0]. 0.0 -> no
+    // constraint, 1.0 -> never use. When egress_constraint for a node is higher
+    // than other nodes with which it performas a reduction, it is assigned
+    // shards for reduction with less probability. This will result in fewer
+    // transfers out of these nodes to higher layers in topology.
+    optional float egress_constraint = 4;
+
+    // Whether to perform the ring algorithm instead of the shuffle algorithm
+    // between the children. The ring order is the order of the children.
+    bool ring_transfers = 5;
+  }
+
+  oneof representation {
+    // Simple representation of a symmetric hierarchical network.
+    SymmetricTree symmetric_tree = 1;
+    // Fully specified tree with no assumptions on symmetry and slice id
+    // mappings.
+    TreeNode tree = 2;
+  }
+}
diff --git a/third_party/xla/xla/megascale/package_groups.bzl b/third_party/xla/xla/megascale/package_groups.bzl
new file mode 100644
index 00000000000000..9d3f8d701a735b
--- /dev/null
+++ b/third_party/xla/xla/megascale/package_groups.bzl
@@ -0,0 +1,7 @@
+"""Megascale package_group definitions"""
+
+def megascale_package_groups(name = "megascale_package_groups"):
+    native.package_group(
+        name = "internal",
+        packages = ["//..."],
+    )
diff --git a/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc b/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
index 7d9b8fc700767a..913635ba3d209b 100644
--- a/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
+++ b/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
@@ -103,9 +103,9 @@ struct OutlineXLAFunc : public RewritePattern {
 
     // The wrapper function will have the same name but with _xla_framework
     // appended and will be annotated with the attribute "xla_entry".
-    auto outline_func = rewriter.create<func::FuncOp>(
-        loc, func.getSymName().str() + "_xla_framework", func_type, attrs,
-        arg_attrs);
+    auto outline_func = func::FuncOp::create(
+        rewriter, loc, func.getSymName().str() + "_xla_framework", func_type,
+        attrs, arg_attrs);
     outline_func->setAttr("outlined", BoolAttr::get(ctx, true));
     outline_func->setAttr("xla_entry", BoolAttr::get(ctx, true));
     auto *b = rewriter.createBlock(&outline_func.getBody(), {},
@@ -114,20 +114,20 @@ struct OutlineXLAFunc : public RewritePattern {
     // Unwrap arguments
     SmallVector<Value> args;
     for (const auto &t : llvm::enumerate(func.getFunctionType().getInputs())) {
-      args.push_back(rewriter.create<xla_framework::XLABufferToMemOp>(
-          loc, t.value(), b->getArgument(t.index())));
+      args.push_back(xla_framework::XLABufferToMemOp::create(
+          rewriter, loc, t.value(), b->getArgument(t.index())));
     }
 
-    auto call = rewriter.create<func::CallOp>(
-        loc, func.getSymName(), func.getFunctionType().getResults(), args);
+    auto call = func::CallOp::create(rewriter, loc, func.getSymName(),
+                                     func.getFunctionType().getResults(), args);
     // Wrap results
     SmallVector<Value> results;
     for (auto t : call.getResults()) {
-      results.push_back(rewriter.create<xla_framework::MemToXLABufferOp>(
-          loc, ::mlir::xla_framework::BufferType::get(ctx), t));
+      results.push_back(xla_framework::MemToXLABufferOp::create(
+          rewriter, loc, ::mlir::xla_framework::BufferType::get(ctx), t));
     }
 
-    rewriter.create<func::ReturnOp>(loc, results);
+    func::ReturnOp::create(rewriter, loc, results);
 
     // Finally, mark the called function as private to prevent users from
     // accidentally trying to use it.
diff --git a/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc b/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
index c40a7ad1b9aa46..cabf3d31fb707c 100644
--- a/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
+++ b/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
@@ -79,8 +79,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
   Value LoadValue(ConversionPatternRewriter &rewriter, Location loc,
                   Value pointer, Value index) const {
     auto ptr = LLVM::LLVMPointerType::get(rewriter.getContext());
-    return rewriter.create<LLVM::LoadOp>(
-        loc, ptr, rewriter.create<LLVM::GEPOp>(loc, ptr, ptr, pointer, index));
+    return LLVM::LoadOp::create(
+        rewriter, loc, ptr,
+        LLVM::GEPOp::create(rewriter, loc, ptr, ptr, pointer, index));
   }
 
   mlir::func::FuncOp convertFuncOpToLLVMFuncOp(
@@ -101,8 +102,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     if (!llvm_type) return nullptr;
 
     rewriter.setInsertionPoint(funcOp);
-    auto new_func_op = rewriter.create<mlir::func::FuncOp>(
-        loc, funcOp.getName(), llvm_type, llvm::SmallVector<NamedAttribute>());
+    auto new_func_op =
+        mlir::func::FuncOp::create(rewriter, loc, funcOp.getName(), llvm_type,
+                                   llvm::SmallVector<NamedAttribute>());
     auto locs = llvm::SmallVector<mlir::Location>(arg_types.size(), loc);
     Block *new_entry =
         rewriter.createBlock(&new_func_op.getBody(), {}, arg_types, locs);
@@ -118,16 +120,18 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     auto result_index = 0;
     for (unsigned i = 0; i < num_refs; ++i) {
       if (funcOp.getArgAttr(i, "xla_framework.input_mapping")) {
-        Value index = rewriter.create<LLVM::ConstantOp>(
-            loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+        Value index = LLVM::ConstantOp::create(
+            rewriter, loc,
+            typeConverter->convertType(rewriter.getIntegerType(32)),
             funcOp.getArgAttrOfType<mlir::IntegerAttr>(
                 i, "xla_framework.input_mapping"));
 
         Value ptr = LoadValue(rewriter, loc, new_entry->getArgument(3), index);
         mapping.map(funcOp.front().getArgument(i), ptr);
       } else {
-        Value index = rewriter.create<LLVM::ConstantOp>(
-            loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+        Value index = LLVM::ConstantOp::create(
+            rewriter, loc,
+            typeConverter->convertType(rewriter.getIntegerType(32)),
             funcOp->getAttrOfType<mlir::IntegerAttr>(
                 "xla_framework.result_mapping"));
         Value first_load =
@@ -136,8 +140,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
         // Handle multi-value results which are wrapped in a tuple.
         if (funcOp->hasAttr("xla_framework.result_inner_mapping")) {
           auto current_index = result_index++;
-          Value inner_index = rewriter.create<LLVM::ConstantOp>(
-              loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+          Value inner_index = LLVM::ConstantOp::create(
+              rewriter, loc,
+              typeConverter->convertType(rewriter.getIntegerType(32)),
               rewriter.getI32IntegerAttr(static_cast<int32_t>(
                   mlir::cast<mlir::IntegerAttr>(
                       funcOp
@@ -152,13 +157,14 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
           mapping.map(funcOp.front().getArgument(i), ptr);
 
           auto ptr_type = LLVM::LLVMPointerType::get(rewriter.getContext());
-          Value second_index = rewriter.create<LLVM::ConstantOp>(
-              loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+          Value second_index = LLVM::ConstantOp::create(
+              rewriter, loc,
+              typeConverter->convertType(rewriter.getIntegerType(32)),
               rewriter.getI32IntegerAttr(current_index));
-          rewriter.create<LLVM::StoreOp>(
-              loc, ptr,
-              rewriter.create<LLVM::GEPOp>(loc, ptr_type, ptr_type, first_load,
-                                           llvm::ArrayRef(second_index)));
+          LLVM::StoreOp::create(
+              rewriter, loc, ptr,
+              LLVM::GEPOp::create(rewriter, loc, ptr_type, ptr_type, first_load,
+                                  llvm::ArrayRef(second_index)));
 
         } else {
           // Non tuple outputs can be simply mapped to the first load op.
@@ -171,7 +177,7 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     // return values now.
     for (auto &op : funcOp.front()) {
       if (isa<mlir::func::ReturnOp>(op)) {
-        rewriter.create<mlir::func::ReturnOp>(loc, ValueRange());
+        mlir::func::ReturnOp::create(rewriter, loc, ValueRange());
       } else {
         rewriter.clone(op, mapping);
       }
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
index c23bfee06fa7bd..f611687ea4be92 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
@@ -12,6 +12,7 @@ limitations under the License.
 
 #include "bindings/c/Attributes.h"
 
+#include <cstdint>
 #include <optional>
 
 #include "mhlo/IR/hlo_ops.h"
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 6b598b2371cd30..95c9934d4c7bc6 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -394,8 +394,8 @@ bool hoistAllocs(Block& block) {
 void promoteToStack(memref::DeallocOp dealloc) {
   auto alloc = dealloc.getMemref().getDefiningOp<memref::AllocOp>();
   OpBuilder b(alloc);
-  auto alloca = b.create<memref::AllocaOp>(
-      alloc->getLoc(), mlir::cast<MemRefType>(alloc->getResultTypes()[0]),
+  auto alloca = memref::AllocaOp::create(
+      b, alloc->getLoc(), mlir::cast<MemRefType>(alloc->getResultTypes()[0]),
       alloc.getAlignmentAttr());
   alloc->replaceAllUsesWith(ValueRange{alloca.getResult()});
   alloc->erase();
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
index dc5014afc09bd0..b470ad53c61a63 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
@@ -85,25 +85,25 @@ RegionBranchOpInterface moveRegionsToNewOpButKeepOldOp(
   OpBuilder b(op);
   RegionBranchOpInterface newOp;
   if (llvm::isa<scf::ForOp>(op)) {
-    newOp = b.create<scf::ForOp>(op.getLoc(), op->getOperands()[0],
-                                 op->getOperands()[1], op->getOperands()[2],
-                                 op->getOperands().drop_front(3));
+    newOp = scf::ForOp::create(b, op.getLoc(), op->getOperands()[0],
+                               op->getOperands()[1], op->getOperands()[2],
+                               op->getOperands().drop_front(3));
   } else if (llvm::isa<scf::WhileOp>(op)) {
-    newOp = b.create<scf::WhileOp>(
-        op.getLoc(),
+    newOp = scf::WhileOp::create(
+        b, op.getLoc(),
         TypeRange{op->getRegion(0).front().getTerminator()->getOperands()}
             .drop_front(),
         op->getOperands());
   } else if (llvm::isa<scf::IfOp>(op)) {
-    newOp = b.create<scf::IfOp>(
-        op.getLoc(),
+    newOp = scf::IfOp::create(
+        b, op.getLoc(),
         TypeRange{op->getRegion(0).front().getTerminator()->getOperands()},
         op->getOperands()[0], op->getNumRegions() > 1);
   } else if (llvm::isa<scf::ParallelOp>(op)) {
     auto parallel = llvm::cast<scf::ParallelOp>(op);
-    newOp = b.create<scf::ParallelOp>(
-        op.getLoc(), parallel.getLowerBound(), parallel.getUpperBound(),
-        parallel.getStep(), parallel.getInitVals());
+    newOp = scf::ParallelOp::create(b, op.getLoc(), parallel.getLowerBound(),
+                                    parallel.getUpperBound(),
+                                    parallel.getStep(), parallel.getInitVals());
   } else {
     llvm_unreachable("unsupported");
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 6e91413b1149cb..48d15aaafbe12c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -239,7 +239,7 @@ static void replaceOpWithRegion(PatternRewriter& rewriter, Operation* op,
 Value maybeCastTo(OpBuilder& b, Location loc, Value value, Type type) {
   if (type == value.getType()) return value;
   assert(type.isIndex() || value.getType().isIndex());
-  return b.create<arith::IndexCastOp>(loc, type, value);
+  return arith::IndexCastOp::create(b, loc, type, value);
 }
 
 DenseElementsAttr reshape(DenseElementsAttr attr, ShapedType newType) {
@@ -941,26 +941,26 @@ LogicalResult DotGeneralOp::reifyReturnTypeShapes(
   SmallVector<Value> dimensions;
   for (const int64_t lhsDim : dimNumbers.getLhsBatchingDimensions()) {
     dimensions.push_back(
-        builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), lhsDim));
+        tensor::DimOp::create(builder, getLoc(), adaptor.getLhs(), lhsDim));
   }
 
   for (int64_t i = 0; i < lhsType.getRank(); i++) {
     if (!llvm::is_contained(dimNumbers.getLhsContractingDimensions(), i) &&
         !llvm::is_contained(dimNumbers.getLhsBatchingDimensions(), i)) {
       dimensions.push_back(
-          builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), i));
+          tensor::DimOp::create(builder, getLoc(), adaptor.getLhs(), i));
     }
   }
   for (int64_t i = 0; i < rhsType.getRank(); i++) {
     if (!llvm::is_contained(dimNumbers.getRhsContractingDimensions(), i) &&
         !llvm::is_contained(dimNumbers.getRhsBatchingDimensions(), i)) {
       dimensions.push_back(
-          builder.create<tensor::DimOp>(getLoc(), adaptor.getRhs(), i));
+          tensor::DimOp::create(builder, getLoc(), adaptor.getRhs(), i));
     }
   }
 
   reifiedReturnShapes.push_back(
-      builder.create<tensor::FromElementsOp>(getLoc(), dimensions));
+      tensor::FromElementsOp::create(builder, getLoc(), dimensions));
   return success();
 }
 
@@ -1491,11 +1491,11 @@ struct GatherSlice : public OpRewritePattern<GatherOp> {
     }
     Type elementType = cast<TensorType>(gather.getType()).getElementType();
     auto sliceType = RankedTensorType::get(sliceShape, elementType);
-    Value result = rewriter.create<SliceOp>(
-        gather.getLoc(), sliceType, gather.getOperand(),
-        rewriter.getI64TensorAttr(sliceStart),
-        rewriter.getI64TensorAttr(sliceEnd),
-        rewriter.getI64TensorAttr(sliceStride));
+    Value result = SliceOp::create(rewriter, gather.getLoc(), sliceType,
+                                   gather.getOperand(),
+                                   rewriter.getI64TensorAttr(sliceStart),
+                                   rewriter.getI64TensorAttr(sliceEnd),
+                                   rewriter.getI64TensorAttr(sliceStride));
 
     auto collapsedSliceDims = dnums.getCollapsedSliceDims();
     if (!collapsedSliceDims.empty()) {
@@ -1506,7 +1506,8 @@ struct GatherSlice : public OpRewritePattern<GatherOp> {
         }
       }
       auto reshapeType = RankedTensorType::get(reshapeShape, elementType);
-      result = rewriter.create<ReshapeOp>(gather.getLoc(), reshapeType, result);
+      result =
+          ReshapeOp::create(rewriter, gather.getLoc(), reshapeType, result);
     }
 
     result.setType(gather.getType());
@@ -1541,7 +1542,7 @@ void getSliceSizeValues(GatherOp* gather, OpBuilder& builder, Location loc,
                         ValueRange operands,
                         SmallVectorImpl<Value>& sliceSizes) {
   for (int64_t val : gather->getSliceSizes().getValues<int64_t>()) {
-    sliceSizes.push_back(builder.create<arith::ConstantIndexOp>(loc, val));
+    sliceSizes.push_back(arith::ConstantIndexOp::create(builder, loc, val));
   }
 }
 
@@ -1552,9 +1553,9 @@ void getSliceSizeValues(DynamicGatherOp* /*dGather*/, OpBuilder& builder,
   Value sliceSizes = adaptor.getSliceSizes();
   auto sliceSizesTy = cast<ShapedType>(sliceSizes.getType());
   for (int64_t i = 0; i < sliceSizesTy.getDimSize(0); ++i) {
-    Value idx = builder.create<arith::ConstantIndexOp>(loc, i);
+    Value idx = arith::ConstantIndexOp::create(builder, loc, i);
     sliceSizeValues.push_back(
-        builder.create<tensor::ExtractOp>(loc, sliceSizes, idx));
+        tensor::ExtractOp::create(builder, loc, sliceSizes, idx));
   }
 }
 
@@ -1582,7 +1583,7 @@ LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
 
   auto getStartIndicesDim = [&](int64_t index) {
     return toShapeElType(
-        builder.create<tensor::DimOp>(loc, startIndices, index));
+        tensor::DimOp::create(builder, loc, startIndices, index));
   };
   SmallVector<Value, 4> shapeValues;
   auto getSliceDim = [&sliceSizes](int64_t index) -> Value {
@@ -1596,8 +1597,9 @@ LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
                            op->getDimensionNumbers().getIndexVectorDim(),
                            shapeValues);
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc, RankedTensorType::get({resultRank}, shapeElTy), shapeValues);
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc, RankedTensorType::get({resultRank}, shapeElTy),
+      shapeValues);
   reifiedReturnShapes.push_back(outputShape);
 
   return success();
@@ -1742,8 +1744,8 @@ struct IotaBroadcast : public OpRewritePattern<IotaOp> {
     auto iotaType = RankedTensorType::get({resultTy.getDimSize(iotaDimension)},
                                           resultTy.getElementType());
 
-    auto newIota = rewriter.create<IotaOp>(iota.getLoc(), iotaType,
-                                           rewriter.getI64IntegerAttr(0));
+    auto newIota = IotaOp::create(rewriter, iota.getLoc(), iotaType,
+                                  rewriter.getI64IntegerAttr(0));
 
     auto broadcastAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({1}, rewriter.getIntegerType(64)),
@@ -1808,21 +1810,21 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     auto iotaDimension = iota.getIotaDimension();
     auto iotaDimensionInt = iotaDimension;
 
-    auto convertedShape = rewriter.create<arith::IndexCastOp>(
-        iota.getLoc(),
+    auto convertedShape = arith::IndexCastOp::create(
+        rewriter, iota.getLoc(),
         RankedTensorType::get(
             cast<ShapedType>(iota.getOutputShape().getType()).getShape(),
             rewriter.getI64Type()),
         iota.getOutputShape());
 
-    auto slicedShape = rewriter.create<SliceOp>(
-        iota.getLoc(), convertedShape,
-        rewriter.getI64TensorAttr(iotaDimensionInt),
-        rewriter.getI64TensorAttr(iotaDimensionInt + 1),
-        rewriter.getI64TensorAttr(1));
+    auto slicedShape =
+        SliceOp::create(rewriter, iota.getLoc(), convertedShape,
+                        rewriter.getI64TensorAttr(iotaDimensionInt),
+                        rewriter.getI64TensorAttr(iotaDimensionInt + 1),
+                        rewriter.getI64TensorAttr(1));
 
-    auto convertedSlicedShape = rewriter.create<arith::IndexCastOp>(
-        iota.getLoc(),
+    auto convertedSlicedShape = arith::IndexCastOp::create(
+        rewriter, iota.getLoc(),
         RankedTensorType::get(
             {1},
             cast<ShapedType>(iota.getOutputShape().getType()).getElementType()),
@@ -1831,9 +1833,9 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     auto iotaType = RankedTensorType::get(
         {resultTy.getDimSize(iotaDimensionInt)}, resultTy.getElementType());
 
-    auto newIota = rewriter.create<DynamicIotaOp>(
-        iota.getLoc(), iotaType, convertedSlicedShape,
-        rewriter.getI64IntegerAttr(0));
+    auto newIota = DynamicIotaOp::create(rewriter, iota.getLoc(), iotaType,
+                                         convertedSlicedShape,
+                                         rewriter.getI64IntegerAttr(0));
 
     auto broadcastAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({1}, rewriter.getIntegerType(64)),
@@ -1857,7 +1859,7 @@ static Value castToIndexTensor(OpBuilder& builder, Location loc,
   ShapedType resultTy = shape::getExtentTensorType(
       builder.getContext(), cast<ShapedType>(shapeOp.getType()).getDimSize(0));
   if (shapeOp.getType() == resultTy) return shapeOp;  // Nothing to do.
-  return builder.create<arith::IndexCastOp>(loc, resultTy, shapeOp);
+  return arith::IndexCastOp::create(builder, loc, resultTy, shapeOp);
 }
 
 LogicalResult DynamicIotaOp::reifyReturnTypeShapes(
@@ -2046,8 +2048,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
 
       auto dotNums = DotDimensionNumbersAttr::get(
           op.getContext(), {}, {}, {lhsContractDim}, {rhsContractDim});
-      auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
-          op.getLoc(), op.getType(), lhs, rhs, dotNums,
+      auto dotOp = mhlo::DotGeneralOp::create(
+          rewriter, op.getLoc(), op.getType(), lhs, rhs, dotNums,
           op.getPrecisionConfig().value_or(nullptr), DotAlgorithmAttr{});
 
       rewriter.replaceOp(op, dotOp.getResult());
@@ -2072,8 +2074,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
     lhsTy = RankedTensorType::get(lhsShape, lhsTy.getElementType());
     rhsTy = RankedTensorType::get(rhsShape, rhsTy.getElementType());
 
-    lhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), lhsTy, lhs);
-    rhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), rhsTy, rhs);
+    lhs = mhlo::ReshapeOp::create(rewriter, op.getLoc(), lhsTy, lhs);
+    rhs = mhlo::ReshapeOp::create(rewriter, op.getLoc(), rhsTy, rhs);
 
     auto dotTy = RankedTensorType::get(
         {featureGroupCount, lhsBatchSize, rhsBatchSize / featureGroupCount},
@@ -2082,8 +2084,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
     auto dotNums = DotDimensionNumbersAttr::get(
         op.getContext(), {lhsContractDim}, {rhsContractDim},
         {lhsContractDim + 1}, {rhsContractDim == 0 ? 2 : 0});
-    auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
-        op.getLoc(), dotTy, lhs, rhs, dotNums,
+    auto dotOp = mhlo::DotGeneralOp::create(
+        rewriter, op.getLoc(), dotTy, lhs, rhs, dotNums,
         op.getPrecisionConfig().value_or(nullptr), DotAlgorithmAttr{});
 
     llvm::SmallVector<int64_t> perms;
@@ -2095,8 +2097,9 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
         {dotTy.getDimSize(perms[0]), dotTy.getDimSize(perms[1]),
          dotTy.getDimSize(perms[2])},
         dotTy.getElementType());
-    auto transposeOp = rewriter.create<mhlo::TransposeOp>(
-        op.getLoc(), transposeTy, dotOp, rewriter.getI64TensorAttr(perms));
+    auto transposeOp =
+        mhlo::TransposeOp::create(rewriter, op.getLoc(), transposeTy, dotOp,
+                                  rewriter.getI64TensorAttr(perms));
 
     rewriter.replaceOpWithNewOp<mhlo::ReshapeOp>(op, resultTy, transposeOp);
     return success();
@@ -2290,8 +2293,8 @@ struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
       // like fp16 -> fp32 -> fp64, bf16 -> fp32 -> fp16
       if (cast<FloatType>(secondType).getWidth() >
           cast<FloatType>(firstType).getWidth()) {
-        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
-                                                  convertOp.getOperand());
+        Value result = ConvertOp::create(
+            rewriter, loc, op.getResult().getType(), convertOp.getOperand());
         rewriter.replaceOp(op, result);
         return success();
       }
@@ -2301,8 +2304,8 @@ struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
       // like i16 -> i32 -> i64, u16 -> i32 -> u32
       if (cast<IntegerType>(secondType).getWidth() >
           cast<IntegerType>(firstType).getWidth()) {
-        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
-                                                  convertOp.getOperand());
+        Value result = ConvertOp::create(
+            rewriter, loc, op.getResult().getType(), convertOp.getOperand());
         rewriter.replaceOp(op, result);
         return success();
       }
@@ -2702,7 +2705,7 @@ LogicalResult BroadcastOp::reifyReturnTypeShapes(
   // Collect the broadcast sizes.
   for (const auto& size : getBroadcastSizes()) {
     shapeValues.push_back(
-        builder.create<arith::ConstantIndexOp>(loc, size.getZExtValue()));
+        arith::ConstantIndexOp::create(builder, loc, size.getZExtValue()));
   }
 
   // Collect the operand sizes.
@@ -2711,8 +2714,8 @@ LogicalResult BroadcastOp::reifyReturnTypeShapes(
         builder.createOrFold<tensor::DimOp>(loc, operand, index));
   }
 
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
+  reifiedReturnShapes.push_back(tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             builder.getIndexType()),
       shapeValues));
@@ -2874,7 +2877,8 @@ namespace {
 template <typename OpTy, typename... Args>
 OpTy refineOpWithNewOp(PatternRewriter& rewriter, Operation* op,
                        Args&&... args) {
-  auto newOp = rewriter.create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+  auto newOp =
+      OpTy::create(rewriter, op->getLoc(), std::forward<Args>(args)...);
 
   llvm::SmallVector<Value> replacementResults;
   assert(op->getNumResults() == newOp->getNumResults() &&
@@ -2885,8 +2889,8 @@ OpTy refineOpWithNewOp(PatternRewriter& rewriter, Operation* op,
     if (llvm::any_of(opResult.getUsers(), [&](Operation* user) {
           return user->getDialect() != op->getDialect();
         })) {
-      replacementResult = rewriter.create<tensor::CastOp>(
-          op->getLoc(), opResult.getType(), newOpResult);
+      replacementResult = tensor::CastOp::create(
+          rewriter, op->getLoc(), opResult.getType(), newOpResult);
     }
     replacementResults.push_back(replacementResult);
   }
@@ -3274,7 +3278,7 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
     SmallVector<Value, 4> shapeVals;
     for (const auto& element : llvm::enumerate(operandType.getShape())) {
       Value valueDim = toShapeScalarType(
-          builder.create<tensor::DimOp>(loc, operand, element.index()));
+          tensor::DimOp::create(builder, loc, operand, element.index()));
       shapeVals.push_back(valueDim);
     }
     allShapeValues.emplace_back(std::move(shapeVals));
@@ -3289,12 +3293,12 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
           << "Concatenate expects all operands must be of the same rank";
       return failure();
     }
-    shapeValues[axis] = builder.create<arith::AddIOp>(loc, shapeValues[axis],
-                                                      otherShapeValues[axis]);
+    shapeValues[axis] = arith::AddIOp::create(builder, loc, shapeValues[axis],
+                                              otherShapeValues[axis]);
   }
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues);
@@ -3489,8 +3493,8 @@ struct DynamicSliceToSlice : public OpRewritePattern<DynamicSliceOp> {
         sliceStartIndices, dynamicSlice.getSliceSizes(), &rewriter);
     DenseIntElementsAttr sliceStrides =
         rewriter.getI64TensorAttr(SmallVector<int64_t, 4>(inputRank, 1));
-    auto result = rewriter.create<SliceOp>(loc, input, sliceStartIndices,
-                                           sliceLimits, sliceStrides);
+    auto result = SliceOp::create(rewriter, loc, input, sliceStartIndices,
+                                  sliceLimits, sliceStrides);
     rewriter.replaceOp(dynamicSlice, result);
     return success();
   }
@@ -3568,14 +3572,15 @@ struct RealDSliceToDSlice : public OpRewritePattern<RealDynamicSliceOp> {
     // Adapt accordingly in order to be compatible with DynamicSliceOp.
     SmallVector<Value> startIndices;
     for (auto i = 0; i < static_cast<int64_t>(sliceSizes.size()); ++i) {
-      auto startIndex1D = rewriter.create<SliceOp>(
-          op.getLoc(), op.getStartIndices(), rewriter.getI64TensorAttr(i),
-          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
+      auto startIndex1D = SliceOp::create(
+          rewriter, op.getLoc(), op.getStartIndices(),
+          rewriter.getI64TensorAttr(i), rewriter.getI64TensorAttr(i + 1),
+          rewriter.getI64TensorAttr(1));
       auto startIndex0DType = RankedTensorType::get(
           {},
           cast<ShapedType>(op.getStartIndices().getType()).getElementType());
-      auto startIndex0D = rewriter.create<ReshapeOp>(
-          op.getLoc(), startIndex0DType, startIndex1D);
+      auto startIndex0D = ReshapeOp::create(rewriter, op.getLoc(),
+                                            startIndex0DType, startIndex1D);
       startIndices.push_back(startIndex0D);
     }
 
@@ -3610,29 +3615,31 @@ LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
   shapeValues.reserve(operandType.getRank());
   Type shapeScalarType =
       cast<ShapedType>(startIndices.getType()).getElementType();
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value one = arith::ConstantIndexOp::create(builder, loc, 1);
   one = maybeCastTo(builder, loc, one, shapeScalarType);
   for (const auto& element : llvm::enumerate(operandType.getShape())) {
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, element.index());
+    Value offset =
+        arith::ConstantIndexOp::create(builder, loc, element.index());
     Value valueStart =
-        builder.create<tensor::ExtractOp>(loc, startIndices, offset);
+        tensor::ExtractOp::create(builder, loc, startIndices, offset);
     Value valueLimit =
-        builder.create<tensor::ExtractOp>(loc, limitIndices, offset);
-    Value valueStride = builder.create<tensor::ExtractOp>(loc, strides, offset);
+        tensor::ExtractOp::create(builder, loc, limitIndices, offset);
+    Value valueStride =
+        tensor::ExtractOp::create(builder, loc, strides, offset);
     // size = (limit - start + stride - 1) / stride
-    shapeValues.push_back(builder.create<arith::DivSIOp>(
-        loc,
-        builder.create<arith::SubIOp>(
-            loc,
-            builder.create<arith::AddIOp>(
-                loc, valueStride,
-                builder.create<arith::SubIOp>(loc, valueLimit, valueStart)),
+    shapeValues.push_back(arith::DivSIOp::create(
+        builder, loc,
+        arith::SubIOp::create(
+            builder, loc,
+            arith::AddIOp::create(
+                builder, loc, valueStride,
+                arith::SubIOp::create(builder, loc, valueLimit, valueStart)),
             one),
         valueStride));
   }
 
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
+  reifiedReturnShapes.push_back(tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues));
@@ -4208,8 +4215,8 @@ struct LowerBoolSplatConstantsIntoRegion : public OpRewritePattern<ReduceOp> {
     // Create new splat constants to replace block arguments.
     for (BlockArgument barg : bb.getArguments()) {
       int argIdx = barg.getArgNumber();
-      mhlo::ConstantOp newCst = rewriter.create<mhlo::ConstantOp>(
-          bb.front().getLoc(), barg.getType(), bargCstAttrs[argIdx]);
+      mhlo::ConstantOp newCst = mhlo::ConstantOp::create(
+          rewriter, bb.front().getLoc(), barg.getType(), bargCstAttrs[argIdx]);
       barg.replaceAllUsesWith(newCst);
     }
     return success();
@@ -4230,8 +4237,8 @@ static LogicalResult convertEmptyReduces(ReduceOp op,
     auto empty = rewriter.getI64TensorAttr({});
     if (t.hasStaticShape()) {
       for (auto [init, out] : llvm::zip(op.getInitValues(), op.getResults())) {
-        out.replaceAllUsesWith(rewriter.create<BroadcastInDimOp>(
-            op.getLoc(), out.getType(), init, empty));
+        out.replaceAllUsesWith(BroadcastInDimOp::create(
+            rewriter, op.getLoc(), out.getType(), init, empty));
       }
       return success();
     }
@@ -4241,8 +4248,8 @@ static LogicalResult convertEmptyReduces(ReduceOp op,
       return failure();
     for (auto [init, shape, out] :
          llvm::zip(op.getInitValues(), shapes, op.getResults())) {
-      out.replaceAllUsesWith(rewriter.create<DynamicBroadcastInDimOp>(
-          op.getLoc(), out.getType(), init, shape, empty));
+      out.replaceAllUsesWith(DynamicBroadcastInDimOp::create(
+          rewriter, op.getLoc(), out.getType(), init, shape, empty));
     }
     return success();
   }
@@ -4282,12 +4289,12 @@ LogicalResult ReduceOp::reifyReturnTypeShapes(
       continue;
     }
     Value valueDim = toShapeScalarType(
-        builder.create<tensor::DimOp>(loc, inputs[0], element.index()));
+        tensor::DimOp::create(builder, loc, inputs[0], element.index()));
     shapeValues.push_back(valueDim);
   }
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues);
@@ -4614,36 +4621,37 @@ LogicalResult PadOp::reifyReturnTypeShapes(
   for (const APInt& val : padInteriorAttr.getValues<APInt>())
     padInterior.push_back(val.getSExtValue());
 
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1).getResult();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0).getResult();
+  Value one = arith::ConstantIndexOp::create(builder, loc, 1).getResult();
+  Value zero = arith::ConstantIndexOp::create(builder, loc, 0).getResult();
 
   llvm::SmallVector<Value> dimensions;
   dimensions.reserve(operandTy.getRank());
   for (int i = 0, s = operandTy.getRank(); i < s; ++i) {
     Value padEdge =
-        builder.create<arith::ConstantIndexOp>(loc, padHigh[i] + padLow[i]);
+        arith::ConstantIndexOp::create(builder, loc, padHigh[i] + padLow[i]);
 
     // First we grab the initial interior size.
-    Value dim = builder.create<tensor::DimOp>(loc, operand, i).getResult();
+    Value dim = tensor::DimOp::create(builder, loc, operand, i).getResult();
 
     // Compute the interior of the tensor and determine padding size.
     if (padInterior[i] > 0) {
       Value padInter =
-          builder.create<arith::ConstantIndexOp>(loc, padInterior[i])
+          arith::ConstantIndexOp::create(builder, loc, padInterior[i])
               .getResult();
-      Value interior = builder.create<arith::SubIOp>(loc, dim, one).getResult();
-      interior = builder.create<arith::MaxSIOp>(loc, interior, zero);
-      interior = builder.create<arith::MulIOp>(loc, interior, padInter);
-      dim = builder.create<arith::AddIOp>(loc, dim, interior).getResult();
+      Value interior =
+          arith::SubIOp::create(builder, loc, dim, one).getResult();
+      interior = arith::MaxSIOp::create(builder, loc, interior, zero);
+      interior = arith::MulIOp::create(builder, loc, interior, padInter);
+      dim = arith::AddIOp::create(builder, loc, dim, interior).getResult();
     }
 
     // Then we add the padding on the edge of the tensor.
-    dim = builder.create<arith::AddIOp>(loc, dim, padEdge).getResult();
+    dim = arith::AddIOp::create(builder, loc, dim, padEdge).getResult();
     dimensions.push_back(dim);
   }
 
   Value dimensionTensor =
-      builder.create<tensor::FromElementsOp>(loc, dimensions).getResult();
+      tensor::FromElementsOp::create(builder, loc, dimensions).getResult();
   reifiedReturnShapes.push_back(dimensionTensor);
   return success();
 }
@@ -4767,38 +4775,40 @@ LogicalResult DynamicPadOp::reifyReturnTypeShapes(
   };
 
   Value zero =
-      toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 0));
-  Value one = toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 1));
+      toShapeScalarType(arith::ConstantIndexOp::create(builder, loc, 0));
+  Value one =
+      toShapeScalarType(arith::ConstantIndexOp::create(builder, loc, 1));
 
   for (int idx : llvm::seq<int>(0, operandType.getShape().size())) {
     Value valueDim =
-        toShapeScalarType(builder.create<tensor::DimOp>(loc, operand, idx));
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, idx);
+        toShapeScalarType(tensor::DimOp::create(builder, loc, operand, idx));
+    Value offset = arith::ConstantIndexOp::create(builder, loc, idx);
     Value valueLow =
-        builder.create<tensor::ExtractOp>(loc, edgePaddingLow, offset);
+        tensor::ExtractOp::create(builder, loc, edgePaddingLow, offset);
     Value valueHigh =
-        builder.create<tensor::ExtractOp>(loc, edgePaddingHigh, offset);
+        tensor::ExtractOp::create(builder, loc, edgePaddingHigh, offset);
     Value valueInterior =
-        builder.create<tensor::ExtractOp>(loc, interiorPadding, offset);
+        tensor::ExtractOp::create(builder, loc, interiorPadding, offset);
     // output_size = input_size + padding_low + padding_high + interior *
     // max(input_size - 1, 0)
-    Value valueDimLessThanOne = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, valueDim, one);
-    Value interiorSize = builder.create<arith::MulIOp>(
-        loc, valueInterior,
-        builder.create<mlir::arith::SelectOp>(
-            loc, valueDimLessThanOne, zero,
-            builder.create<arith::SubIOp>(loc, valueDim, one)));
-    shapeValues.push_back(builder.create<arith::AddIOp>(
-        loc,
-        builder.create<arith::AddIOp>(
-            loc, builder.create<arith::AddIOp>(loc, interiorSize, valueDim),
+    Value valueDimLessThanOne = arith::CmpIOp::create(
+        builder, loc, arith::CmpIPredicate::slt, valueDim, one);
+    Value interiorSize = arith::MulIOp::create(
+        builder, loc, valueInterior,
+        mlir::arith::SelectOp::create(
+            builder, loc, valueDimLessThanOne, zero,
+            arith::SubIOp::create(builder, loc, valueDim, one)));
+    shapeValues.push_back(arith::AddIOp::create(
+        builder, loc,
+        arith::AddIOp::create(
+            builder, loc,
+            arith::AddIOp::create(builder, loc, interiorSize, valueDim),
             valueLow),
         valueHigh));
   }
 
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
+  reifiedReturnShapes.push_back(tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues));
@@ -5684,8 +5694,8 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
     }
 
     auto concatRange = OperandRange(subsetStart, subsetEnd);
-    auto newConcat = rewriter.create<ConcatenateOp>(
-        concat.getLoc(), concatRange, concat.getDimension());
+    auto newConcat = ConcatenateOp::create(rewriter, concat.getLoc(),
+                                           concatRange, concat.getDimension());
 
     llvm::SmallVector<APInt, 6> newStart(start);
     llvm::SmallVector<APInt, 6> newLimit(limit);
@@ -5693,10 +5703,10 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
     newLimit[dimension] -= frontOffset;
 
     auto attrType = cast<ShapedType>(slice.getStartIndices().getType());
-    auto create = rewriter.create<SliceOp>(
-        slice.getLoc(), newConcat,
-        DenseIntElementsAttr::get(attrType, newStart),
-        DenseIntElementsAttr::get(attrType, newLimit), slice.getStrides());
+    auto create = SliceOp::create(rewriter, slice.getLoc(), newConcat,
+                                  DenseIntElementsAttr::get(attrType, newStart),
+                                  DenseIntElementsAttr::get(attrType, newLimit),
+                                  slice.getStrides());
     rewriter.replaceOp(slice, create.getResult());
     return success();
   }
@@ -5763,8 +5773,8 @@ static LogicalResult sortDropEmptyUseArgs(SortOp op,
     }
   }
 
-  auto newOp = rewriter.create<SortOp>(op.getLoc(), newOperands,
-                                       op.getDimension(), op.getIsStable());
+  auto newOp = SortOp::create(rewriter, op.getLoc(), newOperands,
+                              op.getDimension(), op.getIsStable());
   Region& region = newOp.getComparator();
   rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
   region.front().eraseArguments(erasedBlockArgs);
@@ -5795,9 +5805,8 @@ static LogicalResult sortOpInferDefaultDimension(SortOp op,
   }
 
   IntegerAttr dim = rewriter.getI64IntegerAttr(ty.getRank() - 1);
-  auto newOp =
-      rewriter.create<SortOp>(op.getLoc(), op.getResultTypes(), op.getInputs(),
-                              dim, op.getIsStableAttr());
+  auto newOp = SortOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                              op.getInputs(), dim, op.getIsStableAttr());
   Region& region = newOp.getComparator();
   rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
   rewriter.replaceOp(op, newOp.getResults());
@@ -5964,8 +5973,8 @@ LogicalResult TransposeOp::reifyReturnTypeShapes(
     shapeValues[std::distance(permutation.begin(), it)] = valueDim;
   }
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues);
@@ -6432,8 +6441,8 @@ struct ScatterFullReplace : public OpRewritePattern<ScatterOp> {
 
     auto dimensions =
         llvm::to_vector(llvm::seq<int64_t>(0, baseType.getRank()));
-    auto map = rewriter.create<mhlo::MapOp>(
-        scatter.getLoc(), scatter->getResultTypes(),
+    auto map = mhlo::MapOp::create(
+        rewriter, scatter.getLoc(), scatter->getResultTypes(),
         ValueRange{scatter.getOperands()[0], scatter.getUpdates()[0]},
         rewriter.getI64TensorAttr(dimensions));
     rewriter.inlineRegionBefore(scatter.getRegion(), map.getRegion(),
@@ -6536,9 +6545,9 @@ static LogicalResult whileCanonicalization(WhileOp whileOp,
   for (int idx : llvm::reverse(invariantArgIdxs))
     bodyReturnOp->eraseOperand(idx);
 
-  WhileOp newWhileOp = rewriter.create<WhileOp>(
-      whileOp.getLoc(), bodyReturnOp->getOperandTypes(), newOperands,
-      whileOp->getAttrs());
+  WhileOp newWhileOp = WhileOp::create(rewriter, whileOp.getLoc(),
+                                       bodyReturnOp->getOperandTypes(),
+                                       newOperands, whileOp->getAttrs());
   newWhileOp.getBodyRegion(0).takeBody(whileOp.getBodyRegion(0));
   newWhileOp.getBodyRegion(1).takeBody(whileOp.getBodyRegion(1));
   for (auto results : llvm::zip(resultsToReplace, newWhileOp->getResults()))
@@ -7546,10 +7555,11 @@ static void buildSortComparisonBody(llvm::ArrayRef<Type> elementTypes,
     typeAttr = symbolizeComparisonType(*compareType).value();
   else
     typeAttr = ComparisonType::NOTYPE;
-  Value compare = builder->create<mhlo::CompareOp>(
-      loc, block->getArgument(0), block->getArgument(1), direction, typeAttr);
+  Value compare =
+      mhlo::CompareOp::create(*builder, loc, block->getArgument(0),
+                              block->getArgument(1), direction, typeAttr);
 
-  builder->create<mhlo::ReturnOp>(loc, compare);
+  mhlo::ReturnOp::create(*builder, loc, compare);
 }
 
 SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
@@ -7559,7 +7569,7 @@ SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
   assert(!operands.empty() && "No operands to sort");
   // Create the sort op.
   auto sortOp =
-      rewriter->create<mhlo::SortOp>(loc, operands, dimension, isStable);
+      mhlo::SortOp::create(*rewriter, loc, operands, dimension, isStable);
 
   // Use TOTALORDER comparison type instead of the default comparison if the
   // element type is of type float.
@@ -7595,13 +7605,13 @@ Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
           (attrShapedType.getShape() != resultShapedType.getShape()))
         return nullptr;
     }
-    return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
+    return mhlo::ConstantOp::create(builder, loc, type, elementsAttr);
   }
   // HLO dialect constants require the type of value and result to match for
   // non-quantized tensors.
   if (type != elementsAttr.getType()) return nullptr;
 
-  return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
+  return mhlo::ConstantOp::create(builder, loc, type, elementsAttr);
 }
 
 static int64_t getNumLeafBuffers(Type type) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
index 8d3378e9c0d453..561d072ca3a672 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
@@ -93,7 +93,7 @@ class ExpandHloTuplesPass
         OpBuilder builder(func.getBody());
         builder.setInsertionPointToStart(&func.getBody().front());
         auto newTuple =
-            builder.create<mhlo::TupleOp>(loc, tupleType, flattenedOperands);
+            mhlo::TupleOp::create(builder, loc, tupleType, flattenedOperands);
         func.getArgument(originalArgumentIndex).replaceAllUsesWith(newTuple);
 
         // Now the original argument has been rewired, we should be able to
@@ -129,8 +129,8 @@ class ExpandHloTuplesPass
       return success();
     }
 
-    builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
-                                         expandedReturnOperands);
+    mlir::func::ReturnOp::create(builder, returnOp.getLoc(),
+                                 expandedReturnOperands);
     returnOp.erase();
     auto newFuncType = FunctionType::get(
         oldFuncType.getContext(), expandedInputTypes, expandedResultTypes);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
index 7042c5786437c6..88d1c06712800c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file implements logic for bufferizing HLO dialect to memref dialect.
 
-#include <memory>
+#include <cstdint>
 #include <optional>
 #include <utility>
 
@@ -141,8 +141,8 @@ struct DynamicReshapeOpInterface
       if (failed(tensorAlloc)) return failure();
       auto memrefType =
           MemRefType::get(bufferType.getShape(), bufferType.getElementType());
-      operand = rewriter.create<bufferization::ToBufferOp>(
-          op->getLoc(), memrefType, *tensorAlloc);
+      operand = bufferization::ToBufferOp::create(rewriter, op->getLoc(),
+                                                  memrefType, *tensorAlloc);
     }
     bufferization::replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
         rewriter, op, resultType, operand, *outputShapeBuffer);
@@ -165,8 +165,8 @@ FailureOr<Value> insertDynamicMemrefCastOp(
   auto resultType = mlir::cast<RankedTensorType>(op.getType());
   auto resultRank = resultType.getRank();
 
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
 
   // Compute a reversed scan product. Compute the stride for the dimensions so
   // far, working from minor to major dimensions. Additionally, save the
@@ -177,15 +177,15 @@ FailureOr<Value> insertDynamicMemrefCastOp(
   for (int i = operandRank - 1; i >= 0; --i) {
     Value operandDimSize =
         ShapedType::isDynamic(operandShape[i])
-            ? rewriter.create<memref::DimOp>(loc, operand, i).getResult()
-            : rewriter.create<arith::ConstantIndexOp>(loc, operandShape[i])
+            ? memref::DimOp::create(rewriter, loc, operand, i).getResult()
+            : arith::ConstantIndexOp::create(rewriter, loc, operandShape[i])
                   .getResult();
     operandSizes[i] = operandDimSize;
 
     operandStrides[i] = strideSoFar;
     if (i > 0) {
       strideSoFar =
-          rewriter.create<arith::MulIOp>(loc, strideSoFar, operandDimSize);
+          arith::MulIOp::create(rewriter, loc, strideSoFar, operandDimSize);
     }
   }
 
@@ -198,15 +198,15 @@ FailureOr<Value> insertDynamicMemrefCastOp(
     outputToInputDim[dim.value().getSExtValue()] = dim.index();
   }
   for (int i = 0; i < resultRank; ++i) {
-    Value iVal = rewriter.create<arith::ConstantIndexOp>(loc, i);
+    Value iVal = arith::ConstantIndexOp::create(rewriter, loc, i);
     FailureOr<Value> outputDimsBuffer =
         getBuffer(rewriter, op.getOutputDimensions(), options, state);
     if (failed(outputDimsBuffer)) return failure();
     Value resultDimSize =
-        rewriter.create<memref::LoadOp>(loc, *outputDimsBuffer, iVal);
+        memref::LoadOp::create(rewriter, loc, *outputDimsBuffer, iVal);
     if (!resultDimSize.getType().isIndex()) {
-      resultDimSize = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), resultDimSize);
+      resultDimSize = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getIndexType(), resultDimSize);
     }
     if (resultType.isDynamicDim(i)) {
       sizes.push_back(resultDimSize);
@@ -229,10 +229,11 @@ FailureOr<Value> insertDynamicMemrefCastOp(
     //    => stride flattened buffer stride
     // 2) Operand dim < result dim => expansion is needed => stride := 0.
     int dim = it->second;
-    Value isExpansion = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, operandSizes[dim], resultDimSize);
-    Value select = rewriter.create<mlir::arith::SelectOp>(
-        loc, isExpansion, zero, operandStrides[dim]);
+    Value isExpansion =
+        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt,
+                              operandSizes[dim], resultDimSize);
+    Value select = mlir::arith::SelectOp::create(rewriter, loc, isExpansion,
+                                                 zero, operandStrides[dim]);
     strides.push_back(select);
   }
 
@@ -243,8 +244,8 @@ FailureOr<Value> insertDynamicMemrefCastOp(
       makeStridedLinearLayoutMap(dynamicLayout,
                                  /*offset=*/0, rewriter.getContext()));
 
-  auto transformedOperand = rewriter.create<memref::ReinterpretCastOp>(
-      loc, typeErasedMemrefType, operand,
+  auto transformedOperand = memref::ReinterpretCastOp::create(
+      rewriter, loc, typeErasedMemrefType, operand,
       /*offset=*/rewriter.getI64IntegerAttr(0), sizes, strides);
   return transformedOperand.getResult();
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index e52bab960be48f..0362efa19df89e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -442,8 +442,8 @@ FailureOr<func::FuncOp> rewriteMhloRegionAsFunc(
   auto& block = region.getBlocks().front();
   auto type = rewriter.getFunctionType(
       block.getArgumentTypes(), block.getTerminator()->getOperandTypes());
-  auto funcOp = rewriter.create<func::FuncOp>(
-      region.getLoc(), op->getName().stripDialect(), type);
+  auto funcOp = func::FuncOp::create(rewriter, region.getLoc(),
+                                     op->getName().stripDialect(), type);
   symTable.insert(funcOp);
 
   // Move region into new function
@@ -685,9 +685,9 @@ class HloToStablehloOpConverter
     // for the generic builder.
     HloToStablehloOp<HloOpTy> stablehloOp;
     if constexpr (std::is_same<HloOpTy, mhlo::CaseOp>::value) {
-      stablehloOp = rewriter.create<stablehlo::CaseOp>(
-          hloOp.getLoc(), stablehloTypes, stablehloOperands, stablehloAttrs,
-          hloOp.getBranches().size());
+      stablehloOp = stablehlo::CaseOp::create(
+          rewriter, hloOp.getLoc(), stablehloTypes, stablehloOperands,
+          stablehloAttrs, hloOp.getBranches().size());
     } else {
       stablehloOp = rewriter.create<HloToStablehloOp<HloOpTy>>(
           hloOp.getLoc(), stablehloTypes, stablehloOperands, stablehloAttrs);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
index e4b7a1a91f2b31..e677e65de672e9 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
@@ -106,13 +106,23 @@ struct HloLegalizeToStablehloPass
     stablehlo::registerFuncOpsForTypeConversion(target, patterns, converter);
 
     if (allow_xla_features_) {
-      // These ops do not exist in StableHLO.
-      target.addLegalOp<mhlo::AsyncDoneOp, mhlo::AsyncStartOp,
-                        mhlo::AsyncUpdateOp, mhlo::BitcastOp, mhlo::CopyOp,
-                        mhlo::DomainOp, mhlo::ErfOp, mhlo::FusionOp,
-                        mhlo::MinimumBroadcastShapesOp, mhlo::RaggedDotOp,
-                        mhlo::StochasticConvertOp, mhlo::TopKOp, mhlo::TraceOp,
-                        mhlo::XlaRngGetAndUpdateStateOp>();
+      // These ops do not exist in StableHLO. (They do exist in CHLO, a slightly
+      // higher-level dialect wrapping StableHLO, but we leave them as MHLO here
+      // since we're specifically legalizing to StableHLO, not to CHLO.)
+      target.addLegalOp<  //
+          mhlo::AcosOp, mhlo::AcoshOp, mhlo::AsinOp, mhlo::AsinhOp,
+          mhlo::AtanhOp, mhlo::CoshOp, mhlo::ErfOp, mhlo::RaggedDotOp,
+          mhlo::SinhOp, mhlo::TopKOp>();
+
+      // These ops do not exist in StableHLO. (They don't exist in CHLO, either;
+      // MHLO is the appropriate dialect for expressing XLA-specific features
+      // such as these.)
+      target.addLegalOp<
+          mhlo::AsyncDoneOp, mhlo::AsyncStartOp, mhlo::AsyncUpdateOp,
+          mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp, mhlo::FusionOp,
+          mhlo::MinimumBroadcastShapesOp, mhlo::StochasticConvertOp,
+          mhlo::TraceOp, mhlo::XlaRngGetAndUpdateStateOp>();
+
       target.addDynamicallyLegalOp<mhlo::AddDependencyOp>(
           [](mhlo::AddDependencyOp op) {
             return !hasMhloTypes(op->getOperandTypes());
@@ -142,8 +152,11 @@ struct HloLegalizeToStablehloPass
         [](Operation* op) { return !hasMhloTypes(op->getOperandTypes()); });
     patterns.add<UpdateOperandsInUnknownOp>(converter, &getContext());
 
+    ConversionConfig config;
+    config.foldingMode = DialectConversionFoldingMode::Never;
+
     if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns))))
+                                      std::move(patterns), config)))
       return signalPassFailure();
   }
 };
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index 0a20c69af6b6fa..9c5a34351d8ef9 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <cctype>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <memory>
 #include <utility>
@@ -157,9 +159,9 @@ struct EinsumToDotGeneralPattern : public OpRewritePattern<EinsumOp> {
     auto dimNumbers = mhlo::DotDimensionNumbersAttr::get(
         rewriter.getContext(), lhsBatchingDims, rhsBatchingDims,
         lhsContractingDims, rhsContractingDims);
-    auto dotGeneralOp = rewriter.create<DotGeneralOp>(
-        einsum.getLoc(), dotGeneralResultType, einsum.getLhs(), einsum.getRhs(),
-        dimNumbers,
+    auto dotGeneralOp = DotGeneralOp::create(
+        rewriter, einsum.getLoc(), dotGeneralResultType, einsum.getLhs(),
+        einsum.getRhs(), dimNumbers,
         /*precision_config=*/ArrayAttr{}, /*dot_algorithm=*/DotAlgorithmAttr{});
 
     if (isNaturalOrder) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
index 4e2f9c247d2dd8..071efb37345760 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
@@ -63,7 +63,7 @@ class ApproximateOnExtendedF32Lowering : public OpRewritePattern<OpTy> {
       if (argTy.isF64()) return failure();
 
       if (argTy.isF16())
-        arg = rewriter.create<arith::ExtFOp>(loc, rewriter.getF32Type(), arg);
+        arg = arith::ExtFOp::create(rewriter, loc, rewriter.getF32Type(), arg);
 
       // If we still do not have f32, fail.
       if (!arg.getType().isF32()) return failure();
@@ -77,7 +77,7 @@ class ApproximateOnExtendedF32Lowering : public OpRewritePattern<OpTy> {
     // Truncate back if needed.
     if (op.getType().isF16()) {
       result =
-          rewriter.create<arith::TruncFOp>(loc, rewriter.getF16Type(), result);
+          arith::TruncFOp::create(rewriter, loc, rewriter.getF16Type(), result);
     }
 
     rewriter.replaceOp(op, {result});
@@ -108,59 +108,62 @@ class ApproximateTanhLowering
         4.89352518554385e-03f};
 
     // Materialize polynomial approximation.
-    Value inputSquared = rewriter.create<arith::MulFOp>(loc, input, input);
-    Value numerator = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getF32FloatAttr(numeratorCoeffs[0]));
+    Value inputSquared = arith::MulFOp::create(rewriter, loc, input, input);
+    Value numerator = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getF32FloatAttr(numeratorCoeffs[0]));
     for (int64_t i = 1; i < static_cast<int64_t>(numeratorCoeffs.size()); i++) {
-      numerator = rewriter.create<arith::AddFOp>(
-          loc, rewriter.create<arith::MulFOp>(loc, inputSquared, numerator),
-          rewriter.create<arith::ConstantOp>(
-              loc, rewriter.getF32FloatAttr(numeratorCoeffs[i])));
+      numerator = arith::AddFOp::create(
+          rewriter, loc,
+          arith::MulFOp::create(rewriter, loc, inputSquared, numerator),
+          arith::ConstantOp::create(
+              rewriter, loc, rewriter.getF32FloatAttr(numeratorCoeffs[i])));
     }
-    numerator = rewriter.create<arith::MulFOp>(loc, input, numerator);
-    Value denominator = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getF32FloatAttr(denominatorCoeffs[0]));
+    numerator = arith::MulFOp::create(rewriter, loc, input, numerator);
+    Value denominator = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getF32FloatAttr(denominatorCoeffs[0]));
     for (int64_t i = 1; i < static_cast<int64_t>(denominatorCoeffs.size());
          i++) {
-      denominator = rewriter.create<arith::AddFOp>(
-          loc, rewriter.create<arith::MulFOp>(loc, inputSquared, denominator),
-          rewriter.create<arith::ConstantOp>(
-              loc, rewriter.getF32FloatAttr(denominatorCoeffs[i])));
+      denominator = arith::AddFOp::create(
+          rewriter, loc,
+          arith::MulFOp::create(rewriter, loc, inputSquared, denominator),
+          arith::ConstantOp::create(
+              rewriter, loc, rewriter.getF32FloatAttr(denominatorCoeffs[i])));
     }
-    Value approx = rewriter.create<arith::DivFOp>(loc, numerator, denominator);
+    Value approx = arith::DivFOp::create(rewriter, loc, numerator, denominator);
 
     // For small values of |x|, we can approximate tanh(x) = x. For extremely
     // small values of x (|x| < 1e-37), the other approximation would evaluate
     // tanh(x) = 0.
     constexpr float kUseIdentityApprox = 0.0004;
-    Value absInput = rewriter.create<math::AbsFOp>(loc, input);
-    Value useIdentityApprox = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::OLT, absInput,
-        rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getF32FloatAttr(kUseIdentityApprox)));
-    approx =
-        rewriter.create<arith::SelectOp>(loc, useIdentityApprox, input, approx);
+    Value absInput = math::AbsFOp::create(rewriter, loc, input);
+    Value useIdentityApprox = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::OLT, absInput,
+        arith::ConstantOp::create(
+            rewriter, loc, rewriter.getF32FloatAttr(kUseIdentityApprox)));
+    approx = arith::SelectOp::create(rewriter, loc, useIdentityApprox, input,
+                                     approx);
 
     // For very small/large values, use a constant approximation -1/1.
-    Value tooLargeInput = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UGT, input,
-        rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getF32FloatAttr(7.90531110763549805f)));
-    Value tooSmallInput = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::ULT, input,
-        rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getF32FloatAttr(-7.90531110763549805f)));
-    Value inputIsNan = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UNE, input, input);
-    approx = rewriter.create<arith::SelectOp>(
-        loc, tooLargeInput,
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(1.0)),
+    Value tooLargeInput = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::UGT, input,
+        arith::ConstantOp::create(
+            rewriter, loc, rewriter.getF32FloatAttr(7.90531110763549805f)));
+    Value tooSmallInput = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::ULT, input,
+        arith::ConstantOp::create(
+            rewriter, loc, rewriter.getF32FloatAttr(-7.90531110763549805f)));
+    Value inputIsNan = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::UNE, input, input);
+    approx = arith::SelectOp::create(
+        rewriter, loc, tooLargeInput,
+        arith::ConstantOp::create(rewriter, loc, rewriter.getF32FloatAttr(1.0)),
         approx);
-    approx = rewriter.create<arith::SelectOp>(
-        loc, tooSmallInput,
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(-1.0)),
+    approx = arith::SelectOp::create(
+        rewriter, loc, tooSmallInput,
+        arith::ConstantOp::create(rewriter, loc,
+                                  rewriter.getF32FloatAttr(-1.0)),
         approx);
-    approx = rewriter.create<arith::SelectOp>(loc, inputIsNan, input, approx);
+    approx = arith::SelectOp::create(rewriter, loc, inputIsNan, input, approx);
 
     return approx;
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index a39b2639705942..cc5e9139c7d976 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -352,7 +352,7 @@ inline Value getConstantOrSplat(OpBuilder* b, Location loc, Type t,
   if (ShapedType shapedType = mlir::dyn_cast<ShapedType>(t)) {
     v = SplatElementsAttr::get(shapedType, v);
   }
-  return b->create<arith::ConstantOp>(loc, t, cast<TypedAttr>(v));
+  return arith::ConstantOp::create(*b, loc, t, cast<TypedAttr>(v));
 }
 
 template <typename PredicateType>
@@ -401,20 +401,20 @@ inline Value cmpComplex(Location loc, Value lhs, Value rhs,
   auto complexType = mlir::cast<ComplexType>(lhs.getType());
   if (mlir::isa<FloatType>(complexType.getElementType())) {
     if (comparisonDirection == ComparisonDirection::EQ) {
-      return b->create<complex::EqualOp>(loc, lhs, rhs);
+      return complex::EqualOp::create(*b, loc, lhs, rhs);
     }
     if (comparisonDirection == ComparisonDirection::NE) {
-      return b->create<complex::NotEqualOp>(loc, lhs, rhs);
+      return complex::NotEqualOp::create(*b, loc, lhs, rhs);
     }
 
     // Perform a lexicographical comparison for the (real, imaginary) pair.
     Type complexFloatTy = complexType.getElementType();
 
-    Value lhsReal = b->create<complex::ReOp>(loc, complexFloatTy, lhs);
-    Value rhsReal = b->create<complex::ReOp>(loc, complexFloatTy, rhs);
+    Value lhsReal = complex::ReOp::create(*b, loc, complexFloatTy, lhs);
+    Value rhsReal = complex::ReOp::create(*b, loc, complexFloatTy, rhs);
 
-    Value lhsImag = b->create<complex::ImOp>(loc, complexFloatTy, lhs);
-    Value rhsImag = b->create<complex::ImOp>(loc, complexFloatTy, rhs);
+    Value lhsImag = complex::ImOp::create(*b, loc, complexFloatTy, lhs);
+    Value rhsImag = complex::ImOp::create(*b, loc, complexFloatTy, rhs);
 
     auto predicate = getCmpPredicate<arith::CmpFPredicate>(comparisonDirection,
                                                            /*is_signed=*/true);
@@ -422,15 +422,15 @@ inline Value cmpComplex(Location loc, Value lhs, Value rhs,
 
     //   if (lhsReal == rhsReal && lhsImag `predicate` rhsImag ||
     //       lhsReal `predicate` rhsReal)
-    Value realsAreEq = b->create<arith::CmpFOp>(loc, arith::CmpFPredicate::OEQ,
-                                                lhsReal, rhsReal);
+    Value realsAreEq = arith::CmpFOp::create(*b, loc, arith::CmpFPredicate::OEQ,
+                                             lhsReal, rhsReal);
     Value imagsAreOrdered =
-        b->create<arith::CmpFOp>(loc, *predicate, lhsImag, rhsImag);
+        arith::CmpFOp::create(*b, loc, *predicate, lhsImag, rhsImag);
     Value realsAreOrdered =
-        b->create<arith::CmpFOp>(loc, *predicate, lhsReal, rhsReal);
+        arith::CmpFOp::create(*b, loc, *predicate, lhsReal, rhsReal);
 
-    Value orLhs = b->create<arith::AndIOp>(loc, realsAreEq, imagsAreOrdered);
-    return b->create<arith::OrIOp>(loc, orLhs, realsAreOrdered);
+    Value orLhs = arith::AndIOp::create(*b, loc, realsAreEq, imagsAreOrdered);
+    return arith::OrIOp::create(*b, loc, orLhs, realsAreOrdered);
   }
   return nullptr;
 }
@@ -459,9 +459,9 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
       // -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
       auto intType = b->getIntegerType(floatType.getWidth());
       auto zero =
-          b->create<arith::ConstantOp>(loc, intType, b->getZeroAttr(intType));
-      auto max = b->create<arith::ConstantOp>(
-          loc, intType,
+          arith::ConstantOp::create(*b, loc, intType, b->getZeroAttr(intType));
+      auto max = arith::ConstantOp::create(
+          *b, loc, intType,
           b->getIntegerAttr(intType,
                             APInt::getSignedMaxValue(floatType.getWidth())));
       // Switch from a floating point value to a integer value in such a way
@@ -475,11 +475,11 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
       // obvious order, -0 is ordered before 0, and -NaN and NaN appear at the
       // beginning and end of the ordering.
       auto toIntegral = [&](Value v) {
-        auto x = b->create<arith::BitcastOp>(loc, intType, v);
+        auto x = arith::BitcastOp::create(*b, loc, intType, v);
         auto cmp =
-            b->create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, x, zero);
-        auto sub = b->create<arith::SubIOp>(loc, max, x);
-        return b->create<arith::SelectOp>(loc, cmp, sub, x);
+            arith::CmpIOp::create(*b, loc, arith::CmpIPredicate::slt, x, zero);
+        auto sub = arith::SubIOp::create(*b, loc, max, x);
+        return arith::SelectOp::create(*b, loc, cmp, sub, x);
       };
       auto lhsInt = toIntegral(lhs);
       auto rhsInt = toIntegral(rhs);
@@ -487,7 +487,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
           getCmpPredicate<arith::CmpIPredicate>(comparisonDirection,
                                                 /*is_signed=*/true);
       assert(predicate.has_value() && "expected valid comparison direction");
-      return b->create<arith::CmpIOp>(loc, *predicate, lhsInt, rhsInt);
+      return arith::CmpIOp::create(*b, loc, *predicate, lhsInt, rhsInt);
     }
     std::optional<arith::CmpFPredicate> predicate =
         getCmpPredicate<arith::CmpFPredicate>(comparisonDirection,
@@ -574,7 +574,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::MaxOp>(
   // 'max' performs a lexicographical comparison for the (real, imaginary) pair.
   Value cond = cmpComplex(loc, lhs, rhs, ComparisonDirection::GE, b);
 
-  return b->create<arith::SelectOp>(loc, cond, lhs, rhs).getResult();
+  return arith::SelectOp::create(*b, loc, cond, lhs, rhs).getResult();
 }
 
 template <>
@@ -599,7 +599,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::MinOp>(
   // 'min' performs a lexicographical comparison for the (real, imaginary) pair.
   Value cond = cmpComplex(loc, lhs, rhs, ComparisonDirection::LE, b);
 
-  return b->create<arith::SelectOp>(loc, cond, lhs, rhs).getResult();
+  return arith::SelectOp::create(*b, loc, cond, lhs, rhs).getResult();
 }
 
 template <>
@@ -619,8 +619,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ImagOp>(
     mhlo::ImagOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
     OpBuilder* b) {
   if (!mlir::isa<ComplexType>(adaptor.getOperand().getType()))
-    return b->create<arith::ConstantOp>(
-        loc, b->getZeroAttr(adaptor.getOperand().getType()));
+    return arith::ConstantOp::create(
+        *b, loc, b->getZeroAttr(adaptor.getOperand().getType()));
   return MapMhloOpToScalarOpImpl<complex::ImOp>{}(
       loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
 }
@@ -646,10 +646,12 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
   if (IsUnsignedIntegerType{}(sourceType) &&
       mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::UIToFPOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::UIToFPOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::arith::SIToFPOp::areCastCompatible(sourceType, targetType)) {
-    return b->create<mlir::arith::SIToFPOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::SIToFPOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::isa<FloatType>(sourceType) && mlir::isa<FloatType>(targetType)) {
     if (sourceType == targetType) {
@@ -662,30 +664,30 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       // There are no ops for conversions between floats of equal width, so we
       // go through the next-larger standard type.
       sourceType = dst.getWidth() == 8 ? b->getF16Type() : b->getF32Type();
-      src = b->create<mlir::arith::ExtFOp>(loc, sourceType, src).getResult();
+      src = mlir::arith::ExtFOp::create(*b, loc, sourceType, src).getResult();
     }
     assert(sourceType.getIntOrFloatBitWidth() != dst.getWidth());
 
     if (sourceType.getIntOrFloatBitWidth() > dst.getWidth()) {
-      return b->create<mlir::arith::TruncFOp>(loc, resultTypes, src,
-                                              attributes);
+      return mlir::arith::TruncFOp::create(*b, loc, resultTypes, src,
+                                           attributes);
     }
-    return b->create<mlir::arith::ExtFOp>(loc, resultTypes, src, attributes);
+    return mlir::arith::ExtFOp::create(*b, loc, resultTypes, src, attributes);
   }
   if (targetType.isInteger(/*width=*/1)) {
     // When casting to bool, we need to compare whether the value is equal to
     // zero.
     if (sourceType.isSignlessInteger() || sourceType.isUnsignedInteger()) {
-      Value zeroIntval = b->create<arith::ConstantOp>(
-          loc, b->getZeroAttr(args.front().getType()));
-      return b->create<mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                            args.front(), zeroIntval);
+      Value zeroIntval = arith::ConstantOp::create(
+          *b, loc, b->getZeroAttr(args.front().getType()));
+      return mlir::arith::CmpIOp::create(*b, loc, arith::CmpIPredicate::ne,
+                                         args.front(), zeroIntval);
     }
     if (mlir::isa<FloatType>(sourceType)) {
-      Value zero = b->create<arith::ConstantOp>(
-          loc, b->getZeroAttr(args.front().getType()));
-      return b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
-                                            args.front(), zero);
+      Value zero = arith::ConstantOp::create(
+          *b, loc, b->getZeroAttr(args.front().getType()));
+      return mlir::arith::CmpFOp::create(*b, loc, arith::CmpFPredicate::UNE,
+                                         args.front(), zero);
     }
   }
   if (mlir::isa<IntegerType>(sourceType) &&
@@ -693,17 +695,17 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     auto src = mlir::cast<IntegerType>(sourceType);
     auto res = mlir::cast<IntegerType>(targetType);
     if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::arith::TruncIOp>(loc, resultTypes, args,
-                                              attributes);
+      return mlir::arith::TruncIOp::create(*b, loc, resultTypes, args,
+                                           attributes);
     }
     if (src.getWidth() < res.getWidth()) {
       // Special case boolean values, so they get casted to `1` instead of `-1`.
       if (IsUnsignedIntegerType{}(src)) {
-        return b->create<mlir::arith::ExtUIOp>(loc, resultTypes, args,
-                                               attributes);
+        return mlir::arith::ExtUIOp::create(*b, loc, resultTypes, args,
+                                            attributes);
       }
-      return b->create<mlir::arith::ExtSIOp>(loc, resultTypes, args,
-                                             attributes);
+      return mlir::arith::ExtSIOp::create(*b, loc, resultTypes, args,
+                                          attributes);
     }
     // No conversion is needed for the same width integers
     return args.front();
@@ -711,11 +713,13 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
   if (targetType.isUnsignedInteger() &&
       mlir::arith::FPToUIOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::FPToUIOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::FPToUIOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::FPToSIOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::FPToSIOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::isa<ComplexType>(targetType)) {
     Type targetElementType =
@@ -732,12 +736,12 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       assert(!mlir::isa<ComplexType>(sourceElementType) &&
              "elements of complex numbers should not be complex");
       Value sourceReal =
-          b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
+          mlir::complex::ReOp::create(*b, loc, sourceElementType, args.front());
       targetReal = mapConvertOpToStdScalarOp(
           loc, targetElementType, targetElementType, sourceElementType,
           sourceReal, attributes, b);
       Value sourceImag =
-          b->create<mlir::complex::ImOp>(loc, sourceElementType, args.front());
+          mlir::complex::ImOp::create(*b, loc, sourceElementType, args.front());
       targetImag = mapConvertOpToStdScalarOp(
           loc, targetElementType, targetElementType, sourceElementType,
           sourceImag, attributes, b);
@@ -747,18 +751,18 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       targetReal =
           mapConvertOpToStdScalarOp(loc, targetElementType, targetElementType,
                                     argTypes, args, attributes, b);
-      targetImag = b->create<mlir::arith::ConstantOp>(
-          loc, b->getFloatAttr(targetElementType, 0.0));
+      targetImag = mlir::arith::ConstantOp::create(
+          *b, loc, b->getFloatAttr(targetElementType, 0.0));
     }
-    return b->create<mlir::complex::CreateOp>(loc, targetType, targetReal,
-                                              targetImag);
+    return mlir::complex::CreateOp::create(*b, loc, targetType, targetReal,
+                                           targetImag);
   }
   if (auto sourceComplexType = mlir::dyn_cast<ComplexType>(sourceType)) {
     auto sourceElementType = sourceComplexType.getElementType();
     // When converting from complex to a non-complex type, we take just the real
     // part of the complex number.
     Value sourceReal =
-        b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
+        mlir::complex::ReOp::create(*b, loc, sourceElementType, args.front());
     return mapConvertOpToStdScalarOp(loc, targetTypes, resultTypes,
                                      sourceElementType, sourceReal, attributes,
                                      b);
@@ -780,8 +784,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::BitcastConvertOp>(
   if (resultType.getIntOrFloatBitWidth() != argType.getIntOrFloatBitWidth())
     return nullptr;
 
-  return b->create<mlir::arith::BitcastOp>(loc, resultTypes,
-                                           adaptor.getOperands(), attributes);
+  return mlir::arith::BitcastOp::create(*b, loc, resultTypes,
+                                        adaptor.getOperands(), attributes);
 }
 
 template <>
@@ -819,11 +823,11 @@ inline Value mapMhloOpToStdScalarOp<mhlo::IsFiniteOp>(
   if (mlir::isa<FloatType>(adaptor.getX().getType())) {
     auto posInf = APFloat::getInf(
         mlir::cast<FloatType>(adaptor.getX().getType()).getFloatSemantics());
-    auto constPosInf = b->create<arith::ConstantOp>(
-        loc, b->getFloatAttr(adaptor.getX().getType(), posInf));
-    Value absX = b->create<::mlir::math::AbsFOp>(loc, adaptor.getX());
-    return b->create<::mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::ONE,
-                                            absX, constPosInf);
+    auto constPosInf = arith::ConstantOp::create(
+        *b, loc, b->getFloatAttr(adaptor.getX().getType(), posInf));
+    Value absX = ::mlir::math::AbsFOp::create(*b, loc, adaptor.getX());
+    return ::mlir::arith::CmpFOp::create(*b, loc, arith::CmpFPredicate::ONE,
+                                         absX, constPosInf);
   }
   return nullptr;
 }
@@ -867,13 +871,13 @@ inline Value mhloAlwaysPropagateNaN(Value v, ValueRange args, Location loc,
                                     OpBuilder* b) {
   Type elementType = getElementTypeOrSelf(args.front().getType());
   if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
-    Value isnan = b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
-                                                 args[0], args[1]);
+    Value isnan = mlir::arith::CmpFOp::create(
+        *b, loc, arith::CmpFPredicate::UNO, args[0], args[1]);
 
     auto nanApfloat = APFloat::getQNaN(floatType.getFloatSemantics());
     Value nan = getConstantOrSplat(b, loc, args[0].getType(),
                                    b->getFloatAttr(floatType, nanApfloat));
-    v = b->create<mlir::arith::SelectOp>(loc, isnan, nan, v);
+    v = mlir::arith::SelectOp::create(*b, loc, isnan, nan, v);
   }
   return v;
 }
@@ -898,36 +902,36 @@ inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, bool isUnsigned,
                             Value returnedOnSignedOverflow) {
   Type type = lhs.getType();
   auto elementType = mlir::cast<IntegerType>(getElementTypeOrSelf(type));
-  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  Value zero = arith::ConstantOp::create(lb, lb.getZeroAttr(type));
   auto makeConstant = [&](const APInt& i) {
     return getConstantOrSplat(&lb, lb.getLoc(), type,
                               lb.getIntegerAttr(elementType, i));
   };
   Value one = makeConstant(APInt(elementType.getWidth(), 1));
   Value rhsIsZero =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, zero);
+      arith::CmpIOp::create(lb, arith::CmpIPredicate::eq, rhs, zero);
 
   // For unsigned just set the divisor to 1 when it would be 0.
   if (isUnsigned) {
-    Value safeRhs = lb.create<arith::SelectOp>(rhsIsZero, one, rhs);
-    Value safeDiv = lb.create<U>(lhs, safeRhs);
-    return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeDiv);
+    Value safeRhs = arith::SelectOp::create(lb, rhsIsZero, one, rhs);
+    Value safeDiv = U::create(lb, lhs, safeRhs);
+    return arith::SelectOp::create(lb, rhsIsZero, returnedOnZero, safeDiv);
   }
 
   // For signed also check for INT_MIN / -1.
   Value smin = makeConstant(APInt::getSignedMinValue(elementType.getWidth()));
   Value lhsIsSmin =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, lhs, smin);
+      arith::CmpIOp::create(lb, arith::CmpIPredicate::eq, lhs, smin);
   Value minusOne = makeConstant(APInt::getAllOnes(elementType.getWidth()));
   Value rhsIsMinusOne =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, minusOne);
-  Value hasIntMinOverflow = lb.create<arith::AndIOp>(lhsIsSmin, rhsIsMinusOne);
-  Value rhsIsUnsafe = lb.create<arith::OrIOp>(rhsIsZero, hasIntMinOverflow);
-  Value safeRhs = lb.create<arith::SelectOp>(rhsIsUnsafe, one, rhs);
-  Value safeDiv = lb.create<S>(lhs, safeRhs);
-  Value safeSmin = lb.create<arith::SelectOp>(
-      hasIntMinOverflow, returnedOnSignedOverflow, safeDiv);
-  return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeSmin);
+      arith::CmpIOp::create(lb, arith::CmpIPredicate::eq, rhs, minusOne);
+  Value hasIntMinOverflow = arith::AndIOp::create(lb, lhsIsSmin, rhsIsMinusOne);
+  Value rhsIsUnsafe = arith::OrIOp::create(lb, rhsIsZero, hasIntMinOverflow);
+  Value safeRhs = arith::SelectOp::create(lb, rhsIsUnsafe, one, rhs);
+  Value safeDiv = S::create(lb, lhs, safeRhs);
+  Value safeSmin = arith::SelectOp::create(lb, hasIntMinOverflow,
+                                           returnedOnSignedOverflow, safeDiv);
+  return arith::SelectOp::create(lb, rhsIsZero, returnedOnZero, safeSmin);
 }
 
 template <>
@@ -978,7 +982,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::RemOp>(
   // INT_SMIN %s -1 = 0
   ImplicitLocOpBuilder lb(loc, *b);
   Type type = adaptor.getLhs().getType();
-  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  Value zero = arith::ConstantOp::create(lb, lb.getZeroAttr(type));
   return makeSafeIntDiv<arith::RemUIOp, arith::RemSIOp>(
       lb, originalType.isUnsignedInteger(), adaptor.getLhs(), adaptor.getRhs(),
       /*returnedOnZero=*/adaptor.getLhs(),
@@ -1000,7 +1004,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::NegOp>(
     // lmhlo.neg(x, result) -> result = sub(0, x)
     Value lhs = adaptor.getOperand();
     Value zeroIntval =
-        b->create<arith::ConstantOp>(loc, b->getZeroAttr(lhs.getType()));
+        arith::ConstantOp::create(*b, loc, b->getZeroAttr(lhs.getType()));
     return b->create<ScalarIOp<mhlo::SubtractOp>>(loc, zeroIntval, lhs);
   }
   return nullptr;
@@ -1018,7 +1022,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::NotOp>(
         b, loc, adaptor.getOperand().getType(),
         b->getIntegerAttr(integerType,
                           APInt::getAllOnes(integerType.getWidth())));
-    return b->create<::mlir::arith::XOrIOp>(loc, allOnes, adaptor.getOperand());
+    return ::mlir::arith::XOrIOp::create(*b, loc, allOnes,
+                                         adaptor.getOperand());
   }
   return nullptr;
 }
@@ -1037,7 +1042,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::LogisticOp>(
   Type type = getElementTypeOrSelf(resultTypes[0]);
   Value oneFloat =
       mlir::isa<ComplexType>(type)
-          ? b->create<arith::ConstantOp>(loc, b->getF32FloatAttr(1.0))
+          ? arith::ConstantOp::create(*b, loc, b->getF32FloatAttr(1.0))
           : getConstantOrSplat(b, loc, resultTypes[0],
                                FloatAttr::get(type, 1.0f));
   Value one = mapConvertOpToStdScalarOp(loc, resultTypes, resultTypes,
@@ -1067,51 +1072,51 @@ inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(
   // Exponentiation by squaring:
   // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
   Value negOne =
-      lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, -1));
-  Value zero = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 0));
-  Value one = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 1));
-  Value two = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 2));
-  Value step = lb.create<arith::ConstantIndexOp>(1);
-  Value lowerBound = lb.create<arith::ConstantIndexOp>(0);
+      arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, -1));
+  Value zero = arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, 0));
+  Value one = arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, 1));
+  Value two = arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, 2));
+  Value step = arith::ConstantIndexOp::create(lb, 1);
+  Value lowerBound = arith::ConstantIndexOp::create(lb, 0);
   // Everything else would overflow for any exponent > 1, as 2^64
   // is the larget possible exponent for a 64-bit integer, and
   // that's 1 << 6.
-  Value upperBound = lb.create<arith::ConstantIndexOp>(6);
+  Value upperBound = arith::ConstantIndexOp::create(lb, 6);
   auto originalBase = adaptor.getLhs();
   auto originalExponent = adaptor.getRhs();
 
   Value accum =
-      lb.create<scf::ForOp>(
-            lowerBound, upperBound, step,
-            SmallVector<Value>({one, originalBase, originalExponent}),
-            [&](OpBuilder& b, Location, Value /*v*/, ValueRange iters) {
-              Value accum = iters[0];
-              Value base = iters[1];
-              Value exponent = iters[2];
-
-              Value condition = b.create<arith::CmpIOp>(
-                  loc, arith::CmpIPredicate::eq,
-                  b.create<::mlir::arith::AndIOp>(loc, exponent, one), one);
-              Value multiplied =
-                  b.create<::mlir::arith::MulIOp>(loc, accum, base);
-              accum = b.create<::mlir::arith::SelectOp>(loc, condition,
-                                                        multiplied, accum);
-              base = b.create<::mlir::arith::MulIOp>(loc, base, base);
-              exponent = b.create<::mlir::arith::ShRUIOp>(loc, exponent, one);
-              b.create<scf::YieldOp>(
-                  loc, SmallVector<Value>({accum, base, exponent}));
-            })
+      scf::ForOp::create(
+          lb, lowerBound, upperBound, step,
+          SmallVector<Value>({one, originalBase, originalExponent}),
+          [&](OpBuilder& b, Location, Value /*v*/, ValueRange iters) {
+            Value accum = iters[0];
+            Value base = iters[1];
+            Value exponent = iters[2];
+
+            Value condition = arith::CmpIOp::create(
+                b, loc, arith::CmpIPredicate::eq,
+                ::mlir::arith::AndIOp::create(b, loc, exponent, one), one);
+            Value multiplied =
+                ::mlir::arith::MulIOp::create(b, loc, accum, base);
+            accum = ::mlir::arith::SelectOp::create(b, loc, condition,
+                                                    multiplied, accum);
+            base = ::mlir::arith::MulIOp::create(b, loc, base, base);
+            exponent = ::mlir::arith::ShRUIOp::create(b, loc, exponent, one);
+            scf::YieldOp::create(b, loc,
+                                 SmallVector<Value>({accum, base, exponent}));
+          })
           .getResult(0);
 
-  Value rhsIsEven = lb.create<arith::CmpIOp>(
-      arith::CmpIPredicate::eq,
-      lb.create<arith::RemSIOp>(adaptor.getRhs(), two), zero);
-  Value rhsIsNegative = lb.create<arith::CmpIOp>(arith::CmpIPredicate::slt,
-                                                 adaptor.getRhs(), zero);
-  Value lhsIsOne =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, adaptor.getLhs(), one);
-  Value lhsIsNegOne = lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                               adaptor.getLhs(), negOne);
+  Value rhsIsEven = arith::CmpIOp::create(
+      lb, arith::CmpIPredicate::eq,
+      arith::RemSIOp::create(lb, adaptor.getRhs(), two), zero);
+  Value rhsIsNegative = arith::CmpIOp::create(lb, arith::CmpIPredicate::slt,
+                                              adaptor.getRhs(), zero);
+  Value lhsIsOne = arith::CmpIOp::create(lb, arith::CmpIPredicate::eq,
+                                         adaptor.getLhs(), one);
+  Value lhsIsNegOne = arith::CmpIOp::create(lb, arith::CmpIPredicate::eq,
+                                            adaptor.getLhs(), negOne);
 
   // The accum is correct when the rhs is non-negative. When rhs is
   // negative, we return 0 for integer, with the exception of lhs values of 1
@@ -1122,12 +1127,12 @@ inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(
   // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
   // - Return 1 if lhs is 1.
   // - Else return 0.
-  Value ifLhsIsOne = lb.create<::mlir::arith::SelectOp>(lhsIsOne, one, zero);
-  Value ifLhsIsNegOne = lb.create<::mlir::arith::SelectOp>(
-      lhsIsNegOne, lb.create<::mlir::arith::SelectOp>(rhsIsEven, one, negOne),
-      ifLhsIsOne);
-  return lb.create<::mlir::arith::SelectOp>(rhsIsNegative, ifLhsIsNegOne,
-                                            accum);
+  Value ifLhsIsOne = ::mlir::arith::SelectOp::create(lb, lhsIsOne, one, zero);
+  Value ifLhsIsNegOne = ::mlir::arith::SelectOp::create(
+      lb, lhsIsNegOne,
+      ::mlir::arith::SelectOp::create(lb, rhsIsEven, one, negOne), ifLhsIsOne);
+  return ::mlir::arith::SelectOp::create(lb, rhsIsNegative, ifLhsIsNegOne,
+                                         accum);
 }
 
 template <>
@@ -1148,35 +1153,35 @@ inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(
   Type elementType = getElementTypeOrSelf(operand.getType());
   if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
     Value zero =
-        b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
-    Value ne0I1 = b->create<::mlir::arith::CmpFOp>(
-        loc, arith::CmpFPredicate::ONE, operand, zero);
+        arith::ConstantOp::create(*b, loc, b->getZeroAttr(operand.getType()));
+    Value ne0I1 = ::mlir::arith::CmpFOp::create(
+        *b, loc, arith::CmpFPredicate::ONE, operand, zero);
     Value ne0Float =
-        b->create<::mlir::arith::UIToFPOp>(loc, zero.getType(), ne0I1);
-    Value copySign = b->create<::mlir::math::CopySignOp>(loc, resultTypes,
-                                                         ne0Float, operand);
-    auto isNan = b->create<::mlir::arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UNO, operand, operand);
-    return b->create<::mlir::arith::SelectOp>(loc, isNan, operand, copySign);
+        ::mlir::arith::UIToFPOp::create(*b, loc, zero.getType(), ne0I1);
+    Value copySign = ::mlir::math::CopySignOp::create(*b, loc, resultTypes,
+                                                      ne0Float, operand);
+    auto isNan = ::mlir::arith::CmpFOp::create(
+        *b, loc, arith::CmpFPredicate::UNO, operand, operand);
+    return ::mlir::arith::SelectOp::create(*b, loc, isNan, operand, copySign);
   }
   if (auto integerType = mlir::dyn_cast<IntegerType>(elementType)) {
     // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
     Value zero =
-        b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
+        arith::ConstantOp::create(*b, loc, b->getZeroAttr(operand.getType()));
     Value bitwidthMinusOne = getConstantOrSplat(
         b, loc, operand.getType(),
         b->getIntegerAttr(integerType, integerType.getWidth() - 1));
     Value one = getConstantOrSplat(b, loc, operand.getType(),
                                    b->getIntegerAttr(integerType, 1));
-    Value cmp = b->create<::mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                 operand, zero);
+    Value cmp = ::mlir::arith::CmpIOp::create(*b, loc, arith::CmpIPredicate::eq,
+                                              operand, zero);
     Value ashr =
-        b->create<::mlir::arith::ShRSIOp>(loc, operand, bitwidthMinusOne);
-    Value orOp = b->create<::mlir::arith::OrIOp>(loc, ashr, one);
-    return b->create<::mlir::arith::SelectOp>(loc, cmp, zero, orOp);
+        ::mlir::arith::ShRSIOp::create(*b, loc, operand, bitwidthMinusOne);
+    Value orOp = ::mlir::arith::OrIOp::create(*b, loc, ashr, one);
+    return ::mlir::arith::SelectOp::create(*b, loc, cmp, zero, orOp);
   }
   if (mlir::isa<ComplexType>(elementType)) {
-    return b->create<::mlir::complex::SignOp>(loc, elementType, operand);
+    return ::mlir::complex::SignOp::create(*b, loc, elementType, operand);
   }
   return nullptr;
 }
@@ -1192,9 +1197,9 @@ inline Value selectShiftedOrSaturated(ImplicitLocOpBuilder& lb, Value rhs,
   auto bitWidthInt = etype.getIntOrFloatBitWidth();
   Value bitWidth = getConstantOrSplat(&lb, lb.getLoc(), type,
                                       lb.getIntegerAttr(etype, bitWidthInt));
-  Value cmp = lb.create<mlir::arith::CmpIOp>(mlir::arith::CmpIPredicate::ugt,
-                                             bitWidth, rhs);
-  return lb.create<mlir::arith::SelectOp>(cmp, shifted, saturated);
+  Value cmp = mlir::arith::CmpIOp::create(lb, mlir::arith::CmpIPredicate::ugt,
+                                          bitWidth, rhs);
+  return mlir::arith::SelectOp::create(lb, cmp, shifted, saturated);
 }
 
 template <>
@@ -1208,8 +1213,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftLeftOp>(
   Type type = lhs.getType();
 
   // "Saturate" if the shift is greater than the bitwidth of the type
-  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
-  Value shifted = lb.create<mlir::arith::ShLIOp>(lhs, rhs);
+  Value zero = arith::ConstantOp::create(lb, lb.getZeroAttr(type));
+  Value shifted = mlir::arith::ShLIOp::create(lb, lhs, rhs);
 
   return selectShiftedOrSaturated(lb, rhs, shifted, zero, type);
 }
@@ -1225,8 +1230,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightLogicalOp>(
   Type type = lhs.getType();
 
   // "Saturate" if the shift is greater than the bitwidth of the type
-  Value zero = lb.create<arith::ConstantOp>(b->getZeroAttr(type));
-  Value shifted = lb.create<mlir::arith::ShRUIOp>(lhs, rhs);
+  Value zero = arith::ConstantOp::create(lb, b->getZeroAttr(type));
+  Value shifted = mlir::arith::ShRUIOp::create(lb, lhs, rhs);
 
   return selectShiftedOrSaturated(lb, rhs, shifted, zero, type);
 }
@@ -1248,8 +1253,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightArithmeticOp>(
   // "Saturate" if the shift is greater than the bitwidth of the type
   Value maxShift = getConstantOrSplat(
       b, loc, type, lb.getIntegerAttr(etype, bitWidthInt - 1));
-  Value saturatedShifted = lb.create<mlir::arith::ShRSIOp>(lhs, maxShift);
-  Value shifted = lb.create<mlir::arith::ShRSIOp>(lhs, rhs);
+  Value saturatedShifted = mlir::arith::ShRSIOp::create(lb, lhs, maxShift);
+  Value shifted = mlir::arith::ShRSIOp::create(lb, lhs, rhs);
 
   return selectShiftedOrSaturated(lb, rhs, shifted, saturatedShifted, type);
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
index b86038624c4c24..46449eb5cbaadb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
@@ -66,7 +66,7 @@ Value createTupleValue(OpBuilder &builder, Location loc,
 
   assert(mlir::cast<TupleType>(tupleType).getTypes().size() ==
          flattenValues.size());
-  return builder.create<mhlo::TupleOp>(loc, flattenValues);
+  return mhlo::TupleOp::create(builder, loc, flattenValues);
 }
 
 void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
@@ -78,8 +78,9 @@ void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
   }
   int flattenIdx = 0;
   for (auto innerType : tupleType.getTypes()) {
-    auto innerValue = builder.create<mhlo::GetTupleElementOp>(
-        loc, innerType, value, builder.getI32IntegerAttr(flattenIdx++));
+    auto innerValue = mhlo::GetTupleElementOp::create(
+        builder, loc, innerType, value,
+        builder.getI32IntegerAttr(flattenIdx++));
     flattenTupleValue(builder, loc, innerValue, flattenedValues);
   }
 }
@@ -114,8 +115,9 @@ struct FlattenCustomCallOp : public OpRewritePattern<CustomCallOp> {
         flattenTupleType(result, flattenedResultTypes);
     }
 
-    auto flattenedCall = rewriter.create<mhlo::CustomCallOp>(
-        op->getLoc(), flattenedResultTypes, flattenedOperands, op->getAttrs());
+    auto flattenedCall =
+        mhlo::CustomCallOp::create(rewriter, op->getLoc(), flattenedResultTypes,
+                                   flattenedOperands, op->getAttrs());
 
     rewriter.replaceOp(op, flattenResult
                                ? createTupleValue(rewriter, op->getLoc(),
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
index bd45785c2c5bec..35d3379583437d 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
@@ -120,10 +120,11 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
         auto start = getI64ElementsAttr({i}, &rewriter);
         auto limit = getI64ElementsAttr({i + 1}, &rewriter);
         auto stride = getI64ElementsAttr({1}, &rewriter);
-        auto indicesSlice = rewriter.create<SliceOp>(
-            gather.getLoc(), gatherStartIndices, start, limit, stride);
-        auto reshaped = rewriter.create<ReshapeOp>(
-            gather.getLoc(),
+        auto indicesSlice =
+            SliceOp::create(rewriter, gather.getLoc(), gatherStartIndices,
+                            start, limit, stride);
+        auto reshaped = ReshapeOp::create(
+            rewriter, gather.getLoc(),
             RankedTensorType::get({},
                                   mlir::cast<ShapedType>(indicesSlice.getType())
                                       .getElementType()),
@@ -139,9 +140,10 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
     // Start indices have implicit zeros when not specified. This is because
     // Gather occurs similar to slicing where full slices are inferred. Add any
     // missing zeros as necessary.
-    auto zero = rewriter.create<ConstantOp>(
-        gather.getLoc(), rewriter.getZeroAttr(RankedTensorType::get(
-                             {}, gatherStartIndicesTy.getElementType())));
+    auto zero =
+        ConstantOp::create(rewriter, gather.getLoc(),
+                           rewriter.getZeroAttr(RankedTensorType::get(
+                               {}, gatherStartIndicesTy.getElementType())));
     while (static_cast<int64_t>(sliceStartIndices.size()) <
            sliceSizesTy.getDimSize(0)) {
       sliceStartIndices.push_back(zero);
@@ -153,9 +155,9 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
     }
 
     auto sliceTy = RankedTensorType::get(sliceShape, resultTy.getElementType());
-    auto slice = rewriter.create<DynamicSliceOp>(
-        gather.getLoc(), sliceTy, gather.getOperand(), sliceStartIndices,
-        gather.getSliceSizes());
+    auto slice = DynamicSliceOp::create(rewriter, gather.getLoc(), sliceTy,
+                                        gather.getOperand(), sliceStartIndices,
+                                        gather.getSliceSizes());
 
     rewriter.replaceOpWithNewOp<ReshapeOp>(gather, gather.getType(), slice);
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 265bb8f4cac0ea..bf92c0636b759f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -71,12 +71,12 @@ void prepareConstantOp(Operation* op, SplatElementsAttr attr) {
     assert(mlir::isa<FloatType>(complexTy.getElementType()) &&
            "unexpected int complex in MHLO");
     auto complexVal = attr.getSplatValue<std::complex<APFloat>>();
-    cst = b.create<ConstantOp>(DenseElementsAttr::get(tensorType, complexVal));
+    cst = ConstantOp::create(b, DenseElementsAttr::get(tensorType, complexVal));
   } else {
-    cst = b.create<ConstantOp>(attr.getSplatValue<Attribute>());
+    cst = ConstantOp::create(b, attr.getSplatValue<Attribute>());
   }
   auto broadcast =
-      b.create<BroadcastInDimOp>(returnType, cst, b.getI64TensorAttr({}));
+      BroadcastInDimOp::create(b, returnType, cst, b.getI64TensorAttr({}));
   if (auto sharding = op->getAttrOfType<mlir::StringAttr>(kShardingAttr)) {
     // The added broadcast inherits the kShardingAttr from op.
     broadcast->setAttr(kShardingAttr, sharding);
@@ -103,8 +103,8 @@ void prepareBroadcastInDim(BroadcastInDimOp bcast) {
     return rawDims[lhs] < rawDims[rhs];
   });
   OpBuilder builder(bcast);
-  bcast.setOperand(builder.create<TransposeOp>(
-      bcast.getLoc(), bcast.getOperand(),
+  bcast.setOperand(TransposeOp::create(
+      builder, bcast.getLoc(), bcast.getOperand(),
       DenseIntElementsAttr::get(dims.getType(), transposedDim)));
   // Now reuse the original broadcast_dimensions and sort it.
   transposedDim.assign(rawDims.begin(), rawDims.end());
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index 536d09200dad39..cb7e6793f19f5c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -404,9 +404,9 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     // for the generic builder.
     StablehloToHloOp<StablehloOpTy> hloOp;
     if constexpr (std::is_same<StablehloOpTy, stablehlo::CaseOp>::value) {
-      hloOp = rewriter.create<mhlo::CaseOp>(stablehloOp.getLoc(), hloTypes,
-                                            hloOperands, hloAttrs,
-                                            stablehloOp.getBranches().size());
+      hloOp = mhlo::CaseOp::create(rewriter, stablehloOp.getLoc(), hloTypes,
+                                   hloOperands, hloAttrs,
+                                   stablehloOp.getBranches().size());
     } else {
       hloOp = rewriter.create<StablehloToHloOp<StablehloOpTy>>(
           stablehloOp.getLoc(), hloTypes, hloOperands, hloAttrs);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
index d07f3178552f76..47791f5ec751c4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
@@ -50,8 +50,8 @@ Value broadcastToFeatureDim(Location loc, RankedTensorType resultType,
         loc, resultType, value1d, shapeValue, dims);
   }
   assert(resultType.hasStaticShape());
-  return rewriter.create<mhlo::BroadcastInDimOp>(loc, resultType, value1d,
-                                                 dims);
+  return mhlo::BroadcastInDimOp::create(rewriter, loc, resultType, value1d,
+                                        dims);
 }
 
 // Get the shape of operand, assuming it is a dynamic shape with static rank.
@@ -59,8 +59,8 @@ Value getShapeValue(Location loc, Value operand,
                     PatternRewriter &rewriter) {  // NOLINT
   RankedTensorType resultType =
       mlir::dyn_cast<RankedTensorType>(operand.getType());
-  return rewriter.create<mlir::shape::ShapeOfOp>(
-      loc,
+  return mlir::shape::ShapeOfOp::create(
+      rewriter, loc,
       RankedTensorType::get({resultType.getRank()}, rewriter.getIndexType()),
       operand);
 }
@@ -90,12 +90,12 @@ Value materializeEpsilon(Operation *op, FloatAttr epsilonAttr, FloatType fpType,
   auto scalarType = RankedTensorType::get({}, fpType);
   auto epsilonTensorAttr =
       DenseElementsAttr::get(scalarType, {mlir::cast<Attribute>(epsilonAttr)});
-  Value epsilon = b.create<mhlo::ConstantOp>(epsilonTensorAttr);
+  Value epsilon = mhlo::ConstantOp::create(b, epsilonTensorAttr);
   auto dimsType = RankedTensorType::get({0}, b.getIntegerType(64));
   auto dims = DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
   if (broadcastToType.hasStaticShape()) {
-    return b.create<mhlo::BroadcastInDimOp>(broadcastToType, epsilon,
-                                            /*broadcast_dims=*/dims);
+    return mhlo::BroadcastInDimOp::create(b, broadcastToType, epsilon,
+                                          /*broadcast_dims=*/dims);
   }
   Value shapeValue = getShapeValue(op->getLoc(), broadcastTo, rewriter);
   return b.createOrFold<mhlo::DynamicBroadcastInDimOp>(broadcastToType, epsilon,
@@ -134,9 +134,9 @@ class UnfuseBatchNormInferencePattern
     if (!epsilon) {
       return failure();
     }
-    Value stddev = rewriter.create<mhlo::AddOp>(bnOp.getLoc(),
-                                                bnOp.getVariance(), epsilon);
-    stddev = rewriter.create<mhlo::SqrtOp>(bnOp.getLoc(), stddev);
+    Value stddev = mhlo::AddOp::create(rewriter, bnOp.getLoc(),
+                                       bnOp.getVariance(), epsilon);
+    stddev = mhlo::SqrtOp::create(rewriter, bnOp.getLoc(), stddev);
 
     // Broadcast all terms.
     Value shapeValue;
@@ -157,12 +157,12 @@ class UnfuseBatchNormInferencePattern
 
     // Compute:
     // scale * (input - mean) / stddev + offset
-    Value result = rewriter.create<mhlo::SubtractOp>(
-        bnOp.getLoc(), bnOp.getOperand(), broadcastMean);
+    Value result = mhlo::SubtractOp::create(rewriter, bnOp.getLoc(),
+                                            bnOp.getOperand(), broadcastMean);
     result =
-        rewriter.create<mhlo::MulOp>(bnOp.getLoc(), result, broadcastScale);
+        mhlo::MulOp::create(rewriter, bnOp.getLoc(), result, broadcastScale);
     result =
-        rewriter.create<mhlo::DivOp>(bnOp.getLoc(), result, broadcastStddev);
+        mhlo::DivOp::create(rewriter, bnOp.getLoc(), result, broadcastStddev);
     rewriter.replaceOpWithNewOp<mhlo::AddOp>(bnOp, result, broadcastOffset);
 
     return success();
@@ -178,8 +178,8 @@ Value createReduce(Location loc, Value operand, Value zero,
   Type reduceResultType = RankedTensorType::get(
       {operandType.getDimSize(featureIndex)}, operandType.getElementType());
   mhlo::ReduceOp reduce =
-      rewriter.create<mhlo::ReduceOp>(loc, reduceResultType, operand, zero,
-                                      rewriter.getI64TensorAttr(reduceDims));
+      mhlo::ReduceOp::create(rewriter, loc, reduceResultType, operand, zero,
+                             rewriter.getI64TensorAttr(reduceDims));
 
   // setup "mhlo.reduce"'s body
   Region &region = reduce.getBody();
@@ -194,8 +194,8 @@ Value createReduce(Location loc, Value operand, Value zero,
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPointToStart(&block);
     Value addResult =
-        rewriter.create<mhlo::AddOp>(loc, *firstArgument, *secondArgument);
-    rewriter.create<mhlo::ReturnOp>(loc, addResult);
+        mhlo::AddOp::create(rewriter, loc, *firstArgument, *secondArgument);
+    mhlo::ReturnOp::create(rewriter, loc, addResult);
   }
 
   return reduce.getResult(0);
@@ -214,17 +214,18 @@ Value calculateReduceSize(Operation *op, Value operand,
     Value operandShape = getShapeValue(op->getLoc(), operand, rewriter);
     Value scaleShape = getShapeValue(op->getLoc(), scale, rewriter);
     Value operandTotalSize =
-        b.create<shape::NumElementsOp>(indexType, operandShape);
+        shape::NumElementsOp::create(b, indexType, operandShape);
     Value scaleTotalSize =
-        b.create<shape::NumElementsOp>(indexType, scaleShape);
+        shape::NumElementsOp::create(b, indexType, scaleShape);
     Value reduceSize =
-        b.create<shape::DivOp>(indexType, operandTotalSize, scaleTotalSize);
-    reduceSize = b.create<arith::IndexCastOp>(b.getI64Type(), reduceSize);
-    reduceSize = b.create<tensor::FromElementsOp>(reduceSize);
-    reduceSize = b.create<mhlo::ConvertOp>(
-        RankedTensorType::get({1}, operandType.getElementType()), reduceSize);
-    reduceSize = b.create<mhlo::ReshapeOp>(
-        RankedTensorType::get({}, operandType.getElementType()), reduceSize);
+        shape::DivOp::create(b, indexType, operandTotalSize, scaleTotalSize);
+    reduceSize = arith::IndexCastOp::create(b, b.getI64Type(), reduceSize);
+    reduceSize = tensor::FromElementsOp::create(b, reduceSize);
+    reduceSize = mhlo::ConvertOp::create(
+        b, RankedTensorType::get({1}, operandType.getElementType()),
+        reduceSize);
+    reduceSize = mhlo::ReshapeOp::create(
+        b, RankedTensorType::get({}, operandType.getElementType()), reduceSize);
     return b.createOrFold<mhlo::DynamicBroadcastInDimOp>(
         scaleType, reduceSize, scaleShape, b.getI64TensorAttr({}));
   }
@@ -244,8 +245,8 @@ Value calculateReduceSize(Operation *op, Value operand,
   if (losesInfo) {
     op->emitWarning("Conversion of reduce_dims_size loses precision");
   }
-  Value reduceSize = b.create<mhlo::ConstantOp>(
-      DenseFPElementsAttr::get(scaleType, floatValue));
+  Value reduceSize = mhlo::ConstantOp::create(
+      b, DenseFPElementsAttr::get(scaleType, floatValue));
   return reduceSize;
 }
 
@@ -278,8 +279,8 @@ class UnfuseBatchNormTrainingPattern
     }
 
     // zero constant
-    Value constZero = rewriter.create<mhlo::ConstantOp>(
-        bnOp.getLoc(),
+    Value constZero = mhlo::ConstantOp::create(
+        rewriter, bnOp.getLoc(),
         DenseFPElementsAttr::get(RankedTensorType::get({}, fpType),
                                  APFloat::getZero(fpType.getFloatSemantics())));
     // epsilon
@@ -300,27 +301,28 @@ class UnfuseBatchNormTrainingPattern
     Value sum = createReduce(bnOp.getLoc(), bnOp.getOperand(), constZero,
                              dimensionsWithoutFeature, featureIndex, rewriter);
     // X^2
-    Value operandSquare = rewriter.create<mhlo::MulOp>(
-        bnOp.getLoc(), bnOp.getOperand(), bnOp.getOperand());
+    Value operandSquare = mhlo::MulOp::create(
+        rewriter, bnOp.getLoc(), bnOp.getOperand(), bnOp.getOperand());
     // Sum[X^2]
     Value squareSum =
         createReduce(bnOp.getLoc(), operandSquare, constZero,
                      dimensionsWithoutFeature, featureIndex, rewriter);
     // E[X]
-    Value mean = rewriter.create<mhlo::DivOp>(bnOp.getLoc(), sum, reduceSize);
+    Value mean = mhlo::DivOp::create(rewriter, bnOp.getLoc(), sum, reduceSize);
     // E[X^2]
     Value squareMean =
-        rewriter.create<mhlo::DivOp>(bnOp.getLoc(), squareSum, reduceSize);
+        mhlo::DivOp::create(rewriter, bnOp.getLoc(), squareSum, reduceSize);
     // E^2[X]
-    Value meanSquare = rewriter.create<mhlo::MulOp>(bnOp.getLoc(), mean, mean);
+    Value meanSquare = mhlo::MulOp::create(rewriter, bnOp.getLoc(), mean, mean);
     // Var[X]
-    Value var = rewriter.create<mhlo::SubtractOp>(bnOp.getLoc(), squareMean,
-                                                  meanSquare);
+    Value var = mhlo::SubtractOp::create(rewriter, bnOp.getLoc(), squareMean,
+                                         meanSquare);
     // Var[X] + epsilon
     Value varAddEpsilon =
-        rewriter.create<mhlo::AddOp>(bnOp.getLoc(), var, epsilon);
+        mhlo::AddOp::create(rewriter, bnOp.getLoc(), var, epsilon);
     // Sqrt(Var[X] + epsilon)
-    Value sqrtVar = rewriter.create<mhlo::SqrtOp>(bnOp.getLoc(), varAddEpsilon);
+    Value sqrtVar =
+        mhlo::SqrtOp::create(rewriter, bnOp.getLoc(), varAddEpsilon);
 
     Value shapeValue;
     if (!operandType.hasStaticShape()) {
@@ -329,27 +331,27 @@ class UnfuseBatchNormTrainingPattern
     // X - E[X]
     Value meanBroadcast = broadcastToFeatureDim(
         bnOp.getLoc(), operandType, mean, shapeValue, featureIndex, rewriter);
-    Value operandMinusMean = rewriter.create<mhlo::SubtractOp>(
-        bnOp.getLoc(), bnOp.getOperand(), meanBroadcast);
+    Value operandMinusMean = mhlo::SubtractOp::create(
+        rewriter, bnOp.getLoc(), bnOp.getOperand(), meanBroadcast);
     // (X - E[X]) / Sqrt(Var[X] + epsilon)
     Value sqrtVarBroadcast =
         broadcastToFeatureDim(bnOp.getLoc(), operandType, sqrtVar, shapeValue,
                               featureIndex, rewriter);
-    Value normalized = rewriter.create<mhlo::DivOp>(
-        bnOp.getLoc(), operandMinusMean, sqrtVarBroadcast);
+    Value normalized = mhlo::DivOp::create(rewriter, bnOp.getLoc(),
+                                           operandMinusMean, sqrtVarBroadcast);
 
     // ((X - E[X]) / Sqrt(Var[X] + epsilon)) * scale
     Value scaleBroadcast =
         broadcastToFeatureDim(bnOp.getLoc(), operandType, bnOp.getScale(),
                               shapeValue, featureIndex, rewriter);
-    Value scaledNormalized =
-        rewriter.create<mhlo::MulOp>(bnOp.getLoc(), normalized, scaleBroadcast);
+    Value scaledNormalized = mhlo::MulOp::create(rewriter, bnOp.getLoc(),
+                                                 normalized, scaleBroadcast);
     // ((X - E[X]) / Sqrt(Var[X] + epsilon)) * scale + offset.
     Value offsetBroadcast =
         broadcastToFeatureDim(bnOp.getLoc(), operandType, bnOp.getOffset(),
                               shapeValue, featureIndex, rewriter);
-    Value shiftedNormalized = rewriter.create<mhlo::AddOp>(
-        bnOp.getLoc(), scaledNormalized, offsetBroadcast);
+    Value shiftedNormalized = mhlo::AddOp::create(
+        rewriter, bnOp.getLoc(), scaledNormalized, offsetBroadcast);
 
     // results
     SmallVector<Value> results = {shiftedNormalized, mean, var};
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
index b4879df7c20ffd..ed921647946dc3 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
@@ -176,8 +176,8 @@ LogicalResult refineValues(
     };
     if (llvm::none_of(value.getUses(), isFuncReturn)) continue;
     rewriter.setInsertionPointAfter(manualComputation);
-    auto castToUnrefinedType = rewriter.create<UnrealizedConversionCastOp>(
-        manualComputation->getLoc(), unrefinedType, value);
+    auto castToUnrefinedType = UnrealizedConversionCastOp::create(
+        rewriter, manualComputation->getLoc(), unrefinedType, value);
     value.replaceUsesWithIf(castToUnrefinedType.getOutputs()[0], isFuncReturn);
   }
 
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
index 8fae3ddee4f585..bd49a0434e7c00 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
@@ -83,12 +83,11 @@ struct AddQuantDeQuantAfterConvolutionOp final
         cast<ShapedType>(clonedConvOp->getResult(0).getType());
     auto loc = clonedConvOp->getLoc();
     auto quantizedType = getQuantizedType(loc, rewriter, convResultType);
-    auto stablehloQuantizeOp = rewriter.create<stablehlo::UniformQuantizeOp>(
-        op.getLoc(), quantizedType, clonedConvOp->getResult(0));
-    auto stablehloDeQuantizeOp =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            op.getLoc(), op.getType(),
-            /*input=*/stablehloQuantizeOp.getResult());
+    auto stablehloQuantizeOp = stablehlo::UniformQuantizeOp::create(
+        rewriter, op.getLoc(), quantizedType, clonedConvOp->getResult(0));
+    auto stablehloDeQuantizeOp = stablehlo::UniformDequantizeOp::create(
+        rewriter, op.getLoc(), op.getType(),
+        /*input=*/stablehloQuantizeOp.getResult());
     rewriter.replaceAllUsesWith(op, stablehloDeQuantizeOp.getResult());
     return success();
   }
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
index 73af52c7f31bde..a4dc72397919c1 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
@@ -63,9 +63,9 @@ struct CanonicalizeDynamicReduceWindowOpPattern
                                          "expected static window_dilations");
     if (failed(hlo::matchInts(op.getPadding(), padding)))
       return rewriter.notifyMatchFailure(op, "expected static padding");
-    auto newOp = rewriter.create<stablehlo::ReduceWindowOp>(
-        op->getLoc(), op->getResultTypes(), op.getInputs(), op.getInitValues(),
-        rewriter.getDenseI64ArrayAttr(windowDimensions),
+    auto newOp = stablehlo::ReduceWindowOp::create(
+        rewriter, op->getLoc(), op->getResultTypes(), op.getInputs(),
+        op.getInitValues(), rewriter.getDenseI64ArrayAttr(windowDimensions),
         rewriter.getDenseI64ArrayAttr(windowStrides),
         rewriter.getDenseI64ArrayAttr(baseDilations),
         rewriter.getDenseI64ArrayAttr(windowDilations),
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
index 1b2e3cf15eaa97..8f5ba8def72a0f 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
@@ -95,8 +95,8 @@ LogicalResult expandTupledTensorInReturnOp(func::FuncOp func) {
       // Construct a new tuple and rewire it.
       OpBuilder builder(func.getBody());
       builder.setInsertionPointToStart(&func.getBody().front());
-      auto newTuple =
-          builder.create<stablehlo::TupleOp>(loc, tupleType, flattenedOperands);
+      auto newTuple = stablehlo::TupleOp::create(builder, loc, tupleType,
+                                                 flattenedOperands);
       func.getArgument(originalArgumentIndex).replaceAllUsesWith(newTuple);
 
       // Now the original argument has been rewired, we should be able to
@@ -130,8 +130,8 @@ LogicalResult expandTupledTensorInReturnOp(func::FuncOp func) {
 
   if (returnOp.getOperands() == expandedReturnOperands) return success();
 
-  builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
-                                       expandedReturnOperands);
+  mlir::func::ReturnOp::create(builder, returnOp.getLoc(),
+                               expandedReturnOperands);
   returnOp.erase();
   auto newFuncType = FunctionType::get(oldFuncType.getContext(),
                                        expandedInputTypes, expandedResultTypes);
@@ -174,7 +174,7 @@ Value createTupleValue(OpBuilder &builder, Location loc,
         createTupleValue(builder, loc, flattenValues, childType));
   }
 
-  return builder.create<mlir::stablehlo::TupleOp>(loc, flattenedSubValues)
+  return mlir::stablehlo::TupleOp::create(builder, loc, flattenedSubValues)
       .getResult();
 }
 
@@ -187,8 +187,9 @@ void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
   }
   int flattenIdx = 0;
   for (auto innerType : tupleType.getTypes()) {
-    auto innerValue = builder.create<stablehlo::GetTupleElementOp>(
-        loc, innerType, value, builder.getI32IntegerAttr(flattenIdx++));
+    auto innerValue = stablehlo::GetTupleElementOp::create(
+        builder, loc, innerType, value,
+        builder.getI32IntegerAttr(flattenIdx++));
     flattenTupleValue(builder, loc, innerValue, flattenedValues);
   }
 }
@@ -220,8 +221,9 @@ struct FlattenCustomCallOp : public OpRewritePattern<stablehlo::CustomCallOp> {
                                   op->result_type_end());
     }
 
-    auto flattenedCall = rewriter.create<stablehlo::CustomCallOp>(
-        op->getLoc(), flattenedResultTypes, flattenedOperands, op->getAttrs());
+    auto flattenedCall = stablehlo::CustomCallOp::create(
+        rewriter, op->getLoc(), flattenedResultTypes, flattenedOperands,
+        op->getAttrs());
 
     if (flattenResult) {
       ValueRange flattenedResultsRef(flattenedCall.getResults());
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp
index d2c8485cf77d50..3f5b026fb940f5 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp
@@ -309,8 +309,8 @@ class RewriteFakeQuantCompositeOp
         quantizedDimension, storageTypeMin, storageTypeMax);
     RankedTensorType quantizedType = RankedTensorType::get(
         llvm::cast<ShapedType>(op.getType(0)).getShape(), quantizedElementType);
-    auto stablehloQuantizeOp = rewriter.create<stablehlo::UniformQuantizeOp>(
-        op.getLoc(), quantizedType, /*input=*/op.getOperand(0));
+    auto stablehloQuantizeOp = stablehlo::UniformQuantizeOp::create(
+        rewriter, op.getLoc(), quantizedType, /*input=*/op.getOperand(0));
     rewriter.replaceOpWithNewOp<stablehlo::UniformDequantizeOp>(
         op, op.getType(0),
         /*input=*/stablehloQuantizeOp.getResult());
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
index 3e793b4210e1bb..2450fc2393598d 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
@@ -44,7 +44,7 @@ limitations under the License.
 namespace mlir {
 namespace stablehlo_ext {
 
-constexpr char kShardingAttr[] = "stablehlo.sharding";
+constexpr char kShardingAttr[] = "mhlo.sharding";
 
 #define GEN_PASS_DEF_STABLEHLOPREPAREFORHLOEXPORTPASS
 #include "stablehlo_ext/transforms/passes.h.inc"
@@ -72,13 +72,13 @@ static void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
     assert(mlir::isa<FloatType>(complexTy.getElementType()) &&
            "unexpected int complex in StableHLO");
     auto complexVal = attr.getSplatValue<std::complex<APFloat>>();
-    cst = b.create<stablehlo::ConstantOp>(
-        DenseElementsAttr::get(tensorType, complexVal));
+    cst = stablehlo::ConstantOp::create(
+        b, DenseElementsAttr::get(tensorType, complexVal));
   } else {
-    cst = b.create<stablehlo::ConstantOp>(attr.getSplatValue<Attribute>());
+    cst = stablehlo::ConstantOp::create(b, attr.getSplatValue<Attribute>());
   }
-  auto broadcast = b.create<stablehlo::BroadcastInDimOp>(
-      returnType, cst, b.getDenseI64ArrayAttr({}));
+  auto broadcast = stablehlo::BroadcastInDimOp::create(
+      b, returnType, cst, b.getDenseI64ArrayAttr({}));
   if (auto sharding = op->getAttrOfType<mlir::StringAttr>(kShardingAttr)) {
     // The added broadcast inherits the kShardingAttr from op.
     broadcast->setAttr(kShardingAttr, sharding);
@@ -103,8 +103,8 @@ static void prepareBroadcastInDim(stablehlo::BroadcastInDimOp bcast) {
   llvm::sort(transposedDim,
              [&](int64_t lhs, int64_t rhs) { return dims[lhs] < dims[rhs]; });
   OpBuilder builder(bcast);
-  bcast.setOperand(builder.create<stablehlo::TransposeOp>(
-      bcast.getLoc(), bcast.getOperand(),
+  bcast.setOperand(stablehlo::TransposeOp::create(
+      builder, bcast.getLoc(), bcast.getOperand(),
       mlir::DenseI64ArrayAttr::get(builder.getContext(), transposedDim)));
   // Now reuse the original broadcast_dimensions and sort it.
   transposedDim.assign(dims.begin(), dims.end());
diff --git a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
index 6022dc31e64a1d..24288b5613acd6 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
@@ -92,8 +92,8 @@ void AllocToArgPass::runOnOperation() {
         // buffer.
         rewriter.setInsertionPoint(allocOp);
         Value arg = funcOp.getArguments().back();
-        Value collapsedArg = rewriter.create<memref::CollapseShapeOp>(
-            loc, arg, expandOp.getReassociationIndices());
+        Value collapsedArg = memref::CollapseShapeOp::create(
+            rewriter, loc, arg, expandOp.getReassociationIndices());
 
         // Replace alloc and its expansion.
         rewriter.replaceOp(allocOp, collapsedArg);
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
index ba72dda3746b97..73956da6a25874 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
@@ -55,21 +55,21 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
     // TODO(kramerb): Should this use materializeConstant instead?
     auto makeConstant = [&](Attribute attr, Type type) -> Value {
       if (complex::ConstantOp::isBuildableWith(attr, type))
-        return rewriter.create<complex::ConstantOp>(
-            loc, type, mlir::cast<ArrayAttr>(attr));
-      return rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
+        return complex::ConstantOp::create(rewriter, loc, type,
+                                           mlir::cast<ArrayAttr>(attr));
+      return arith::ConstantOp::create(rewriter, loc, cast<TypedAttr>(attr));
     };
 
     if (resultRank == 0) {
-      Value buffer = rewriter.create<memref::AllocOp>(loc, memrefType);
+      Value buffer = memref::AllocOp::create(rewriter, loc, memrefType);
       Value constant =
           makeConstant(elementsAttr.getValues<Attribute>()[0], elementType);
-      rewriter.create<memref::StoreOp>(loc, constant, buffer);
+      memref::StoreOp::create(rewriter, loc, constant, buffer);
       rewriter.replaceOp(op, {buffer});
       return success();
     }
 
-    Value buffer = rewriter.create<memref::AllocaOp>(loc, memrefType);
+    Value buffer = memref::AllocaOp::create(rewriter, loc, memrefType);
 
     bool allSameElems = elementsAttr.isSplat();
     Value value;
@@ -79,8 +79,8 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
     for (const auto &en :
          llvm::enumerate(elementsAttr.getValues<Attribute>())) {
       if (!allSameElems) value = makeConstant(en.value(), elementType);
-      Value index = rewriter.create<arith::ConstantIndexOp>(loc, en.index());
-      rewriter.create<memref::StoreOp>(loc, value, buffer, index);
+      Value index = arith::ConstantIndexOp::create(rewriter, loc, en.index());
+      memref::StoreOp::create(rewriter, loc, value, buffer, index);
     }
     rewriter.replaceOp(op, {buffer});
     return success();
@@ -97,7 +97,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
       ConversionPatternRewriter &rewriter) const override {
     auto loc = broadcastShapesOp.getLoc();
     ImplicitLocOpBuilder lb(loc, rewriter);
-    Value zero = lb.create<arith::ConstantIndexOp>(0);
+    Value zero = arith::ConstantIndexOp::create(lb, 0);
     SmallVector<Value> shapes = adaptor.getShapes();
     size_t k = shapes.size();
     SmallVector<Value> ranks;
@@ -106,12 +106,12 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // Determine the maximum rank of the operands.
     Value maxRank;
     for (size_t i = 0; i < k; ++i) {
-      Value rank = lb.create<memref::DimOp>(loc, shapes[i], zero);
+      Value rank = memref::DimOp::create(lb, loc, shapes[i], zero);
       ranks.push_back(rank);
       if (i) {
-        Value rankIsGreater = lb.create<arith::CmpIOp>(
-            arith::CmpIPredicate::ugt, ranks[i], maxRank);
-        maxRank = lb.create<arith::SelectOp>(rankIsGreater, ranks[i], maxRank);
+        Value rankIsGreater = arith::CmpIOp::create(
+            lb, arith::CmpIPredicate::ugt, ranks[i], maxRank);
+        maxRank = arith::SelectOp::create(lb, rankIsGreater, ranks[i], maxRank);
       } else {
         maxRank = ranks[0];
       }
@@ -122,17 +122,17 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     resultShapes.reserve(k);
     auto resultType =
         MemRefType::get({ShapedType::kDynamic}, lb.getIndexType());
-    Value one = lb.create<arith::ConstantIndexOp>(1);
+    Value one = arith::ConstantIndexOp::create(lb, 1);
     for (size_t i = 0; i < k; ++i) {
       // We assume the buffer will be small, so we allocate it on the stack.
       // TODO(b/181654096): Replace AllocaOp with AllocOp.
-      auto result = lb.create<memref::AllocaOp>(resultType, ranks[i]);
-      lb.create<scf::ForOp>(zero, ranks[i], one, mlir::ValueRange(),
-                            [&one, &result](OpBuilder& b, Location l, Value idx,
-                                            ValueRange /*vr*/) {
-                              b.create<memref::StoreOp>(l, one, result, idx);
-                              b.create<scf::YieldOp>(l, mlir::ValueRange());
-                            });
+      auto result = memref::AllocaOp::create(lb, resultType, ranks[i]);
+      scf::ForOp::create(lb, zero, ranks[i], one, mlir::ValueRange(),
+                         [&one, &result](OpBuilder& b, Location l, Value idx,
+                                         ValueRange /*vr*/) {
+                           memref::StoreOp::create(b, l, one, result, idx);
+                           scf::YieldOp::create(b, l, mlir::ValueRange());
+                         });
       resultShapes.push_back(result);
     }
 
@@ -143,10 +143,10 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // backward, because the broadcasting semantics mean that the last
     // dimensions of each shape (the least significant ones) are matched
     // together.
-    Value two = lb.create<arith::ConstantIndexOp>(2);
-    Value maxRankPlusTwo = lb.create<arith::AddIOp>(loc, maxRank, two);
+    Value two = arith::ConstantIndexOp::create(lb, 2);
+    Value maxRankPlusTwo = arith::AddIOp::create(lb, loc, maxRank, two);
     Value constantFalse =
-        lb.create<arith::ConstantOp>(lb.getI1Type(), lb.getBoolAttr(false));
+        arith::ConstantOp::create(lb, lb.getI1Type(), lb.getBoolAttr(false));
     SmallVector<Value> initValues;
     initValues.reserve(k + 3);
     // Initially, all values are marked as not broadcasted.
@@ -164,9 +164,9 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // used as an offset from the end of each shape vector. We iterate until
     // max_rank + 1 to handle the case that we have a running_product > 1 left
     // when we have processed all dimensions of the largest shape.
-    auto mainLoop = lb.create<scf::ForOp>(
-        one, maxRankPlusTwo, one, initValues,
-        [&](OpBuilder &b, Location l, Value v, ValueRange vr) {
+    auto mainLoop = scf::ForOp::create(
+        lb, one, maxRankPlusTwo, one, initValues,
+        [&](OpBuilder& b, Location l, Value v, ValueRange vr) {
           // 'same_size' should track what the size of the dimension is to which
           // the 1-sized dimensions are broadcasted. If all of the dimensions
           // are 1, it will stay 1.
@@ -192,41 +192,41 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           for (size_t i = 0; i < k; ++i) {
             // Determine the size of the current dimension. If the dimension is
             // out of bounds, we choose the value 'one'.
-            Value isOutOfBounds = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ult, ranks[i], v);
-            Value dimension = b.create<arith::SubIOp>(l, ranks[i], v);
+            Value isOutOfBounds = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ult, ranks[i], v);
+            Value dimension = arith::SubIOp::create(b, l, ranks[i], v);
             resultDimensions.push_back(dimension);
-            Value currentSize =
-                b.create<scf::IfOp>(
-                     l, isOutOfBounds,
-                     [&](OpBuilder &b, Location l) {
-                       b.create<scf::YieldOp>(l, one);
-                     },
-                     [&](OpBuilder &b, Location l) {
-                       // Using IfOp instead of SelectOp makes sure that we
-                       // don't try to load if the dimension is out of bounds.
-                       Value size =
-                           b.create<memref::LoadOp>(l, shapes[i], dimension);
-                       b.create<scf::YieldOp>(l, size);
-                     })
-                    .getResult(0);
+            Value currentSize = scf::IfOp::create(
+                                    b, l, isOutOfBounds,
+                                    [&](OpBuilder& b, Location l) {
+                                      scf::YieldOp::create(b, l, one);
+                                    },
+                                    [&](OpBuilder& b, Location l) {
+                                      // Using IfOp instead of SelectOp makes
+                                      // sure that we don't try to load if the
+                                      // dimension is out of bounds.
+                                      Value size = memref::LoadOp::create(
+                                          b, l, shapes[i], dimension);
+                                      scf::YieldOp::create(b, l, size);
+                                    })
+                                    .getResult(0);
             // Compute whether the current dimension does require broadcasting.
-            Value currentSizeIsNotOne = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, currentSize, one);
+            Value currentSizeIsNotOne = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ne, currentSize, one);
             noBroadcasting.push_back(currentSizeIsNotOne);
-            Value newSameSize = b.create<arith::SelectOp>(
-                l, currentSizeIsNotOne, currentSize, sameSize);
-            Value sameSizeWasNotOne = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, sameSize, one);
-            Value isDifferentSize = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, sameSize, newSameSize);
+            Value newSameSize = arith::SelectOp::create(
+                b, l, currentSizeIsNotOne, currentSize, sameSize);
+            Value sameSizeWasNotOne = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ne, sameSize, one);
+            Value isDifferentSize = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ne, sameSize, newSameSize);
             // The broadcast is invalid if the size of the current dimension
             // is not equal to the expected size, unless the expected size was
             // still the initial value 1.
             Value isInvalid =
-                b.create<arith::AndIOp>(l, sameSizeWasNotOne, isDifferentSize);
-            currentDimensionHasInvalidBroadcast = b.create<arith::OrIOp>(
-                l, currentDimensionHasInvalidBroadcast, isInvalid);
+                arith::AndIOp::create(b, l, sameSizeWasNotOne, isDifferentSize);
+            currentDimensionHasInvalidBroadcast = arith::OrIOp::create(
+                b, l, currentDimensionHasInvalidBroadcast, isInvalid);
             sameSize = newSameSize;
           }
 
@@ -234,22 +234,22 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           // status regarding whether it needs broadcasting at the current
           // dimension versus whether it needs broadcasting at the previous
           // dimension.
-          Value sameSizeIsOne = b.create<arith::CmpIOp>(
-              l, arith::CmpIPredicate::eq, sameSize, one);
+          Value sameSizeIsOne = arith::CmpIOp::create(
+              b, l, arith::CmpIPredicate::eq, sameSize, one);
           Value differentBroadcastingSet = constantFalse;
           for (size_t i = 0; i < k; ++i) {
             // If all dimensions are 1, we preserve the status whether a shape
             // needs broadcasting or not, because in that case the dimension can
             // just be ignored.
-            noBroadcasting[i] = b.create<arith::SelectOp>(
-                l, sameSizeIsOne, prevNoBroadcasting[i], noBroadcasting[i]);
+            noBroadcasting[i] = arith::SelectOp::create(
+                b, l, sameSizeIsOne, prevNoBroadcasting[i], noBroadcasting[i]);
             // Compare whether the current shape changes its status regarding
             // whether it needs broadcasting at the current dimension.
-            Value broadcastingIsDifferent = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, prevNoBroadcasting[i],
-                noBroadcasting[i]);
-            differentBroadcastingSet = b.create<arith::OrIOp>(
-                l, differentBroadcastingSet, broadcastingIsDifferent);
+            Value broadcastingIsDifferent =
+                arith::CmpIOp::create(b, l, arith::CmpIPredicate::ne,
+                                      prevNoBroadcasting[i], noBroadcasting[i]);
+            differentBroadcastingSet = arith::OrIOp::create(
+                b, l, differentBroadcastingSet, broadcastingIsDifferent);
           }
           Value runningProduct = vr[k];
           Value currentDimensionOffset = vr[k + 1];
@@ -257,83 +257,82 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           // We need to stop combining dimensions if the set of shapes which
           // need broadcasting at the current dimension changes compared to the
           // set of shapes needing broadcasting at the previous dimension.
-          Value isLastIteration =
-              b.create<arith::CmpIOp>(l, arith::CmpIPredicate::sgt, v, maxRank);
-          Value stopCombiningDimensions = b.create<arith::OrIOp>(
-              l, isLastIteration, differentBroadcastingSet);
-          auto ifStopCombiningDimensions = b.create<scf::IfOp>(
-              l, stopCombiningDimensions,
-              [&](OpBuilder &b, Location l) {
+          Value isLastIteration = arith::CmpIOp::create(
+              b, l, arith::CmpIPredicate::sgt, v, maxRank);
+          Value stopCombiningDimensions = arith::OrIOp::create(
+              b, l, isLastIteration, differentBroadcastingSet);
+          auto ifStopCombiningDimensions = scf::IfOp::create(
+              b, l, stopCombiningDimensions,
+              [&](OpBuilder& b, Location l) {
                 // If the running product is not 1, add one dimension of size
                 // 'running_product' to each shape that didn't need
                 // broadcasting, otherwise add a 1 dimension if it was
                 // previously indexed in-bounds.
-                Value runningProductNotOne = b.create<arith::CmpIOp>(
-                    l, arith::CmpIPredicate::ne, runningProduct, one);
+                Value runningProductNotOne = arith::CmpIOp::create(
+                    b, l, arith::CmpIPredicate::ne, runningProduct, one);
                 Value newDimensionOffset =
-                    b.create<scf::IfOp>(
-                         l, runningProductNotOne,
-                         [&](OpBuilder &b, Location l) {
-                           Value newDimensionOffset = b.create<arith::AddIOp>(
-                               l, currentDimensionOffset, one);
-                           Value minusOne =
-                               lb.create<arith::ConstantIndexOp>(-1);
-                           for (size_t i = 0; i < k; ++i) {
-                             Value wasInBounds = b.create<arith::CmpIOp>(
-                                 l, arith::CmpIPredicate::sge,
-                                 resultDimensions[i], minusOne);
-                             Value shouldStoreDimension =
-                                 b.create<arith::OrIOp>(l, wasInBounds,
-                                                        prevNoBroadcasting[i]);
-                             b.create<scf::IfOp>(
-                                 l, shouldStoreDimension,
-                                 [&](OpBuilder &b, Location l) {
-                                   Value outputDimension =
-                                       b.create<arith::SubIOp>(
-                                           l, ranks[i], newDimensionOffset);
-                                   // If the shape needed broadcasting at the
-                                   // previous dimension, we set the output size
-                                   // to 1, otherwise to 'running_product'.
-                                   Value outputSize = b.create<arith::SelectOp>(
-                                       l, prevNoBroadcasting[i], runningProduct,
-                                       one);
-                                   b.create<memref::StoreOp>(l, outputSize,
-                                                             resultShapes[i],
-                                                             outputDimension);
-                                   b.create<scf::YieldOp>(l,
-                                                          mlir::ValueRange());
-                                 });
-                           }
-                           b.create<scf::YieldOp>(l, newDimensionOffset);
-                         },
-                         [&](OpBuilder &b, Location l) {
-                           b.create<scf::YieldOp>(l, currentDimensionOffset);
-                         })
+                    scf::IfOp::create(
+                        b, l, runningProductNotOne,
+                        [&](OpBuilder& b, Location l) {
+                          Value newDimensionOffset = arith::AddIOp::create(
+                              b, l, currentDimensionOffset, one);
+                          Value minusOne =
+                              arith::ConstantIndexOp::create(lb, -1);
+                          for (size_t i = 0; i < k; ++i) {
+                            Value wasInBounds = arith::CmpIOp::create(
+                                b, l, arith::CmpIPredicate::sge,
+                                resultDimensions[i], minusOne);
+                            Value shouldStoreDimension = arith::OrIOp::create(
+                                b, l, wasInBounds, prevNoBroadcasting[i]);
+                            scf::IfOp::create(
+                                b, l, shouldStoreDimension,
+                                [&](OpBuilder& b, Location l) {
+                                  Value outputDimension = arith::SubIOp::create(
+                                      b, l, ranks[i], newDimensionOffset);
+                                  // If the shape needed broadcasting at the
+                                  // previous dimension, we set the output size
+                                  // to 1, otherwise to 'running_product'.
+                                  Value outputSize = arith::SelectOp::create(
+                                      b, l, prevNoBroadcasting[i],
+                                      runningProduct, one);
+                                  memref::StoreOp::create(b, l, outputSize,
+                                                          resultShapes[i],
+                                                          outputDimension);
+                                  scf::YieldOp::create(b, l,
+                                                       mlir::ValueRange());
+                                });
+                          }
+                          scf::YieldOp::create(b, l, newDimensionOffset);
+                        },
+                        [&](OpBuilder& b, Location l) {
+                          scf::YieldOp::create(b, l, currentDimensionOffset);
+                        })
                         .getResult(0);
-                b.create<scf::YieldOp>(
-                    l, ValueRange{sameSize, newDimensionOffset});
+                scf::YieldOp::create(b, l,
+                                     ValueRange{sameSize, newDimensionOffset});
               },
-              [&](OpBuilder &b, Location l) {
+              [&](OpBuilder& b, Location l) {
                 Value newRunningProduct =
-                    b.create<arith::MulIOp>(l, runningProduct, sameSize);
-                b.create<scf::YieldOp>(
-                    l, ValueRange{newRunningProduct, currentDimensionOffset});
+                    arith::MulIOp::create(b, l, runningProduct, sameSize);
+                scf::YieldOp::create(
+                    b, l,
+                    ValueRange{newRunningProduct, currentDimensionOffset});
               });
           // Add the remaining results.
           noBroadcasting.push_back(ifStopCombiningDimensions.getResult(0));
           noBroadcasting.push_back(ifStopCombiningDimensions.getResult(1));
           Value isInvalid = vr.back();
-          isInvalid = b.create<arith::OrIOp>(
-              l, isInvalid, currentDimensionHasInvalidBroadcast);
+          isInvalid = arith::OrIOp::create(b, l, isInvalid,
+                                           currentDimensionHasInvalidBroadcast);
           noBroadcasting.push_back(isInvalid);
-          b.create<scf::YieldOp>(l, noBroadcasting);
+          scf::YieldOp::create(b, l, noBroadcasting);
         });
     Value isInvalid = mainLoop.getResults().back();
     for (size_t i = 0; i < k; ++i) {
       resultShapes[i] =
           removeLeadingOnesFrom1DMemref(lb, resultShapes[i], ranks[i]);
       resultShapes[i] =
-          lb.create<arith::SelectOp>(isInvalid, shapes[i], resultShapes[i]);
+          arith::SelectOp::create(lb, isInvalid, shapes[i], resultShapes[i]);
     }
     rewriter.replaceOp(broadcastShapesOp, resultShapes);
     return success();
@@ -346,20 +345,20 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // boolean flag for whether every size so far was 1, one with the number of
     // leading 1's.
     Value constantTrue =
-        lb.create<arith::ConstantOp>(lb.getI1Type(), lb.getBoolAttr(true));
-    Value zero = lb.create<arith::ConstantIndexOp>(0);
-    Value one = lb.create<arith::ConstantIndexOp>(1);
-    auto leadingOnesLoop = lb.create<scf::ForOp>(
-        zero, rank, one, ValueRange{constantTrue, zero},
-        [&](OpBuilder &b, Location l, Value idx, ValueRange vr) {
-          auto size = b.create<memref::LoadOp>(l, extentMemref, idx);
+        arith::ConstantOp::create(lb, lb.getI1Type(), lb.getBoolAttr(true));
+    Value zero = arith::ConstantIndexOp::create(lb, 0);
+    Value one = arith::ConstantIndexOp::create(lb, 1);
+    auto leadingOnesLoop = scf::ForOp::create(
+        lb, zero, rank, one, ValueRange{constantTrue, zero},
+        [&](OpBuilder& b, Location l, Value idx, ValueRange vr) {
+          auto size = memref::LoadOp::create(b, l, extentMemref, idx);
           auto isEqualToOne =
-              b.create<arith::CmpIOp>(l, arith::CmpIPredicate::eq, size, one);
-          auto allOnes = b.create<arith::AndIOp>(l, vr.front(), isEqualToOne);
-          auto increasedValue = b.create<arith::AddIOp>(l, vr.back(), one);
+              arith::CmpIOp::create(b, l, arith::CmpIPredicate::eq, size, one);
+          auto allOnes = arith::AndIOp::create(b, l, vr.front(), isEqualToOne);
+          auto increasedValue = arith::AddIOp::create(b, l, vr.back(), one);
           auto numberOfLeadingOnes =
-              b.create<arith::SelectOp>(l, allOnes, increasedValue, vr.back());
-          b.create<scf::YieldOp>(l, ValueRange{allOnes, numberOfLeadingOnes});
+              arith::SelectOp::create(b, l, allOnes, increasedValue, vr.back());
+          scf::YieldOp::create(b, l, ValueRange{allOnes, numberOfLeadingOnes});
         });
     return leadingOnesLoop.getResults()[1];
   }
@@ -367,7 +366,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
   Value removeLeadingOnesFrom1DMemref(ImplicitLocOpBuilder &lb,
                                       Value extentMemref, Value rank) const {
     Value leadingOnes = countLeadingOnes(lb, extentMemref, rank);
-    Value newRank = lb.create<arith::SubIOp>(rank, leadingOnes);
+    Value newRank = arith::SubIOp::create(lb, rank, leadingOnes);
     auto resultType =
         MemRefType::get({ShapedType::kDynamic}, lb.getIndexType());
     // We cannot use SubView here to return a MemRef with 'leading_ones' as
@@ -377,16 +376,16 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // another buffer of the desired size and copy the elements over. We assume
     // the buffer will be small, so we allocate it on the stack.
     // TODO(b/181654096): Replace AllocaOp with AllocOp.
-    Value result = lb.create<memref::AllocaOp>(resultType, newRank);
-    Value zero = lb.create<arith::ConstantIndexOp>(0);
-    Value one = lb.create<arith::ConstantIndexOp>(1);
-    lb.create<scf::ForOp>(
-        zero, newRank, one, mlir::ValueRange(),
+    Value result = memref::AllocaOp::create(lb, resultType, newRank);
+    Value zero = arith::ConstantIndexOp::create(lb, 0);
+    Value one = arith::ConstantIndexOp::create(lb, 1);
+    scf::ForOp::create(
+        lb, zero, newRank, one, mlir::ValueRange(),
         [&](OpBuilder& b, Location l, Value idx, ValueRange /*vr*/) {
-          Value idxWithOffset = b.create<arith::AddIOp>(l, idx, leadingOnes);
-          auto size = b.create<memref::LoadOp>(l, extentMemref, idxWithOffset);
-          b.create<memref::StoreOp>(l, size, result, idx);
-          b.create<scf::YieldOp>(l, mlir::ValueRange());
+          Value idxWithOffset = arith::AddIOp::create(b, l, idx, leadingOnes);
+          auto size = memref::LoadOp::create(b, l, extentMemref, idxWithOffset);
+          memref::StoreOp::create(b, l, size, result, idx);
+          scf::YieldOp::create(b, l, mlir::ValueRange());
         });
     return result;
   }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index 2e9ea7a8a59254..fa572362080eaa 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -93,7 +93,7 @@ static Value materializeToTensor(OpBuilder& builder, TensorType type,
                                  ValueRange inputs, Location loc) {
   assert(inputs.size() == 1);
   assert(mlir::isa<BaseMemRefType>(inputs[0].getType()));
-  return builder.create<bufferization::ToTensorOp>(loc, type, inputs[0]);
+  return bufferization::ToTensorOp::create(builder, loc, type, inputs[0]);
 }
 
 // TODO(pifon): Remove as soon as https://reviews.llvm.org/D93126 is landed.
@@ -129,7 +129,7 @@ class CustomBufferizeTypeConverter : public mlir::TypeConverter {
       }
       if (isa<TensorType>(inputs[0].getType())) {
         // Tensor to MemRef cast.
-        return builder.create<bufferization::ToBufferOp>(loc, type, inputs[0]);
+        return bufferization::ToBufferOp::create(builder, loc, type, inputs[0]);
       }
       llvm_unreachable("only tensor/memref input types supported");
     });
@@ -146,7 +146,7 @@ class CustomBufferizeTypeConverter : public mlir::TypeConverter {
         return inputs[0];
       }
       assert(mlir::isa<TensorType>(inputs[0].getType()));
-      return builder.create<bufferization::ToBufferOp>(loc, type, inputs[0]);
+      return bufferization::ToBufferOp::create(builder, loc, type, inputs[0]);
     });
   }
 };
diff --git a/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc b/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
index 12d8b3814646e7..402fd7b112c432 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
@@ -62,7 +62,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     b.setInsertionPoint(result);
     for (auto [index, operand] : unitTensors(result->getOperands())) {
-      result->setOperand(index, b.create<tensor::ExtractOp>(operand));
+      result->setOperand(index, tensor::ExtractOp::create(b, operand));
     }
 
     // Fix any block arguments in the op. We're detensorizing all arguments that
@@ -76,8 +76,8 @@ struct RegionOpPattern : public OpRewritePattern<T> {
           // Change the argument type to a scalar, but repack it into a tensor.
           arg.setType(
               mlir::cast<RankedTensorType>(arg.getType()).getElementType());
-          auto converted = b.create<tensor::FromElementsOp>(
-              RankedTensorType::get({}, arg.getType()), arg);
+          auto converted = tensor::FromElementsOp::create(
+              b, RankedTensorType::get({}, arg.getType()), arg);
           arg.replaceAllUsesExcept(converted, converted.getOperation());
         }
 
@@ -86,7 +86,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
              unitTensors(block.getTerminator()->getOperands())) {
           b.setInsertionPoint(block.getTerminator());
           block.getTerminator()->setOperand(
-              index, b.create<tensor::ExtractOp>(operand));
+              index, tensor::ExtractOp::create(b, operand));
         }
       }
     }
@@ -99,7 +99,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
       opResult.setType(oldType.getElementType());
 
       // Convert the scalar back to a tensor in the output.
-      results[index] = b.create<tensor::FromElementsOp>(oldType, opResult);
+      results[index] = tensor::FromElementsOp::create(b, oldType, opResult);
     }
     rewriter.replaceOp(op.getOperation(), results);
     return success();
diff --git a/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
index b773792e67b5c4..0022fb5dfdc213 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
@@ -48,10 +48,10 @@ struct IndexCastConverter : public OpRewritePattern<T> {
         tensor::createDynamicDimValues(rewriter, op.getLoc(), op.getIn());
     rewriter.replaceOpWithNewOp<tensor::GenerateOp>(
         op, resultTy, dynamicExtents,
-        [&](OpBuilder &b, Location loc, ValueRange args) {
-          Value extent = b.create<tensor::ExtractOp>(loc, op.getIn(), args);
-          Value cast = b.create<T>(loc, resultTy.getElementType(), extent);
-          b.create<tensor::YieldOp>(loc, cast);
+        [&](OpBuilder& b, Location loc, ValueRange args) {
+          Value extent = tensor::ExtractOp::create(b, loc, op.getIn(), args);
+          Value cast = T::create(b, loc, resultTy.getElementType(), extent);
+          tensor::YieldOp::create(b, loc, cast);
         });
     return success();
   }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc
index d6efd72d2437c0..8d624f3afd8f31 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc
@@ -116,7 +116,7 @@ void TileLoopsPass::runOnOperation() {
       int64_t difference = upper[i].value() - lower[i].value();
       if (difference % (step[i].value() * unrollFactor) != 0) continue;
       ploop.getUpperBoundMutable().slice(i, 1).assign(
-          builder.create<arith::ConstantIndexOp>(loc, unrollFactor));
+          arith::ConstantIndexOp::create(builder, loc, unrollFactor));
     }
   }
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc b/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
index 5650e83be0c2d4..28bdded6ed8376 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
@@ -88,7 +88,7 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
                          targetType.getNumElements() <= tileSize;
 
     if (isContiguous || isSmall) {
-      rewriter.create<memref::CopyOp>(loc, src, target);
+      memref::CopyOp::create(rewriter, loc, src, target);
       return;
     }
 
@@ -99,14 +99,14 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
     const int64_t remainderSize = dimSize % sliceSize;
     const int64_t upperBound = shape[dim] - remainderSize;
 
-    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
     Value tileSizeValue =
-        rewriter.create<arith::ConstantIndexOp>(loc, sliceSize);
+        arith::ConstantIndexOp::create(rewriter, loc, sliceSize);
     Value upperBoundValue =
-        rewriter.create<arith::ConstantIndexOp>(loc, upperBound);
+        arith::ConstantIndexOp::create(rewriter, loc, upperBound);
 
-    auto loop = rewriter.create<scf::ForOp>(loc, zero, upperBoundValue,
-                                            tileSizeValue, target);
+    auto loop = scf::ForOp::create(rewriter, loc, zero, upperBoundValue,
+                                   tileSizeValue, target);
 
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointToStart(loop.getBody());
@@ -123,7 +123,7 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
     createLoopsNest(rewriter, loc, dim + 1, srcSubview, targetSubview, shape,
                     offsets, sizes, strides);
 
-    rewriter.create<scf::YieldOp>(loc, loop.getRegionIterArgs()[0]);
+    scf::YieldOp::create(rewriter, loc, loop.getRegionIterArgs()[0]);
 
     // Remainder copy can only be created for the innermost loop, for other
     // loops remainder size is guaranteed to be 0.
@@ -138,8 +138,8 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
       Value targetRemainderSubview =
           getSubView(rewriter, loc, target, shape, offsets, sizes, strides);
 
-      rewriter.create<memref::CopyOp>(loc, srcRemainderSubview,
-                                      targetRemainderSubview);
+      memref::CopyOp::create(rewriter, loc, srcRemainderSubview,
+                             targetRemainderSubview);
     }
   }
 
@@ -154,8 +154,8 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             shape, valType, offsets, sizes, strides));
 
-    return rewriter.create<memref::SubViewOp>(loc, valSubviewType, val, offsets,
-                                              sizes, strides);
+    return memref::SubViewOp::create(rewriter, loc, valSubviewType, val,
+                                     offsets, sizes, strides);
   }
 
   int64_t tileSize;
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
index af292de860e3f0..4e2a8c6358a16f 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
@@ -17,21 +17,71 @@ limitations under the License.
 
 #include <algorithm>
 #include <cassert>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
 #include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace hlo {
+namespace {
+APFloat getScalarLimitOfFloatType(FloatType floatTy, ScalarLimit limit) {
+  auto& semantics = floatTy.getFloatSemantics();
+  switch (limit) {
+    case kLowest:
+      return APFloat::getLargest(semantics, /*negative=*/true);
+    case kInfinityLowest:
+      return APFloat::getInf(semantics, /*negative=*/true);
+    case kMax:
+      return APFloat::getLargest(semantics, /*negative=*/false);
+    case kInfinityMax:
+      return APFloat::getInf(semantics, /*negative=*/false);
+  }
+  llvm_unreachable("invalid limit");
+}
+
+// Returns a scalar value for the given integer type.
+//
+// The argument 'scalar' describes which scalar value to return. `integer_value`
+// is used to specify the integer value for kInteger. For any other scalar,
+// integer_value is ignored.
+APInt getScalarLimitOfIntegerType(IntegerType integerTy, ScalarLimit limit) {
+  unsigned width = integerTy.getWidth();
+  bool isBool = (width == 1);
+  switch (limit) {
+    case kLowest:
+    case kInfinityLowest:
+      if (integerTy.isUnsigned() || isBool) {
+        return APInt::getMinValue(width);
+      } else {
+        return APInt::getSignedMinValue(width);
+      }
+
+    case kMax:
+    case kInfinityMax:
+      if (integerTy.isUnsigned() || isBool) {
+        return APInt::getMaxValue(width);
+      } else {
+        return APInt::getSignedMaxValue(width);
+      }
+  }
+  llvm_unreachable("invalid limit");
+}
+}  // namespace
 
 static constexpr size_t kPaddingSize = 64;
 
@@ -110,50 +160,6 @@ DenseElementsAttr getScalarNegZeroOfType(Type ty) {
   llvm_unreachable("unsupported type");
 }
 
-static APFloat getScalarLimitOfFloatType(FloatType floatTy, ScalarLimit limit) {
-  auto& semantics = floatTy.getFloatSemantics();
-  switch (limit) {
-    case kLowest:
-      return APFloat::getLargest(semantics, /*negative=*/true);
-    case kInfinityLowest:
-      return APFloat::getInf(semantics, /*negative=*/true);
-    case kMax:
-      return APFloat::getLargest(semantics, /*negative=*/false);
-    case kInfinityMax:
-      return APFloat::getInf(semantics, /*negative=*/false);
-  }
-  llvm_unreachable("invalid limit");
-}
-
-// Returns a scalar value for the given integer type.
-//
-// The argument 'scalar' describes which scalar value to return. `integer_value`
-// is used to specify the integer value for kInteger. For any other scalar,
-// integer_value is ignored.
-static APInt getScalarLimitOfIntegerType(IntegerType integerTy,
-                                         ScalarLimit limit) {
-  unsigned width = integerTy.getWidth();
-  bool isBool = (width == 1);
-  switch (limit) {
-    case kLowest:
-    case kInfinityLowest:
-      if (integerTy.isUnsigned() || isBool) {
-        return APInt::getMinValue(width);
-      } else {
-        return APInt::getSignedMinValue(width);
-      }
-
-    case kMax:
-    case kInfinityMax:
-      if (integerTy.isUnsigned() || isBool) {
-        return APInt::getMaxValue(width);
-      } else {
-        return APInt::getSignedMaxValue(width);
-      }
-  }
-  llvm_unreachable("invalid limit");
-}
-
 DenseElementsAttr getScalarLimitOfType(Type ty, ScalarLimit limit) {
   RankedTensorType scalarTy = RankedTensorType::get({}, ty);
   if (auto floatTy = mlir::dyn_cast<FloatType>(ty)) {
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
index 74dfa37326213e..617c529add6407 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
@@ -126,7 +126,7 @@ static Value getConstantLike(OpBuilder& b, Location loc, T constant,
       return complex::NumberAttr::get(complexTy, constant, 0);
     llvm_unreachable("unhandled element type");
   };
-  return b.create<ConstantLikeOp>(loc, cast<TypedAttr>(getAttr()), val);
+  return ConstantLikeOp::create(b, loc, cast<TypedAttr>(getAttr()), val);
 }
 
 Value getConstantLike(OpBuilder& b, Location loc, const APFloat& constant,
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 11fa3d24c990c8..a0bdf45320c23e 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -143,7 +143,9 @@ cc_library(
         ":device_event",
         ":host_callback",
         ":pjrt_client",
+        ":pjrt_executable",
         ":raw_buffer",
+        ":utils",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
@@ -155,6 +157,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -191,7 +194,7 @@ xla_cc_test(
         "//xla/client:local_client",
         "//xla/hlo/testlib:test",
         "//xla/service:cpu_plugin",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:statusor",
@@ -222,7 +225,7 @@ cc_library(
         ":worker_thread",
         "//xla:util",
         "//xla/client:local_client",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -668,6 +671,7 @@ cc_library(
         ":device_event",
         ":event_pool",
         ":host_callback",
+        ":host_memory_allocator",
         ":host_memory_spaces",
         ":layout_mode",
         ":local_device_state",
@@ -710,12 +714,12 @@ cc_library(
         "//xla/service:generic_transfer_manager",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
@@ -947,6 +951,7 @@ cc_library(
         "//xla:ef57",
         "//xla:permutation_util",
         "//xla:util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -971,24 +976,26 @@ xla_cc_test(
         ":transpose",
         "//xla:array",
         "//xla:permutation_util",
-        "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/testlib:test",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -1397,6 +1404,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "host_memory_allocator",
+    srcs = ["host_memory_allocator.cc"],
+    hdrs = ["host_memory_allocator.h"],
+    deps = [
+        "//xla/tsl/framework:allocator",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 xla_cc_test(
     name = "errors_test",
     srcs = ["errors_test.cc"],
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
index 85975e95db0a28..03436e55390afb 100644
--- a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
@@ -98,6 +98,18 @@ class AbstractTrackedDeviceBuffer {
         "WaitUntilBufferReadyOnStream is only implemented for GPU.");
   }
 
+  // TODO(parkers): definition events are fixed, so we should just store them
+  // directly.
+  // Returns true if there is an error in any of the events.
+  virtual bool AddDefinitionEventsToSet(PjRtDeviceEventSet& events) {
+    LOG(FATAL) << "TODO IMPLEMENT: AddDefinitionEventsToSet.";
+    return false;
+  }
+
+  virtual void AddUsageEventsToSet(PjRtDeviceEventSet& events) {
+    LOG(FATAL) << "TODO IMPLEMENT: AddUsageEventsToSet.";
+  }
+
  protected:
   void ReleaseDeviceMemory() {
     raw_buffer_ = tsl::RCReference<CommonPjRtRawBuffer>();
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 90213cdccdb613..6bc50f8c1e91cc 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -276,6 +276,7 @@ cc_library(
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt:raw_buffer",
+        "//xla/pjrt:scoped_async_tracking_event",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/pjrt/proto:topology_description_proto_cc",
@@ -417,7 +418,11 @@ cc_library(
     name = "pjrt_c_api_gpu_internal",
     srcs = ["pjrt_c_api_gpu_internal.cc"],
     hdrs = ["pjrt_c_api_gpu_internal.h"],
-    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    local_defines = (
+        if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+            "TENSORFLOW_USE_ROCM=1",
+        ])
+    ),
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_custom_partitioner_extension_hdrs",
@@ -449,7 +454,6 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/extensions/cross_host_transfers:pjrt_c_api_cross_host_transfer_extension",
         "//xla/pjrt/gpu:gpu_helpers",
-        "//xla/pjrt/gpu:gpu_topology",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # buildcleaner: keep to register GPU AOT compiler
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_pjrt_client",
@@ -459,10 +463,13 @@ cc_library(
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
         "//xla/service:compiler",
         "//xla/service:custom_call_target_registry",
+        "//xla/service:gpu_topology",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
-    ],
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 25c3fbd29a0d3d..2818e8ea62f00f 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,27 @@
 # PJRT C API changelog
 
+## 0.88
+
+* Add `PJRT_Buffer_DonateWithControlDependency`.
+
+## 0.87
+
+* Add `PJRT_Executable_GetCompileOptions`.
+
+## 0.86
+
+* Add `PJRT_Device_CreateAsyncTrackingEvent`.
+
+* Add `PJRT_AsyncTrackingEvent_Destroy`.
+
+## 0.85
+
+* Add `PJRT_Device_PoisonExecution`.
+
+## 0.84
+
+* Add `PJRT_Buffer_CopyRawToHostFuture`.
+
 ## 0.83
 
 * Add `PJRT_AsyncHostToDeviceTransferManager_TransferLiteral`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 8a34314ff037b3..dd3bf0f8cc9a00 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -104,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 83
+#define PJRT_API_MINOR 88
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1416,6 +1416,57 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_MemoryStats_Args, peak_pool_bytes_is_set);
 // also return PJRT_Error_Code_UNIMPLEMENTED. Intended for diagnostic purposes.
 typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
+struct PJRT_Device_PoisonExecution_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+
+  PJRT_Device* device;
+  int32_t launch_id;
+
+  // Status fields.
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+
+  bool poisoned;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_PoisonExecution_Args, poisoned);
+
+// Poisons the earliest execution on this device with given launch_id if it's
+// not finished yet, i.e. makes its output buffers error.
+typedef PJRT_Error* PJRT_Device_PoisonExecution(
+    PJRT_Device_PoisonExecution_Args* args);
+
+// --------------------------- AsyncTrackingEvent ------------------------------
+
+typedef struct PJRT_AsyncTrackingEvent PJRT_AsyncTrackingEvent;
+
+struct PJRT_Device_CreateAsyncTrackingEvent_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  const char* description;
+  size_t description_size;
+  PJRT_AsyncTrackingEvent* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_CreateAsyncTrackingEvent_Args, event);
+
+// Creates an async tracking event. The caller is responsible for destroying the
+// event.
+typedef PJRT_Error* PJRT_Device_CreateAsyncTrackingEvent(
+    PJRT_Device_CreateAsyncTrackingEvent_Args* args);
+
+struct PJRT_AsyncTrackingEvent_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncTrackingEvent* event;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncTrackingEvent_Destroy_Args, event);
+
+// Destroys the async tracking event.
+typedef PJRT_Error* PJRT_AsyncTrackingEvent_Destroy(
+    PJRT_AsyncTrackingEvent_Destroy_Args* args);
+
 //-------------------------------- Memory --------------------------------------
 
 struct PJRT_Memory_Id_Args {
@@ -1980,6 +2031,8 @@ typedef PJRT_Error* PJRT_Executable_OutputMemoryKinds(
 
 typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
 
+typedef struct PJRT_SerializedCompileOptions PJRT_SerializedCompileOptions;
+
 struct PJRT_Executable_Serialize_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2003,6 +2056,29 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Serialize_Args,
 typedef PJRT_Error* PJRT_Executable_Serialize(
     PJRT_Executable_Serialize_Args* args);
 
+struct PJRT_Executable_GetCompileOptions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+
+  // Lives only as long as serialized_compile_options
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_SerializedCompileOptions*
+      serialized_compile_options;  // backs serialized_bytes.
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_compile_options.
+  void (*serialized_compile_options_deleter)(
+      PJRT_SerializedCompileOptions* options);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCompileOptions_Args,
+                          serialized_compile_options_deleter);
+
+// Returns the CompileOptions that were used to compile this executable.
+typedef PJRT_Error* PJRT_Executable_GetCompileOptions(
+    PJRT_Executable_GetCompileOptions_Args* args);
+
 struct PJRT_Executable_DeserializeAndLoad_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2210,6 +2286,43 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHost_Args, event);
 typedef PJRT_Error* PJRT_Buffer_CopyRawToHost(
     PJRT_Buffer_CopyRawToHost_Args* args);
 
+struct PJRT_Buffer_CopyRawToHostFuture_Callback_Args {
+  size_t struct_size;
+
+  // callback_data should be set to the one returned by
+  // PJRT_Buffer_CopyRawToHostFuture.
+  void* callback_data;
+
+  PJRT_Error_Code error_code;
+  // error_message and error_message_size are only valid if error_code is not
+  // PJRT_ERROR_CODE_OK.
+  const char* error_message;
+  size_t error_message_size;
+  // dst is only valid if error_code is PJRT_ERROR_CODE_OK.
+  void* dst;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHostFuture_Callback_Args, dst);
+
+struct PJRT_Buffer_CopyRawToHostFuture_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  int64_t offset;
+  int64_t transfer_size;
+  PJRT_Event* event;  // out
+  // callback_data should be sent to the future_ready, when dst is ready.
+  void* callback_data;  // out
+  void (*future_ready_callback)(
+      PJRT_Buffer_CopyRawToHostFuture_Callback_Args* args);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHostFuture_Args,
+                          future_ready_callback);
+
+// Similar to PJRT_Buffer_CopyRawToHost, but the transfer will not happen until
+// `future_ready_callback` is invoked.
+typedef PJRT_Error* PJRT_Buffer_CopyRawToHostFuture(
+    PJRT_Buffer_CopyRawToHostFuture_Args* args);
+
 struct PJRT_Buffer_CopyToDevice_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2351,6 +2464,33 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args,
 typedef PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
     PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args* args);
 
+struct PJRT_Buffer_DonateWithControlDependency_Callback_Args {
+  size_t struct_size;
+  void* callback_data;
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DonateWithControlDependency_Callback_Args,
+                          error_message_size);
+
+struct PJRT_Buffer_DonateWithControlDependency_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+
+  void* callback_data;  // out
+  void (*dependency_ready_callback)(
+      PJRT_Buffer_DonateWithControlDependency_Callback_Args* args);  // out
+
+  PJRT_Buffer* out_buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DonateWithControlDependency_Args,
+                          out_buffer);
+
+typedef PJRT_Error* PJRT_Buffer_DonateWithControlDependency(
+    PJRT_Buffer_DonateWithControlDependency_Args* args);
+
 // ---------------------------- CopyToDeviceStream -----------------------------
 
 struct PJRT_CopyToDeviceStream_Destroy_Args {
@@ -2734,11 +2874,17 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetDeviceAssignment);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateErrorBuffer);
   _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferLiteral);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHostFuture);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_PoisonExecution);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_CreateAsyncTrackingEvent);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncTrackingEvent_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompileOptions);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_DonateWithControlDependency);
 } PJRT_Api;
 
 enum {
-  PJRT_Api_STRUCT_SIZE = PJRT_STRUCT_SIZE(
-      PJRT_Api, PJRT_AsyncHostToDeviceTransferManager_TransferLiteral)
+  PJRT_Api_STRUCT_SIZE =
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Buffer_DonateWithControlDependency)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 6f9d0d70e56df0..3575515a78fa6c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -58,6 +57,11 @@ limitations under the License.
 #include "xla/python/custom_partition_callback.h"
 #include "xla/service/compiler.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/service/gpu_topology.h"
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_CUDA
 
 namespace pjrt {
 namespace gpu_plugin {
@@ -322,6 +326,50 @@ PJRT_Error* PJRT_GpuDeviceTopology_Create(
   return nullptr;
 }
 
+#if GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
+namespace {
+
+const std::vector<PJRT_NamedValue>* MakeCudaPluginCAttributes() {
+  std::vector<PJRT_NamedValue>* attributes = new std::vector<PJRT_NamedValue>();
+  const std::vector<PJRT_NamedValue>& base_attributes =
+      pjrt::GetXlaPluginCAttributes();
+  attributes->reserve(base_attributes.size() + 1);
+  attributes->assign(base_attributes.begin(), base_attributes.end());
+  {
+    // Include the cuda_version attribute.
+    PJRT_NamedValue c_value;
+    c_value.struct_size = PJRT_NamedValue_STRUCT_SIZE;
+    c_value.extension_start = nullptr;
+    absl::string_view name = "cuda_version";
+    c_value.name = name.data();
+    c_value.name_size = name.size();
+    c_value.type = PJRT_NamedValue_Type::PJRT_NamedValue_kInt64;
+    c_value.int64_value = CUDART_VERSION;
+    c_value.value_size = 1;
+    attributes->push_back(c_value);
+  }
+  return attributes;
+}
+
+}  // namespace
+#endif
+
+PJRT_Error* PJRT_Plugin_Attributes_Gpu(PJRT_Plugin_Attributes_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Plugin_Attributes_Args", PJRT_Plugin_Attributes_Args_STRUCT_SIZE,
+      args->struct_size));
+#if GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
+  static const std::vector<PJRT_NamedValue>* attributes =
+      MakeCudaPluginCAttributes();
+#else
+  const std::vector<PJRT_NamedValue>* attributes =
+      &pjrt::GetXlaPluginCAttributes();
+#endif
+  args->num_attributes = attributes->size();
+  args->attributes = attributes->data();
+  return nullptr;
+}
+
 PLUGIN_Profiler_Api profiler_api{
     /*struct_size=*/PLUGIN_Profiler_Api_STRUCT_SIZE,
     /*priv=*/nullptr,
@@ -470,7 +518,7 @@ const PJRT_Api* GetGpuPjrtApi() {
       pjrt::gpu_plugin::PJRT_ExecuteContext_Create,
       pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
       pjrt::PJRT_Plugin_Initialize_NoOp, &cross_host_transfers_extension.base,
-      pjrt::PJRT_Plugin_Attributes_Xla);
+      pjrt::gpu_plugin::PJRT_Plugin_Attributes_Gpu);
 
   return &pjrt_api;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 49f90941489596..f839c61ed1a304 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -212,8 +212,8 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHost) {
   args.struct_size = PJRT_Buffer_CopyRawToHost_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.buffer = buffer.get();
-  args.dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  args.dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
   args.offset = 0;
   args.transfer_size = size;
   PJRT_Error* error = api_->PJRT_Buffer_CopyRawToHost(&args);
@@ -221,8 +221,9 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHost) {
   xla::Future<> copy_to_host_event = ConvertCEventToCppFuture(args.event, api_);
   TF_EXPECT_OK(copy_to_host_event.Await());
   EXPECT_EQ(*(static_cast<float*>(args.dst)), 41);
-  tsl::port::AlignedSizedFree(args.dst, tsl::Allocator::kAllocatorAlignment,
-                              size);
+  tsl::port::AlignedSizedFree(
+      args.dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST_F(PjrtCApiGpuBufferTest, CopyRawToHostWithInvalidOffset) {
@@ -231,8 +232,8 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHostWithInvalidOffset) {
   args.struct_size = PJRT_Buffer_CopyRawToHost_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.buffer = buffer_.get();
-  args.dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  args.dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
   args.offset = size + 1;  // offset is invalid
   args.transfer_size = size;
   PJRT_Error* error = api_->PJRT_Buffer_CopyRawToHost(&args);
@@ -376,10 +377,12 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
 TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   size_t dma_size = 1024 * 1024;
   size_t alignment = 1024 * 1024;
-  void* host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  void* host_dma_ptr = tsl::port::AlignedMalloc(
+      dma_size, static_cast<std::align_val_t>(alignment));
   auto host_dma_ptr_deleter =
       absl::Cleanup([host_dma_ptr, dma_size, alignment] {
-        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+        tsl::port::AlignedSizedFree(host_dma_ptr, dma_size,
+                                    static_cast<std::align_val_t>(alignment));
       });
 
   PJRT_Client_DmaMap_Args dma_args;
@@ -468,7 +471,7 @@ TEST_F(PjrtCApiGpuTransferManagerTest, SetBufferError) {
           &set_buffer_error_args);
   ASSERT_EQ(set_buffer_error_error, nullptr);
 
-  EXPECT_THAT(buffer_out->buffer->ToLiteralSync(),
+  EXPECT_THAT(buffer_out->buffer->ToLiteral().Await(),
               absl_testing::StatusIs(absl::StatusCode::kInternal,
                                      HasSubstr(error_message)));
 
@@ -538,7 +541,7 @@ TEST_F(PjrtCApiGpuTransferManagerTest, TransferRawDataToBufferIsSuccessful) {
       transfer_args.done_with_h2d_transfer, MakeEventDeleter(api_));
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> literal,
-                          buffer_out->buffer->ToLiteralSync());
+                          buffer_out->buffer->ToLiteral().Await());
   EXPECT_EQ(literal->element_count(), 8);
   EXPECT_THAT(literal->data<uint32_t>(), ElementsAreArray(data));
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index 9f072392bffadd..00b653824a1654 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -707,6 +707,17 @@ absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api) {
   return platform_version;
 }
 
+absl::string_view GetPlatformVersion(PJRT_TopologyDescription* c_topology,
+                                     const PJRT_Api* api) {
+  PJRT_TopologyDescription_PlatformVersion_Args args;
+  args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.topology = c_topology;
+  pjrt::LogFatalIfPjrtError(
+      api->PJRT_TopologyDescription_PlatformVersion(&args), api);
+  return absl::string_view(args.platform_version, args.platform_version_size);
+}
+
 absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api) {
   PJRT_Client_PlatformName_Args args;
   args.client = client;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index a585e00719694b..5c0ae4a1ac2d7b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -199,6 +199,8 @@ absl::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
                                               size_t actual_size);
 
 absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api);
+absl::string_view GetPlatformVersion(PJRT_TopologyDescription* c_topology,
+                                     const PJRT_Api* api);
 absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api);
 
 absl::StatusOr<PJRT_TopologyDescription*> GetTopologyDescription(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 0a089ae0f34e38..a0faa37ccbf6d6 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -950,6 +950,22 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
       add_field("PJRT_AsyncHostToDeviceTransferManager_TransferLiteral",
                 kFnPtrSize);
     }
+    if (minor_version >= 84) {
+      add_field("PJRT_Buffer_CopyRawToHostFuture", kFnPtrSize);
+    }
+    if (minor_version >= 85) {
+      add_field("PJRT_Device_PoisonExecution", kFnPtrSize);
+    }
+    if (minor_version >= 86) {
+      add_field("PJRT_Device_CreateAsyncTrackingEvent", kFnPtrSize);
+      add_field("PJRT_AsyncTrackingEvent_Destroy", kFnPtrSize);
+    }
+    if (minor_version >= 87) {
+      add_field("PJRT_Executable_GetCompileOptions", kFnPtrSize);
+    }
+    if (minor_version >= 88) {
+      add_field("PJRT_Buffer_DonateWithControlDependency", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1349,6 +1365,24 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
                      PJRT_AsyncHostToDeviceTransferManager_TransferLiteral),
             sizeof(PJRT_Api::
                        PJRT_AsyncHostToDeviceTransferManager_TransferLiteral)}},
+          {"PJRT_Buffer_CopyRawToHostFuture",
+           {offsetof(PJRT_Api, PJRT_Buffer_CopyRawToHostFuture),
+            sizeof(PJRT_Api::PJRT_Buffer_CopyRawToHostFuture)}},
+          {"PJRT_Device_PoisonExecution",
+           {offsetof(PJRT_Api, PJRT_Device_PoisonExecution),
+            sizeof(PJRT_Api::PJRT_Device_PoisonExecution)}},
+          {"PJRT_Device_CreateAsyncTrackingEvent",
+           {offsetof(PJRT_Api, PJRT_Device_CreateAsyncTrackingEvent),
+            sizeof(PJRT_Api::PJRT_Device_CreateAsyncTrackingEvent)}},
+          {"PJRT_AsyncTrackingEvent_Destroy",
+           {offsetof(PJRT_Api, PJRT_AsyncTrackingEvent_Destroy),
+            sizeof(PJRT_Api::PJRT_AsyncTrackingEvent_Destroy)}},
+          {"PJRT_Executable_GetCompileOptions",
+           {offsetof(PJRT_Api, PJRT_Executable_GetCompileOptions),
+            sizeof(PJRT_Api::PJRT_Executable_GetCompileOptions)}},
+          {"PJRT_Buffer_DonateWithControlDependency",
+           {offsetof(PJRT_Api, PJRT_Buffer_DonateWithControlDependency),
+            sizeof(PJRT_Api::PJRT_Buffer_DonateWithControlDependency)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
index 3eef6aca6b2c75..282e84b109e24e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
@@ -39,11 +39,8 @@ struct PJRT_Triton_Compile_Args {
   const char* out_asm;  // owned
   size_t out_asm_size;
   int64_t out_smem_bytes;
-  int out_cluster_dim_x;
-  int out_cluster_dim_y;
-  int out_cluster_dim_z;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Triton_Compile_Args, out_cluster_dim_z);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Triton_Compile_Args, out_smem_bytes);
 
 // Compiles a given Triton kernel.
 typedef PJRT_Error* PJRT_Triton_Compile(PJRT_Triton_Compile_Args* args);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc
index df5070f806e3f7..4212c92475d24b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc
@@ -42,9 +42,6 @@ PJRT_Error* PJRT_Triton_Compile(PJRT_Triton_Compile_Args* args) {
   args->out_asm = asm_copy;
   args->out_asm_size = result.asm_text.size();
   args->out_smem_bytes = result.smem_bytes;
-  args->out_cluster_dim_x = result.cluster_dim_x;
-  args->out_cluster_dim_y = result.cluster_dim_y;
-  args->out_cluster_dim_z = result.cluster_dim_z;
   return nullptr;
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 078f16b27087f1..81dff12b5e6d96 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
@@ -774,6 +775,48 @@ PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_TransferLiteral(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Device_PoisonExecution(
+    PJRT_Device_PoisonExecution_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Device_PoisonExecution_Args",
+      PJRT_Device_PoisonExecution_Args_STRUCT_SIZE, args->struct_size));
+
+  absl::Status error = absl::Status(
+      pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+      absl::string_view(args->error_message, args->error_message_size));
+
+  PJRT_ASSIGN_OR_RETURN(args->poisoned, args->device->device->PoisonExecution(
+                                            args->launch_id, error));
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Device_CreateAsyncTrackingEvent(
+    PJRT_Device_CreateAsyncTrackingEvent_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Device_CreateAsyncTrackingEvent_Args",
+      PJRT_Device_CreateAsyncTrackingEvent_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  absl::string_view description(args->description, args->description_size);
+  std::unique_ptr<xla::ScopedAsyncTrackingEvent> event =
+      args->device->device->CreateAsyncTrackingEvent(description);
+  if (event == nullptr) {
+    args->event = nullptr;
+  } else {
+    args->event = new PJRT_AsyncTrackingEvent{std::move(event)};
+  }
+  return nullptr;
+}
+
+PJRT_Error* PJRT_AsyncTrackingEvent_Destroy(
+    PJRT_AsyncTrackingEvent_Destroy_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_AsyncTrackingEvent_Destroy_Args",
+      PJRT_AsyncTrackingEvent_Destroy_Args_STRUCT_SIZE, args->struct_size));
+  delete args->event;
+  return nullptr;
+}
+
 PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer(
     PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -2025,11 +2068,34 @@ PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Executable_GetCompileOptions(
+    PJRT_Executable_GetCompileOptions_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Executable_GetCompileOptions_Args",
+      PJRT_Executable_GetCompileOptions_Args_STRUCT_SIZE, args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(xla::CompileOptions options,
+                        args->executable->executable->GetCompileOptions());
+  PJRT_ASSIGN_OR_RETURN(xla::CompileOptionsProto options_proto,
+                        options.ToProto());
+  std::string serialized = options_proto.SerializeAsString();
+
+  PJRT_SerializedCompileOptions* serialized_options =
+      new PJRT_SerializedCompileOptions;
+  serialized_options->serialized = std::move(serialized);
+  args->serialized_compile_options = serialized_options;
+  args->serialized_bytes = serialized_options->serialized.data();
+  args->serialized_bytes_size = serialized_options->serialized.size();
+  args->serialized_compile_options_deleter =
+      +[](PJRT_SerializedCompileOptions* options) { delete options; };
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
-      "PJRT_Executable_Serialize_Args",
-      PJRT_Executable_Serialize_Args_STRUCT_SIZE, args->struct_size));
+      "PJRT_Executable_GetCompiledMemoryStats_Args",
+      PJRT_Executable_GetCompiledMemoryStats_Args_STRUCT_SIZE,
+      args->struct_size));
   PJRT_ASSIGN_OR_RETURN(auto memory_stats,
                         args->executable->executable->GetCompiledMemoryStats());
   args->generated_code_size_in_bytes =
@@ -2248,6 +2314,50 @@ PJRT_Error* PJRT_Buffer_CopyRawToHost(PJRT_Buffer_CopyRawToHost_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Buffer_CopyRawToHostFuture(
+    PJRT_Buffer_CopyRawToHostFuture_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Buffer_CopyRawToHostFuture_Args",
+      PJRT_Buffer_CopyRawToHostFuture_Args_STRUCT_SIZE, args->struct_size));
+
+  auto [promise, future] = xla::Future<void*>::MakePromise();
+  xla::Future<> wrapped_promise = args->buffer->buffer->CopyRawToHostFuture(
+      future, args->offset, args->transfer_size);
+  args->event = new PJRT_Event{std::move(wrapped_promise)};
+
+  typedef absl::AnyInvocable<void(
+      PJRT_Buffer_CopyRawToHostFuture_Callback_Args*) &&>
+      Callback;
+  auto callback = new Callback(
+      [promise = std::move(promise)](
+          PJRT_Buffer_CopyRawToHostFuture_Callback_Args* args) mutable {
+        absl::Status status = ActualStructSizeIsGreaterOrEqual(
+            "PJRT_Buffer_CopyRawToHostFuture_Callback_Args",
+            PJRT_Buffer_CopyRawToHostFuture_Callback_Args_STRUCT_SIZE,
+            args->struct_size);
+        if (!status.ok()) {
+          promise.Set(status);
+          return;
+        }
+        if (args->error_code != PJRT_Error_Code_OK) {
+          absl::Status error = absl::Status(
+              pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+              absl::string_view(args->error_message, args->error_message_size));
+          promise.Set(std::move(error));
+          return;
+        }
+        promise.Set(args->dst);
+      });
+  args->callback_data = callback;
+  args->future_ready_callback =
+      +[](PJRT_Buffer_CopyRawToHostFuture_Callback_Args* args) {
+        auto* callback = reinterpret_cast<Callback*>(args->callback_data);
+        std::move (*callback)(args);
+        delete callback;
+      };
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Buffer_CopyToDevice(PJRT_Buffer_CopyToDevice_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Buffer_CopyToDevice_Args",
@@ -2402,6 +2512,40 @@ PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Buffer_DonateWithControlDependency(
+    PJRT_Buffer_DonateWithControlDependency_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Buffer_DonateWithControlDependency_Args",
+      PJRT_Buffer_DonateWithControlDependency_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  auto [promise, future] = xla::Future<>::MakePromise();
+  PJRT_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtBuffer> out_buffer,
+      args->buffer->buffer->DonateWithControlDependency(std::move(future)));
+
+  struct CallbackData {
+    xla::Promise<> promise;
+  };
+  args->out_buffer =
+      new PJRT_Buffer{std::move(out_buffer), args->buffer->client};
+  args->callback_data = new CallbackData{std::move(promise)};
+  args->dependency_ready_callback =
+      [](PJRT_Buffer_DonateWithControlDependency_Callback_Args* args) {
+        auto* data = static_cast<CallbackData*>(args->callback_data);
+        if (args->error_code == PJRT_Error_Code_OK) {
+          data->promise.Set();
+        } else {
+          absl::Status status(
+              pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+              absl::string_view(args->error_message, args->error_message_size));
+          data->promise.Set(std::move(status));
+        }
+        delete data;
+      };
+  return nullptr;
+}
+
 // ---------------------------- CopyToDeviceStream -----------------------------
 
 PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
@@ -3181,6 +3325,17 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_Client_CreateErrorBuffer,
       /*PJRT_AsyncHostToDeviceTransferManager_TransferLiteral=*/
       pjrt::PJRT_AsyncHostToDeviceTransferManager_TransferLiteral,
+      /*PJRT_Buffer_CopyRawToHostFuture=*/
+      pjrt::PJRT_Buffer_CopyRawToHostFuture,
+      /*PJRT_Device_PoisonExecution=*/pjrt::PJRT_Device_PoisonExecution,
+      /*PJRT_Device_CreateAsyncTrackingEvent=*/
+      pjrt::PJRT_Device_CreateAsyncTrackingEvent,
+      /*PJRT_AsyncTrackingEvent_Destroy=*/
+      pjrt::PJRT_AsyncTrackingEvent_Destroy,
+      /*PJRT_Executable_GetCompileOptions=*/
+      pjrt::PJRT_Executable_GetCompileOptions,
+      /*PJRT_Buffer_DonateWithControlDependency=*/
+      pjrt::PJRT_Buffer_DonateWithControlDependency,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 387c8642694872..9339e05931ea8a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/shape.h"
 
 struct PJRT_Error {
@@ -214,6 +215,10 @@ struct PJRT_SerializedExecutable {
   std::string serialized;
 };
 
+struct PJRT_SerializedCompileOptions {
+  std::string serialized;
+};
+
 struct PJRT_DeviceAssignmentSerialized {
   std::string serialized;
 };
@@ -259,6 +264,10 @@ struct PJRT_PhaseCompiler {
       : compiler(phase_compiler), owned_compiler(nullptr) {}
 };
 
+struct PJRT_AsyncTrackingEvent {
+  std::unique_ptr<xla::ScopedAsyncTrackingEvent> event;
+};
+
 namespace pjrt {
 // C API definitions
 
@@ -373,6 +382,8 @@ PJRT_Error* PJRT_Executable_OutputMemoryKinds(
 PJRT_Error* PJRT_Executable_OptimizedProgram(
     PJRT_Executable_OptimizedProgram_Args* args);
 PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args);
+PJRT_Error* PJRT_Executable_GetCompileOptions(
+    PJRT_Executable_GetCompileOptions_Args* args);
 PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args);
 
@@ -423,6 +434,9 @@ PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
 PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
     PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args* args);
 
+PJRT_Error* PJRT_Buffer_DonateWithControlDependency(
+    PJRT_Buffer_DonateWithControlDependency_Args* args);
+
 PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
     PJRT_CopyToDeviceStream_Destroy_Args* args);
 PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
diff --git a/third_party/xla/xla/pjrt/c_api_client/BUILD b/third_party/xla/xla/pjrt/c_api_client/BUILD
index ee111d84b4c8bb..5f06f8d3ed1fda 100644
--- a/third_party/xla/xla/pjrt/c_api_client/BUILD
+++ b/third_party/xla/xla/pjrt/c_api_client/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt:scoped_async_tracking_event",
         "//xla/pjrt:string_utils",
         "//xla/pjrt/c:pjrt_c_api_ffi_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
@@ -77,6 +78,8 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -116,6 +119,7 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
+        "//xla:util",
         "//xla/backends/cpu:alignment",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
@@ -134,9 +138,11 @@ xla_cc_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 3d579a45509817..d09986db09030e 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -77,10 +77,13 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -1013,7 +1016,7 @@ absl::Status PjRtCApiClient::DmaUnmap(void* data) {
 // Helper struct and method used to serialize shapes past the C API boundary.
 struct ShapesInfo {
   std::vector<size_t> shape_num_dims;
-  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  std::vector<std::optional<pjrt::BufferMemoryLayoutData>> layout_list;
   std::vector<const int64_t*> num_dims;
   std::vector<PJRT_Buffer_Type> element_type_list;
 };
@@ -1021,7 +1024,7 @@ struct ShapesInfo {
 ShapesInfo MakeShapesInfo(absl::Span<const Shape> shapes) {
   std::vector<size_t> shape_num_dims;
   shape_num_dims.reserve(shapes.size());
-  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  std::vector<std::optional<pjrt::BufferMemoryLayoutData>> layout_list;
   layout_list.reserve(shapes.size());
   std::vector<const int64_t*> num_dims;
   num_dims.reserve(shapes.size());
@@ -1034,15 +1037,20 @@ ShapesInfo MakeShapesInfo(absl::Span<const Shape> shapes) {
     num_dims.push_back(shapes[i].dimensions().data());
     element_type_list.push_back(
         pjrt::ConvertToPjRtBufferType(shapes[i].element_type()));
-    // TODO(b/434246423): Enable this once ASAN failure is fixed.
-    // if (shapes[i].has_layout()) {
-    //   // this is messed up
-    //   auto& layout = shapes[i].layout();
-    //   TF_ASSIGN_OR_RETURN(
-    //       pjrt::BufferMemoryLayoutData c_layout_data,
-    //       pjrt::ConvertToBufferMemoryLayoutData(layout));
-    //   layout_list.push_back(&(c_layout_data.c_layout));
-    layout_list.push_back(nullptr);
+
+    if (shapes[i].has_layout()) {
+      auto& layout = shapes[i].layout();
+      absl::StatusOr<pjrt::BufferMemoryLayoutData> c_layout_data =
+          pjrt::ConvertToBufferMemoryLayoutData(layout);
+      if (c_layout_data.ok()) {
+        layout_list.push_back(std::optional<pjrt::BufferMemoryLayoutData>(
+            std::move(*c_layout_data)));
+      } else {
+        layout_list.push_back({});
+      }
+    } else {
+      layout_list.push_back({});
+    }
   }
 
   return ShapesInfo{
@@ -1088,7 +1096,16 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
   args.shape_num_dims = shapes_info.shape_num_dims.data();
   args.num_dims = shapes_info.num_dims.data();
   args.element_types = shapes_info.element_type_list.data();
-  args.layouts = shapes_info.layout_list.data();
+
+  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  for (int i = 0; i < shapes_info.layout_list.size(); i++) {
+    if (shapes_info.layout_list[i].has_value()) {
+      layout_list.push_back(&shapes_info.layout_list[i]->c_layout);
+    } else {
+      layout_list.push_back(nullptr);
+    }
+  }
+  args.layouts = layout_list.data();
 
   args.notifier = pjrt::CppCrossHostRecvNotifierToC(c_api, std::move(notifier));
   args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
@@ -1179,7 +1196,16 @@ PjRtCApiClient::CrossHostReceiveBuffers(
   args.shape_num_dims = shapes_info.shape_num_dims.data();
   args.num_dims = shapes_info.num_dims.data();
   args.element_types = shapes_info.element_type_list.data();
-  args.layouts = shapes_info.layout_list.data();
+
+  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  for (int i = 0; i < shapes_info.layout_list.size(); i++) {
+    if (shapes_info.layout_list[i].has_value()) {
+      layout_list.push_back(&shapes_info.layout_list[i]->c_layout);
+    } else {
+      layout_list.push_back(nullptr);
+    }
+  }
+  args.layouts = layout_list.data();
 
   args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
   args.src_global_device_ids = src_global_device_ids.data();
@@ -1381,30 +1407,33 @@ PjRtCApiClient::CreateBuffersForAsyncHostToDevice(
       PJRT_Client_CreateBuffersForAsyncHostToDevice_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.client = c_client_.get();
+
   args.num_shape_specs = shape_specs.size();
-  args.shape_specs = new PJRT_ShapeSpec[shape_specs.size()];
-  absl::Cleanup cleanup =
-      absl::MakeCleanup([&args] { delete[] args.shape_specs; });
-  const ShapeSpec* iterator = shape_specs.begin();
-  for (int i = 0; i < shape_specs.size(); ++i) {
-    args.shape_specs[i] = pjrt::ConvertToPjRtShapeSpec(*(iterator++));
+  absl::InlinedVector<PJRT_ShapeSpec, 4> c_shape_specs;
+  c_shape_specs.reserve(shape_specs.size());
+  for (const ShapeSpec& shape_spec : shape_specs) {
+    c_shape_specs.push_back(pjrt::ConvertToPjRtShapeSpec(shape_spec));
   }
+  args.shape_specs = c_shape_specs.data();
+
+  absl::InlinedVector<pjrt::BufferMemoryLayoutData, 4> layout_data_list;
+  absl::InlinedVector<PJRT_Buffer_MemoryLayout*, 4> device_layout_list;
   if (device_layouts.has_value()) {
     args.num_device_layouts = device_layouts->size();
-    auto device_layout_list =
-        std::make_unique<std::vector<PJRT_Buffer_MemoryLayout*>>(
-            device_layouts->size());
+    device_layout_list.reserve(device_layouts->size());
+    layout_data_list.reserve(device_layouts->size());
     for (int i = 0; i < device_layouts->size(); ++i) {
       if (device_layouts.has_value() && (*device_layouts)[i].has_value()) {
         const Layout& layout = (*device_layouts)[i].value();
         TF_ASSIGN_OR_RETURN(pjrt::BufferMemoryLayoutData c_layout_data,
                             pjrt::ConvertToBufferMemoryLayoutData(layout));
-        device_layout_list->emplace_back(&(c_layout_data.c_layout));
+        layout_data_list.push_back(std::move(c_layout_data));
+        device_layout_list.emplace_back(&(layout_data_list.back().c_layout));
       } else {
-        device_layout_list->emplace_back(nullptr);
+        device_layout_list.emplace_back(nullptr);
       }
     }
-    args.device_layouts = device_layout_list->data();
+    args.device_layouts = device_layout_list.data();
   } else {
     args.num_device_layouts = 0;
     args.device_layouts = nullptr;
@@ -1694,6 +1723,70 @@ absl::StatusOr<std::intptr_t> PjRtCApiDevice::GetStreamForExternalReadyEvents()
   return args.stream;
 }
 
+absl::StatusOr<bool> PjRtCApiDevice::PoisonExecution(int32_t launch_id,
+                                                     absl::Status error) {
+  if (client_->pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      client_->pjrt_c_api()->pjrt_api_version.minor_version < 85) {
+    return absl::UnimplementedError(
+        "PJRT_Device_PoisonExecution requires PJRT C API version 0.85 or "
+        "higher.");
+  }
+  const PJRT_Api* c_api = client_->pjrt_c_api();
+  PJRT_Device_PoisonExecution_Args args;
+  args.struct_size = PJRT_Device_PoisonExecution_Args_STRUCT_SIZE;
+  args.device = device_;
+  args.launch_id = launch_id;
+
+  args.error_code = pjrt::StatusCodeToPjrtErrorCode(error.code());
+  args.error_message = error.message().data();
+  args.error_message_size = error.message().size();
+
+  RETURN_STATUS_IF_PJRT_ERROR(c_api->PJRT_Device_PoisonExecution(&args), c_api);
+  return args.poisoned;
+}
+
+std::unique_ptr<ScopedAsyncTrackingEvent>
+PjRtCApiDevice::CreateAsyncTrackingEvent(absl::string_view description) const {
+  if (client_->pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      client_->pjrt_c_api()->pjrt_api_version.minor_version < 86) {
+    return nullptr;
+  }
+  PJRT_Device_CreateAsyncTrackingEvent_Args args;
+  args.struct_size = PJRT_Device_CreateAsyncTrackingEvent_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.device = c_device();
+  args.description = description.data();
+  args.description_size = description.size();
+  args.event = nullptr;
+
+  const PJRT_Api* api = client_->pjrt_c_api();
+  pjrt::LogFatalIfPjrtError(api->PJRT_Device_CreateAsyncTrackingEvent(&args),
+                            api);
+
+  if (args.event == nullptr) {
+    return nullptr;
+  }
+  return std::make_unique<PjRtCApiAsyncTrackingEvent>(api, args.event);
+}
+
+PjRtCApiAsyncTrackingEvent::PjRtCApiAsyncTrackingEvent(
+    const PJRT_Api* c_api, PJRT_AsyncTrackingEvent* event)
+    : c_api_(c_api), event_(event) {}
+
+PjRtCApiAsyncTrackingEvent::~PjRtCApiAsyncTrackingEvent() {
+  PJRT_AsyncTrackingEvent_Destroy_Args args;
+  args.struct_size = PJRT_AsyncTrackingEvent_Destroy_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.event = event_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_AsyncTrackingEvent_Destroy(&args),
+                            c_api_);
+}
+
+void PjRtCApiAsyncTrackingEvent::AddDependency(
+    tsl::RCReference<tsl::AsyncValue> dependency) {
+  LOG(FATAL) << "AddDependency is not supported in C API yet.";
+}
+
 // ------------------------------- Memory --------------------------------------
 
 const PJRT_Api* PjRtCApiMemorySpace::pjrt_c_api() const {
@@ -2117,6 +2210,32 @@ absl::StatusOr<std::string> PjRtCApiExecutable::FingerprintExecutable() const {
                      args.executable_fingerprint_size);
 }
 
+absl::StatusOr<CompileOptions> PjRtCApiExecutable::GetCompileOptions() const {
+  if (c_api_->pjrt_api_version.major_version == 0 &&
+      c_api_->pjrt_api_version.minor_version < 87) {
+    return absl::UnimplementedError(
+        "PJRT_Executable_GetCompileOptions not implemented in this PJRT "
+        "plugin.");
+  }
+  PJRT_Executable_GetCompileOptions_Args args;
+  args.struct_size = PJRT_Executable_GetCompileOptions_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.executable = c_executable();
+  RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_Executable_GetCompileOptions(&args),
+                              c_api_);
+  absl::Cleanup cleanup = [&args] {
+    args.serialized_compile_options_deleter(args.serialized_compile_options);
+  };
+  CompileOptionsProto proto;
+  if (!proto.ParseFromString(
+          std::string(args.serialized_bytes, args.serialized_bytes_size))) {
+    return absl::InternalError(
+        "PjRtCApiExecutable::GetCompileOptions: Failed to parse "
+        "CompileOptionsProto");
+  }
+  return CompileOptions::FromProto(proto);
+}
+
 // ------------------------ Loaded Executables ---------------------------------
 
 PjRtCApiLoadedExecutable::PjRtCApiLoadedExecutable(
@@ -3070,6 +3189,48 @@ Future<> PjRtCApiBuffer::CopyRawToHost(void* dst, int64_t offset,
   return pjrt::ConvertCEventToCppFuture(args.event, api);
 }
 
+Future<> PjRtCApiBuffer::CopyRawToHostFuture(Future<void*> dst, int64_t offset,
+                                             int64_t transfer_size) {
+  if (pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      pjrt_c_api()->pjrt_api_version.minor_version < 84) {
+    return Future<>(absl::UnimplementedError(
+        "PJRT_Buffer_CopyRawToHostFuture requires PJRT C API version 0.84 or "
+        "higher."));
+  }
+
+  PJRT_Buffer_CopyRawToHostFuture_Args args;
+  args.struct_size = PJRT_Buffer_CopyRawToHostFuture_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.buffer = buffer_.get();
+  args.offset = offset;
+  args.transfer_size = transfer_size;
+  const PJRT_Api* api = pjrt_c_api();
+  RETURN_FUTURE_IF_ERROR(api->PJRT_Buffer_CopyRawToHostFuture(&args), api);
+  dst.OnReady(
+      [callback_data = args.callback_data,
+       callback = args.future_ready_callback](absl::StatusOr<void*> dst) {
+        PJRT_Buffer_CopyRawToHostFuture_Callback_Args callback_args;
+        callback_args.struct_size =
+            PJRT_Buffer_CopyRawToHostFuture_Callback_Args_STRUCT_SIZE;
+        if (dst.ok()) {
+          callback_args.dst = *dst;
+          callback_args.error_code = PJRT_Error_Code_OK;
+          callback_args.error_message = nullptr;
+          callback_args.error_message_size = 0;
+        } else {
+          callback_args.dst = nullptr;
+          callback_args.error_code =
+              pjrt::StatusCodeToPjrtErrorCode(dst.status().code());
+          callback_args.error_message = dst.status().message().data();
+          callback_args.error_message_size = dst.status().message().size();
+        }
+        callback_args.callback_data = callback_data;
+        callback(&callback_args);
+      });
+  CHECK(args.event != nullptr);
+  return pjrt::ConvertCEventToCppFuture(args.event, api);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
     PjRtMemorySpace* dst_memory) {
   const PJRT_Api* api = pjrt_c_api();
@@ -3086,7 +3247,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
         std::make_unique<PjRtCApiBuffer>(client_, args.dst_buffer));
   } else {
     // Copy across PjRtClients by copying through host
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                        PjRtBuffer::ToLiteral().Await());
     absl::InlinedVector<int64_t, 4> byte_strides(
         literal->shape().dimensions().size());
     TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
@@ -3220,6 +3382,43 @@ PjRtCApiBuffer::AcquireExternalReference() {
                                                      device_memory_ptr);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiBuffer::DonateWithControlDependency(Future<> dependency) {
+  if (client_->pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      client_->pjrt_c_api()->pjrt_api_version.minor_version < 88) {
+    return Unimplemented(
+        "PJRT_Buffer_DonateWithControlDependency requires PJRT C API version "
+        "0.88 or higher.");
+  }
+  PJRT_Buffer_DonateWithControlDependency_Args args;
+  args.struct_size = PJRT_Buffer_DonateWithControlDependency_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.buffer = c_buffer();
+  const PJRT_Api* api = pjrt_c_api();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      api->PJRT_Buffer_DonateWithControlDependency(&args), api);
+
+  dependency.OnReady([callback = args.dependency_ready_callback,
+                      data = args.callback_data](absl::Status s) {
+    PJRT_Buffer_DonateWithControlDependency_Callback_Args cb_args;
+    cb_args.struct_size =
+        PJRT_Buffer_DonateWithControlDependency_Callback_Args_STRUCT_SIZE;
+    cb_args.callback_data = data;
+    if (s.ok()) {
+      cb_args.error_code = PJRT_Error_Code_OK;
+      cb_args.error_message = nullptr;
+      cb_args.error_message_size = 0;
+    } else {
+      cb_args.error_code = pjrt::StatusCodeToPjrtErrorCode(s.code());
+      cb_args.error_message = s.message().data();
+      cb_args.error_message_size = s.message().size();
+    }
+    callback(&cb_args);
+  });
+  return std::unique_ptr<PjRtBuffer>(
+      std::make_unique<PjRtCApiBuffer>(client_, args.out_buffer));
+}
+
 void PjRtCApiBuffer::CopyToRemoteDevice(
     Future<std::string> serialized_descriptor, RemoteSendCallback on_done) {
   PJRT_CrossHostTransfers_Extension* extension =
@@ -3284,6 +3483,8 @@ PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
       tpu_topology_extension_(pjrt::FindExtension<PJRT_TpuTopology_Extension>(
           c_api, PJRT_Extension_Type::PJRT_Extension_Type_TpuTopology)),
       c_topology_(c_topology),
+      platform_version_(absl::StrCat(
+          "PJRT C API\n", ::pjrt::GetPlatformVersion(c_topology, c_api))),
       platform_name_(::pjrt::PlatformName(c_api, c_topology)),
       platform_id_(tsl::Fingerprint64(platform_name_)) {
   if (owned) {
@@ -3295,13 +3496,7 @@ PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
 }
 
 absl::string_view PjRtCApiTopologyDescription::platform_version() const {
-  PJRT_TopologyDescription_PlatformVersion_Args args;
-  args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
-  args.extension_start = nullptr;
-  args.topology = c_topology_;
-  pjrt::LogFatalIfPjrtError(
-      c_api_->PJRT_TopologyDescription_PlatformVersion(&args), c_api_);
-  return absl::string_view(args.platform_version, args.platform_version_size);
+  return platform_version_;
 }
 
 std::vector<std::unique_ptr<const PjRtDeviceDescription>>
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index ea773cddabf832..5386b1230050b8 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -54,9 +54,12 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/util.h"
@@ -171,13 +174,10 @@ class PjRtCApiDevice : public PjRtDevice {
       absl::string_view kind) const override;
 
   std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
-      absl::string_view description) const override {
-    LOG(FATAL)
-        << "PJRT C API does not support CreateAsyncTrackingEvent. Please "
-           "report an issue at https://github.com/google/jax/issues if you "
-           "need this feature.";
-    return nullptr;
-  }
+      absl::string_view description) const override;
+
+  absl::StatusOr<bool> PoisonExecution(int32_t launch_id,
+                                       absl::Status error) override;
 
   PJRT_Device* c_device() const { return device_; }
 
@@ -313,6 +313,7 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
   // Device specific attributes with corresponding values.
   absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
 
+  const std::string platform_version_;
   const std::string platform_name_;
   const PjRtPlatformId platform_id_;
 
@@ -320,6 +321,19 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
   void InitAttributes();
 };
 
+class PjRtCApiAsyncTrackingEvent : public ScopedAsyncTrackingEvent {
+ public:
+  PjRtCApiAsyncTrackingEvent(const PJRT_Api* c_api,
+                             PJRT_AsyncTrackingEvent* event);
+  ~PjRtCApiAsyncTrackingEvent() override;
+
+  void AddDependency(tsl::RCReference<tsl::AsyncValue> dependency) override;
+
+ private:
+  const PJRT_Api* c_api_;
+  PJRT_AsyncTrackingEvent* event_;
+};
+
 class PjRtCApiClient : public PjRtClient {
  public:
   PjRtCApiClient(
@@ -549,6 +563,9 @@ class PjRtCApiBuffer : public PjRtBuffer {
   Future<> CopyRawToHost(void* dst, int64_t offset,
                          int64_t transfer_size) override;
 
+  Future<> CopyRawToHostFuture(Future<void*> dst, int64_t offset,
+                               int64_t transfer_size) override;
+
   void Delete() override;
 
   absl::StatusOr<std::unique_ptr<ExternalReference>>
@@ -557,6 +574,9 @@ class PjRtCApiBuffer : public PjRtBuffer {
         "PJRT C API does not support ReleaseDeviceMemoryOwnership");
   }
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      Future<> dependency) override;
+
   bool IsDeleted() const override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
@@ -663,6 +683,8 @@ class PjRtCApiExecutable : public PjRtExecutable {
   // TODO(b/438000615): Move this to PjRtLoadedExecutable.
   absl::StatusOr<std::string> GetSerializedExecutableMetadata() const;
 
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override;
+
  private:
   const PJRT_Api* c_api_;
   std::unique_ptr<PJRT_Executable, ::pjrt::PJRT_ExecutableDeleter> executable_;
@@ -738,6 +760,10 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return executable_->GetOutputMemoryKinds();
   }
 
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return executable_->GetCompileOptions();
+  }
+
   absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index 3d74a142ac77f4..f45151debe82db 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
@@ -58,11 +59,15 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/types.h"
+#include "xla/util.h"
 
+using ::absl_testing::IsOkAndHolds;
+using ::absl_testing::StatusIs;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 
@@ -125,7 +130,7 @@ TEST(PjRtCApiClientTest, FulfillAliasBuffer) {
   ASSERT_NE(alias_buffer.second, nullptr);
   TF_ASSERT_OK(std::move(alias_buffer.second)(result_buffer.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto alias_literal,
-                          alias_buffer.first->ToLiteralSync());
+                          alias_buffer.first->ToLiteral().Await());
 
   // Expected result: data + 1
   EXPECT_TRUE(LiteralTestUtil::Equal(
@@ -333,6 +338,29 @@ TEST(PjRtCApiClientTest, NonEmptyExecutableFingerprint) {
   }
 }
 
+TEST(PjRtCApiClientTest, GetCompileOptions) {
+  SetUpCpuPjRtApi();
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                       GetCApiClient("cpu"));
+  Shape shape = ShapeUtil::MakeShapeWithType<float>({4});
+  XlaBuilder builder("sum");
+  auto inp_0 = Parameter(&builder, 0, shape, "input0");
+  auto inp_1 = Parameter(&builder, 1, shape, "input1");
+  auto sum = Add(inp_0, inp_1);
+  builder.SetUpAlias({}, 0, {});
+  auto computation = builder.Build(sum).value();
+
+  CompileOptions options;
+  options.compile_portable_executable = !options.compile_portable_executable;
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                       client->CompileAndLoad(computation, options));
+
+  ASSERT_OK_AND_ASSIGN(CompileOptions retrieved_options,
+                       executable->GetCompileOptions());
+  EXPECT_EQ(retrieved_options.compile_portable_executable,
+            options.compile_portable_executable);
+}
+
 TEST(PjRtCApiClientTest, CreateBuffersForAsyncHostToDeviceWithShape) {
   SetUpCpuPjRtApi();
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
@@ -370,7 +398,7 @@ TEST(PjRtClientTest, CreateViewAndCopyToDeviceAsyncExternalCpuOnly) {
       buffer->CopyToMemorySpace(client->memory_spaces()[1]));
   buffer.reset();
   ASSERT_TRUE(result);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 0);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -526,7 +554,7 @@ TEST(PjRtCApiClientTest, ForwardExecuteContext) {
   auto result = executable->Execute(/*argument_handles=*/{{}}, options);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
       *result_literal));
@@ -639,5 +667,94 @@ TEST(PjRtCApiClientTest, AsyncHostToDeviceTransferManagerTransferLiteral) {
   EXPECT_TRUE(LiteralTestUtil::Equal(literal, *result_literal));
 }
 
+TEST(PjRtCApiClientTest, CopyRawToHostFuture) {
+  SetUpCpuPjRtApi();
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                       GetCApiClient("cpu"));
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> recv_data(4);
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->memory_spaces()[0], /*device_layout=*/nullptr));
+  auto [dst_promise, dst_future] = Future<void*>::MakePromise();
+  ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
+
+  // Fulfill the promise with a valid host buffer.
+  dst_promise.Set(recv_data.data());
+  EXPECT_OK(result.Await());
+  ASSERT_EQ(recv_data.size(), data.size());
+  EXPECT_THAT(recv_data, ElementsAreArray(data));
+
+  // Test error case.
+  auto [error_dst_promise, error_dst_future] = Future<void*>::MakePromise();
+  result = buffer->CopyRawToHostFuture(error_dst_future, 0, size);
+  error_dst_promise.Set(absl::InternalError("Future error"));
+  absl::Status status = result.Await();
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInternal, "Future error"));
+}
+
+TEST(PjRtCApiClientTest, PoisonExecution) {
+  SetUpCpuPjRtApi();
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                       GetCApiClient("cpu"));
+
+  ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseAndReturnUnverifiedModule(R"(
+HloModule Identity
+ENTRY Identity() -> f32[2, 2] {
+    ROOT %result = f32[2, 2] parameter(0)
+})",
+                                                                       {}));
+  XlaComputation xla_computation(hlo_module->ToProto());
+  ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
+                       client->CompileAndLoad(xla_computation, {}));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+  ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                       client->CreateBuffersForAsyncHostToDevice(
+                           {shape}, client->memory_spaces()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+
+  const int32_t kLaunchId = 123;
+  ExecuteOptions opts;
+  opts.launch_id = kLaunchId;
+  // PoisonExecution only works for asynchronous executions. Synchronous
+  // executions are executed inline and will not be poisonable.
+  opts.execution_mode = ExecuteOptions::ExecutionMode::kAsynchronous;
+
+  auto result =
+      pjrt_executable->Execute(/*argument_handles=*/{{buffer.get()}}, opts);
+  ASSERT_OK(result);
+
+  // Poisoning the execution should succeed because the execution has not
+  // started with the input buffer not defined yet.
+  auto poison_result = client->addressable_devices().front()->PoisonExecution(
+      kLaunchId, Internal("foobar1"));
+  ASSERT_THAT(poison_result, IsOkAndHolds(true));
+
+  // The buffer is expected to be poisoned with the error.
+  ASSERT_EQ(result->size(), 1);
+  ASSERT_EQ(result->at(0).size(), 1);
+  EXPECT_THAT(result->at(0).at(0)->ToLiteral().Await(),
+              StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
+
+  // A later error (propagated from the input buffer) would not affect the
+  // already poisoned output buffer.
+  transfer_manager->SetBufferError(0, Internal("foobar2"));
+
+  EXPECT_THAT(result->at(0).at(0)->ToLiteral().Await(),
+              StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
+
+  // Attempting to poison a non-existent execution should fail.
+  poison_result = client->addressable_devices().front()->PoisonExecution(
+      kLaunchId + 12, Internal("foobar3"));
+  EXPECT_THAT(poison_result, IsOkAndHolds(false));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index e737e773ff85f6..1eca83fcb8a3d8 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -47,7 +49,9 @@ limitations under the License.
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/pjrt/utils.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -154,8 +158,7 @@ CommonPjRtClient::BufferFromHostLiteral(const LiteralSlice& literal,
                     HostBufferSemantics::kImmutableUntilTransferCompletes,
                     raw_buffer));
   return DefineBuffer(device_shape, memory_space, std::move(raw_buffer),
-                      {std::move(definition_event)},
-                      /*raw_buffer_is_mutable=*/true);
+                      {std::move(definition_event)});
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
@@ -189,8 +192,7 @@ CommonPjRtClient::CreateUninitializedBuffer(const Shape& shape,
                       raw_buffer->MakeAllocationReadyEvent());
   TF_ASSIGN_OR_RETURN(auto output_buffer,
                       DefineBuffer(device_shape, memory_space, raw_buffer,
-                                   {std::move(definition_event)},
-                                   /*raw_buffer_is_mutable=*/true));
+                                   {std::move(definition_event)}));
   return output_buffer;
 }
 
@@ -201,8 +203,12 @@ CommonPjRtClient::CreateAliasBuffer(const Shape& shape,
   tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
   PjRtFulfillAliasRawBufferCallback buffer_promise;
 
-  TF_ASSIGN_OR_RETURN(std::tie(raw_buffer, buffer_promise),
-                      CreateRawBufferChannel(memory_space));
+  TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, shape));
+
+  TF_ASSIGN_OR_RETURN(
+      std::tie(raw_buffer, buffer_promise),
+      CreateRawBufferChannel(memory_space, on_device_bytes_count));
 
   tsl::RCReference<xla::PjRtDeviceEventPromise> definition_event_promise;
   tsl::RCReference<xla::PjRtDeviceEvent> definition_event;
@@ -262,8 +268,7 @@ CommonPjRtClient::CreateAliasBuffer(const Shape& shape,
 
   TF_ASSIGN_OR_RETURN(auto result_buffer,
                       DefineBuffer(shape, memory_space, std::move(raw_buffer),
-                                   {std::move(definition_event)},
-                                   /*raw_buffer_is_mutable=*/true));
+                                   {std::move(definition_event)}));
 
   return std::make_pair(std::move(result_buffer), std::move(fulfill_cb));
 }
@@ -294,14 +299,14 @@ CommonPjRtClient::BufferFromHostBuffer(
           ImportForeignMemory(
               const_cast<void*>(data),  // CONST_CAST_OK=flag controlled.
               std::move(on_done_with_host_buffer), on_device_bytes_count,
-              memory_space));
+              memory_space,
+              host_buffer_semantics ==
+                  PjRtClient::HostBufferSemantics::kMutableZeroCopy));
       TF_ASSIGN_OR_RETURN(
           auto output_buffer,
           DefineBuffer(
               device_shape, memory_space, raw_buffer,
-              absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
-              /*raw_buffer_is_mutable=*/host_buffer_semantics ==
-                  PjRtClient::HostBufferSemantics::kMutableZeroCopy));
+              absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{}));
       return output_buffer;
     }
   }
@@ -319,8 +324,7 @@ CommonPjRtClient::BufferFromHostBuffer(
           std::move(on_done_with_host_buffer), device_shape, raw_buffer));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> output_buffer,
                       DefineBuffer(device_shape, memory_space, raw_buffer,
-                                   {std::move(definition_event)},
-                                   /*raw_buffer_is_mutable=*/true));
+                                   {std::move(definition_event)}));
   return output_buffer;
 }
 
@@ -343,12 +347,13 @@ CommonPjRtClient::CreateViewOfDeviceBuffer(
   TF_ASSIGN_OR_RETURN(
       auto raw_buffer,
       ImportForeignMemory(device_ptr, std::move(on_delete_callback),
-                          on_device_bytes_count, memory_space));
+                          on_device_bytes_count, memory_space,
+                          /*is_mutable=*/false));
   TF_ASSIGN_OR_RETURN(
       auto output_buffer,
-      DefineBuffer(device_shape, memory_space, raw_buffer,
-                   absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
-                   /*raw_buffer_is_mutable=*/false));
+      DefineBuffer(
+          device_shape, memory_space, raw_buffer,
+          absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{}));
   return output_buffer;
 }
 
@@ -426,6 +431,139 @@ void CommonPjRtClient::ScheduleRemoteSend(
   usage_event_promise->SetError(error);
 }
 
+absl::Status CommonPjRtClient::PrepareArguments(
+    const ExecuteOptions& options,
+    absl::Span<PjRtBuffer* const> argument_handles,
+    absl::Span<int const> donated_params, PjRtDeviceEventSet& extra_deps,
+    PjRtDeviceEventSet& control_deps,
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>&
+        input_buffers,
+    absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4>& device_buffers,
+    PjRtDevice* device, int replica, int partition,
+    absl::Span<const Shape> parameter_device_shapes, bool& is_error) {
+  input_buffers.reserve(argument_handles.size());
+  device_buffers.reserve(argument_handles.size());
+  auto donate_it = donated_params.begin();
+  {
+    tsl::profiler::TraceMe t2("Handle inputs");
+    // State for `TestBufferDonationClashes`.
+    absl::flat_hash_map<const void*, std::pair<bool, int>> donation_clashes;
+    donation_clashes.reserve(argument_handles.size());
+    // The first element is the argument index of the donated buffer, and the
+    // second element is the size in bytes of the donated buffer.
+    std::vector<std::pair<int, size_t>> donated_buffer_stats;
+    for (int i = 0; i < argument_handles.size(); ++i) {
+      PjRtBuffer* handle = argument_handles[i];
+      auto* tfrt_buffer = tensorflow::down_cast<CommonPjRtBufferImpl*>(handle);
+      if (tfrt_buffer->device() != device) {
+        return InvalidArgument(
+            "Buffer passed to Execute() as argument %d to replica %d is on "
+            "device %s, but replica is assigned to device %s.",
+            i, replica, tfrt_buffer->device()->DebugString(),
+            device->DebugString());
+      }
+      const bool donated_param =
+          donate_it != donated_params.end() && *donate_it == i;
+      const bool donation_denied_at_runtime =
+          options.non_donatable_input_indices.contains(i);
+      if (donated_param && donation_denied_at_runtime &&
+          tfrt_buffer->on_device_shape().has_layout() &&
+          tfrt_buffer->on_device_shape().layout().memory_space() ==
+              Layout::kHostMemorySpace) {
+        return absl::UnimplementedError(
+            "pinned_host buffers do not support donation denial at runtime via "
+            "`ExecuteOptions::non_donatable_input_indices`");
+      }
+      bool must_donate = donated_param && !donation_denied_at_runtime;
+      if (must_donate) {
+        ++donate_it;
+        if (VLOG_IS_ON(1)) {
+          TF_ASSIGN_OR_RETURN(size_t on_device_size,
+                              tfrt_buffer->GetOnDeviceSizeInBytes());
+          donated_buffer_stats.emplace_back(std::make_pair(i, on_device_size));
+        }
+      }
+      TF_RETURN_IF_ERROR(TestBufferDonationClashes(
+          tfrt_buffer, donation_clashes, must_donate, i, replica, partition));
+      device_buffers.emplace_back(tfrt_buffer->GetBufferWithHold(
+          must_donate ? CommonPjRtBuffer::ScopedHold::kDonation
+                      : CommonPjRtBuffer::ScopedHold::kUsage));
+      CommonPjRtBuffer::ScopedHold& hold = device_buffers.back();
+      if (!hold.ok()) {
+        return InvalidArgument(
+            "Invalid buffer passed to Execute() as argument %d to replica %d: "
+            "%s",
+            i, replica, hold.status().ToString());
+      }
+      auto* device_buffer = hold.buffer();
+
+      const bool is_handle_dynamic_shape =
+          handle->on_device_shape().is_dynamic();
+
+      const Shape& expected_shape = parameter_device_shapes[i];
+      if (device_buffer->raw_buffer()) {
+        tsl::RCReference<CommonPjRtRawBuffer> actual_buffer =
+            device_buffer->raw_buffer();
+        if (is_handle_dynamic_shape && !expected_shape.is_dynamic()) {
+          TF_ASSIGN_OR_RETURN(auto handle_logical_device_shape,
+                              handle->logical_on_device_shape());
+          auto status_or_buffer =
+              actual_buffer->RemoveDynamicShapeMetadataIfPresent(
+                  handle_logical_device_shape);
+
+          if (!status_or_buffer.ok()) {
+            absl::Status status = status_or_buffer.status();
+            tsl::errors::AppendToMessage(
+                &status, absl::StrCat("; Error when preparing the input buffer "
+                                      "to Execute() as argument ",
+                                      i, " to replica ", replica));
+            return status;
+          }
+          actual_buffer = std::move(status_or_buffer).value();
+        }
+        input_buffers.push_back(std::move(actual_buffer));
+      } else {
+        is_error = true;
+      }
+
+      // Definition events are never modified after buffer construction.
+      is_error |= device_buffer->AddDefinitionEventsToSet(extra_deps);
+      // If we are trying to donate this buffer, we must wait on its usage
+      // events as well as its definition events to ensure that all reads on
+      // this buffer (e.g., d2h transfer) have been completed before it can be
+      // mutated. Usage holds on this buffer are excluded during a donation hold
+      // so we know that its usage events won't be modified while we are
+      // enqueueing, but we ignore any errors from usage events.
+      if (must_donate) {
+        device_buffer->AddUsageEventsToSet(control_deps);
+      }
+    }
+    // Debug logging of buffer donation and input buffer shapes and size.
+    if (VLOG_IS_ON(1)) {
+      // Buffer donation information.
+      if (!argument_handles.empty()) {
+        LOG(INFO) << donated_buffer_stats.size() << " arguments out of total "
+                  << argument_handles.size() << " arguments will be donated.";
+        for (auto [index, buffer_size] : donated_buffer_stats) {
+          LOG(INFO) << "Argument " << index << " with size " << buffer_size
+                    << " will be donated.";
+        }
+      }
+      // Input buffers shape and size.
+      for (int i = 0; i < input_buffers.size(); ++i) {
+        size_t buffer_size = input_buffers[i]->GetOnDeviceSizeInBytes();
+        TF_ASSIGN_OR_RETURN(Shape actual_input_shape,
+                            argument_handles[i]->logical_on_device_shape());
+        VLOG(2) << "input buffer with index " << i
+                << " has shape: " << actual_input_shape.ToString()
+                << " and size: " << buffer_size;
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 absl::StatusOr<absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>>
 CommonPjRtClient::AllocateOutputBuffersWithInputReuse(
     const Shape& output_device_shape,
@@ -564,9 +702,9 @@ static std::unique_ptr<PjRtBuffer> CreateOutputLeafBuffer(
     CHECK(memory_space) << "No memory space found for device: "
                         << device->DebugString() << " kind: " << kind_id;
   }
-  auto buffer_or = client->DefineBuffer(
-      output_leaf_shape, memory_space, std::move(leaf_buffer),
-      {definition_event}, /*raw_buffer_is_mutable=*/true);
+  auto buffer_or =
+      client->DefineBuffer(output_leaf_shape, memory_space,
+                           std::move(leaf_buffer), {definition_event});
   CHECK_OK(buffer_or);
   return *std::move(buffer_or);
 }
@@ -611,6 +749,383 @@ std::vector<std::unique_ptr<PjRtBuffer>> CommonPjRtClient::CreateOutputs(
   return res;
 }
 
+absl::Status CommonPjRtLoadedExecutable::ExecutePrepare(
+    ExecuteLaunchArgs& launch_args,
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const ExecuteOptions& options, size_t host_callback_idx,
+    PjRtDevice* device) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::ExecutePrepare");
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      StartRawExecutable(options, replica, partition, device));
+  // Fill in device to launch_args so it will be present even if ExecutePrepare
+  // fails with OOM.
+  device = executable->device();
+  launch_args.device = device;
+
+  // Execute takes `extra_deps` and waits for those to be
+  // fulfilled before executing the program and returning an available
+  // `execute_event` signaling that the program execution is complete. To avoid
+  // clobbering inputs, we must ensure that
+  //   `extra_deps` = inputs' definition events + donated inputs' usage events.
+  // This also ensures that the returned `execute_event` dominates all inputs'
+  // events, and thus output buffer only need to contain `execute_event` as the
+  // single definition event.
+  launch_args.extra_deps =
+      client()->CreateDeviceEventSet(argument_handles.size());
+  launch_args.control_deps =
+      client()->CreateDeviceEventSet(argument_handles.size());
+
+  bool is_error = false;
+  TF_RETURN_IF_ERROR(CommonPjRtClient::PrepareArguments(
+      options, argument_handles, ParametersThatMustBeDonated(),
+      *launch_args.extra_deps, *launch_args.control_deps,
+      launch_args.input_buffers, launch_args.device_buffers, device, replica,
+      partition, parameter_device_shapes_, is_error));
+
+  absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
+      output_leaf_buffers;
+  if (!is_error) {
+    // Allocate output with input reuse. Any allocation errors are returned
+    // immediately. Derived classes may use custom logic for allocation.
+    TF_ASSIGN_OR_RETURN(output_leaf_buffers,
+                        client()->AllocateOutputBuffersWithInputReuse(
+                            output_device_shape_, launch_args.device_buffers,
+                            input_output_alias_config(), device,
+                            output_memory_space_kind_ids_));
+    VLOG(3) << "Created output buffer: " << output_device_shape_.ToString();
+
+    TF_RETURN_IF_ERROR(CheckBufferCompatibilities(
+        options, launch_args.input_buffers, argument_handles));
+  }
+
+  TF_RETURN_IF_ERROR(executable->Load(options, host_callback_idx));
+
+  launch_args.executable = std::move(executable);
+  launch_args.options = &options;
+  launch_args.is_predetermined_error = is_error;
+  launch_args.output_leaf_buffers = std::move(output_leaf_buffers);
+  return absl::OkStatus();
+}
+
+absl::Span<int const> CommonPjRtLoadedExecutable::ParametersThatMustBeDonated()
+    const {
+  return parameters_that_must_be_donated_;
+}
+
+absl::Status CommonPjRtLoadedExecutable::CheckBufferCompatibilities(
+    const ExecuteOptions& options,
+    absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> input_buffers,
+    absl::Span<PjRtBuffer* const> argument_handles) const {
+  if (input_buffers.size() != input_buffer_sizes_in_bytes_.size()) {
+    return InvalidArgument(
+        "Execution supplied %lld buffers but compiled program expected %lld "
+        "buffers",
+        input_buffers.size(), input_buffer_sizes_in_bytes_.size());
+  }
+  for (int i = 0; i < input_buffers.size(); ++i) {
+    size_t buffer_size = input_buffers[i]->GetOnDeviceSizeInBytes();
+    if (input_buffer_sizes_in_bytes_[i] != buffer_size) {
+      const auto& expected_shape = parameter_device_shapes_[i];
+      const auto& actual_shape = argument_handles[i]->on_device_shape();
+      return InvalidArgument(
+          "Executable(%s) expected parameter %d of size %lld (%s) but got "
+          "buffer with incompatible size %lld (%s)",
+          name(), i, input_buffer_sizes_in_bytes_[i],
+          expected_shape.ToString(true), buffer_size,
+          actual_shape.ToString(true));
+    }
+  }
+  return absl::OkStatus();
+}
+
+PjRtLoadedExecutable::Result CommonPjRtLoadedExecutable::ExecuteLaunch(
+    ExecuteLaunchArgs& launch_args, bool fill_future) const {
+  CHECK(launch_args.extra_deps.get()) << "extra_deps is nullptr";
+  CHECK(launch_args.control_deps.get()) << "control_deps is nullptr";
+  auto results =
+      std::move(*launch_args.executable)
+          .Execute(*launch_args.options, launch_args.input_buffers,
+                   launch_args.output_leaf_buffers, *launch_args.extra_deps,
+                   *launch_args.control_deps,
+                   launch_args.is_predetermined_error, fill_future);
+  {
+    tsl::profiler::TraceMe t3("Handle input event recording");
+    // Handle input event recording.
+    for (CommonPjRtBuffer::ScopedHold& b : launch_args.device_buffers) {
+      if (b.type() == CommonPjRtBuffer::ScopedHold::kUsage) {
+        b.ConvertUsageHold(results.primary_execute_event);
+      } else {
+        CHECK(b.type() == CommonPjRtBuffer::ScopedHold::kDonation);
+        b.ConfirmDonation();
+      }
+    }
+  }
+  return PjRtLoadedExecutable::Result(
+      {/*future=*/std::move(results.future),
+       /*buffers=*/client()->CreateOutputs(
+           output_device_shape_, results.primary_execute_event,
+           launch_args.device, output_memory_space_kind_ids_,
+           std::move(launch_args.output_leaf_buffers),
+           launch_args.is_predetermined_error)});
+}
+
+absl::Status CommonPjRtLoadedExecutable::ExecutePrepareWithOomRetries(
+    std::optional<ExecuteLaunchArgs>& launch_args,
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const ExecuteOptions& options, size_t host_callback_idx,
+    PjRtDevice* device) const {
+  absl::Status prepare_status;
+  int attempts = 0;
+  while (true) {
+    launch_args.emplace();
+    prepare_status =
+        ExecutePrepare(*launch_args, argument_handles, replica, partition,
+                       options, host_callback_idx, device);
+    ++attempts;
+    if (!absl::IsResourceExhausted(prepare_status)) {
+      break;
+    }
+    if (!ShouldRetryOnOom(attempts, launch_args->device, prepare_status)) {
+      break;
+    }
+  }
+  return prepare_status;
+}
+
+static absl::Status ValidateHostTransferCallbacks(
+    absl::Span<const std::vector<SendCallback>> send_callbacks,
+    absl::Span<const std::vector<RecvCallback>> recv_callbacks,
+    size_t num_devices) {
+  if (!send_callbacks.empty() && send_callbacks.size() != num_devices) {
+    return InvalidArgument(
+        "The number of send callback vectors does not match the number of "
+        "devices");
+  }
+  if (!recv_callbacks.empty() && recv_callbacks.size() != num_devices) {
+    return InvalidArgument(
+        "The number of recv callback vectors does not match the number of "
+        "devices");
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<PjRtLoadedExecutable::Result>
+CommonPjRtLoadedExecutable::ExecuteHelperOnSingleDevice(
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const ExecuteOptions& options, bool fill_future, PjRtDevice* device) const {
+  tsl::profiler::TraceMe traceme(
+      "CommonPjRtLoadedExecutable::ExecuteHelperOnSingleDevice");
+  std::optional<ExecuteLaunchArgs> launch_args;
+  TF_RETURN_IF_ERROR(ExecutePrepareWithOomRetries(
+      launch_args, argument_handles, replica, partition, options,
+      /*host_callback_idx=*/0, device));
+  return ExecuteLaunch(*launch_args, fill_future);
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+CommonPjRtLoadedExecutable::ExecuteSharded(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options,
+    std::optional<tsl::Future<void>>& returned_future, bool fill_future) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::ExecuteSharded");
+  for (int i = 0; i < addressable_devices_.size(); ++i) {
+    if (addressable_devices_[i] == device) {
+      TF_RETURN_IF_ERROR(ValidateHostTransferCallbacks(
+          options.send_callbacks, options.recv_callbacks, /*num_devices=*/1));
+      TF_ASSIGN_OR_RETURN(
+          auto result,
+          ExecuteHelperOnSingleDevice(
+              argument_handles, addressable_device_logical_ids_[i].replica,
+              addressable_device_logical_ids_[i].partition, options,
+              fill_future));
+      returned_future = std::move(result.future);
+      return std::move(result.buffers);
+    }
+  }
+  return InvalidArgument(
+      "ExecuteShard attempted to execute on device id %d which is not "
+      "addressable by this client",
+      device->global_device_id().value());
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+CommonPjRtLoadedExecutable::ExecutePortable(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options,
+    std::optional<tsl::Future<void>>& returned_future, bool fill_future) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::ExecutePortable");
+  if (num_replicas() != 1 || num_partitions() != 1) {
+    return InvalidArgument(
+        "ExecutePortable expects a single-core executable but gets "
+        "one with %d replica %d partition",
+        num_replicas(), num_partitions());
+  }
+  if (device == nullptr) {
+    return InvalidArgument("ExecutePortable expects a device to be specified");
+  }
+
+  TF_RETURN_IF_ERROR(ValidateHostTransferCallbacks(
+      options.send_callbacks, options.recv_callbacks, /*num_devices=*/1));
+  VLOG(1) << "ExecutePortable executes single-core portable executable "
+          << name();
+  TF_ASSIGN_OR_RETURN(
+      auto result, ExecuteHelperOnSingleDevice(argument_handles, /*replica=*/0,
+                                               /*partition=*/0, options,
+                                               fill_future, device));
+  returned_future = std::move(result.future);
+  return std::move(result.buffers);
+}
+
+absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+CommonPjRtLoadedExecutable::Execute(
+    absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+    const ExecuteOptions& options,
+    std::optional<std::vector<tsl::Future<void>>>& returned_futures) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::Execute");
+  VLOG(1) << "CommonPjRtLoadedExecutable::Execute";
+  if (!client()->allows_recursion() && ThisThreadIsInsideHostCallback()) {
+    // Because TPU is single threaded, and the host callback currently blocking
+    // the TPU, we should not initiate any outstanding computations because that
+    // risks deadlocking the TPU.
+    return InvalidArgument("Execute() called from inside host callback.");
+  }
+
+  tsl::profiler::TraceMeProducer producer("CommonPjRtLoadedExecutable::Execute",
+                                          tsl::profiler::ContextType::kPjRt);
+
+  const int num_addressable_devices = addressable_devices_.size();
+
+  if (argument_handles.size() != num_addressable_devices) {
+    return InvalidArgument(
+        "Attempted to execute with %d argument lists when local device "
+        "count is %d (total replica count: %d, partition count: %d)",
+        argument_handles.size(), num_addressable_devices, num_replicas(),
+        num_partitions());
+  }
+
+  VLOG(1) << "Executing computation " << name()
+          << "; num_replicas=" << num_replicas()
+          << " num_partitions=" << num_partitions()
+          << " num_addressable_devices=" << num_addressable_devices;
+
+  TF_RETURN_IF_ERROR(ValidateHostTransferCallbacks(
+      options.send_callbacks, options.recv_callbacks,
+      addressable_devices_.size()));
+
+  std::vector<absl::StatusOr<Result>> results(num_addressable_devices);
+  if (num_addressable_devices == 1) {
+    // Fast-path if there is only one device — run the computation on the
+    // current thread.
+    const int replica = addressable_device_logical_ids_[0].replica;
+    const int partition = addressable_device_logical_ids_[0].partition;
+    results[0] =
+        ExecuteHelperOnSingleDevice(argument_handles[0], replica, partition,
+                                    options, returned_futures.has_value());
+  } else {
+    absl::Mutex mu;
+    int preparing = num_addressable_devices;
+    int launching = num_addressable_devices;
+    int failed = 0;
+    absl::Status first_failure_status;
+
+    {
+      // The gang_schedule mutex ensures that all calls to Schedule() happen
+      // atomically and cannot interleave with calls to Execute on other
+      // threads. If calls to Schedule are not atomic, then the threads can get
+      // stuck waiting for done_preparing to become true.
+      absl::MutexLock gang_schedule(client()->gang_scheduler());
+      auto context_id = producer.GetContextId();
+      for (int i = 0; i < num_addressable_devices; ++i) {
+        const int replica = addressable_device_logical_ids_[i].replica;
+        const int partition = addressable_device_logical_ids_[i].partition;
+        PjRtDevice* device = addressable_devices_[i];
+        LaunchOnDevice(device, [&, replica, partition, i, context_id] {
+          tsl::profiler::TraceMeConsumer consumer(
+              "Scheduled CommonPjRtLoadedExecutable::Execute",
+              tsl::profiler::ContextType::kPjRt, context_id);
+
+          // Two phase launch. Phase 1: Prepare on all cores. Abort
+          // launch on prepare failure.
+          std::optional<ExecuteLaunchArgs> launch_args;
+          absl::Status launch_status = ExecutePrepareWithOomRetries(
+              launch_args, argument_handles[i], replica, partition, options,
+              /*host_callback_idx=*/i);
+          // Wait for prepare to finish on all cores.
+          {
+            absl::MutexLock lock(mu);
+            preparing--;
+            auto done_preparing = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+              return preparing == 0;
+            };
+            mu.Await(absl::Condition(&done_preparing));
+            if (!launch_status.ok()) {
+              if (failed == 0) {
+                first_failure_status = launch_status;
+              }
+              failed++;
+            }
+            if (failed > 0) {
+              // Poison results for all cores.
+              results[i] = first_failure_status;
+              // Abort phase 2 if Prepare fails for any core.
+              --launching;
+              return;
+            }
+          }
+
+          // Phase 2: Launch. It cannot fail.
+          results[i] =
+              ExecuteLaunch(*launch_args, returned_futures.has_value());
+
+          absl::MutexLock lock(mu);
+          --launching;
+        });
+      }
+    }
+
+    // Wait until we either fail Phase 1 or completes two phases.
+    auto done = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+      return launching == 0;
+    };
+    absl::MutexLock lock(mu);
+    mu.Await(absl::Condition(&done));
+  }
+  VLOG(3) << "Replicated execution complete.";
+
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> wrapped_results(
+      num_addressable_devices);
+  if (returned_futures.has_value()) {
+    returned_futures->reserve(num_addressable_devices);
+  }
+  for (int i = 0; i < num_addressable_devices; ++i) {
+    const int replica = addressable_device_logical_ids_[i].replica;
+    const int partition = addressable_device_logical_ids_[i].partition;
+    auto& statusor = results[i];
+    if (!statusor.ok()) {
+      if (absl::IsResourceExhausted(statusor.status())) {
+        client()->CallOomHandlers();
+      }
+      if (returned_futures.has_value()) {
+        returned_futures->clear();
+      }
+      if (num_addressable_devices == 1) {
+        return statusor.status();
+      }
+      return AppendStatus(
+          statusor.status(),
+          absl::StrFormat("while running replica %d and partition %d of a "
+                          "replicated computation (other "
+                          "replicas may have failed as well).",
+                          replica, partition));
+    }
+    wrapped_results[i] = std::move(statusor->buffers);
+    if (returned_futures.has_value()) {
+      returned_futures->push_back(*std::move(statusor->future));
+    }
+  }
+  return wrapped_results;
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
                                            PjRtMemorySpace* dst_memory_space) {
@@ -636,8 +1151,7 @@ CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
   TF_ASSIGN_OR_RETURN(
       auto buffer,
       dst_client->DefineBuffer(dst_shape, dst_memory_space, dst_raw_buffer,
-                               {std::move(definition_event)},
-                               /*raw_buffer_is_mutable=*/true));
+                               {std::move(definition_event)}));
   auto* base_ptr = dst_raw_buffer->GetHostPointer();
   std::unique_ptr<MutableLiteralBase> literal;
   bool needs_second_copy = false;
@@ -747,8 +1261,7 @@ static absl::Status CommonCopyToMemorySpace(
     TF_ASSIGN_OR_RETURN(
         dst_buffer,
         dst_client->DefineBuffer(dst_shape, dst_memory_space, dst_raw_buffer,
-                                 {std::move(definition_event)},
-                                 /*raw_buffer_is_mutable=*/true));
+                                 {std::move(definition_event)}));
     TF_RETURN_IF_ERROR(src_buffer->AcquireScopedRawBuffer(
         [&](tsl::RCReference<CommonPjRtRawBuffer> buf_raw_buffer,
             std::vector<tsl::RCReference<tsl::AsyncValue>>
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index 43cc61bc024696..d03af18392ec5d 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -62,6 +62,9 @@ class CommonPjRtClient : public PjRtClient {
   // callbacks. Those clients should return false here.
   virtual bool allows_recursion() const { return true; }
 
+  // Backend specific handlers for when an oom is detected during execute.
+  virtual void CallOomHandlers() const {}
+
   // Computes the memory requirements for storing shape on memory_space.
   // TODO(parkers): make pure virtual and update all clients.
   virtual absl::StatusOr<int64_t> GetOnDeviceBytesCount(
@@ -84,7 +87,7 @@ class CommonPjRtClient : public PjRtClient {
   ImportForeignMemory(void* device_ptr,
                       absl::AnyInvocable<void() &&> on_delete_callback,
                       size_t on_device_bytes_count,
-                      PjRtMemorySpace* memory_space) {
+                      PjRtMemorySpace* memory_space, bool is_mutable) {
     return absl::UnimplementedError("ImportForeignMemory is not supported");
   }
 
@@ -102,8 +105,7 @@ class CommonPjRtClient : public PjRtClient {
       const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-          definition_device_events,
-      bool raw_buffer_is_mutable) {
+          definition_device_events) {
     return absl::UnimplementedError("DefineBuffer is not supported");
   }
 
@@ -152,6 +154,11 @@ class CommonPjRtClient : public PjRtClient {
     return CreateLinkedEventPromise(memory_space, "CreateLinkedEventPromise");
   }
 
+  virtual std::unique_ptr<PjRtDeviceEventSet> CreateDeviceEventSet(
+      size_t preallocated_size) const {
+    LOG(FATAL) << "Implement";
+  }
+
   // Registers the necessary debug information for an allocation event.
   // TODO(parkers): Once everything is unified this should be controlled
   // by a non-device-specific config instead of delegating this control
@@ -191,7 +198,8 @@ class CommonPjRtClient : public PjRtClient {
       absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>) &&>;
   virtual absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                                    PjRtFulfillAliasRawBufferCallback>>
-  CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                         size_t on_device_bytes_count) {
     return absl::UnimplementedError("CreateRawBufferChannel is not supported");
   }
 
@@ -236,6 +244,17 @@ class CommonPjRtClient : public PjRtClient {
       Future<std::string> serialized_descriptor,
       PjRtBuffer::RemoteSendCallback on_done);
 
+  static absl::Status PrepareArguments(
+      const ExecuteOptions& options,
+      absl::Span<PjRtBuffer* const> argument_handles,
+      absl::Span<int const> donated_params, PjRtDeviceEventSet& extra_deps,
+      PjRtDeviceEventSet& control_deps,
+      absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>&
+          input_buffers,
+      absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4>& device_buffers,
+      PjRtDevice* device, int replica, int partition,
+      absl::Span<const Shape> parameter_device_shapes, bool& is_error);
+
   absl::StatusOr<absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>>
   AllocateOutputBuffersWithInputReuse(
       const Shape& output_device_shape,
@@ -250,6 +269,163 @@ class CommonPjRtClient : public PjRtClient {
       absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
           output_leaf_buffers,
       bool is_predetermined_error);
+
+  absl::Mutex& gang_scheduler() const { return gang_scheduler_mu_; }
+
+ private:
+  mutable absl::Mutex gang_scheduler_mu_;
+};
+
+// Represents the launch state for a loaded executable. This state must be
+// reconstructed each time we want to launch the executable.
+class PjRtRawLoadedExecutable {
+ public:
+  virtual ~PjRtRawLoadedExecutable() = default;
+
+  virtual PjRtDevice* device() = 0;
+
+  virtual absl::Status Load(const ExecuteOptions& options,
+                            size_t host_callback_idx) = 0;
+
+  struct RawExecuteResult {
+    std::optional<tsl::Future<>> future;
+    tsl::RCReference<PjRtDeviceEvent> primary_execute_event;
+  };
+  virtual RawExecuteResult Execute(
+      const ExecuteOptions& options,
+      absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> inputs,
+      absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> results,
+      PjRtDeviceEventSet& extra_deps, PjRtDeviceEventSet& control_deps,
+      bool is_predetermined_error, bool fill_future) && = 0;
+};
+
+class CommonPjRtLoadedExecutable : public PjRtLoadedExecutable {
+ public:
+  CommonPjRtLoadedExecutable(
+      CommonPjRtClient* client, std::vector<Shape> parameter_device_shapes,
+      Shape output_device_shape, std::vector<int> output_memory_space_kind_ids,
+      std::vector<PjRtDevice*> addressable_devices,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids)
+      : parameter_device_shapes_(std::move(parameter_device_shapes)),
+        output_device_shape_(std::move(output_device_shape)),
+        output_memory_space_kind_ids_(std::move(output_memory_space_kind_ids)),
+        addressable_devices_(std::move(addressable_devices)),
+        addressable_device_logical_ids_(
+            std::move(addressable_device_logical_ids)) {}
+
+  CommonPjRtClient* client() const override = 0;
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  using PjRtLoadedExecutable::Execute;
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<tsl::Future<void>>>& returned_futures)
+      const override;
+
+  using PjRtLoadedExecutable::ExecuteSharded;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<tsl::Future<void>>& returned_future,
+      bool fill_future) const override;
+
+  using PjRtLoadedExecutable::ExecutePortable;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<tsl::Future<void>>& returned_future,
+      bool fill_future) const override;
+
+ protected:
+  // Execute is split into Prepare and Launch.
+  // Prepare can fail and be retried, while Launch is guaranteed to succeed.
+  struct ExecuteLaunchArgs {
+    PjRtDevice* device;
+    std::unique_ptr<PjRtRawLoadedExecutable> executable;
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4> input_buffers;
+    absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4> device_buffers;
+    std::unique_ptr<PjRtDeviceEventSet> extra_deps;
+    std::unique_ptr<PjRtDeviceEventSet> control_deps;
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
+        output_leaf_buffers;
+    bool is_predetermined_error;
+    const ExecuteOptions* options;
+  };
+
+  virtual absl::StatusOr<std::unique_ptr<PjRtRawLoadedExecutable>>
+  StartRawExecutable(const ExecuteOptions& options, int replica, int partition,
+                     PjRtDevice* device) const = 0;
+
+  // Returns a sorted list of the parameters that must be donated as a
+  // side-effect of the execution. Derived classes may use custom logic.
+  absl::Span<int const> ParametersThatMustBeDonated() const;
+
+  virtual const HloInputOutputAliasConfig& input_output_alias_config()
+      const = 0;
+
+  // Checks that the input buffers passed in by the user have the correct size
+  // on device for the compiled program.
+  absl::Status CheckBufferCompatibilities(
+      const ExecuteOptions& options,
+      absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> input_buffers,
+      absl::Span<PjRtBuffer* const> argument_handles) const;
+
+  absl::Status ExecutePrepare(ExecuteLaunchArgs& launch_args,
+                              absl::Span<PjRtBuffer* const> argument_handles,
+                              int replica, int partition,
+                              const ExecuteOptions& options,
+                              size_t host_callback_idx,
+                              PjRtDevice* device) const;
+
+  // Run Prepare and Launch phases on a single device.
+  absl::StatusOr<Result> ExecuteHelperOnSingleDevice(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const ExecuteOptions& options, bool fill_future,
+      PjRtDevice* device = nullptr) const;
+
+  absl::Status ExecutePrepareWithOomRetries(
+      std::optional<ExecuteLaunchArgs>& launch_args,
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const ExecuteOptions& options, size_t host_callback_idx,
+      PjRtDevice* device = nullptr) const;
+
+  virtual void LaunchOnDevice(PjRtDevice* device,
+                              absl::AnyInvocable<void()> execute_fn) const = 0;
+
+  virtual bool ShouldRetryOnOom(int attempts, PjRtDevice* device,
+                                absl::Status perpare_status) const {
+    return false;
+  }
+
+  Result ExecuteLaunch(ExecuteLaunchArgs& launch_args, bool fill_future) const;
+
+  // Parameter shapes.
+  std::vector<Shape> parameter_device_shapes_;
+  // A sorted vector of parameters that have any aliased buffers and thus must
+  // be donated when executing the computation.
+  std::vector<int> parameters_that_must_be_donated_;
+  // Result layouts (device shapes).
+  Shape output_device_shape_;
+  // memory_space()->kind_id() for each output buffer.
+  std::vector<int> output_memory_space_kind_ids_;
+  // Size on device of each leaf buffer of the compiled program, cached here
+  // for performance reasons.
+  std::vector<int64_t> input_buffer_sizes_in_bytes_;
+  // addressable_devices_[i] is the Device to which
+  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
+  // unique_ptrs to play well with the Python bindings (see xla.cc).
+  std::vector<PjRtDevice*> addressable_devices_;
+  // The replica and partition indices of device_assignment_ to be run by this
+  // client. On single-host platforms without partitioning, this is all
+  // replicas (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may
+  // not be the case on multi-host platforms. If there are 4 replicas and 2
+  // partitions on a single host platform, size of
+  // addressable_device_logical_ids_ is 4*2 = 8.
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
 };
 
 // TODO(parkers): Merge everything here into CommonPjRtBuffer.
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 59bcdbffdc173f..2743355a6ed4c0 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_xfeed",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:env",
@@ -202,13 +202,13 @@ cc_library(
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:executable_proto_cc",
         "//xla/service/llvm_ir:llvm_command_line_options",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:env",
@@ -354,6 +354,8 @@ xla_cc_test(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology_description",
+        "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service/cpu:executable_proto_cc",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
index 3a98e4200f496a..a62f5a8541060d 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
@@ -18,21 +18,10 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <memory>
 #include <optional>
-#include <utility>
-#include <vector>
 
 #include "absl/base/casts.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/layout_util.h"
@@ -53,7 +42,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 9a2a4dcb42d319..3ad33cd715e612 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -114,10 +114,10 @@ limitations under the License.
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -488,7 +488,7 @@ PjRtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
                       compiler.LoadAotCompilationResult(str));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(&compiler, /*executor=*/nullptr));
+      std::move(*aot_result).LoadExecutable(/*executor=*/nullptr));
 
   // Set up other arguments for PjRtCpuExecutable
   // TODO(b/232263665): Remove duplicated code in DeserializeExecutable and
@@ -620,7 +620,7 @@ static absl::StatusOr<std::unique_ptr<xla::Executable>> CompileAheadOfTime(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
                       compiler.LoadAotCompilationResult(serialized_aot_result));
 
-  return std::move(*aot_result).LoadExecutable(&compiler, /*executor=*/nullptr);
+  return std::move(*aot_result).LoadExecutable(/*executor=*/nullptr);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
@@ -888,10 +888,11 @@ static bool IsAlignedData(void* ptr) {
 absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
 PjRtCpuClient::ImportForeignMemory(
     void* device_ptr, absl::AnyInvocable<void() &&> on_delete_callback,
-    size_t on_device_bytes_count, PjRtMemorySpace* memory_space) {
-  return CpuRawBuffer::ImportForeignMemory(device_ptr,
-                                           std::move(on_delete_callback),
-                                           on_device_bytes_count, memory_space);
+    size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+    bool is_mutable) {
+  return CpuRawBuffer::ImportForeignMemory(
+      device_ptr, std::move(on_delete_callback), on_device_bytes_count,
+      memory_space, is_mutable);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::CreateErrorBuffer(
@@ -910,10 +911,9 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::CreateErrorBuffer(
   return std::make_unique<CommonPjRtBufferImpl>(
       shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
-          /*owns_buffers=*/true, std::move(raw_buffer),
-          absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
-              tsl::AsyncValueRef<CpuEvent>(
-                  tsl::MakeErrorAsyncValueRef(std::move(error)))}),
+          std::move(raw_buffer),
+          tsl::AsyncValueRef<CpuEvent>(
+              tsl::MakeErrorAsyncValueRef(std::move(error)))),
       memory_space);
 }
 
@@ -996,24 +996,18 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::DefineBuffer(
     const Shape& on_device_shape, PjRtMemorySpace* memory_space,
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-        definition_device_events,
-    bool raw_buffer_is_mutable) {
+        definition_device_events) {
   if (raw_buffer && raw_buffer->memory_space() != memory_space) {
     return absl::InvalidArgumentError(
         absl::StrFormat("DefineBuffer: Mismatch in memory spaces: %s vs %s",
                         raw_buffer->memory_space()->DebugString(),
                         memory_space->DebugString()));
   }
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
-  for (auto& ev : definition_device_events) {
-    definition_events.push_back(
-        tsl::down_cast<CpuTrackedDeviceEvent*>(ev.get())->event());
-  }
   return std::unique_ptr<PjRtBuffer>(std::make_unique<CommonPjRtBufferImpl>(
       on_device_shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
-          /*owns_buffers=*/raw_buffer_is_mutable, std::move(raw_buffer),
-          ShapeUtil::ByteSizeOf(on_device_shape), std::move(definition_events)),
+          std::move(raw_buffer),
+          CpuTrackedDeviceEvent::AfterAll(definition_device_events)),
       memory_space));
 }
 
@@ -1030,10 +1024,12 @@ PjRtCpuClient::AllocateRawBuffer(PjRtMemorySpace* memory_space,
 
 absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                          CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
-PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
+PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                                      size_t on_device_bytes_count) {
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto raw_buffer = tsl::MakeRef<CpuRawBuffer>(
-      memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise));
+      memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise),
+      on_device_bytes_count, /*is_mutable=*/true);
 
   auto buffer_promise_cb =
       [buffer_promise = std::move(buffer_promise), memory_space](
@@ -1254,7 +1250,9 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
     // If we don't own the buffer, we can't overwrite it or donate it. For
     // example we might be pointing to a buffer owned by the client whose
     // lifetime will not extend past the lifetime of the donated input buffer.
-    if ((!can_donate || (arg && !arg->owns_buffers())) &&
+    if ((!can_donate ||
+         (arg && !tensorflow::down_cast<CpuRawBuffer*>(arg->raw_buffer().get())
+                      ->is_mutable())) &&
         !allocation.is_readonly()) {
       auto copy = CpuDeviceMemory::CreateDelayedMemory();
 
@@ -1269,13 +1267,15 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
     }
 
     buffer_info.buffer = out.CopyRef();
-    buffer_info.owns_buffer = !arg || arg->owns_buffers();
+    buffer_info.owns_buffer =
+        !arg || tensorflow::down_cast<CpuRawBuffer*>(arg->raw_buffer().get())
+                    ->is_mutable();
     buffer_info.buffer_size = buffer_size;
     return buffer_info;
 
   } else if (allocation.is_constant() &&
              allocation.index() < constants.size()) {
-    se::DeviceMemoryBase constant =
+    se::DeviceAddressBase constant =
         constants[allocation.index()].AsDeviceMemoryBase();
     buffer_info.buffer = CpuDeviceMemory::CreateConstantMemory(
         constant.opaque(), constant.size());
@@ -1390,6 +1390,15 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   auto execute_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
   MarkEventReadyOnExit ready_on_exit(execute_event);
   auto execute_usage_event = tsl::MakeRef<CpuTrackedDeviceEvent>(execute_event);
+  // `returned_future_can_be_set_event` indicates when `returned_future` can be
+  // set using `execute_event`. This is necessary to delay setting the
+  // `returned_future` until all (async) execution activities are complete even
+  // if `execute_event` itself may be set early due to execution poisoning. This
+  // lets the user rely on `returned_future` when there is no more in-flight
+  // executions and destroy any external resources such as loaded callbacks and
+  // execute contexts.
+  auto returned_future_can_be_set_event =
+      tsl::MakeConstructedAsyncValueRef<CpuEvent>();
 
   absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4> donation_transactions;
 
@@ -1620,12 +1629,12 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
 
     if (cpu_executable->has_thunks()) {
       // Call interpreted thunk sequence implementing XLA executable.
-      absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
+      absl::InlinedVector<MaybeOwningDeviceAddress, 8> buffer_device_mem;
       buffer_device_mem.reserve(buffer_table.size());
       for (const auto& buffer_info : buffer_table) {
         buffer_device_mem.emplace_back(
-            se::DeviceMemoryBase(buffer_info.buffer->untyped_data(),
-                                 buffer_info.buffer->size_bytes()));
+            se::DeviceAddressBase(buffer_info.buffer->untyped_data(),
+                                  buffer_info.buffer->size_bytes()));
       }
 
       cpu::BufferAllocations allocations(buffer_device_mem);
@@ -1638,12 +1647,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           cpu::Thunk::CustomCallExecuteParams custom_call_execute_params,
           cpu::Thunk::CustomCallExecuteParams::Create(&run_options));
 
-      std::optional<cpu::Thunk::XnnParams> xnn_params;
-      if (cpu_executable->has_xnn_fusions()) {
-        TF_ASSIGN_OR_RETURN(xnn_params,
-                            cpu::Thunk::XnnParams::Create(&run_options));
-      }
-
       std::optional<cpu::Thunk::YnnParams> ynn_params;
       if (cpu_executable->has_ynn_fusions()) {
         TF_ASSIGN_OR_RETURN(ynn_params,
@@ -1661,7 +1664,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           &task_runner,
           &collective_params,
           &custom_call_execute_params,
-          xnn_params ? &*xnn_params : nullptr,
           ynn_params ? &*ynn_params : nullptr,
           run_options.run_id().ToInt(),
           run_options.device_ordinal(),
@@ -1689,6 +1691,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
       return thunks_execute_event.GetError();
     }
 
+    returned_future_can_be_set_event.SetStateConcrete();
+
   } else {
     // Asynchronously call generated function.
 
@@ -1718,20 +1722,21 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
         device->async_execution_tracker()->NewAsyncExecution(
             run_id.ToInt(), std::move(ready_on_exit).Release());
     client()->async_work_runner()->ScheduleWhenReady(
-        input_deps,
-        [cpu_executable, buffer_alloc = std::move(buffer_alloc),
-         buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
-         buffer_table = std::move(buffer_table),
-         run_options = std::move(run_options),
-         device_assignment = std::move(device_assignment),
-         cpu_run_options = std::move(cpu_run_options),
-         compute_reservation = std::move(compute_reservation),
-         tuple_index_table = std::move(tuple_index_table),
-         donation_transactions = std::move(donation_transactions),
-         scoped_async_execution = std::move(scoped_async_execution),
-         input_deps_avs = std::move(input_deps_avs_copy),
-         allocator = client()->allocator(),
-         eigen_device = client()->eigen_intraop_device()]() mutable {
+        input_deps, [cpu_executable, buffer_alloc = std::move(buffer_alloc),
+                     buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
+                     buffer_table = std::move(buffer_table),
+                     run_options = std::move(run_options),
+                     device_assignment = std::move(device_assignment),
+                     cpu_run_options = std::move(cpu_run_options),
+                     compute_reservation = std::move(compute_reservation),
+                     tuple_index_table = std::move(tuple_index_table),
+                     donation_transactions = std::move(donation_transactions),
+                     scoped_async_execution = std::move(scoped_async_execution),
+                     input_deps_avs = std::move(input_deps_avs_copy),
+                     allocator = client()->allocator(),
+                     eigen_device = client()->eigen_intraop_device(),
+                     returned_future_can_be_set_event =
+                         returned_future_can_be_set_event.CopyRef()]() mutable {
           // Because `input_deps` contains the definition events of all inputs,
           // when it is ready, all input buffers must have been allocated. So,
           // we are safe to allocate and copy memory here. Since `execute_event`
@@ -1743,6 +1748,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
             if (auto* error = av->GetErrorIfPresent()) {
               scoped_async_execution.SetError(Internal(
                   "Error dispatching computation: %s", error->message()));
+              returned_future_can_be_set_event.SetStateConcrete();
               return;
             }
           }
@@ -1758,18 +1764,19 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
               scoped_async_execution.SetError(
                   Internal("Error preparing computation: %s",
                            buffer_info.buffer.GetError().message()));
+              returned_future_can_be_set_event.SetStateConcrete();
               return;
             }
           }
           absl::Status status;
           if (cpu_executable->has_thunks()) {
             // Call interpreted thunk sequence implementing XLA executable.
-            absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
+            absl::InlinedVector<MaybeOwningDeviceAddress, 8> buffer_device_mem;
             buffer_device_mem.reserve(buffer_table.size());
             for (const auto& buffer_info : buffer_table) {
               buffer_device_mem.emplace_back(
-                  se::DeviceMemoryBase(buffer_info.buffer->untyped_data(),
-                                       buffer_info.buffer->size_bytes()));
+                  se::DeviceAddressBase(buffer_info.buffer->untyped_data(),
+                                        buffer_info.buffer->size_bytes()));
             }
 
             cpu::BufferAllocations allocations(buffer_device_mem);
@@ -1782,12 +1789,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
                 custom_call_params =
                     cpu::Thunk::CustomCallExecuteParams::Create(&run_options);
 
-            absl::StatusOr<std::optional<cpu::Thunk::XnnParams>> xnn_params(
-                std::nullopt);
-            if (cpu_executable->has_xnn_fusions()) {
-              xnn_params = cpu::Thunk::XnnParams::Create(&run_options);
-            }
-
             absl::StatusOr<std::optional<cpu::Thunk::YnnParams>> ynn_params(
                 std::nullopt);
             if (cpu_executable->has_ynn_fusions()) {
@@ -1806,7 +1807,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
                   &task_runner,
                   &*collective_params,
                   &*custom_call_params,
-                  *xnn_params ? &**xnn_params : nullptr,
                   *ynn_params ? &**ynn_params : nullptr,
                   run_options.run_id().ToInt(),
                   run_options.device_ordinal(),
@@ -1840,10 +1840,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           if (!status.ok()) {
             // CPU computation fails with an error.
             scoped_async_execution.SetError(std::move(status));
+            returned_future_can_be_set_event.SetStateConcrete();
+            return;
           }
 
           // CPU computation completes.
           scoped_async_execution.SetStateConcrete();
+          returned_future_can_be_set_event.SetStateConcrete();
         });
   }
 
@@ -1855,14 +1858,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
     res.reserve(result_buffers_info.size());
     for (int i = 0; i < result_buffers_info.size(); ++i) {
       // Program execution writes to output buffers so it's a definition event.
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
-      definition_events.push_back(execute_event.CopyRef());
       auto leaf_tracked_device_buffer =
           std::make_unique<TrackedCpuDeviceBuffer>(
-              result_buffers_info[i].owns_buffer,
               tsl::MakeRef<CpuRawBuffer>(
-                  memory_space, std::move(result_buffers_info[i].buffer)),
-              result_buffers_info[i].buffer_size, std::move(definition_events));
+                  memory_space, std::move(result_buffers_info[i].buffer),
+                  result_buffers_info[i].buffer_size,
+                  result_buffers_info[i].owns_buffer),
+              execute_event.CopyRef());
       auto leaf_buffer = std::make_unique<CommonPjRtBufferImpl>(
           result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
           memory_space);
@@ -1872,10 +1874,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
     CHECK_EQ(result_buffers_info.size(), 1);
     // Program execution writes to output buffers so it's a definition event.
     auto tracked_device_buffer = std::make_unique<TrackedCpuDeviceBuffer>(
-        result_buffers_info[0].owns_buffer,
         tsl::MakeRef<CpuRawBuffer>(memory_space,
-                                   std::move(result_buffers_info[0].buffer)),
-        result_buffers_info[0].buffer_size,
+                                   std::move(result_buffers_info[0].buffer),
+                                   result_buffers_info[0].buffer_size,
+                                   result_buffers_info[0].owns_buffer),
         /*definition_event=*/execute_event);
     auto output_buffer = std::make_unique<CommonPjRtBufferImpl>(
         result_shape, std::move(tracked_device_buffer), memory_space);
@@ -1884,14 +1886,18 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
 
   if (fill_future) {
     auto [promise, future] = Future<>::MakePromise();
-    execute_event.AndThen([promise = std::move(promise),
-                           event = execute_event.CopyRef()]() mutable {
-      if (auto* error = event.GetErrorIfPresent()) {
-        promise.Set(Internal("Compute error: %s", error->message()));
-      } else {
-        promise.Set();
-      }
-    });
+    returned_future_can_be_set_event.AndThen(
+        [execute_event = std::move(execute_event),
+         promise = std::move(promise)]() mutable {
+          execute_event.AndThen([execute_event = execute_event.CopyRef(),
+                                 promise = std::move(promise)]() mutable {
+            if (auto* error = execute_event.GetErrorIfPresent()) {
+              promise.Set(Internal("Compute error: %s", error->message()));
+            } else {
+              promise.Set();
+            }
+          });
+        });
     return Result({std::move(future), /*buffers=*/std::move(res)});
   }
 
@@ -1913,17 +1919,18 @@ static void MaybeDumpHloSnapshot(
   *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = module.ToProto();
 
   for (auto* argument : arguments) {
-    *hlo_snapshot.add_arguments() = (*argument->ToLiteralSync())->ToProto();
+    *hlo_snapshot.add_arguments() = (*argument->ToLiteral().Await())->ToProto();
   }
 
   // If there are multiple results, wrap them in a tuple.
   if (results.size() == 1) {
-    *hlo_snapshot.mutable_result() = (*results[0]->ToLiteralSync())->ToProto();
+    *hlo_snapshot.mutable_result() =
+        (*results[0]->ToLiteral().Await())->ToProto();
   } else {
     std::vector<Literal> result_literals;
     result_literals.reserve(results.size());
     for (auto& result : results) {
-      result_literals.push_back(std::move(**result->ToLiteralSync()));
+      result_literals.push_back(std::move(**result->ToLiteral().Await()));
     }
     *hlo_snapshot.mutable_result() =
         LiteralUtil::MakeTupleOwned(std::move(result_literals)).ToProto();
@@ -1980,7 +1987,7 @@ PjRtCpuExecutable::Execute(
       for (const auto& argument_handle : argument_handles) {
         HloInputs hlo_inputs;
         for (const auto& buffer : argument_handle) {
-          TF_ASSIGN_OR_RETURN(auto literal, buffer->ToLiteralSync());
+          TF_ASSIGN_OR_RETURN(auto literal, buffer->ToLiteral().Await());
           *hlo_inputs.add_arguments() = literal->ToProto();
         }
         *hlo_snapshot.add_partitions() = std::move(hlo_inputs);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index e2e29df765002a..813834bfcfe504 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -174,7 +174,8 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> ImportForeignMemory(
       void* device_ptr, absl::AnyInvocable<void() &&> on_delete_callback,
-      size_t on_device_bytes_count, PjRtMemorySpace* memory_space) override;
+      size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+      bool is_mutable) override;
 
   tsl::thread::ThreadPool* pjrt_client_thread_pool() const {
     return pjrt_client_thread_pool_.get();
@@ -218,7 +219,8 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                            PjRtFulfillAliasRawBufferCallback>>
-  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                         size_t on_device_bytes_count) override;
 
   absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> AllocateRawBuffer(
       PjRtMemorySpace* memory_space, size_t on_device_bytes_count,
@@ -233,8 +235,7 @@ class PjRtCpuClient final : public CommonPjRtClient {
       const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-          definition_device_events,
-      bool raw_buffer_is_mutable) override;
+          definition_device_events) override;
 
   absl::StatusOr<int64_t> GetOnDeviceBytesCount(
       PjRtMemorySpace* memory_space, const xla::Shape& shape) const override;
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index f936afbe72e944..b1d1bd1ce8e27f 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <array>
+#include <numeric>
 
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -444,7 +445,7 @@ TEST(PjRtCpuClientTest, AsyncTransferRawData) {
   absl::string_view raw_data_view(raw_data, raw_data_size);
   TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
       0, absl::string_view(raw_data, raw_data_size), []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   ASSERT_EQ(literal->element_count(), 3 * 2);
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
@@ -465,7 +466,7 @@ TEST(PjRtCpuClientTest, AsyncTransferWithSpecs) {
   absl::string_view raw_data_view(raw_data, raw_data_size);
   TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
       0, absl::string_view(raw_data, raw_data_size), []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   ASSERT_EQ(literal->element_count(), 3 * 2);
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
@@ -481,7 +482,7 @@ TEST(PjRtCpuClientTest, AsyncTransferLiteral) {
   EXPECT_THAT(ready_future.IsReady(), IsFalse());
   TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToBuffer(0, literal, []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<float>(),
               ElementsAreArray(literal.data<float>()));
 }
@@ -497,7 +498,7 @@ TEST(PjRtCpuClientTest, AsyncTransferLiteralInt4) {
   EXPECT_THAT(ready_future.IsReady(), IsFalse());
   TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToBuffer(0, literal, []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<s4>(),
               ElementsAreArray(literal.data<s4>()));
 }
@@ -509,7 +510,7 @@ TEST(PjRtCpuClientTest, BufferFromLiteralInt4) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<s4>(),
               ElementsAreArray(literal.data<s4>()));
 }
@@ -523,7 +524,7 @@ TEST(PjRtCpuClientTest, CopyToMemorySpace) {
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(buffer,
                           buffer->CopyToMemorySpace(buffer->memory_space()));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<int32_t>(),
               ElementsAreArray(literal.data<int32_t>()));
 }
@@ -555,7 +556,7 @@ TEST(PjRtCpuClientTest, AsyncTransferNeverTransferred) {
   auto buffer = transfer_manager->RetrieveBuffer(0);
   transfer_manager.reset();
   EXPECT_THAT(
-      buffer->ToLiteralSync(),
+      buffer->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL,
                              HasSubstr("Async transfer object was deleted "
                                        "before transfers completed.")));
@@ -602,7 +603,7 @@ TEST(PjRtCpuClientTest, AsyncTransferSetBufferError) {
   auto buffer = transfer_manager->RetrieveBuffer(0);
   transfer_manager->SetBufferError(0, Internal("foobar"));
   EXPECT_THAT(
-      buffer->ToLiteralSync(),
+      buffer->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
@@ -614,7 +615,7 @@ TEST(PjRtCpuClientTest, CreateErrorBuffer) {
         auto buffer,
         client->CreateErrorBuffer(Internal("foobar"), shape, memory_space));
     EXPECT_THAT(
-        buffer->ToLiteralSync(),
+        buffer->ToLiteral().Await(),
         absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
     EXPECT_EQ(buffer->memory_space(), memory_space);
   }
@@ -639,7 +640,7 @@ TEST(PjRtCpuClientTest, AsyncTransferRawDataToSubBuffer) {
   TF_ASSERT_OK(transfer_manager->TransferRawDataToSubBuffer(
       0, raw_data_view.data(), raw_data_size - 1, 1, /*is_last_transfer=*/true,
       []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   ASSERT_EQ(literal->element_count(), 3 * 2);
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
@@ -678,7 +679,7 @@ ENTRY Identity() -> f32[2, 2] {
   ASSERT_THAT(result, absl_testing::StatusIs(tsl::error::OK));
   // However, the buffer is expected to be poisoned.
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
@@ -717,7 +718,7 @@ ENTRY Identity() -> f32[2, 2] {
   ASSERT_EQ(result->size(), 1);
   ASSERT_EQ(result->at(0).size(), 1);
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
@@ -775,11 +776,11 @@ ENTRY Identity() -> f32[2, 2] {
   }
   for (int i = 0; i < output_buffers.size(); ++i) {
     if (i % 2 == 0) {
-      EXPECT_THAT(output_buffers[i]->ToLiteralSync(),
+      EXPECT_THAT(output_buffers[i]->ToLiteral().Await(),
                   absl_testing::StatusIs(tsl::error::OK));
     } else {
       EXPECT_THAT(
-          output_buffers[i]->ToLiteralSync(),
+          output_buffers[i]->ToLiteral().Await(),
           absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
     }
   }
@@ -832,7 +833,7 @@ ENTRY Identity() -> f32[2, 2] {
   ASSERT_EQ(result->size(), 1);
   ASSERT_EQ(result->at(0).size(), 1);
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
 
   // A later error (propagated from the input buffer) would not affect the
@@ -840,7 +841,7 @@ ENTRY Identity() -> f32[2, 2] {
   transfer_manager->SetBufferError(0, Internal("foobar2"));
 
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
 
   // Attempting to poison a non-existent execution should fail.
@@ -899,7 +900,7 @@ TEST(PjRtCpuClientTest, ForwardUserDataToFfiHandler) {
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
       *result_literal));
@@ -943,7 +944,7 @@ TEST(PjRtCpuClientTest, PassAttrToFfiHandler) {
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({3.0f, 3.0f, 3.0f, 3.0f}), *result_literal));
 }
@@ -1025,7 +1026,7 @@ TEST(PjRtCpuClientTest, CustomAllocator) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
 
   // Check that buffer was constructed in the data array provided by the custom
   // allocator.
@@ -1037,16 +1038,16 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
     HloModule add_and_multiply
 
     ynn_fusion {
-      %lhs = f32[4] parameter(0)
-      %rhs = f32[4] parameter(1)
-      %add = f32[4] add(%lhs, %rhs)
-      ROOT %mul = f32[4] multiply(%add, %add)
+      %lhs = f32[100] parameter(0)
+      %rhs = f32[100] parameter(1)
+      %add = f32[100] add(%lhs, %rhs)
+      ROOT %mul = f32[100] multiply(%add, %add)
     }
 
     ENTRY entry {
-      %p0 = f32[4] parameter(0)
-      %p1 = f32[4] parameter(1)
-      ROOT %fusion = f32[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+      %p0 = f32[100] parameter(0)
+      %p1 = f32[100] parameter(1)
+      ROOT %fusion = f32[100] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
         backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     })";
 
@@ -1057,7 +1058,15 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           client->CompileAndLoad(xla_computation, {}));
 
-  Literal literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  std::vector<float> literal_data(100);
+  std::iota(literal_data.begin(), literal_data.end(), 1.0f);
+
+  std::vector<float> literal_data_x2_squared(literal_data);
+  for (float& i : literal_data_x2_squared) {
+    i = 4 * i * i;
+  }
+
+  Literal literal = LiteralUtil::CreateR1<float>(literal_data);
   TF_ASSERT_OK_AND_ASSIGN(auto buf, client->BufferFromHostLiteral(
                                         literal, client->memory_spaces()[0]));
 
@@ -1065,10 +1074,9 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
   auto result = executable->Execute({{buf.get(), buf.get()}}, opts);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
-      *result_literal));
+      LiteralUtil::CreateR1<float>(literal_data_x2_squared), *result_literal));
 
   // Check that serialized/deserialized executable works and produces the same
   // result.
@@ -1079,10 +1087,10 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
       client->LoadSerializedExecutable(serialized, std::nullopt, {}));
 
   result = executable->Execute({{buf.get(), buf.get()}}, opts);
-  TF_ASSERT_OK_AND_ASSIGN(result_literal, result->at(0).at(0)->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(result_literal,
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
-      *result_literal));
+      LiteralUtil::CreateR1<float>(literal_data_x2_squared), *result_literal));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
index e8fd02e09bc7c8..5d700cb535019a 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
+#include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/cpu/executable.pb.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::cpu {
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
index ab453cceaba7c4..57963004deb68f 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
@@ -105,18 +105,40 @@ Future<> CpuTrackedDeviceEvent::GetReadyFuture() {
       });
 }
 
+/*static*/ tsl::AsyncValueRef<CpuEvent> CpuTrackedDeviceEvent::AfterAll(
+    absl::Span<const tsl::RCReference<PjRtDeviceEvent>> events) {
+  tsl::AsyncValueRef<CpuEvent> definition_event;
+  if (events.empty()) {
+    return tsl::MakeAvailableAsyncValueRef<CpuEvent>();
+  }
+  if (events.size() == 1) {
+    return tsl::down_cast<CpuTrackedDeviceEvent*>(events[0].get())->event();
+  }
+
+  tsl::CountDownAsyncValueRef<CpuEvent> after_all(events.size());
+  for (auto& ev : events) {
+    tsl::down_cast<CpuTrackedDeviceEvent*>(ev.get())->event().AndThen(
+        [after_all](absl::Status status) mutable {
+          after_all.CountDown(std::move(status));
+        });
+  }
+  return std::move(after_all).AsRef();
+}
+
 /*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
 CpuRawBuffer::Allocate(PjRtMemorySpace* memory_space, size_t size_bytes,
                        const CpuDeviceMemory::Allocator& allocator) {
   TF_ASSIGN_OR_RETURN(auto memory,
                       CpuDeviceMemory::Allocate(size_bytes, allocator));
-  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory));
+  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory), size_bytes,
+                                    /*is_mutable=*/true);
 }
 
 /*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
 CpuRawBuffer::ImportForeignMemory(
     void* data, absl::AnyInvocable<void() &&> on_delete_callback,
-    size_t on_device_bytes_count, PjRtMemorySpace* memory_space) {
+    size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+    bool is_mutable) {
   if ((absl::bit_cast<std::uintptr_t>(data) & (cpu::MinAlign() - 1)) != 0) {
     return InvalidArgument(
         "Can't create a view of buffer with unaligned data, ptr: %#x is not "
@@ -126,12 +148,11 @@ CpuRawBuffer::ImportForeignMemory(
   return tsl::MakeRef<CpuRawBuffer>(
       memory_space,
       CpuDeviceMemory::CreateForeignMemory(data, on_device_bytes_count,
-                                           std::move(on_delete_callback)));
+                                           std::move(on_delete_callback)),
+      on_device_bytes_count, is_mutable);
 }
 
-size_t CpuRawBuffer::GetOnDeviceSizeInBytes() const {
-  return buffer_->size_bytes();
-}
+size_t CpuRawBuffer::GetOnDeviceSizeInBytes() const { return buffer_size_; }
 
 void* CpuRawBuffer::GetHostPointer() const { return buffer_->untyped_data(); }
 
@@ -232,7 +253,7 @@ CpuRawBuffer::CopyFromHostBuffer(
       options.dims = dims;
       options.permutation = permutation;
       if (byte_strides) {
-        options.input_layout = TransposePlan::Striding{*byte_strides};
+        options.input_striding = TransposePlan::Striding{*byte_strides};
       }
       if (thread_pool) {
         options.num_threads =
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.h b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
index 0d5f91fc08f74a..03227be74217fa 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
@@ -83,6 +83,9 @@ class CpuTrackedDeviceEvent : public PjRtDeviceEvent {
 
   Future<> GetReadyFuture() override;
 
+  static tsl::AsyncValueRef<CpuEvent> AfterAll(
+      absl::Span<const tsl::RCReference<PjRtDeviceEvent>> events);
+
  private:
   tsl::AsyncValueRef<CpuEvent> event_;
   const char* callee_type_;
@@ -92,8 +95,12 @@ class CpuTrackedDeviceEvent : public PjRtDeviceEvent {
 class CpuRawBuffer : public CommonPjRtRawBuffer {
  public:
   CpuRawBuffer(PjRtMemorySpace* memory_space,
-               tsl::AsyncValueRef<CpuDeviceMemory> buffer)
-      : memory_space_(memory_space), buffer_(std::move(buffer)) {}
+               tsl::AsyncValueRef<CpuDeviceMemory> buffer, size_t buffer_size,
+               bool is_mutable)
+      : memory_space_(memory_space),
+        buffer_(std::move(buffer)),
+        buffer_size_(buffer_size),
+        is_mutable_(is_mutable) {}
 
   absl::Status ValidateSlice(int64_t offset, int64_t slice_size);
 
@@ -106,7 +113,8 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
   // Imports foreign memory.
   static absl::StatusOr<tsl::RCReference<CpuRawBuffer>> ImportForeignMemory(
       void* data, absl::AnyInvocable<void() &&> on_delete_callback,
-      size_t on_device_bytes_count, PjRtMemorySpace* memory_space);
+      size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+      bool is_mutable);
 
   size_t GetOnDeviceSizeInBytes() const override;
 
@@ -124,6 +132,8 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
 
   PjRtMemorySpace* memory_space() const override { return memory_space_; }
 
+  bool is_mutable() const { return is_mutable_; }
+
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
   CopyRawHostToDeviceAndReturnEvent(const void* src, int64_t offset,
                                     int64_t transfer_size) override;
@@ -169,6 +179,8 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
  private:
   PjRtMemorySpace* const memory_space_;
   tsl::AsyncValueRef<CpuDeviceMemory> buffer_;
+  size_t buffer_size_;
+  bool is_mutable_;
 };
 
 absl::StatusOr<xla::Shape> MakeDefaultCpuBufferShape(xla::Shape shape,
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
index 78d53986c7db66..a172969dccc501 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -49,28 +49,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-// Returns an AsyncValueRef<CpuEvent> that will be ready after all the async
-// values in `events` are ready. If errors occurs, one of the errors will be
-// propagated through the returned async value.
-tsl::AsyncValueRef<CpuEvent> AfterAll(
-    absl::Span<const tsl::AsyncValueRef<CpuEvent>> events) {
-  if (events.empty()) {
-    return tsl::MakeAvailableAsyncValueRef<CpuEvent>();
-  }
-  if (events.size() == 1) {
-    return events.front();
-  }
-
-  tsl::CountDownAsyncValueRef<CpuEvent> after_all(events.size());
-  for (auto& event : events) {
-    event.AndThen([after_all](absl::Status status) mutable {
-      after_all.CountDown(std::move(status));
-    });
-  }
-
-  return std::move(after_all).AsRef();
-}
-
 //===----------------------------------------------------------------------===//
 // Default CpuDeviceMemory::RawMemory allocator.
 //===----------------------------------------------------------------------===//
@@ -81,7 +59,8 @@ class AlignedMemory final : public CpuDeviceMemory::RawMemory {
       : base_(base), size_bytes_(size_bytes) {}
 
   ~AlignedMemory() final {
-    tsl::port::AlignedSizedFree(base_, cpu::MinAlign(), size_bytes_);
+    tsl::port::AlignedSizedFree(base_, size_bytes_,
+                                static_cast<std::align_val_t>(cpu::MinAlign()));
   }
 
   void* base() const final { return base_; }
@@ -96,7 +75,8 @@ class AlignedAllocator final : public CpuDeviceMemory::Allocator {
  public:
   absl::StatusOr<std::unique_ptr<CpuDeviceMemory::RawMemory>> Allocate(
       size_t size_bytes, size_t alignment) const final {
-    if (void* base = tsl::port::AlignedMalloc(size_bytes, alignment)) {
+    if (void* base = tsl::port::AlignedMalloc(
+            size_bytes, static_cast<std::align_val_t>(alignment))) {
       return std::make_unique<AlignedMemory>(base, size_bytes);
     }
     return ResourceExhausted("Out of memory allocating %d bytes.", size_bytes);
@@ -209,37 +189,9 @@ absl::Status CpuDeviceMemory::AllocateInto(
 //===----------------------------------------------------------------------===//
 
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
-    : TrackedCpuDeviceBuffer(owns_buffers, std::move(raw_buffer),
-                             AfterAll(definition_events)) {}
-
-TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-    size_t buffer_size,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
-    : TrackedCpuDeviceBuffer(owns_buffers, std::move(raw_buffer), buffer_size,
-                             AfterAll(definition_events)) {}
-
-TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     tsl::AsyncValueRef<CpuEvent> definition_event)
     : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
-      owns_buffers_(owns_buffers),
-      definition_event_(std::move(definition_event)) {
-  DCHECK(definition_event_);
-  CHECK(tensorflow::down_cast<CpuRawBuffer*>(this->raw_buffer().get())
-            ->buffer()
-            .IsConcrete());
-  buffer_size_ = this->raw_buffer()->GetOnDeviceSizeInBytes();
-}
-
-TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-    size_t buffer_size, tsl::AsyncValueRef<CpuEvent> definition_event)
-    : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
-      owns_buffers_(owns_buffers),
-      buffer_size_(buffer_size),
       definition_event_(std::move(definition_event)) {
   DCHECK(definition_event_);
 }
@@ -255,7 +207,9 @@ const tsl::AsyncValueRef<CpuDeviceMemory>& TrackedCpuDeviceBuffer::buffer() {
   return *missing_buffer;
 }
 
-size_t TrackedCpuDeviceBuffer::BufferSize() { return buffer_size_; }
+size_t TrackedCpuDeviceBuffer::BufferSize() {
+  return raw_buffer() ? raw_buffer()->GetOnDeviceSizeInBytes() : 0;
+}
 
 void TrackedCpuDeviceBuffer::AddUsageEvents(
     absl::Span<tsl::AsyncValueRef<CpuEvent>> events) {
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
index 580846b00078c5..a1ca122405e24b 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
@@ -140,35 +140,8 @@ class CpuDeviceMemory {
 // memory. This class is thread-compatible.
 class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
  public:
-  // For non-tuple, takes a single buffer.
-  // For tuple, takes the leaf buffers. Tuple index table created internally.
-  // Nested tuple is not supported.
-
-  // Constructor for allocated cpu memory, i.e., `buffer` should have concrete
-  // states. Definition event is after the list of `definition_events`.
-  TrackedCpuDeviceBuffer(
-      bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
-
-  // Variant with single definition event.
-  TrackedCpuDeviceBuffer(bool owns_buffers,
-                         tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-                         tsl::AsyncValueRef<CpuEvent> definition_event);
-
-  // Constructor for unallocated cpu memory, i.e., `buffer` will have
-  // unconstructed states, and we also need to provide `buffer_size` which will
-  // be the size of the `buffer` after allocation. Definition event is after the
-  // list of `definition_events`. Callers need to ensure cpu memory is allocated
-  // before the definition event is ready.
-  TrackedCpuDeviceBuffer(
-      bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-      size_t buffer_size,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
-
   // Variant with single definition event.
-  TrackedCpuDeviceBuffer(bool owns_buffers,
-                         tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-                         size_t buffer_size,
+  TrackedCpuDeviceBuffer(tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
                          tsl::AsyncValueRef<CpuEvent> definition_event);
 
   TrackedCpuDeviceBuffer(TrackedCpuDeviceBuffer&&) noexcept = default;
@@ -196,8 +169,6 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
   LockUseAndTransferUsageEvents();
 
-  bool owns_buffers() const { return owns_buffers_; }
-
   std::vector<tsl::RCReference<tsl::AsyncValue>> GetAsyncValueDefinitionEvents()
       override;
 
@@ -216,10 +187,6 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
  private:
   void ConfirmDonation() override;
 
-  bool owns_buffers_;
-
-  // Should equal raw_buffer()->GetOnDeviceSizeInBytes();
-  size_t buffer_size_;
   // The definition event are associated with CPU operations that write to the
   // buffers.
   tsl::AsyncValueRef<CpuEvent> definition_event_;
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
index 4d54ae1dacb9a4..7579fef3c0ee3d 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
@@ -57,8 +57,7 @@ TEST(TrackedCpuDeviceBufferTest, Basic) {
     definition_event.SetStateConcrete();
   });
 
-  TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true, buffer, definition_event);
+  TrackedCpuDeviceBuffer tracked_buffer(buffer, definition_event);
 
   BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
@@ -85,8 +84,7 @@ TEST(TrackedCpuDeviceBufferTest, BasicError) {
         Internal("tracked_cpu_device_buffer_test error."));
   });
 
-  TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true, buffer, definition_event);
+  TrackedCpuDeviceBuffer tracked_buffer(buffer, definition_event);
 
   BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
@@ -108,8 +106,9 @@ TEST(TrackedCpuDeviceBufferTest, DelayedAllocation) {
 
   auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
   TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true, tsl::MakeRef<CpuRawBuffer>(memory_space, buffer),
-      expected.size(), definition_event);
+      tsl::MakeRef<CpuRawBuffer>(memory_space, buffer, expected.size(),
+                                 /*is_mutable=*/true),
+      definition_event);
   auto result = tracked_buffer.buffer();
   ASSERT_FALSE(result.IsAvailable());
   ASSERT_EQ(tracked_buffer.BufferSize(), expected.size());
diff --git a/third_party/xla/xla/pjrt/device_event.h b/third_party/xla/xla/pjrt/device_event.h
index 9aa231ebca926f..5e307e0dcc5cc9 100644
--- a/third_party/xla/xla/pjrt/device_event.h
+++ b/third_party/xla/xla/pjrt/device_event.h
@@ -106,6 +106,13 @@ class PjRtDeviceEventPromise : public PjRtDeviceEventOrPromise {
   virtual void SetReady() = 0;
 };
 
+// A collection of events. This is not an event itself because we may want to
+// add events in the future.
+class PjRtDeviceEventSet {
+ public:
+  virtual ~PjRtDeviceEventSet() = default;
+};
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_DEVICE_EVENT_H_
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 710dfc9f0208a5..0795d46b0e4bd6 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -20,25 +20,20 @@ cc_library(
     srcs = ["service.cc"],
     hdrs = ["service.h"],
     deps = [
-        ":topology_util",
-        ":util",
         "//xla:types",
         "//xla:util",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "//xla/pjrt/distributed/coordination:coordination_service",
+        "//xla/pjrt/distributed/coordination:grpc_coordination_service_impl",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
-        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:random",
     ],
 )
 
@@ -70,11 +65,10 @@ cc_library(
     ],
     deps = [
         ":key_value_store_interface",
-        ":util",
+        "//xla/pjrt/distributed/coordination:coordination_client",
+        "//xla/pjrt/distributed/coordination:coordination_service_agent",
+        "//xla/pjrt/distributed/coordination:grpc_coordination_client",
         "//xla/runtime:device_id",
-        "//xla/tsl/distributed_runtime/coordination:coordination_client",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
@@ -87,8 +81,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -123,7 +115,7 @@ cc_library(
         ":protocol_proto_cc",
         "//xla:util",
         "//xla/pjrt:utils",
-        "//xla/pjrt/gpu:gpu_topology_proto_cc",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -153,7 +145,6 @@ xla_cc_test(
         ":topology_util",
         "//xla:status_macros",
         "//xla/runtime:device_id",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -170,10 +161,6 @@ xla_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index e1230a81e614ba..e8f57c5d4e36ec 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
+#include "xla/pjrt/distributed/coordination/coordination_client.h"
+#include "xla/pjrt/distributed/coordination/coordination_service_agent.h"
+#include "xla/pjrt/distributed/coordination/grpc_coordination_client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/runtime/device_id.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -73,12 +73,12 @@ class DistributedRuntimeCoordinationServiceClient
   GetLiveNodesWithIncarnations(absl::Span<const int32_t> nodes) override;
   absl::StatusOr<std::vector<int32_t>> GetLiveNodes(
       absl::Span<const int32_t> nodes) override;
-  absl::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
+  absl::StatusOr<CoordinationServiceAgent*> GetCoordinationServiceAgent()
       override;
 
  private:
-  std::unique_ptr<tsl::CoordinationServiceAgent> coord_agent_;
-  tensorflow::CoordinationServiceConfig config_;
+  std::unique_ptr<CoordinationServiceAgent> coord_agent_;
+  CoordinationServiceAgent::Config config_;
   absl::Duration min_connect_barrier_timeout_;
   int task_id_;
 };
@@ -87,30 +87,25 @@ DistributedRuntimeCoordinationServiceClient::
     DistributedRuntimeCoordinationServiceClient(
         std::shared_ptr<::grpc::Channel> channel, const Options& options) {
   // Convert options to coordination config.
-  tensorflow::CoordinationServiceConfig config;
-  config.set_service_type("standalone");
-  config.set_service_leader("/job:jax_worker/task:0");
-  config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.init_timeout));
-  config.set_heartbeat_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.heartbeat_timeout));
-  config.set_cluster_register_with_barrier(true);
-  config.set_shutdown_barrier_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.shutdown_timeout));
-  config.set_agent_destruction_without_shutdown(
-      !options.shutdown_on_destruction);
-  config.set_poll_for_error_from_service_at_startup(
-      options.poll_for_error_from_service_at_startup);
-
-  std::unique_ptr<tsl::CoordinationClient> leader_client;
-  leader_client.reset(tsl::NewGrpcCoordinationClient(channel));
-  coord_agent_ = tsl::CreateCoordinationServiceAgent();
-  const absl::Status status = coord_agent_->Initialize(
+  CoordinationServiceAgent::Config config;
+  config.service_leader = "/job:jax_worker/task:0";
+  config.cluster_register_timeout = options.init_timeout;
+  config.heartbeat_timeout = options.heartbeat_timeout;
+  config.shutdown_barrier_timeout = options.shutdown_timeout;
+  config.agent_destruction_without_shutdown = !options.shutdown_on_destruction;
+  config.poll_for_error_from_service_at_startup =
+      options.poll_for_error_from_service_at_startup;
+
+  std::unique_ptr<CoordinationClient> leader_client;
+  leader_client.reset(NewGrpcCoordinationClient(channel));
+  auto agent = CoordinationServiceAgent::Create(
       options.env, "jax_worker", options.node_id, config,
       std::move(leader_client), options.missed_heartbeat_callback,
       options.recoverable);
-  if (!status.ok()) {
-    LOG(ERROR) << "Coordination agent failed to initialize: " << status;
+  if (!agent.ok()) {
+    LOG(ERROR) << "Coordination agent failed to initialize: " << agent.status();
+  } else {
+    coord_agent_ = *std::move(agent);
   }
   task_id_ = options.node_id;
   config_ = config;
@@ -132,7 +127,7 @@ absl::Status DistributedRuntimeCoordinationServiceClient::Connect() {
            "scheduled, or 3) scheduling delays. Consider setting a longer "
            "initialization timeout if such delays are expected, the timeout is "
            "currently set to: "
-        << absl::Milliseconds(config_.cluster_register_timeout_in_ms())
+        << config_.cluster_register_timeout
         << ".\n\nOriginal runtime error: " << s;
   } else {
     LOG(ERROR) << "Failed to connect to distributed JAX controller: " << s;
@@ -232,12 +227,12 @@ DistributedRuntimeCoordinationServiceClient::GetLiveNodesWithIncarnations(
 
   // Get the set of live tasks.
   TF_ASSIGN_OR_RETURN(
-      const std::vector<tsl::CoordinationServiceAgent::AliveTask> live_tasks,
+      const std::vector<CoordinationServiceAgent::AliveTask> live_tasks,
       coord_agent_->GetAliveTasks(tasks));
 
   // Extract the node ids from the live tasks.
   absl::flat_hash_map<int32_t, IncarnationId> live_nodes;
-  for (const tsl::CoordinationServiceAgent::AliveTask& task : live_tasks) {
+  for (const CoordinationServiceAgent::AliveTask& task : live_tasks) {
     live_nodes[task.task_id] = task.incarnation_id;
   }
   return live_nodes;
@@ -258,7 +253,7 @@ DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
   return live_nodes;
 }
 
-absl::StatusOr<tsl::CoordinationServiceAgent*>
+absl::StatusOr<CoordinationServiceAgent*>
 DistributedRuntimeCoordinationServiceClient::GetCoordinationServiceAgent() {
   return coord_agent_.get();
 }
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index 50355fd8c0b182..01bab0d6f700c6 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -36,9 +36,9 @@ limitations under the License.
 #include "xla/runtime/device_id.h"
 #include "xla/tsl/platform/env.h"
 
-namespace tsl {
+namespace xla {
 class CoordinationServiceAgent;
-}  // namespace tsl
+}  // namespace xla
 
 namespace xla {
 
@@ -165,7 +165,7 @@ class DistributedRuntimeClient {
 
   // Returns pointer to coordination service agent, or InternalError if the
   // client does not use coordination service.
-  virtual absl::StatusOr<tsl::CoordinationServiceAgent*>
+  virtual absl::StatusOr<CoordinationServiceAgent*>
   GetCoordinationServiceAgent() = 0;
 };
 
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/BUILD b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
index 2f4d808af8c8f0..d50eae2b53bdd1 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
@@ -53,12 +53,9 @@ cc_library(
     srcs = ["coordination_service.cc"],
     hdrs = ["coordination_service.h"],
     deps = [
-        ":coordination_client",
         ":coordination_service_error_util",
         ":key_value_store",
         "//xla/service:global_device_id",
-        "//xla/tsl/distributed_runtime:call_options",
-        "//xla/tsl/lib/gtl:int_type",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -108,7 +105,6 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
-        "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
         "//xla/tsl/util/proto:proto_matchers",
@@ -143,18 +139,16 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:random",
     ],
 )
@@ -170,6 +164,7 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
         "//xla/tsl/protobuf:coordination_service_proto_cc_impl",
@@ -245,32 +240,6 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
-    name = "coordination_service_recoverable_job_test",
-    srcs = ["coordination_service_recoverable_job_test.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service",
-        ":coordination_service_agent",
-        ":grpc_coordination_client",
-        ":grpc_coordination_service_impl",
-        "//xla/tsl/distributed_runtime/rpc:async_service_interface",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
-        "@com_github_grpc_grpc//:grpc++",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 xla_cc_test(
     name = "client_server_test",
     size = "medium",
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
index 2cc16bd7d9bf76..b071e9508a7f1a 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
@@ -54,7 +54,6 @@ limitations under the License.
 
 namespace xla {
 namespace {
-using ::tensorflow::CoordinationServiceConfig;
 using ::testing::AnyOf;
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
@@ -79,44 +78,42 @@ MATCHER_P2(IsKvEntry, key, value, "") {
 
 class ClientServerTest : public ::testing::Test {
  public:
-  CoordinationServiceConfig GetConfig(
+  CoordinationServiceAgent::Config GetConfig(
       absl::Duration init_and_shutdown_timeout,
       bool shutdown_on_destruction = true,
       bool cluster_register_with_barrier = true,
       bool cluster_shutdown_with_barrier = true) {
     // Set config.
-    tensorflow::CoordinationServiceConfig config;
-    config.set_service_type("standalone");
-    config.set_service_leader("/job:agent/task:0");
-    config.set_cluster_register_timeout_in_ms(
-        absl::ToInt64Milliseconds(init_and_shutdown_timeout));
-    config.set_heartbeat_timeout_in_ms(
-        absl::ToInt64Milliseconds(kHeartbeatTimeout));
+    CoordinationServiceAgent::Config config;
+    config.service_leader = "/job:agent/task:0";
+    config.cluster_register_timeout = init_and_shutdown_timeout;
+    config.heartbeat_timeout = kHeartbeatTimeout;
     if (cluster_shutdown_with_barrier) {
-      config.set_shutdown_barrier_timeout_in_ms(
-          absl::ToInt64Milliseconds(init_and_shutdown_timeout));
+      config.shutdown_barrier_timeout = init_and_shutdown_timeout;
     }
-    config.set_agent_destruction_without_shutdown(!shutdown_on_destruction);
+    config.agent_destruction_without_shutdown = !shutdown_on_destruction;
     // TODO(b/369222279): Add more test cases that exercise TF behaviour (no
     // barrier).
-    config.set_cluster_register_with_barrier(cluster_register_with_barrier);
-    config.set_poll_for_error_from_service_at_startup(true);
+    config.poll_for_error_from_service_at_startup = true;
     return config;
   }
 
-  CoordinationServiceConfig GetServiceConfig(
+  CoordinationService::Config GetServiceConfig(
       int num_nodes, absl::Duration init_and_shutdown_timeout,
       bool cluster_register_with_barrier, bool cluster_shutdown_with_barrier) {
-    auto config =
-        GetConfig(init_and_shutdown_timeout,
-                  /*shutdown_on_destruction=*/true,
-                  cluster_register_with_barrier, cluster_shutdown_with_barrier);
-    tensorflow::CoordinatedJob* job =
-        config.mutable_coordinated_job_list()->Add();
-    job->set_name("agent");
-    job->set_num_tasks(num_nodes);
-    auto service = CoordinationService::Create(tsl::Env::Default(), config,
-                                               /*cache=*/nullptr);
+    CoordinationService::Config config;
+    config.cluster_register_timeout = init_and_shutdown_timeout;
+    config.heartbeat_timeout = kHeartbeatTimeout;
+    if (cluster_shutdown_with_barrier) {
+      config.shutdown_barrier_timeout = init_and_shutdown_timeout;
+    }
+    config.cluster_register_with_barrier = cluster_register_with_barrier;
+    tensorflow::CoordinatedJob job;
+    job.set_name("agent");
+    job.set_num_tasks(num_nodes);
+    config.coordinated_job_list.push_back(job);
+    auto service =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     return config;
   }
 
@@ -133,16 +130,17 @@ class ClientServerTest : public ::testing::Test {
     std::unique_ptr<CoordinationClient> leader_client;
     leader_client.reset(NewGrpcCoordinationClient(channel));
 
-    auto coord_agent = CreateCoordinationServiceAgent();
-    CoordinationServiceConfig config =
+    CoordinationServiceAgent::Config config =
         GetConfig(init_and_shutdown_timeout, shutdown_on_destruction);
-    const absl::Status status = coord_agent->Initialize(
+    auto coord_agent = CoordinationServiceAgent::Create(
         tsl::Env::Default(), "agent", node_id, config, std::move(leader_client),
         std::move(error_fn), recoverable);
-    if (!status.ok()) {
-      LOG(ERROR) << "Coordination agent failed to initialize: " << status;
+    if (!coord_agent.ok()) {
+      LOG(ERROR) << "Coordination agent failed to initialize: "
+                 << coord_agent.status();
+      return nullptr;
     }
-    return coord_agent;
+    return *std::move(coord_agent);
   }
 
   void StartService(int num_nodes,
@@ -160,8 +158,8 @@ class ClientServerTest : public ::testing::Test {
                              grpc::InsecureServerCredentials());
     // Set up the actual coordination service (where all the real logic
     // lives).
-    coord_service_ = CoordinationService::Create(tsl::Env::Default(), config,
-                                                 /*cache=*/nullptr);
+    coord_service_ =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     // Set up threads and RPC service.
     coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
         tsl::Env::Default(), "CoordinationServiceRpcHandler",
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
index 141fd0d69ae8a2..c7e70795b97fba 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
@@ -36,8 +36,6 @@ using tensorflow::GetKeyValueDirRequest;
 using tensorflow::GetKeyValueDirResponse;
 using tensorflow::GetKeyValueRequest;
 using tensorflow::GetKeyValueResponse;
-using tensorflow::GetTaskStateRequest;
-using tensorflow::GetTaskStateResponse;
 using tensorflow::HeartbeatRequest;
 using tensorflow::HeartbeatResponse;
 using tensorflow::IncrementKeyValueRequest;
@@ -48,18 +46,12 @@ using tensorflow::PollForErrorRequest;
 using tensorflow::PollForErrorResponse;
 using tensorflow::RegisterTaskRequest;
 using tensorflow::RegisterTaskResponse;
-using tensorflow::ReportErrorToServiceRequest;
-using tensorflow::ReportErrorToServiceResponse;
-using tensorflow::ReportErrorToTaskRequest;
-using tensorflow::ReportErrorToTaskResponse;
 using tensorflow::ResetTaskRequest;
 using tensorflow::ResetTaskResponse;
 using tensorflow::ShutdownTaskRequest;
 using tensorflow::ShutdownTaskResponse;
 using tensorflow::TryGetKeyValueRequest;
 using tensorflow::TryGetKeyValueResponse;
-using tensorflow::WaitForAllTasksRequest;
-using tensorflow::WaitForAllTasksResponse;
 using tensorflow::WatchJobStateRequest;
 using tensorflow::WatchJobStateResponse;
 
@@ -79,10 +71,6 @@ class CoordinationClient {
                               HeartbeatResponse* response,
                               tsl::StatusCallback done) = 0;
 
-  virtual void WaitForAllTasksAsync(const WaitForAllTasksRequest* request,
-                                    WaitForAllTasksResponse* response,
-                                    tsl::StatusCallback done) = 0;
-
   virtual void ShutdownTaskAsync(tsl::CallOptions* call_opts,
                                  const ShutdownTaskRequest* request,
                                  ShutdownTaskResponse* response,
@@ -92,19 +80,6 @@ class CoordinationClient {
                               ResetTaskResponse* response,
                               tsl::StatusCallback done) = 0;
 
-  virtual void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                                      const ReportErrorToTaskRequest* request,
-                                      ReportErrorToTaskResponse* response,
-                                      tsl::StatusCallback done) = 0;
-
-  virtual void ReportErrorToServiceAsync(
-      const ReportErrorToServiceRequest* request,
-      ReportErrorToServiceResponse* response, tsl::StatusCallback done) = 0;
-
-  virtual void GetTaskStateAsync(const GetTaskStateRequest* request,
-                                 GetTaskStateResponse* response,
-                                 tsl::StatusCallback done) = 0;
-
   virtual void WatchJobStateAsync(tsl::CallOptions* call_opts,
                                   const WatchJobStateRequest* request,
                                   WatchJobStateResponse* response,
@@ -154,21 +129,6 @@ class CoordinationClient {
                                  tsl::StatusCallback done) = 0;
 };
 
-// Simple wrapper class that can be used to retrieve CoordinationClients.
-class CoordinationClientCache {
- public:
-  virtual ~CoordinationClientCache() = default;
-
-  // If the `target` names a remote task, returns a pointer of the
-  // CoordinationClient object wrapping that channel to the remote task.
-  virtual CoordinationClient* GetClient(const std::string& target) = 0;
-
-  // If the `target` names a remote task, returns an owned pointer of the
-  // CoordinationClient object wrapping that channel to the remote task.
-  virtual std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) = 0;
-};
-
 }  // namespace xla
 
 #endif  // XLA_PJRT_DISTRIBUTED_COORDINATION_COORDINATION_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index 54a026aefb5b93..2c676d3b6be4fb 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -42,9 +42,7 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/distributed/coordination/coordination_client.h"
 #include "xla/pjrt/distributed/coordination/coordination_service_error_util.h"
-#include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
@@ -61,16 +59,12 @@ namespace {
 using tensorflow::CoordinatedTask;
 using tensorflow::CoordinatedTaskState;
 using tensorflow::CoordinatedTaskStateInfo;
-using tensorflow::CoordinationServiceConfig;
-using tensorflow::CoordinationServiceError;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 
 constexpr char kClusterRegisterBarrierId[] =
     "[Init]Wait_for_all_tasks_to_register";
 constexpr absl::Duration kDevicePropagationTimeout = absl::Hours(1);
-constexpr int kDefaultHeartbeatTimeoutMs = 10 * 1000;  // 10 seconds
-constexpr int kServiceToClientTimeoutMs = 10 * 1000;   // 10 seconds
 constexpr size_t kOngoingBarriersSoftLimit = 20;
 constexpr char kHealthCheckThread[] = "CoordinationServiceHealthCheck";
 // Limit the number of stragglers we log to avoid `RESOURCE_EXHAUSTED` errors in
@@ -104,7 +98,9 @@ absl::Status MakeShutdownBarrierError(const absl::Status& error) {
 
 void CoordinationService::ErrorPollingState::SetError(
     const absl::Status& error) {
-  if (responded_) return;
+  if (responded_) {
+    return;
+  }
   responded_ = true;
   error_ = error;
   for (auto& [_, done_cb] : done_callbacks_) {
@@ -125,7 +121,9 @@ void CoordinationService::ErrorPollingState::RemoveTask(
 void CoordinationService::ErrorPollingState::AddTask(
     const CoordinatedTask& task, tsl::StatusCallback&& done) {
   // Do not allow to insert a task if the service has already responded.
-  if (Responded()) return;
+  if (Responded()) {
+    return;
+  }
   polling_task_names_.insert(GetTaskName(task));
   RemoveTask(task, "new request from the same task");
   done_callbacks_[task] = done;
@@ -149,7 +147,9 @@ void CoordinationService::TaskState::Disconnect(
 }
 
 bool CoordinationService::TaskState::SetError(const absl::Status& status) {
-  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return false;
+  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
+    return false;
+  }
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
   status_ = status;
   return true;
@@ -157,7 +157,9 @@ bool CoordinationService::TaskState::SetError(const absl::Status& status) {
 
 absl::Status CoordinationService::TaskState::RecordHeartbeat(
     IncarnationId task_incarnation) {
-  if (!status_.ok()) return status_;
+  if (!status_.ok()) {
+    return status_;
+  }
   // Record heartbeat.
   if (task_incarnation_ == task_incarnation) {
     absl::MutexLock l(last_heartbeat_mu_);
@@ -167,14 +169,13 @@ absl::Status CoordinationService::TaskState::RecordHeartbeat(
   // Task incarnation mismatch!
   if (IsRecoverable()) {
     return absl::OkStatus();  // Ignore, but don't record new heartbeat.
-  } else {
-    return MakeCoordinationError(absl::AbortedError(absl::StrCat(
-        task_name_, " Heartbeat: Incarnation ID mismatch: expecting ",
-        task_incarnation_.value(), " but got ", task_incarnation.value(),
-        ". The task has restarted and likely crashed earlier - check for any "
-        "earlier errors or any scheduler events (e.g. preemption, eviction) to "
-        "debug further.")));
   }
+  return MakeCoordinationError(absl::AbortedError(absl::StrCat(
+      task_name_, " Heartbeat: Incarnation ID mismatch: expecting ",
+      task_incarnation_.value(), " but got ", task_incarnation.value(),
+      ". The task has restarted and likely crashed earlier - check for any "
+      "earlier errors or any scheduler events (e.g. preemption, eviction) to "
+      "debug further.")));
 }
 
 int64_t CoordinationService::TaskState::TimeSinceLastHeartbeatMs() {
@@ -200,27 +201,10 @@ bool CoordinationService::TaskState::IsDisconnectedBeyondGracePeriod() {
          tsl::Env::Default()->NowMicros() > disconnect_grace_period_us_;
 }
 
-CoordinationService::CoordinationService(
-    tsl::Env* env, const CoordinationServiceConfig& config,
-    std::unique_ptr<CoordinationClientCache> client_cache)
-    : client_cache_(std::move(client_cache)),
-      env_(*env),
-      heartbeat_timeout_ms_([&config]() -> uint64_t {
-        return config.heartbeat_timeout_in_ms() > 0
-                   ? config.heartbeat_timeout_in_ms()
-                   : kDefaultHeartbeatTimeoutMs;
-      }()),
-      cluster_register_with_barrier_(config.cluster_register_with_barrier()),
-      cluster_register_timeout_(
-          absl::Milliseconds(config.cluster_register_timeout_in_ms())),
-      shutdown_barrier_timeout_(
-          absl::Milliseconds(config.shutdown_barrier_timeout_in_ms())),
-      allow_new_incarnation_to_reconnect_(
-          config.allow_new_incarnation_to_reconnect()) {
+CoordinationService::CoordinationService(tsl::Env* env, const Config& config)
+    : env_(*env), config_(config) {
   LOG(INFO) << "Initializing CoordinationService";
-  recoverable_jobs_ = absl::flat_hash_set<std::string>(
-      config.recoverable_jobs().cbegin(), config.recoverable_jobs().cend());
-  for (const auto& job : config.coordinated_job_list()) {
+  for (const auto& job : config_.coordinated_job_list) {
     for (int i = 0; i < job.num_tasks(); ++i) {
       const std::string task_name = GetTaskName(job.name(), i);
       cluster_state_.emplace(task_name, std::make_unique<TaskState>(task_name));
@@ -239,7 +223,8 @@ void CoordinationService::CheckHeartbeatTimeout() {
       continue;
     }
     const bool is_stale =
-        task_state->TimeSinceLastHeartbeatMs() > heartbeat_timeout_ms_;
+        absl::Milliseconds(task_state->TimeSinceLastHeartbeatMs()) >
+        config_.heartbeat_timeout;
     VLOG(10) << "Checking staleness for " << task_name
              << " stale?=" << is_stale;
     if (is_stale) {
@@ -600,7 +585,7 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
   const auto task_status = task_cluster_state->GetStatus();
 
   if (task_state == CoordinatedTaskState::TASKSTATE_DISCONNECTED ||
-      ((allow_new_incarnation_to_reconnect_ ||
+      ((config_.allow_new_incarnation_to_reconnect ||
         task_cluster_state->IsRecoverable()) &&
        (absl::IsUnavailable(task_status) &&
         task_status.GetPayload(CoordinationErrorPayloadKey())))) {
@@ -611,7 +596,7 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
     //   an unavailable error state, but has now restarted (possibly with
     //   a new incarnation). This is only allowed if configured with
     //   `allow_new_incarnation_to_reconnect`.
-    if (cluster_register_with_barrier_) {
+    if (config_.cluster_register_with_barrier) {
       // Impose barrier so that all tasks can register together.
       // Note: it is possible that the same task restarts multiple times and
       // registers itself with new incarnations.
@@ -635,7 +620,7 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
       }
       BarrierAsyncLocked(
           kClusterRegisterBarrierId, kUniqueBarrierCounter,
-          cluster_register_timeout_, task, {},
+          config_.cluster_register_timeout, task, {},
           ConnectAfterBarrierPasses(task_name, incarnation, std::move(done)));
       ClusterStateUpdated();
       return;
@@ -647,7 +632,8 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
     done(absl::OkStatus());
     ClusterStateUpdated();
     return;
-  } else if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
+  }
+  if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
     // This may happen if the service processes the initial RegisterTask(),
     // but the agent did not receive the response so the agent retries again.
     if (task_cluster_state->GetTaskIncarnation() == incarnation ||
@@ -661,12 +647,11 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
       done(absl::OkStatus());
       ClusterStateUpdated();
       return;
-    } else {
-      error_message =
-          absl::StrCat(task_name,
-                       " unexpectedly tried to connect with a different "
-                       "incarnation. It has likely restarted.");
     }
+    error_message =
+        absl::StrCat(task_name,
+                     " unexpectedly tried to connect with a different "
+                     "incarnation. It has likely restarted.");
   } else {
     // This task is already in error, which implies it has registered
     // previously.
@@ -685,35 +670,11 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
   done(error);
 }
 
-void CoordinationService::WaitForAllTasks(const CoordinatedTask& task,
-                                          const DeviceInfo& devices,
-                                          tsl::StatusCallback done) {
-  {
-    absl::MutexLock l(state_mu_);
-    if (ServiceHasStopped()) {
-      done(MakeCoordinationError(absl::InternalError(
-          "Coordination service has stopped. WaitForAllTasks() failed.")));
-      return;
-    }
-    const auto& task_state = cluster_state_.find(GetTaskName(task));
-    // Collect task device info for the first time that task
-    // has called WaitForAllTasks(). This will be aggregated when the barrier
-    // passes.
-    if (task_state != cluster_state_.end() &&
-        !task_state->second->DeviceInfoIsCollected()) {
-      task_state->second->CollectDeviceInfo(devices);
-    }
-  }
-  BarrierAsync(device_propagation_barrier_id_, kUniqueBarrierCounter,
-               kDevicePropagationTimeout, task, {},
-               [done = std::move(done)](const absl::Status& s,
-                                        int64_t unused_counter) { done(s); });
-}
-
 void CoordinationService::ShutdownTaskAsync(const CoordinatedTask& task,
                                             tsl::StatusCallback done) {
   VLOG(3) << "Task " << GetTaskName(task) << " invoked ShutdownTaskAsync()";
-  if (shutdown_barrier_timeout_ > absl::ZeroDuration() && !task.recoverable()) {
+  if (config_.shutdown_barrier_timeout > absl::ZeroDuration() &&
+      !task.recoverable()) {
     // Impose shutdown barrier so that all (non-recoverable) tasks can
     // disconnect together.
     // Notes:
@@ -727,7 +688,7 @@ void CoordinationService::ShutdownTaskAsync(const CoordinatedTask& task,
     //    all tasks.
     auto shutdown_tasks = GetTasksForShutdownBarrier();
     BarrierAsync(shutdown_barrier_id_, kUniqueBarrierCounter,
-                 shutdown_barrier_timeout_, task, shutdown_tasks,
+                 config_.shutdown_barrier_timeout, task, shutdown_tasks,
                  [done = std::move(done)](const absl::Status& s,
                                           int64_t unused_counter) {
                    if (s.ok()) {
@@ -765,7 +726,8 @@ absl::Status CoordinationService::DisconnectTask(const CoordinatedTask& task) {
         absl::StrCat("Coordination service has stopped. DisconnectTask() "
                      "failed for task_name=",
                      task_name)));
-  } else if (!cluster_state_.contains(task_name)) {
+  }
+  if (!cluster_state_.contains(task_name)) {
     return MakeCoordinationError(absl::InvalidArgumentError(absl::StrCat(
         "Unexpected disconnect request with task_name=", task_name)));
   }
@@ -778,7 +740,8 @@ absl::Status CoordinationService::DisconnectTask(const CoordinatedTask& task) {
 
   // Disconnect task.
   task_state->Disconnect(
-      /*grace_period_duration_us=*/heartbeat_timeout_ms_ * 1000);
+      /*grace_period_duration_us=*/absl::ToInt64Milliseconds(
+          config_.heartbeat_timeout));
   LeaveOngoingBarriers(task, "task disconnected");
   RefreshAliveness();
   error_polling_state_.RemoveTask(task, "task has disconnected.");
@@ -802,11 +765,13 @@ absl::Status CoordinationService::ReportTaskError(const CoordinatedTask& task,
   if (ServiceHasStopped()) {
     return MakeCoordinationError(absl::InternalError(
         "Coordination service has stopped. ReportTaskError() failed."));
-  } else if (!cluster_state_.contains(task_name)) {
+  }
+  if (!cluster_state_.contains(task_name)) {
     return MakeCoordinationError(absl::InvalidArgumentError(
         absl::StrCat("Unexpected request from task ", task_name)));
-  } else if (cluster_state_[task_name]->GetState() !=
-             CoordinatedTaskState::TASKSTATE_CONNECTED) {
+  }
+  if (cluster_state_[task_name]->GetState() !=
+      CoordinatedTaskState::TASKSTATE_CONNECTED) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "The task is not connected or already has an error."));
   }
@@ -831,19 +796,6 @@ CoordinatedTaskStateInfo CoordinationService::CreateTaskStateInfo(
   return info;
 }
 
-std::vector<CoordinatedTaskStateInfo> CoordinationService::GetTaskState(
-    const std::vector<CoordinatedTask>& tasks) {
-  std::vector<CoordinatedTaskStateInfo> states_info;
-  states_info.reserve(tasks.size());
-
-  absl::MutexLock l(state_mu_);
-  for (const auto& task : tasks) {
-    states_info.push_back(
-        CreateTaskStateInfo(task, *cluster_state_[GetTaskName(task)]));
-  }
-  return states_info;
-}
-
 std::vector<CoordinatedTaskStateInfo> CoordinationService::GetJobState(
     absl::string_view job_name) {
   std::vector<CoordinatedTaskStateInfo> states_info;
@@ -898,7 +850,8 @@ absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
         "gracefully. Check the task leader's logs for an earlier error or "
         "scheduler events (e.g. preemption, eviction) to debug the root "
         "cause.")));
-  } else if (!cluster_state_.contains(task_name)) {
+  }
+  if (!cluster_state_.contains(task_name)) {
     return MakeCoordinationError(absl::InvalidArgumentError(
         absl::StrCat("Unexpected heartbeat request from task: ", task_name,
                      ". This usually implies a configuration error.")));
@@ -909,7 +862,8 @@ absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
         "Unexpected heartbeat request from an already-in-error task: ",
         task_name,
         " with existing error: ", task_state->GetStatus().ToString())));
-  } else if (task_state->IsDisconnectedBeyondGracePeriod()) {
+  }
+  if (task_state->IsDisconnectedBeyondGracePeriod()) {
     // We accept heartbeats for a short grace period to account for the lag
     // time between the service recording the state change and the agent
     // stopping heartbeats.
@@ -935,8 +889,7 @@ absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
 bool CoordinationService::AllTasksAreRecoverable(
     const std::vector<CoordinatedTask>& tasks) {
   for (const auto& task : tasks) {
-    if (!cluster_state_[GetTaskName(task)]->IsRecoverable() &&
-        !isRecoverableJob(task.job_name())) {
+    if (!cluster_state_[GetTaskName(task)]->IsRecoverable()) {
       return false;
     }
   }
@@ -965,52 +918,7 @@ void CoordinationService::PropagateError(
     VLOG(3) << "All tasks are recoverable, skip propagating error.";
     return;
   }
-  // If there is no service-to-client connection, use error polling or stop
-  // the service.
-  if (client_cache_ == nullptr) {
-    SendErrorPollingResponseOrFailAllTasks(error);
-    return;
-  }
-
-  ReportErrorToTaskRequest request;
-  request.set_error_code(error.raw_code());
-  request.set_error_message(std::string(error.message()));
-  CoordinationServiceError* payload = request.mutable_error_payload();
-  payload->set_is_reported_error(is_reported_by_task);
-  tsl::CallOptions call_opts;
-  call_opts.SetTimeout(kServiceToClientTimeoutMs);
-  // TODO(b/369222279): This logic will be removed shortly, so we don't bother
-  // adding the full list of source tasks.
-  if (!source_tasks.empty()) {
-    *payload->mutable_source_task() = source_tasks[0];
-  }
-
-  std::vector<std::shared_ptr<absl::Notification>> notifications;
-
-  for (const auto& pair : cluster_state_) {
-    // Propagate error only to tasks that are connected
-    if (pair.second->GetState() != CoordinatedTaskState::TASKSTATE_CONNECTED) {
-      continue;
-    }
-    std::string task = pair.first;
-
-    CoordinationClient* client = client_cache_->GetClient(task);
-    auto response = std::make_shared<ReportErrorToTaskResponse>();
-    auto n = std::make_shared<absl::Notification>();
-    client->ReportErrorToTaskAsync(
-        &call_opts, &request, response.get(),
-        [response, n, task](const absl::Status& s) {
-          if (!s.ok()) {
-            LOG(ERROR) << "Encountered another error while reporting to "
-                       << task << ": " << s;
-          }
-          n->Notify();
-        });
-    notifications.push_back(n);
-  }
-  for (auto& n : notifications) {
-    n->WaitForNotification();
-  }
+  SendErrorPollingResponseOrFailAllTasks(error);
 }
 
 // Utility for normalizing structured config key string.
@@ -1025,7 +933,9 @@ std::string NormalizeKey(absl::string_view orig_key) {
   // Parse all characters
   while (*src) {
     // Skip leading slashes
-    while (*src == '/') src++;
+    while (*src == '/') {
+      src++;
+    }
     // Copy over all non-slash characters
     while (*src && *src != '/') {
       *dst++ = *src++;
@@ -1036,7 +946,9 @@ std::string NormalizeKey(absl::string_view orig_key) {
     }
   }
   // If ending with slash, remove the trailing slash
-  if (dst > norm_key.begin() && *(dst - 1) == '/') dst--;
+  if (dst > norm_key.begin() && *(dst - 1) == '/') {
+    dst--;
+  }
   norm_key.resize(dst - norm_key.begin());
   return norm_key;
 }
@@ -1125,13 +1037,6 @@ void CoordinationService::PollForErrorAsync(const CoordinatedTask& task,
     return;
   }
 
-  if (client_cache_ != nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Should not use error polling from service when "
-                            "there is service to client connection.")));
-    return;
-  }
-
   client_polling_for_error_ = true;
 
   if (!cluster_state_.contains(task_name)) {
@@ -1334,7 +1239,6 @@ void CoordinationService::BarrierAsyncLocked(
       task.recoverable() && counter == 0 &&
       // Not a special once-only barrier.
       barrier_id != kClusterRegisterBarrierId &&
-      barrier_id != device_propagation_barrier_id_ &&
       barrier_id != shutdown_barrier_id_) {
     should_initialize_new_instance = true;
     // Use the service's counter to initialize the new barrier.
@@ -1471,9 +1375,6 @@ void CoordinationService::PassBarrier(BarrierState* barrier,
   LOG(INFO) << "Barrier(" << BarrierName(*barrier)
             << ") has passed with status: " << result;
   // Special hook for device propagation barrier to set global device ids.
-  if (barrier->id == device_propagation_barrier_id_) {
-    AggregateClusterDevices();
-  }
   for (const auto& task_at_barrier : barrier->tasks_at_barrier) {
     // Clean up task state (used as error hooks).
     const CoordinatedTask& task = task_at_barrier.first;
@@ -1497,7 +1398,8 @@ void CoordinationService::PassBarrier(BarrierState* barrier,
            "some tasks were never scheduled, or 3) scheduling delays. Consider "
            "setting a longer initialization timeout if such delays are "
            "expected, the timeout is currently set to: "
-        << cluster_register_timeout_ << ".\n\nOriginal error: " << result;
+        << config_.cluster_register_timeout
+        << ".\n\nOriginal error: " << result;
     return;
   }
   // Special hook for shutdown barrier to disconnect tasks at the barrier and
@@ -1793,33 +1695,6 @@ void CoordinationService::ReachBarrier(BarrierState* barrier,
   }
 };
 
-void CoordinationService::AggregateClusterDevices() {
-  assert(cluster_devices_.device_size() == 0);
-  std::vector<CoordinatedTask> ordered_tasks;
-  // Sort by task name to set deterministic order for cluster devices.
-  ordered_tasks.reserve(cluster_state_.size());
-  for (const auto& task : cluster_state_) {
-    ordered_tasks.push_back(GetTaskFromName(task.first));
-  }
-  std::sort(ordered_tasks.begin(), ordered_tasks.end(),
-            [](const CoordinatedTask& task1, const CoordinatedTask& task2) {
-              if (task1.job_name() != task2.job_name()) {
-                return task1.job_name() < task2.job_name();
-              }
-              return task1.task_id() < task2.task_id();
-            });
-
-  // Aggregate to global device list.
-  for (const auto& task : ordered_tasks) {
-    cluster_devices_.MergeFrom(
-        cluster_state_[GetTaskName(task)]->GetDeviceInfo());
-  }
-
-  if (post_aggregate_device_fn_ != nullptr) {
-    cluster_devices_ = post_aggregate_device_fn_(cluster_devices_);
-  }
-}
-
 void CoordinationService::DisconnectAllNonRecoverableTasks() {
   for (const auto& [task_name, state] : cluster_state_) {
     if (state->IsRecoverable()) {
@@ -1881,17 +1756,11 @@ void CoordinationService::CompleteShutdownAfterBarrier(
   }
 }
 
-bool CoordinationService::isRecoverableJob(
-    const absl::string_view task_name) const {
-  return recoverable_jobs_.find(task_name) != recoverable_jobs_.end();
-}
-
 void CoordinationService::SendErrorPollingResponseOrFailAllTasks(
     const absl::Status& error) {
   CHECK(!error.ok()) << "SendErrorPollingResponseOrFailAllTasks called with OK "
                         "status. Should always return an error.";
   // Should be called only when there is no service-to-client connection.
-  assert(client_cache_ == nullptr);
   if (IsClientPollingForError()) {
     LOG(ERROR)
         << "Use error polling to propagate the following error to all tasks: "
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index 35e174af3a5228..6c2ddb1b4cda1b 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -19,8 +19,9 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
-#include <utility>
+#include <tuple>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -35,10 +36,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/pjrt/distributed/coordination/coordination_client.h"
+#include "absl/types/span.h"
 #include "xla/pjrt/distributed/coordination/key_value_store.h"
 #include "xla/service/global_device_id.h"
-#include "xla/tsl/lib/gtl/int_type.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
@@ -65,6 +65,36 @@ namespace xla {
 // tasks. Each task interacts with the service through CoordinationServiceAgent.
 class CoordinationService {
  public:
+  struct Config {
+    // Maximum wait time for all members in the cluster to be registered.
+    absl::Duration cluster_register_timeout = absl::Minutes(60);
+
+    // Denotes if we should synchronize the agents' register attempts by
+    // blocking on a barrier. This is useful for synchronized restarts.
+    bool cluster_register_with_barrier = false;
+
+    // Heartbeat timeout, if a task does not record heartbeat in this time
+    // window, it will be considered disconnected.
+    // Note: This is also used as a grace period to accept any heartbeats after
+    // the agent has disconnected, to account for the lag time between the
+    // service recording the state change and the agent stopping heartbeats.
+    absl::Duration heartbeat_timeout = absl::Seconds(10);
+
+    // The list of `CoordinatedJob`s that will register in coordination service.
+    std::vector<tensorflow::CoordinatedJob> coordinated_job_list;
+
+    // Denotes how long to wait for all coordination agents to reach the
+    // barriers (after the first shutdown request) before disconnecting
+    // together. If set to 0, no barrier is imposed upon shutdown and each
+    // worker can disconnect individually.
+    absl::Duration shutdown_barrier_timeout = absl::ZeroDuration();
+
+    // If a task restarts with a new incarnation, we may allow it to reconnect
+    // silently. This is useful when we know that a task can immediately resume
+    // work upon re-connecting to the service.
+    bool allow_new_incarnation_to_reconnect = false;
+  };
+
   using StatusOrValueCallback =
       std::function<void(const absl::StatusOr<absl::string_view>&)>;
   using BarrierCallback = std::function<void(const absl::Status&, int64_t)>;
@@ -89,15 +119,7 @@ class CoordinationService {
       absl::flat_hash_set<tensorflow::CoordinatedTask, CoordinatedTaskHash,
                           CoordinatedTaskEqual>;
 
-  static std::unique_ptr<CoordinationService> Create(
-      tsl::Env* env, const tensorflow::CoordinationServiceConfig& config,
-      std::unique_ptr<CoordinationClientCache> cache) {
-    return std::make_unique<CoordinationService>(env, config, std::move(cache));
-  }
-
-  CoordinationService(tsl::Env* env,
-                      const tensorflow::CoordinationServiceConfig& config,
-                      std::unique_ptr<CoordinationClientCache> client_cache);
+  CoordinationService(tsl::Env* env, const Config& config);
 
   ~CoordinationService() {
     absl::MutexLock lock(state_mu_);
@@ -116,15 +138,6 @@ class CoordinationService {
   void RegisterTaskAsync(const tensorflow::CoordinatedTask& task,
                          IncarnationId incarnation, tsl::StatusCallback done);
 
-  // Wait for all tasks to be up and running, and register local device
-  // info. The callback is invoked when all tasks are up and registered, or some
-  // error occurs.
-  // Each task's local devices will be appended in a deterministic order, and
-  // post-processed by the callback in SetDeviceAggregationFunction() (if set).
-  void WaitForAllTasks(const tensorflow::CoordinatedTask& task,
-                       const tensorflow::DeviceInfo& devices,
-                       tsl::StatusCallback done);
-
   // Disconnects task from the service. If `shutdown_barrier_timeout_in_ms` is
   // specified in the config, blocks until all tasks reach the barrier before
   // disconnecting together.
@@ -149,13 +162,11 @@ class CoordinationService {
                                IncarnationId incarnation);
 
   // Set a task in error state permanently.
+  //
+  // TODO: mwhittaker - Remove this. It's only used for testing.
   absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
                                const absl::Status& error);
 
-  // Get the state and the error status of the tasks.
-  std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
-      const std::vector<tensorflow::CoordinatedTask>& task);
-
   // Watches the state and the error status of the job.
   using WatchJobStateCallback = absl::AnyInvocable<void(
       std::vector<tensorflow::CoordinatedTaskStateInfo>, int64_t)>;
@@ -280,9 +291,7 @@ class CoordinationService {
                           GetAliveTasksCallback done);
 
   // Gets error from the coordination service. Block until the service
-  // returns an error or the task/service is shutdown. This should never be used
-  // when there is service to client connection (i.e. `CoordinationClientCache`
-  // is passed in during construction).
+  // returns an error or the task/service is shutdown.
   //
   // The first call to this function will trigger the error polling mode in the
   // coordination service, so once an error occurs after the first call, the
@@ -536,13 +545,6 @@ class CoordinationService {
     // Sets the error and returns true if the task state is not ERROR.
     // Otherwise, don't overwrite the error and return false.
     bool SetError(const absl::Status& status);
-    tensorflow::DeviceInfo GetDeviceInfo() { return devices_; }
-    void CollectDeviceInfo(const tensorflow::DeviceInfo& devices) {
-      devices_ = devices;
-    }
-    // Checks if task has called WaitForAllTasks() previously, which gathers the
-    // local device info.
-    bool DeviceInfoIsCollected() { return !devices_.device().empty(); }
 
     // This is used to propagate state changes (disconnect, error) to ongoing
     // barriers.
@@ -573,7 +575,6 @@ class CoordinationService {
     // accounts for the lag time between the service recording the state change
     // and the agent stopping heartbeats/error polling.
     uint64_t disconnect_grace_period_us_ = 0;
-    tensorflow::DeviceInfo devices_;
     // For now, we assume there won't be many simultaneous barriers so we simply
     // use a set.
     absl::flat_hash_set<std::string> ongoing_barriers_for_task_;
@@ -619,23 +620,13 @@ class CoordinationService {
   // such that NotifyWatchJobStateCallbacks should be called.
   void ClusterStateUpdated() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
-  std::unique_ptr<CoordinationClientCache> client_cache_;
   tsl::Env& env_;
   const IncarnationId service_incarnation_{tsl::random::New64()};
-  const uint64_t heartbeat_timeout_ms_;
-  bool cluster_register_with_barrier_ = false;
-  const absl::Duration cluster_register_timeout_;
-  const absl::Duration shutdown_barrier_timeout_;
-  // If a task restarts with a new incarnation, we may allow it to reconnect
-  // silently if configured. This is useful when we know that a task can
-  // immediately resume work upon re-connecting to the service.
-  bool allow_new_incarnation_to_reconnect_ = false;
+  const Config config_;
 
   std::function<tensorflow::DeviceInfo(const tensorflow::DeviceInfo& devices)>
       post_aggregate_device_fn_;
 
-  const std::string device_propagation_barrier_id_ =
-      absl::StrCat("WaitForAllTasks::", service_incarnation_.value());
   const std::string shutdown_barrier_id_ =
       absl::StrCat("Shutdown::", service_incarnation_.value());
   std::vector<tensorflow::CoordinatedTask> shutdown_barrier_tasks_
@@ -660,8 +651,6 @@ class CoordinationService {
   // The state of all pending GetAliveTasks calls.
   std::vector<AlivenessState> aliveness_states_ ABSL_GUARDED_BY(state_mu_);
 
-  absl::flat_hash_set<std::string> recoverable_jobs_;
-
   // When the tasks connect to coordination service after cluster initialization
   // is done, they will be added to this set.
   // Tasks connecting after cluster initialization indicate that they
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index 72dfd996032f16..8e3fa8497577ea 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -33,19 +33,17 @@ limitations under the License.
 #include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "absl/types/span.h"
 #include "xla/pjrt/distributed/coordination/coordination_client.h"
-#include "xla/pjrt/distributed/coordination/coordination_service.h"
 #include "xla/pjrt/distributed/coordination/coordination_service_error_util.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/framework/cancellation.h"
@@ -58,8 +56,6 @@ limitations under the License.
 namespace xla {
 using tensorflow::CoordinatedTask;
 using tensorflow::CoordinatedTaskState;
-using tensorflow::CoordinatedTaskStateInfo;
-using tensorflow::CoordinationServiceConfig;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 
@@ -69,74 +65,44 @@ auto* enabled_usage_metric = tsl::monitoring::Gauge<bool, 0>::New(
     "/coordination_service/v2/agent/enabled",
     "Tracks usage of coordination service.");
 
-constexpr absl::Duration kDefaultClusterRegisterTimeout = absl::Hours(1);
-constexpr absl::Duration kDefaultHeartbeatTimeout = absl::Seconds(10);
-constexpr absl::Duration kDefaultShutdownTimeout = absl::Seconds(10);
 constexpr char kHeartbeatThread[] = "CoordinationServiceHeartbeatLoop";
 
 }  // namespace
 
-absl::Status CoordinationServiceAgent::Initialize(
+/*static*/ absl::StatusOr<std::unique_ptr<CoordinationServiceAgent>>
+CoordinationServiceAgent::Create(
     tsl::Env* env, absl::string_view job_name, int task_id,
-    const CoordinationServiceConfig& configs,
-    std::unique_ptr<CoordinationClient> leader_client,
-    tsl::StatusCallback error_fn) {
-  return Initialize(env, job_name, task_id, configs, std::move(leader_client),
-                    error_fn,
-                    /*recoverable=*/false);
-}
-
-absl::Status CoordinationServiceAgent::Initialize(
-    tsl::Env* env, absl::string_view job_name, int task_id,
-    const CoordinationServiceConfig& configs,
-    std::unique_ptr<CoordinationClient> leader_client,
+    const Config& config, std::unique_ptr<CoordinationClient> leader_client,
     tsl::StatusCallback error_fn, bool recoverable) {
-  CoordinatedTask task;
-  task.set_job_name(std::string(job_name));
-  task.set_task_id(task_id);
+  // Validate arguments.
+  if (config.service_leader.empty()) {
+    return MakeCoordinationError(absl::InvalidArgumentError(
+        "CoordinationServiceAgent must be initialized with a valid leader."));
+  }
+  if (leader_client == nullptr) {
+    return MakeCoordinationError(absl::InvalidArgumentError(
+        "CoordinationServiceAgent must have a valid leader client."));
+  }
   if (recoverable) {
     LOG(WARNING)
         << "Using experimental recoverable task feature. The default shutdown "
            "barrier will only block non-recoverable tasks. If a synchronized "
            "shutdown is desired, the user / library should invoke "
            "`WaitAtBarrier` explicitly at the end of the program.";
-    task.set_recoverable(true);
   }
-  return Initialize(env, task, configs, std::move(leader_client), error_fn);
-}
 
-absl::Status CoordinationServiceAgent::Initialize(
-    tsl::Env* env, const CoordinatedTask& task,
-    const CoordinationServiceConfig& configs,
-    std::unique_ptr<CoordinationClient> leader_client,
-    tsl::StatusCallback error_fn) {
+  // Record coordination service agent metric.
   enabled_usage_metric->GetCell()->Set(true);
-  absl::MutexLock l(state_mu_);
-  if (state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
-    return MakeCoordinationError(absl::FailedPreconditionError(
-        "Coordination service agent has already been initialized."));
-  }
 
-  env_ = env;
-  task_ = task;
-  configs_ = configs;
-  if (configs_.service_leader().empty()) {
-    return MakeCoordinationError(absl::InvalidArgumentError(
-        "CoordinationServiceAgent must be initialized with a valid leader."));
-  }
-  leader_client_ = std::move(leader_client);
-  if (leader_client_ == nullptr) {
-    return MakeCoordinationError(absl::InvalidArgumentError(
-        "CoordinationServiceAgent must have a valid leader client."));
-  }
-  error_fn_ = error_fn;
-  state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-  return absl::OkStatus();
-}
+  CoordinatedTask task;
+  task.set_job_name(std::string(job_name));
+  task.set_task_id(task_id);
+  task.set_recoverable(recoverable);
 
-bool CoordinationServiceAgent::IsInitialized() {
-  absl::MutexLock l(state_mu_);
-  return state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
+  // The CoordinationServiceAgent constructor is private, so we can't call
+  // std::make_unique.
+  return absl::WrapUnique(new CoordinationServiceAgent(
+      env, task, config, error_fn, std::move(leader_client)));
 }
 
 bool CoordinationServiceAgent::IsConnected() {
@@ -183,13 +149,9 @@ absl::Status CoordinationServiceAgent::Connect() {
   request.set_incarnation(incarnation_id_.value());
   RegisterTaskResponse response;
 
-  const int64_t register_timeout =
-      configs_.cluster_register_timeout_in_ms() > 0
-          ? configs_.cluster_register_timeout_in_ms()
-          : absl::ToInt64Milliseconds(kDefaultClusterRegisterTimeout);
   // Give 5 seconds for any service-related timeouts to propagate.
   const absl::Time deadline =
-      absl::Now() + absl::Milliseconds(register_timeout) + absl::Seconds(5);
+      absl::Now() + config_.cluster_register_timeout + absl::Seconds(5);
   int attempt = 0;
   std::default_random_engine generator;
   std::uniform_real_distribution<double> distribution(0.0, 1.0);
@@ -244,7 +206,7 @@ absl::Status CoordinationServiceAgent::Connect() {
       tsl::ThreadOptions(), kHeartbeatThread,
       absl::bind_front(&CoordinationServiceAgent::StartSendingHeartbeats,
                        this)));
-  if (configs_.poll_for_error_from_service_at_startup()) {
+  if (config_.poll_for_error_from_service_at_startup) {
     StartPollingForError();
   }
   return absl::OkStatus();
@@ -255,12 +217,9 @@ void CoordinationServiceAgent::StartSendingHeartbeats() {
   *request.mutable_source_task() = task_;
   request.set_incarnation(incarnation_id_.value());
   HeartbeatResponse response;
-  const int64_t heartbeat_interval_ms =
-      configs_.heartbeat_timeout_in_ms() > 0
-          ? configs_.heartbeat_timeout_in_ms() / 2
-          : absl::ToInt64Milliseconds(kDefaultHeartbeatTimeout) / 2;
+  const absl::Duration heartbeat_interval = config_.heartbeat_timeout;
   tsl::CallOptions call_opts;
-  call_opts.SetTimeout(heartbeat_interval_ms);
+  call_opts.SetTimeout(absl::ToInt64Milliseconds(heartbeat_interval));
 
   while (true) {
     absl::Status status;
@@ -302,7 +261,7 @@ void CoordinationServiceAgent::StartSendingHeartbeats() {
     {
       absl::MutexLock l(shutdown_mu_);
       shutdown_mu_.AwaitWithTimeout(absl::Condition(&shutting_down_),
-                                    absl::Milliseconds(heartbeat_interval_ms));
+                                    config_.heartbeat_timeout);
       if (shutting_down_) {
         return;
       }
@@ -366,71 +325,14 @@ void CoordinationServiceAgent::PollForErrorAsync(tsl::StatusCallback done) {
       });
 }
 
-absl::Status CoordinationServiceAgent::WaitForAllTasks(
-    const DeviceInfo& local_devices) {
-  absl::Status agent_running_status = ValidateRunningAgent();
-  if (!agent_running_status.ok()) {
-    return agent_running_status;
-  }
-  WaitForAllTasksRequest request;
-  *request.mutable_source_task() = task_;
-  *request.mutable_device_info() = local_devices;
-  VLOG(3) << "WaitForAllTasksRequest: " << request.DebugString();
-  WaitForAllTasksResponse response;
-  absl::Status status;
-  absl::Notification n;
-  leader_client_->WaitForAllTasksAsync(&request, &response,
-                                       [&](const absl::Status& s) {
-                                         status = s;
-                                         n.Notify();
-                                       });
-  n.WaitForNotification();
-  if (!status.ok()) {
-    VLOG(3) << "WaitForAllTasksResponse: " << status;
-    SetError(status);
-    return status;
-  }
-  VLOG(3) << "WaitForAllTasksResponse: " << response.DebugString();
-  cluster_devices_ = response.device_info();
-  return absl::OkStatus();
-}
-
 const DeviceInfo& CoordinationServiceAgent::GetClusterDeviceInfo() {
   return cluster_devices_;
 }
 
 absl::StatusOr<CoordinatedTask> CoordinationServiceAgent::GetOwnTask() {
-  if (!IsInitialized()) {
-    return MakeCoordinationError(absl::FailedPreconditionError(
-        "Agent has not been initialized; we do not "
-        "know the associated task yet."));
-  }
   return task_;
 }
 
-absl::StatusOr<std::vector<CoordinatedTaskStateInfo>>
-CoordinationServiceAgent::GetTaskState(
-    const std::vector<CoordinatedTask>& tasks) {
-  GetTaskStateRequest request;
-  *request.mutable_source_task() = {tasks.begin(), tasks.end()};
-  GetTaskStateResponse response;
-  absl::Notification n;
-  absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> result;
-  leader_client_->GetTaskStateAsync(
-      &request, &response, [&](const absl::Status& s) {
-        if (s.ok()) {
-          result = std::vector<CoordinatedTaskStateInfo>(
-              std::make_move_iterator(response.task_state().begin()),
-              std::make_move_iterator(response.task_state().end()));
-        } else {
-          result = s;
-        }
-        n.Notify();
-      });
-  n.WaitForNotification();
-  return result;
-}
-
 std::shared_ptr<tsl::CallOptions> CoordinationServiceAgent::WatchJobStateAsync(
     absl::string_view job_name, std::optional<int64_t> version_number,
     std::function<void(absl::StatusOr<tensorflow::WatchJobStateResponse>)>
@@ -471,52 +373,7 @@ CoordinationServiceAgent::WatchJobState(absl::string_view job_name,
   return response;
 }
 
-absl::Status CoordinationServiceAgent::ReportError(const absl::Status& error) {
-  {
-    absl::MutexLock l(state_mu_);
-    if (state_ == CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
-      return MakeCoordinationError(absl::FailedPreconditionError(
-          "Coordination service agent must be initialized first before "
-          "reporting error."));
-    } else if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
-      return MakeCoordinationError(absl::FailedPreconditionError(
-          "Coordination service agent is already in error state."));
-    }
-  }
-  SetError(MakeCoordinationError(error, task_,
-                                 /*is_reported_error=*/true));
-  LOG(INFO) << "Reporting error to coordination service: " << error;
-  ReportErrorToServiceRequest request;
-  request.set_error_code(error.raw_code());
-  request.set_error_message(std::string(error.message()));
-  *request.mutable_error_origin() = task_;
-  VLOG(5) << "ReportErrorToServiceRequest: " << request.DebugString();
-  ReportErrorToServiceResponse response;
-
-  absl::Notification n;
-  leader_client_->ReportErrorToServiceAsync(
-      &request, &response, [&](const absl::Status& s) {
-        VLOG(5) << "ReportErrorToServiceResponse: " << s;
-        if (!s.ok()) {
-          LOG(ERROR)
-              << "Encountered another error when reporting error to "
-                 "coordination service: "
-              << s
-              << "\nThis is usually caused by an earlier error during "
-                 "execution. Check the logs of (a) this task, (b) the "
-                 "leader (usually slice 0 task 0) and (c) the scheduler "
-                 "(e.g. preemption, eviction) for an earlier error to debug "
-                 "further.";
-        }
-        n.Notify();
-      });
-  n.WaitForNotification();
-  return absl::OkStatus();
-}
-
-absl::Status CoordinationServiceAgent::Shutdown() { return ShutdownInternal(); }
-
-absl::Status CoordinationServiceAgent::ShutdownInternal() {
+absl::Status CoordinationServiceAgent::Shutdown() {
   absl::Status status = absl::OkStatus();
   bool is_connected = false;
   {
@@ -524,18 +381,15 @@ absl::Status CoordinationServiceAgent::ShutdownInternal() {
     is_connected = state_ == CoordinatedTaskState::TASKSTATE_CONNECTED;
   }
   // Disconnect agent from service.
-  if (!configs_.agent_destruction_without_shutdown() && is_connected) {
+  if (!config_.agent_destruction_without_shutdown && is_connected) {
     LOG(INFO) << "Coordination agent has initiated Shutdown().";
     ShutdownTaskRequest request;
     *request.mutable_source_task() = task_;
     ShutdownTaskResponse response;
     tsl::CallOptions call_opts;
+    // Add 5s for service-related errors to propagate.
     const int64_t shutdown_timeout =
-        (configs_.shutdown_barrier_timeout_in_ms() > 0
-             ? configs_.shutdown_barrier_timeout_in_ms()
-             : absl::ToInt64Milliseconds(kDefaultShutdownTimeout)) +
-        // Add 5s for service-related errors to propagate.
-        5 * 1000;
+        absl::ToInt64Milliseconds(config_.shutdown_barrier_timeout) + 5 * 1000;
     call_opts.SetTimeout(shutdown_timeout);
 
     absl::Notification n;
@@ -857,7 +711,9 @@ absl::Status CoordinationServiceAgent::StopWatchKey(absl::string_view key) {
 void CoordinationServiceAgent::SetError(const absl::Status& error) {
   assert(!error.ok());
   absl::MutexLock l(state_mu_);
-  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
+  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
+    return;
+  }
   absl::Status trimmed_error = TrimCoordinationErrorMessage(error);
 
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
@@ -1062,7 +918,9 @@ absl::Status CoordinationServiceAgent::ValidateRunningAgent(
           "Agent must be in CONNECTED state. It is currently UNINITIALIZED."));
 
     case CoordinatedTaskState::TASKSTATE_DISCONNECTED:
-      if (allow_disconnected) return absl::OkStatus();
+      if (allow_disconnected) {
+        return absl::OkStatus();
+      }
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Agent must be in CONNECTED state. It is currently DISCONNECTED."));
 
@@ -1077,10 +935,6 @@ absl::Status CoordinationServiceAgent::ValidateRunningAgent(
 }
 
 absl::StatusOr<tsl::Env*> CoordinationServiceAgent::GetEnv() {
-  if (!IsInitialized()) {
-    return MakeCoordinationError(absl::FailedPreconditionError(
-        "Coordination service agent has not been initialized."));
-  }
   if (env_ == nullptr) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "Coordination service agent was not "
@@ -1089,8 +943,4 @@ absl::StatusOr<tsl::Env*> CoordinationServiceAgent::GetEnv() {
   return env_;
 }
 
-std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent() {
-  return std::make_unique<CoordinationServiceAgent>();
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
index 15231c562b94fe..429bdec124f9e9 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
@@ -27,14 +27,12 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "absl/types/span.h"
 #include "xla/pjrt/distributed/coordination/coordination_client.h"
 #include "xla/pjrt/distributed/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
@@ -70,6 +68,36 @@ namespace xla {
 //                               registered or wrong config).
 class CoordinationServiceAgent {
  public:
+  struct Config {
+    // Address where the coordination service instance is hosted.
+    std::string service_leader;
+
+    // Maximum wait time for all members in the cluster to be registered.
+    absl::Duration cluster_register_timeout = absl::Hours(1);
+
+    // Heartbeat timeout, if a task does not record heartbeat in this time
+    // window, it will be considered disconnected.
+    // Note: This is also used as a grace period to accept any heartbeats after
+    // the agent has disconnected, to account for the lag time between the
+    // service recording the state change and the agent stopping heartbeats.
+    absl::Duration heartbeat_timeout = absl::Seconds(10);
+
+    // Denotes how long to wait for all coordination agents to reach the
+    // barriers (after the first shutdown request) before disconnecting
+    // together. If set to 0, no barrier is imposed upon shutdown and each
+    // worker can disconnect individually.
+    absl::Duration shutdown_barrier_timeout = absl::Seconds(10);
+
+    // If set, agents do not make an explicit Shutdown() call. Service will only
+    // find out about the disconnected agent via stale heartbeats. Used for
+    // testing.
+    bool agent_destruction_without_shutdown = false;
+
+    // Use long polling to get error from coordination service as the error
+    // propagation mechanism.
+    bool poll_for_error_from_service_at_startup = false;
+  };
+
   using StatusOrValueCallback =
       std::function<void(const absl::StatusOr<std::string>&)>;
   // Collection of key-value pairs in the same directory.
@@ -78,32 +106,16 @@ class CoordinationServiceAgent {
   using ChangedKeyValuesCallback =
       std::function<void(const std::map<std::string, std::string>&)>;
 
-  CoordinationServiceAgent() = default;
+  static absl::StatusOr<std::unique_ptr<CoordinationServiceAgent>> Create(
+      tsl::Env* env, absl::string_view job_name, int task_id,
+      const Config& config, std::unique_ptr<CoordinationClient> leader_client,
+      tsl::StatusCallback error_fn, bool recoverable = false);
 
   virtual ~CoordinationServiceAgent() {
-    absl::Status s = ShutdownInternal();
+    absl::Status s = Shutdown();
     VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
 
-  absl::Status Initialize(tsl::Env* env, absl::string_view job_name,
-                          int task_id,
-                          const tensorflow::CoordinationServiceConfig& configs,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          tsl::StatusCallback error_fn, bool recoverable);
-  absl::Status Initialize(tsl::Env* env, absl::string_view job_name,
-                          int task_id,
-                          const tensorflow::CoordinationServiceConfig& configs,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          tsl::StatusCallback error_fn);
-  absl::Status Initialize(tsl::Env* env,
-                          const tensorflow::CoordinatedTask& task,
-                          const tensorflow::CoordinationServiceConfig& configs,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          tsl::StatusCallback error_fn);
-
-  // Return true if the coordination service agent has been initialized.
-  bool IsInitialized();
-
   // Return true if the coordination service agent has successfully connected
   // with the Coordination Service
   bool IsConnected();
@@ -123,32 +135,20 @@ class CoordinationServiceAgent {
   //              the configured timeout)
   absl::Status Connect();
 
-  // Wait for all tasks to be up and registered. The call blocks until all tasks
-  // in the cluster are up, or some error occurs.
-  // Possible service errors:
-  //   - Internal: Coordination service has shut down.
-  //   - FailedPrecondition: Agent is not in CONNECTED state.
-  //   - InvalidArgument: Unexpected task request
-  absl::Status WaitForAllTasks(const tensorflow::DeviceInfo& local_devices);
-
   // Get the device attributes of tasks from remote tasks in the cluster.
   const tensorflow::DeviceInfo& GetClusterDeviceInfo();
 
   // State transition in coordination service agent:
   //
-  //                 Init              Connect           SetError
-  //   UNINITIALIZED ---> DISCONNECTED ------> CONNECTED -------> ERROR
-  //                           ^                                  |
-  //                           |__________________________________|
-  //                                         Reset
+  //               Connect           SetError
+  //  DISCONNECTED ------> CONNECTED -------> ERROR
+  //       ^                                  |
+  //       |__________________________________|
+  //                     Reset
 
   // Get task associated with this agent.
   absl::StatusOr<tensorflow::CoordinatedTask> GetOwnTask();
 
-  // Get status of a remote task.
-  absl::StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>>
-  GetTaskState(const std::vector<tensorflow::CoordinatedTask>& task);
-
   // Watches the status of a remote job.
   absl::StatusOr<tensorflow::WatchJobStateResponse> WatchJobState(
       absl::string_view job_name, std::optional<int64_t> version_number);
@@ -165,7 +165,7 @@ class CoordinationServiceAgent {
   // distinguish user-specified errors from internal service or RPC failures.
   // Possible service errors:
   //   - Internal: Coordination service has shut down.
-  //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
+  //   - FailedPrecondition: disconnected/already in error state.
   //   - InvalidArgument: Unexpected task request
   absl::Status ReportError(const absl::Status& error);
 
@@ -276,8 +276,8 @@ class CoordinationServiceAgent {
   //       for the same barrier, (2) one of the participating tasks is not in
   //       the cluster, or (3) task making the request is not included in the
   //       list of participating tasks.
-  //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state, or the
-  //       same barrier id is still being invoked.
+  //   - FailedPrecondition: Agent is in ERROR state, or the same barrier id is
+  //       still being invoked.
   virtual absl::Status WaitAtBarrier(
       absl::string_view barrier_id, absl::Duration timeout,
       const std::vector<tensorflow::CoordinatedTask>& tasks);
@@ -364,7 +364,16 @@ class CoordinationServiceAgent {
  private:
   friend class CoordinationServiceRpcHandler;
 
-  absl::Status ShutdownInternal();
+  explicit CoordinationServiceAgent(
+      tsl::Env* env, const tensorflow::CoordinatedTask& task,
+      const Config& config, tsl::StatusCallback error_fn,
+      std::unique_ptr<CoordinationClient> leader_client)
+      : env_(env),
+        task_(task),
+        config_(config),
+        error_fn_(error_fn),
+        leader_client_(std::move(leader_client)) {}
+
   // Starts sending heartbeats to the coordination service.
   void StartSendingHeartbeats();
   // Use long polling to get error from the coordination service.
@@ -380,12 +389,12 @@ class CoordinationServiceAgent {
   tsl::Env* env_ = nullptr;  // Not owned.
   const IncarnationId incarnation_id_{tsl::random::New64()};
   tensorflow::CoordinatedTask task_;
-  tensorflow::CoordinationServiceConfig configs_;
+  Config config_;
   tsl::StatusCallback error_fn_;
 
   mutable absl::Mutex state_mu_;
   tensorflow::CoordinatedTaskState state_ ABSL_GUARDED_BY(state_mu_) =
-      tensorflow::CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
+      tensorflow::CoordinatedTaskState::TASKSTATE_DISCONNECTED;
   absl::Status status_ ABSL_GUARDED_BY(state_mu_) = absl::OkStatus();
   // Tracks the number of times a barrier has been used, keyed by id.
   absl::flat_hash_map<std::string, int64_t> barrier_counter_
@@ -416,8 +425,6 @@ class CoordinationServiceAgent {
   void operator=(const CoordinationServiceAgent&) = delete;
 };
 
-std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent();
-
 }  // namespace xla
 
 #endif  // XLA_PJRT_DISTRIBUTED_COORDINATION_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
index 92a35738368af4..e2e983cbb8a4ed 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -41,7 +42,6 @@ limitations under the License.
 namespace xla {
 namespace {
 using tensorflow::CoordinatedTask;
-using tensorflow::CoordinationServiceConfig;
 using tensorflow::KeyValueEntry;
 
 using ::testing::_;
@@ -109,10 +109,6 @@ class TestCoordinationClient : public CoordinationClient {
               (const ResetTaskRequest*, ResetTaskResponse*,
                tsl::StatusCallback),
               (override));
-  MOCK_METHOD(void, ReportErrorToServiceAsync,
-              (const ReportErrorToServiceRequest*,
-               ReportErrorToServiceResponse*, tsl::StatusCallback),
-              (override));
   MOCK_METHOD(void, BarrierAsync,
               (tsl::CallOptions * call_opts, const BarrierRequest*,
                BarrierResponse*, tsl::StatusCallback),
@@ -125,10 +121,6 @@ class TestCoordinationClient : public CoordinationClient {
               (const GetAliveTasksRequest*, GetAliveTasksResponse*,
                tsl::StatusCallback),
               (override));
-  MOCK_METHOD(void, GetTaskStateAsync,
-              (const GetTaskStateRequest*, GetTaskStateResponse*,
-               tsl::StatusCallback),
-              (override));
   MOCK_METHOD(void, WatchJobStateAsync,
               (tsl::CallOptions*, const WatchJobStateRequest*,
                WatchJobStateResponse*, tsl::StatusCallback),
@@ -141,22 +133,6 @@ class TestCoordinationClient : public CoordinationClient {
               (tsl::CallOptions * call_opts, const PollForErrorRequest*,
                PollForErrorResponse*, tsl::StatusCallback),
               (override));
-
-#define UNIMPLEMENTED(method)                                              \
-  void method##Async(const method##Request* request,                       \
-                     method##Response* response, tsl::StatusCallback done) \
-      override {                                                           \
-    done(absl::UnimplementedError(#method "Async"));                       \
-  }
-
-  UNIMPLEMENTED(WaitForAllTasks);
-#undef UNIMPLEMENTED
-  void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                              const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              tsl::StatusCallback done) override {
-    done(absl::UnimplementedError("ReportErrorToTaskAsync"));
-  }
 };
 
 class CoordinationServiceAgentTest : public ::testing::Test {
@@ -168,27 +144,24 @@ class CoordinationServiceAgentTest : public ::testing::Test {
         .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, ShutdownTaskAsync(_, _, _, _))
         .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
-    ON_CALL(*client_, ReportErrorToServiceAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, ResetTaskAsync(_, _, _))
         .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, BarrierAsync(_, _, _, _))
         .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, CancelBarrierAsync(_, _, _))
         .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
-    ON_CALL(*client_, GetTaskStateAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
   }
 
   // Should be called after mocking service responses, before testing the agent.
-  void InitializeAgent(CoordinationServiceConfig config = {}) {
-    config.set_service_leader("test_leader");
-    TF_ASSERT_OK(agent_->Initialize(
-        tsl::Env::Default(), /*job_name=*/"test_job",
-        /*task_id=*/0, config, std::move(client_),
-        /*error_fn=*/[](absl::Status s) {
-          LOG(ERROR) << "Coordination agent is set to error: " << s;
-        }));
+  void InitializeAgent(CoordinationServiceAgent::Config config = {}) {
+    config.service_leader = "test_leader";
+    TF_ASSERT_OK_AND_ASSIGN(
+        agent_, CoordinationServiceAgent::Create(
+                    tsl::Env::Default(), /*job_name=*/"test_job",
+                    /*task_id=*/0, config, std::move(client_),
+                    /*error_fn=*/[](absl::Status s) {
+                      LOG(ERROR) << "Coordination agent is set to error: " << s;
+                    }));
   }
 
   TestCoordinationClient* GetClient() {
@@ -199,8 +172,7 @@ class CoordinationServiceAgentTest : public ::testing::Test {
   }
 
  protected:
-  std::unique_ptr<CoordinationServiceAgent> agent_ =
-      CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationServiceAgent> agent_;
   std::unique_ptr<TestCoordinationClient> client_ =
       std::make_unique<TestCoordinationClient>();
 };
@@ -412,18 +384,6 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValueDir_Simple_Success) {
   EXPECT_THAT(*result, UnorderedPointwise(KvEq(), test_values));
 }
 
-TEST_F(CoordinationServiceAgentTest, ShutdownInErrorShouldReturnError) {
-  // Connect coordination agent and set it to error.
-  InitializeAgent();
-  TF_ASSERT_OK(agent_->Connect());
-  TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
-
-  // Shutdown should return error.
-  absl::Status s = agent_->Shutdown();
-
-  EXPECT_TRUE(absl::IsFailedPrecondition(s));
-}
-
 TEST_F(CoordinationServiceAgentTest, Reset_ConnectedButNotInError_Fail) {
   // Connect agent.
   InitializeAgent();
@@ -435,18 +395,6 @@ TEST_F(CoordinationServiceAgentTest, Reset_ConnectedButNotInError_Fail) {
   EXPECT_TRUE(absl::IsFailedPrecondition(status));
 }
 
-TEST_F(CoordinationServiceAgentTest, ConnectAfterResetError) {
-  // Connect coordination agent and set it to error.
-  InitializeAgent();
-  TF_ASSERT_OK(agent_->Connect());
-  TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
-
-  // Reset error.
-  TF_ASSERT_OK(agent_->Reset());
-  // Agent should be able to reconnect to the service after resetting.
-  TF_EXPECT_OK(agent_->Connect());
-}
-
 TEST_F(CoordinationServiceAgentTest, ConnectAfterReset_WithErrorPolling) {
   // Connect coordination agent and set it to error.
   PollForErrorResponse mocked_response;
@@ -456,8 +404,8 @@ TEST_F(CoordinationServiceAgentTest, ConnectAfterReset_WithErrorPolling) {
       .WillOnce(DoAll(SetArgPointee<2>(mocked_response),
                       InvokeArgument<3>(absl::InternalError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   // The agent will be in ERROR state after the first call to Connect()
   // because the error polling thread will be created and will immediately
@@ -483,8 +431,8 @@ TEST_F(CoordinationServiceAgentTest, CancelledPollForErrorRequest) {
       .WillOnce(DoAll(SetArgPointee<2>(mocked_response),
                       InvokeArgument<3>(absl::CancelledError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   TF_ASSERT_OK(agent_->Connect());
   // Wait a bit for the error polling thread to start.
@@ -501,8 +449,8 @@ TEST_F(CoordinationServiceAgentTest, InvalidPollForErrorRequest) {
           DoAll(SetArgPointee<2>(mocked_response),
                 InvokeArgument<3>(absl::InvalidArgumentError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   TF_ASSERT_OK(agent_->Connect());
   // Wait a bit for the error polling thread to start.
@@ -519,8 +467,8 @@ TEST_F(CoordinationServiceAgentTest,
           SetArgPointee<2>(mocked_response),
           InvokeArgument<3>(absl::FailedPreconditionError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   TF_ASSERT_OK(agent_->Connect());
   // Wait a bit for the error polling thread to start.
@@ -528,26 +476,6 @@ TEST_F(CoordinationServiceAgentTest,
   ASSERT_TRUE(agent_->IsError());
 }
 
-TEST_F(CoordinationServiceAgentTest, ResetCanBeRetried) {
-  // Mock reset error failing for the first time.
-  EXPECT_CALL(*GetClient(), ResetTaskAsync(_, _, _))
-      .WillOnce(InvokeArgument<2>(absl::InternalError("Reset error")))
-      .WillOnce(InvokeArgument<2>(absl::OkStatus()));
-  // Connect coordination agent and set it to error.
-  InitializeAgent();
-  TF_ASSERT_OK(agent_->Connect());
-  TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
-
-  // Reset error fails for the first time.
-  absl::Status reset_status = agent_->Reset();
-  EXPECT_TRUE(absl::IsInternal(reset_status));
-
-  // Agent should be able to attempt resetting again.
-  TF_ASSERT_OK(agent_->Reset());
-  // Agent should be able to reconnect to the service after resetting.
-  TF_EXPECT_OK(agent_->Connect());
-}
-
 TEST_F(CoordinationServiceAgentTest, GetOwnTask) {
   InitializeAgent();
 
@@ -563,18 +491,9 @@ TEST_F(CoordinationServiceAgentTest, GetOwnTask) {
   EXPECT_EQ(actual_task.task_id(), expected_task.task_id());
 }
 
-TEST_F(CoordinationServiceAgentTest, GetOwnTask_Uninitialized) {
-  auto result = agent_->GetOwnTask();
-
-  EXPECT_TRUE(absl::IsFailedPrecondition(result.status()));
-}
-
 TEST_F(CoordinationServiceAgentTest, GetEnv_SucceedsAfterInit) {
-  EXPECT_TRUE(absl::IsFailedPrecondition(agent_->GetEnv().status()));
   InitializeAgent();
-
   absl::StatusOr<tsl::Env*> result = agent_->GetEnv();
-
   TF_ASSERT_OK(result.status());
   EXPECT_EQ(*result, tsl::Env::Default());
 }
@@ -598,10 +517,9 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldFailEventually) {
   EXPECT_CALL(*GetClient(), RegisterTaskAsync(_, _, _, _))
       .WillRepeatedly(
           InvokeArgument<3>(absl::AbortedError("DuplicateTaskRegistration")));
-  CoordinationServiceConfig config;
+  CoordinationServiceAgent::Config config;
   // Connect should only be retried for 3 seconds.
-  config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(absl::Seconds(3)));
+  config.cluster_register_timeout = absl::Seconds(3);
   InitializeAgent(config);
 
   absl::Status s = agent_->Connect();
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc
deleted file mode 100644
index 8f2dfb02ab135d..00000000000000
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "grpcpp/server.h"
-#include "grpcpp/server_builder.h"
-#include "grpcpp/support/channel_arguments.h"
-#include "xla/pjrt/distributed/coordination/coordination_client.h"
-#include "xla/pjrt/distributed/coordination/coordination_service.h"
-#include "xla/pjrt/distributed/coordination/coordination_service_agent.h"
-#include "xla/pjrt/distributed/coordination/grpc_coordination_client.h"
-#include "xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h"
-#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/tsl/protobuf/coordination_config.pb.h"
-
-namespace xla {
-namespace {
-using tensorflow::CoordinatedJob;
-using tensorflow::CoordinationServiceConfig;
-
-constexpr char kParameterServerJobName[] = "parameter_server";
-constexpr char kWorkerJobName[] = "worker";
-constexpr char kCoordinationServiceType[] = "standalone";
-constexpr char kServiceLeader[] = "/job:parameter_server/replica:0/task:0";
-
-class TestCoordinationClientCache : public CoordinationClientCache {
- public:
-  void AddTask(const std::string& target, CoordinationClient* client) {
-    absl::MutexLock l(clients_mu_);
-    clients_.emplace(target, client);
-  }
-
-  CoordinationClient* GetClient(const std::string& target) override {
-    absl::MutexLock l(clients_mu_);
-    if (auto it = clients_.find(target); it != clients_.end()) {
-      return it->second;
-    }
-    return nullptr;
-  }
-
-  std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) override {
-    LOG(ERROR) << "GetOwnedClient is not supported.";
-    return nullptr;
-  }
-
- private:
-  absl::Mutex clients_mu_;
-  absl::flat_hash_map<std::string, CoordinationClient*> clients_
-      ABSL_GUARDED_BY(clients_mu_);
-};
-
-class TestCoordinationServiceTaskState {
- public:
-  TestCoordinationServiceTaskState() = default;
-
-  ~TestCoordinationServiceTaskState() = default;
-
-  void Shutdown() {
-    coord_client_.reset();
-    coord_agent_.reset();
-    coord_compute_pool_.reset();
-    static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
-        ->SetCoordinationServiceInstance(nullptr);
-    grpc_server_->Shutdown();
-    coord_rpc_service_->Shutdown();
-  }
-
-  void StartGrpcServer() {
-    ::grpc::ServerBuilder builder;
-    coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
-        tsl::Env::Default(), /*name=*/"CoordinationServiceRpcHandler",
-        /*num_threads=*/5);
-    coord_rpc_service_ = std::make_unique<GrpcCoordinationServiceImpl>(
-        coord_compute_pool_.get(), &builder);
-    auto* grpc_coord_service =
-        static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
-    grpc_coord_service->SetCoordinationServiceAgentInstance(coord_agent_.get());
-    grpc_server_ = builder.BuildAndStart();
-    coord_client_ = absl::WrapUnique(NewGrpcCoordinationClient(
-        grpc_server_->InProcessChannel(::grpc::ChannelArguments())));
-    coord_rpc_thread_ = absl::WrapUnique(tsl::Env::Default()->StartThread(
-        /*thread_options=*/{}, /*name=*/"CoordinationServiceHandleRPCsLoop",
-        [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
-  }
-
-  void SetCoordinationService(CoordinationService* service) {
-    auto* grpc_coord_service =
-        static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
-    grpc_coord_service->SetCoordinationServiceInstance(service);
-  }
-
-  void InitializeAndConnectCoordinationAgents(
-      const std::string& job_name, int task_id,
-      const CoordinationServiceConfig& coordination_config) {
-    auto error_fn = [this, job_name](const absl::Status& status) {
-      this->status_ = status;
-      LOG(ERROR) << "Coordination service agent of " << job_name
-                 << " is in error status: " << status;
-    };
-
-    TF_CHECK_OK(coord_agent_->Initialize(tsl::Env::Default(), job_name, task_id,
-                                         coordination_config,
-                                         std::move(coord_client_), error_fn));
-    TF_CHECK_OK(coord_agent_->Connect());
-    TF_CHECK_OK(status_);
-  }
-
-  CoordinationClient* GetCoordinationClient() { return coord_client_.get(); }
-
-  absl::Status ReportError(const absl::Status& status) {
-    return coord_agent_->ReportError(status);
-  }
-
-  absl::Status GetStatus() const { return status_; }
-
- private:
-  std::unique_ptr<::grpc::Server> grpc_server_;
-  std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
-  std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
-  std::unique_ptr<tsl::Thread> coord_rpc_thread_;
-  std::unique_ptr<CoordinationServiceAgent> coord_agent_ =
-      CreateCoordinationServiceAgent();
-  std::unique_ptr<CoordinationClient> coord_client_;
-  absl::Status status_;
-};
-
-class CoordinationServiceRecoverableJobTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    state_ps_0_.StartGrpcServer();
-    state_ps_1_.StartGrpcServer();
-    state_worker_0_.StartGrpcServer();
-    state_worker_1_.StartGrpcServer();
-  }
-
-  void TearDown() override {
-    state_ps_0_.Shutdown();
-    state_ps_1_.Shutdown();
-    state_worker_0_.Shutdown();
-    state_worker_1_.Shutdown();
-    coord_service_.reset();
-  }
-
-  void Initialize() {
-    ConfigureCoordinationService();
-    auto client_cache = std::make_unique<TestCoordinationClientCache>();
-    client_cache->AddTask(
-        /*target=*/kServiceLeader, state_ps_0_.GetCoordinationClient());
-    client_cache->AddTask(
-        /*target=*/"/job:parameter_server/replica:0/task:1",
-        state_ps_1_.GetCoordinationClient());
-    client_cache->AddTask(
-        /*target=*/"/job:worker/replica:0/task:0",
-        state_worker_0_.GetCoordinationClient());
-    client_cache->AddTask(
-        /*target=*/"/job:worker/replica:0/task:1",
-        state_worker_1_.GetCoordinationClient());
-    coord_service_ = CoordinationService::Create(
-        tsl::Env::Default(), coordination_config_, std::move(client_cache));
-    // Set the service pointer for all the tasks since it is needed for handling
-    // error propagations. In reality, every task has its own service pointer.
-    // To mimic that, we need multi-process tests.
-    state_ps_0_.SetCoordinationService(coord_service_.get());
-    state_ps_1_.SetCoordinationService(coord_service_.get());
-    state_worker_0_.SetCoordinationService(coord_service_.get());
-    state_worker_1_.SetCoordinationService(coord_service_.get());
-    state_ps_0_.InitializeAndConnectCoordinationAgents(kParameterServerJobName,
-                                                       /*task_id=*/0,
-                                                       coordination_config_);
-    state_ps_1_.InitializeAndConnectCoordinationAgents(kParameterServerJobName,
-                                                       /*task_id=*/1,
-                                                       coordination_config_);
-    state_worker_0_.InitializeAndConnectCoordinationAgents(
-        kWorkerJobName,
-        /*task_id=*/0, coordination_config_);
-    state_worker_1_.InitializeAndConnectCoordinationAgents(
-        kWorkerJobName,
-        /*task_id=*/1, coordination_config_);
-  }
-
-  void ConfigureCoordinationService() {
-    // Assume the coordination service is deployed in the parameter server.
-    coordination_config_.set_service_type(kCoordinationServiceType);
-    coordination_config_.set_service_leader(kServiceLeader);
-    CoordinatedJob* ps =
-        coordination_config_.mutable_coordinated_job_list()->Add();
-    ps->set_name(kParameterServerJobName);
-    ps->set_num_tasks(2);
-    CoordinatedJob* worker =
-        coordination_config_.mutable_coordinated_job_list()->Add();
-    worker->set_name(kWorkerJobName);
-    worker->set_num_tasks(2);
-  }
-
-  void AddJobToRecoverableJobs(const std::string& job_name) {
-    coordination_config_.add_recoverable_jobs(job_name);
-  }
-
- protected:
-  CoordinationServiceConfig coordination_config_;
-  std::unique_ptr<CoordinationService> coord_service_;
-  TestCoordinationServiceTaskState state_ps_0_;
-  TestCoordinationServiceTaskState state_ps_1_;
-  TestCoordinationServiceTaskState state_worker_0_;
-  TestCoordinationServiceTaskState state_worker_1_;
-};
-
-TEST_F(CoordinationServiceRecoverableJobTest,
-       UnrecoverableWorkerFailurePropagated) {
-  Initialize();
-  TF_ASSERT_OK(state_worker_0_.ReportError(absl::InternalError("Test Error.")));
-
-  // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(absl::IsInternal(state_ps_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_ps_1_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_1_.GetStatus()));
-}
-
-TEST_F(CoordinationServiceRecoverableJobTest,
-       UnrecoverablePSFailurePropagated) {
-  Initialize();
-  TF_ASSERT_OK(state_ps_0_.ReportError(absl::InternalError("Test Error.")));
-
-  // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(absl::IsInternal(state_ps_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_ps_1_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_1_.GetStatus()));
-}
-
-TEST_F(CoordinationServiceRecoverableJobTest,
-       RecoverableWorkerFailureNotPropagated) {
-  AddJobToRecoverableJobs(kWorkerJobName);
-  Initialize();
-  TF_ASSERT_OK(state_worker_0_.ReportError(absl::InternalError("Test Error.")));
-
-  // For recoverable task, error does not propagate.
-  EXPECT_TRUE(state_ps_0_.GetStatus().ok());
-  EXPECT_TRUE(state_ps_1_.GetStatus().ok());
-  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(state_worker_1_.GetStatus().ok());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
index 1f95a806c7add1..a94a4189881c4e 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
@@ -91,26 +91,6 @@ void CoordinationServiceRpcHandler::HeartbeatAsync(
   done(absl::OkStatus());
 }
 
-void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
-    const tensorflow::WaitForAllTasksRequest* request,
-    tensorflow::WaitForAllTasksResponse* response, tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (service_ == nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Coordination service is not enabled.")));
-    return;
-  }
-  service_->WaitForAllTasks(
-      request->source_task(), request->device_info(),
-      [response, service = service_, done = std::move(done)](absl::Status s) {
-        if (s.ok()) {
-          service->state_mu_.AssertHeld();
-          *response->mutable_device_info() = service->ListClusterDevices();
-        }
-        done(s);
-      });
-}
-
 void CoordinationServiceRpcHandler::ShutdownTaskAsync(
     const tensorflow::ShutdownTaskRequest* request,
     tensorflow::ShutdownTaskResponse* response, tsl::StatusCallback done) {
@@ -136,62 +116,6 @@ void CoordinationServiceRpcHandler::ResetTaskAsync(
   done(service_->ResetTask(request->source_task()));
 }
 
-void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
-    const tensorflow::ReportErrorToTaskRequest* request,
-    tensorflow::ReportErrorToTaskResponse* response, tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (agent_ == nullptr) {
-    done(MakeCoordinationError(absl::InternalError(
-        "CoordinationServiceAgent is uninitialized or has already shutdown.")));
-    return;
-  }
-  const CoordinationServiceError& error_payload = request->error_payload();
-  absl::Status error(
-      static_cast<absl::StatusCode>(request->error_code()),
-      absl::StrCat(
-          "Error reported from /job:", error_payload.source_task().job_name(),
-          "/task:", error_payload.source_task().task_id(), ": ",
-          request->error_message()));
-  error = MakeCoordinationError(error, error_payload);
-  agent_->SetError(error);
-  done(absl::OkStatus());
-}
-
-void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
-    const tensorflow::ReportErrorToServiceRequest* request,
-    tensorflow::ReportErrorToServiceResponse* response,
-    tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (service_ == nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Coordination service is not enabled.")));
-    return;
-  }
-  done(service_->ReportTaskError(
-      request->error_origin(),
-      MakeCoordinationError(
-          absl::Status{static_cast<absl::StatusCode>(request->error_code()),
-                       request->error_message()},
-          request->error_origin(),
-          /*is_reported_error=*/true)));
-}
-
-void CoordinationServiceRpcHandler::GetTaskStateAsync(
-    const tensorflow::GetTaskStateRequest* request,
-    tensorflow::GetTaskStateResponse* response, tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (service_ == nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Coordination service is not enabled.")));
-    return;
-  }
-  auto result = service_->GetTaskState(
-      {request->source_task().begin(), request->source_task().end()});
-  absl::c_move(result, tsl::protobuf::RepeatedFieldBackInserter(
-                           response->mutable_task_state()));
-  done(absl::OkStatus());
-}
-
 void CoordinationServiceRpcHandler::WatchJobStateAsync(
     const tensorflow::WatchJobStateRequest* request,
     tensorflow::WatchJobStateResponse* response, tsl::StatusCallback done) {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
index 7e71594ddfb284..04e7645cd9c766 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
@@ -40,10 +40,6 @@ class CoordinationServiceRpcHandler {
                       tensorflow::HeartbeatResponse* response,
                       tsl::StatusCallback done);
 
-  void WaitForAllTasksAsync(const tensorflow::WaitForAllTasksRequest* request,
-                            tensorflow::WaitForAllTasksResponse* response,
-                            tsl::StatusCallback done);
-
   void ShutdownTaskAsync(const tensorflow::ShutdownTaskRequest* request,
                          tensorflow::ShutdownTaskResponse* response,
                          tsl::StatusCallback done);
@@ -52,20 +48,6 @@ class CoordinationServiceRpcHandler {
                       tensorflow::ResetTaskResponse* response,
                       tsl::StatusCallback done);
 
-  void ReportErrorToTaskAsync(
-      const tensorflow::ReportErrorToTaskRequest* request,
-      tensorflow::ReportErrorToTaskResponse* response,
-      tsl::StatusCallback done);
-
-  void ReportErrorToServiceAsync(
-      const tensorflow::ReportErrorToServiceRequest* request,
-      tensorflow::ReportErrorToServiceResponse* response,
-      tsl::StatusCallback done);
-
-  void GetTaskStateAsync(const tensorflow::GetTaskStateRequest* request,
-                         tensorflow::GetTaskStateResponse* response,
-                         tsl::StatusCallback done);
-
   void WatchJobStateAsync(const tensorflow::WatchJobStateRequest* request,
                           tensorflow::WatchJobStateResponse* response,
                           tsl::StatusCallback done);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index 55ee5a32ed1527..ee224bc5bebb44 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/*r Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -43,7 +42,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -68,7 +66,6 @@ using tensorflow::CoordinationServiceConfig;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 using xla::TestDevice;
-using xla::TestDeviceList;
 
 constexpr absl::Duration kHeartbeatTimeout = absl::Seconds(2);
 constexpr absl::Duration kShutdownBarrierTimeout = absl::Milliseconds(500);
@@ -81,12 +78,12 @@ KeyValueEntry CreateKv(const std::string& key, const std::string& value) {
   return kv;
 }
 
-CoordinationServiceConfig GetCoordinationServiceConfig(int num_tasks) {
-  CoordinationServiceConfig config;
-  config.set_service_type(kCoordinationServiceType);
-  CoordinatedJob* job = config.mutable_coordinated_job_list()->Add();
-  job->set_name("worker");
-  job->set_num_tasks(num_tasks);
+CoordinationService::Config GetCoordinationServiceConfig(int num_tasks) {
+  CoordinationService::Config config;
+  CoordinatedJob job;
+  job.set_name("worker");
+  job.set_num_tasks(num_tasks);
+  config.coordinated_job_list.push_back(std::move(job));
   return config;
 }
 
@@ -106,16 +103,6 @@ class TestCoordinationClient : public CoordinationClient {
     done(absl::OkStatus());
   }
 
-  void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                              const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              tsl::StatusCallback done) override {
-    absl::MutexLock l(mu_);
-    status_ = absl::Status(static_cast<absl::StatusCode>(request->error_code()),
-                           request->error_message());
-    done(absl::OkStatus());
-  }
-
 #define UNIMPLEMENTED(method)                                              \
   void method##Async(const method##Request* request,                       \
                      method##Response* response, tsl::StatusCallback done) \
@@ -123,10 +110,7 @@ class TestCoordinationClient : public CoordinationClient {
     done(absl::UnimplementedError(#method "Async"));                       \
   }
 
-  UNIMPLEMENTED(WaitForAllTasks);
   UNIMPLEMENTED(ResetTask);
-  UNIMPLEMENTED(ReportErrorToService);
-  UNIMPLEMENTED(GetTaskState);
   UNIMPLEMENTED(InsertKeyValue);
   UNIMPLEMENTED(TryGetKeyValue);
   UNIMPLEMENTED(IncrementKeyValue);
@@ -156,50 +140,26 @@ class TestCoordinationClient : public CoordinationClient {
   absl::Status status_ ABSL_GUARDED_BY(mu_);
 };
 
-class TestCoordinationClientCache : public CoordinationClientCache {
- public:
-  void AddTask(const std::string& target, CoordinationClient* client) {
-    clients_.emplace(target, client);
-  }
-
-  CoordinationClient* GetClient(const std::string& target) override {
-    auto it = clients_.find(target);
-    if (it == clients_.end()) return nullptr;
-    return it->second;
-  }
-
-  std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) override {
-    LOG(ERROR) << "GetOwnedClient is not supported.";
-    return nullptr;
-  }
-
- private:
-  std::unordered_map<std::string, CoordinationClient*> clients_;
-};
-
 class CoordinationBarrierTest : public ::testing::Test {
  protected:
-  CoordinationBarrierTest() {
+  explicit CoordinationBarrierTest(bool recoverable = false) {
     // Set up fake cluster with 3 tasks.
     const int num_tasks = 3;
-    auto client_cache = std::make_unique<TestCoordinationClientCache>();
     for (int i = 0; i < num_tasks; ++i) {
       CoordinatedTask task;
       task.set_job_name("worker");
       task.set_task_id(i);
+      task.set_recoverable(recoverable);
 
       auto client = std::make_unique<TestCoordinationClient>();
-      client_cache->AddTask(absl::StrCat("/job:worker/replica:0/task:", i),
-                            client.get());
-
       tasks_.push_back(task);
       clients_.push_back(std::move(client));
     }
-    CoordinationServiceConfig config = GetCoordinationServiceConfig(num_tasks);
+    CoordinationService::Config config =
+        GetCoordinationServiceConfig(num_tasks);
 
-    coord_service_ = CoordinationService::Create(tsl::Env::Default(), config,
-                                                 std::move(client_cache));
+    coord_service_ =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     // Register the tasks.
     for (int i = 0; i < num_tasks; ++i) {
       absl::Status s =
@@ -256,40 +216,30 @@ class CoordinateTwoTasksTest : public ::testing::Test {
 
   // Set up coordination service.
   void EnableCoordinationService(
-      bool has_service_to_client_connection = true,
       bool enable_shutdown_barrier = false,
       bool enable_register_barrier = false,
       bool set_worker_job_recoverable = false,
       bool allow_new_incarnation_to_reconnect = false) {
-    CoordinationServiceConfig config =
+    CoordinationService::Config config =
         GetCoordinationServiceConfig(/*num_tasks=*/2);
-    auto client_cache = std::make_unique<TestCoordinationClientCache>();
-    if (has_service_to_client_connection) {
-      client_cache->AddTask("/job:worker/replica:0/task:0", &client_0_);
-      client_cache->AddTask("/job:worker/replica:0/task:1", &client_1_);
-    } else {
-      client_cache = nullptr;
-    }
-    config.set_heartbeat_timeout_in_ms(kHeartbeatTimeout /
-                                       absl::Milliseconds(1));
+    config.heartbeat_timeout = kHeartbeatTimeout;
     if (set_worker_job_recoverable) {
-      config.mutable_recoverable_jobs()->Add("worker");
+      task_0_.set_recoverable(true);
+      task_1_.set_recoverable(true);
     }
     if (enable_shutdown_barrier) {
-      config.set_shutdown_barrier_timeout_in_ms(kShutdownBarrierTimeout /
-                                                absl::Milliseconds(1));
+      config.shutdown_barrier_timeout = kShutdownBarrierTimeout;
     }
     if (enable_register_barrier) {
-      config.set_cluster_register_with_barrier(true);
-      config.set_cluster_register_timeout_in_ms(absl::Seconds(1) /
-                                                absl::Milliseconds(1));
+      config.cluster_register_with_barrier = true;
+      config.cluster_register_timeout = absl::Seconds(1);
     }
     if (allow_new_incarnation_to_reconnect) {
-      config.set_allow_new_incarnation_to_reconnect(true);
+      config.allow_new_incarnation_to_reconnect = true;
     }
     // Init service.
-    coord_service_ = CoordinationService::Create(tsl::Env::Default(), config,
-                                                 std::move(client_cache));
+    coord_service_ =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   }
 
   CoordinatedTask task_0_;
@@ -319,19 +269,8 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
   task_2.set_task_id(2);
 
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  absl::Notification wait_for_all;
-  coord_service_->WaitForAllTasks(task_0_, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    wait_for_all.Notify();
-  });
   // Not all tasks have registered, so must not be notified here.
-  ASSERT_FALSE(wait_for_all.HasBeenNotified());
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  coord_service_->WaitForAllTasks(task_1_, {},
-                                  [&](absl::Status s) { ASSERT_OK(s); });
-  // All tasks have registered.
-  wait_for_all.WaitForNotification();
-
   ASSERT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RecordHeartbeat(task_1_, incarnation_1_));
   EXPECT_THAT(coord_service_->RecordHeartbeat(task_2, IncarnationId(0)),
@@ -342,89 +281,19 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
               StatusIs(absl::StatusCode::kAborted));
   EXPECT_THAT(coord_service_->RecordHeartbeat(task_1_, IncarnationId(0)),
               StatusIs(absl::StatusCode::kAborted));
-  // Error is propagated to other tasks.
-  EXPECT_THAT(client_0_.GetStatus(), StatusIs(absl::StatusCode::kAborted));
-}
-
-TEST(CoordinationServiceTest, TestCoordinatedJobs) {
-  CoordinatedTask chief;
-  chief.set_job_name("chief");
-  chief.set_task_id(0);
-  CoordinatedTask task_0;
-  task_0.set_job_name("worker");
-  task_0.set_task_id(0);
-  CoordinatedTask task_1;
-  task_1.set_job_name("worker");
-  task_1.set_task_id(1);
-  CoordinatedTask evaluator;
-  evaluator.set_job_name("evaluator");
-  evaluator.set_task_id(0);
-
-  CoordinationServiceConfig config;
-  config.set_service_type(kCoordinationServiceType);
-  CoordinatedJob* chief_job = config.mutable_coordinated_job_list()->Add();
-  chief_job->set_name("chief");
-  chief_job->set_num_tasks(1);
-  CoordinatedJob* worker_job = config.mutable_coordinated_job_list()->Add();
-  worker_job->set_name("worker");
-  worker_job->set_num_tasks(2);
-
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  TestCoordinationClient ci;
-  client_cache->AddTask("/job:chief/replica:0/task:0", &ci);
-  TestCoordinationClient wi0;
-  client_cache->AddTask("/job:worker/replica:0/task:0", &wi0);
-  TestCoordinationClient wi1;
-  client_cache->AddTask("/job:worker/replica:0/task:1", &wi1);
-  TestCoordinationClient ei;
-  client_cache->AddTask("/job:evaluator/replica:0/task:0", &ei);
-  std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
-
-  // Each coordinated task registers and waits for other tasks.
-  absl::Notification register_chief;
-  ASSERT_OK(coord_service->RegisterTask(chief, IncarnationId(0)));
-  coord_service->WaitForAllTasks(chief, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    register_chief.Notify();
-  });
-  absl::Notification register_task0;
-  ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
-  coord_service->WaitForAllTasks(task_0, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    register_task0.Notify();
-  });
-  absl::Notification register_task1;
-  ASSERT_OK(coord_service->RegisterTask(task_1, IncarnationId(0)));
-  coord_service->WaitForAllTasks(task_1, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    register_task1.Notify();
-  });
-  // All tasks in the coordinated jobs have registered.
-  register_chief.WaitForNotification();
-  register_task0.WaitForNotification();
-  register_task1.WaitForNotification();
-
-  // Registering the evaluator task is unexpected
-  absl::Status status =
-      coord_service->RegisterTask(evaluator, IncarnationId(0));
-
-  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
 // RegisterTask() may succeed in the service, but the agent response times out.
 // In this case, the agent would retry Connect() and should succeed if it has
 // the same incarnation.
 TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
 
@@ -437,14 +306,13 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
 
 TEST(CoordinationServiceTest,
      RegisterTask_AlreadyConnectedDifferentIncarnation_Fails) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
 
@@ -458,14 +326,13 @@ TEST(CoordinationServiceTest,
 }
 
 TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
-  CoordinationServiceConfig config =
+  CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
   // Arbitrarily set task to be in error.
@@ -495,7 +362,7 @@ TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
 
 TEST_F(CoordinateTwoTasksTest,
        ErrorPollingRequestsGotCancelledErrorUponServiceShutdown) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   std::vector<absl::Status> statuses;
@@ -518,7 +385,7 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutWithoutServerToClientConnection) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
@@ -534,7 +401,7 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutErrorCanPropagateThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // Use notifications to guarantee the ordering of operations across threads.
@@ -565,7 +432,7 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutErrorFromOneTaskCanPropagateThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // Use notifications to guarantee the ordering of operations across threads.
@@ -603,7 +470,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, ReportedErrorCanPropagateThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   std::vector<absl::Status> statuses;
@@ -630,8 +497,6 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
       task_1_, IncarnationId(tsl::random::New64()));
 
   EXPECT_THAT(s, StatusIs(absl::StatusCode::kAborted));
-  // Aborted error is also propagated to other tasks in cluster.
-  EXPECT_THAT(client_0_.GetStatus(), StatusIs(absl::StatusCode::kAborted));
 }
 
 tensorflow::CoordinatedTaskStateInfo info(
@@ -907,12 +772,10 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
 }
 
 TEST(CoordinationServiceTest, TryGetKeyValue) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
 
   // Try to get nonexistent key.
   absl::StatusOr<std::string> result =
@@ -931,12 +794,10 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
 }
 
 TEST(CoordinationServiceTest, IncrementKeyValue) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   ASSERT_OK(coord_service->InsertKeyValue("test_key", "1"));
   ASSERT_OK(coord_service->IncrementKeyValue("test_key", 3));
   ASSERT_OK_AND_ASSIGN(std::string result_0,
@@ -1029,127 +890,6 @@ TEST_F(CoordinateTwoTasksTest,
 
 }  // namespace
 
-// Verify that coordination service can gather each task's device info and
-// propagate the aggregated cluster device info correctly.
-TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
-  const CoordinationServiceConfig config =
-      GetCoordinationServiceConfig(/*num_tasks=*/3);
-  CoordinatedTask task_0;
-  task_0.set_job_name("worker");
-  task_0.set_task_id(0);
-  CoordinatedTask task_1;
-  task_1.set_job_name("worker");
-  task_1.set_task_id(1);
-  CoordinatedTask task_2;
-  task_2.set_job_name("worker");
-  task_2.set_task_id(2);
-  absl::Status status = absl::OkStatus();
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
-  absl::Notification n;
-  // Map fake devices to each task.
-  DeviceInfo local_devices_0;
-  DeviceInfo local_devices_1;
-  DeviceInfo local_devices_2;
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device0"));
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device1"));
-  local_devices_1.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task1_device0"));
-  local_devices_2.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task2_device0"));
-
-  // Each task sends its device info.
-  DeviceInfo cluster_devices;
-  coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](absl::Status s) { ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](absl::Status s) { ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
-    ASSERT_OK(s);
-    // Gather the cluster device info.
-    coord_service->state_mu_.AssertHeld();
-    cluster_devices = coord_service->ListClusterDevices();
-    n.Notify();
-  });
-  n.WaitForNotification();
-
-  DeviceInfo expected_cluster_devices;
-  auto expected_devices = expected_cluster_devices.mutable_device();
-  expected_devices->Add(local_devices_0.device().begin(),
-                        local_devices_0.device().end());
-  expected_devices->Add(local_devices_1.device().begin(),
-                        local_devices_1.device().end());
-  expected_devices->Add(local_devices_2.device().begin(),
-                        local_devices_2.device().end());
-  EXPECT_THAT(cluster_devices, EqualsProto(expected_cluster_devices));
-}
-
-// Task devices should not be added twice if same task calls WaitForAllDevices()
-// twice.
-TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
-  const CoordinationServiceConfig config =
-      GetCoordinationServiceConfig(/*num_tasks=*/2);
-  CoordinatedTask task_0;
-  task_0.set_job_name("worker");
-  task_0.set_task_id(0);
-  CoordinatedTask task_1;
-  task_1.set_job_name("worker");
-  task_1.set_task_id(1);
-  absl::Status status = absl::OkStatus();
-  absl::Status initial_wait_for_all_tasks_status;
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
-  absl::Notification n;
-  // Map fake devices to each task.
-  DeviceInfo local_devices_0;
-  DeviceInfo local_devices_1;
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device0"));
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device1"));
-  local_devices_1.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task1_device0"));
-  // Task0 sends device info.
-  DeviceInfo cluster_devices;
-  coord_service->WaitForAllTasks(
-      task_0, local_devices_0,
-      [&initial_wait_for_all_tasks_status](absl::Status s) {
-        initial_wait_for_all_tasks_status = s;
-      });
-
-  // Task0 sends device info again.
-  coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](absl::Status s) { ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [coord_service = coord_service.get(),
-                                  &cluster_devices, &n](absl::Status s) {
-                                   ASSERT_OK(s);
-                                   // Gather the cluster device info.
-                                   coord_service->state_mu_.AssertHeld();
-                                   cluster_devices =
-                                       coord_service->ListClusterDevices();
-                                   n.Notify();
-                                 });
-  n.WaitForNotification();
-
-  // No duplicates found.
-  DeviceInfo expected_cluster_devices;
-  auto expected_devices = expected_cluster_devices.mutable_device();
-  expected_devices->Add(local_devices_0.device().begin(),
-                        local_devices_0.device().end());
-  expected_devices->Add(local_devices_1.device().begin(),
-                        local_devices_1.device().end());
-  EXPECT_THAT(cluster_devices, EqualsProto(expected_cluster_devices));
-  EXPECT_THAT(initial_wait_for_all_tasks_status,
-              StatusIs(absl::StatusCode::kCancelled));
-}
-
 TEST_F(CoordinationBarrierTest, Barrier) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
@@ -1866,8 +1606,8 @@ TEST_F(CoordinateTwoTasksTest, Reset_HeartbeatsAreAcceptedForAGracePeriod) {
 }
 
 TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Status barrier_status;
   absl::Notification barrier_n;
@@ -1887,8 +1627,8 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
 }
 
 TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   absl::Notification n;
@@ -1910,8 +1650,8 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
 }
 
 TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Status barrier_status;
   absl::Notification barrier_n;
@@ -1936,8 +1676,8 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
 }
 
 TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
@@ -1960,8 +1700,8 @@ TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
 
 TEST_F(CoordinateTwoTasksTest,
        ShutdownWithBarrier_BarrierFails_TaskDisconnectsOtherTaskIsAlerted) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
@@ -1984,16 +1724,12 @@ TEST_F(CoordinateTwoTasksTest,
               StatusIs(absl::StatusCode::kAborted));
   EXPECT_THAT(coord_service_->RegisterTask(task_0_, incarnation_1_),
               StatusIs(absl::StatusCode::kAborted));
-
-  // Other task is alerted that shutdown has been initiated without it.
-  absl::Status other_task_status = client_1_.GetStatus();
-  EXPECT_THAT(other_task_status, StatusIs(absl::StatusCode::kInternal));
 }
 
 TEST_F(CoordinateTwoTasksTest,
        ShutdownWithBarrier_BarrierFailsWithoutClientConnection_SetTaskToError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
@@ -2020,7 +1756,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, BarrierFailsIfTaskIsInError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
@@ -2043,7 +1779,7 @@ TEST_F(CoordinateTwoTasksTest, BarrierFailsIfTaskIsInError) {
 
 TEST_F(CoordinateTwoTasksTest,
        BarrierWithParticipatingTasksFailsIfTaskIsStale) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
@@ -2064,7 +1800,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, BarrierFailsAfterErrorPollingResponse) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // Use notifications to guarantee the ordering of operations across threads.
@@ -2107,7 +1843,7 @@ TEST_F(CoordinateTwoTasksTest, BarrierFailsAfterErrorPollingResponse) {
 }
 
 TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfTaskIsStale) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
@@ -2130,27 +1866,11 @@ TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfTaskIsStale) {
   EXPECT_THAT(barrier_status, StatusIs(absl::StatusCode::kInternal));
 }
 
-TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/false);
-
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-
-  ASSERT_OK(coord_service_->ReportTaskError(task_0_,
-                                            absl::InternalError("test_error")));
-
-  // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_THAT(client_1_.GetStatus(), StatusIs(absl::StatusCode::kInternal));
-}
-
 TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/true);
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2165,10 +1885,10 @@ TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
 
 TEST_F(CoordinateTwoTasksTest,
        RecoverableTaskWithErrorPollingWillNotPropagateError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/true);
   // These callbacks may be invoked after this test (e.g. cancellations during
   // coord service dtor), so we use shared pointers to extend their lifetimes
   // beyond the test to avoid use-after-free errors.
@@ -2191,29 +1911,30 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
-  CoordinationServiceConfig config;
-  config.set_service_type(kCoordinationServiceType);
+  CoordinationService::Config config;
   // Workers are recoverable, chief is not.
-  config.mutable_recoverable_jobs()->Add("worker");
   CoordinatedTask chief;
   chief.set_job_name("chief");
   chief.set_task_id(0);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
+  task_0.set_recoverable(true);
   CoordinatedTask task_1;
   task_1.set_job_name("worker");
   task_1.set_task_id(1);
-  CoordinatedJob* chief_job = config.mutable_coordinated_job_list()->Add();
-  chief_job->set_name("chief");
-  chief_job->set_num_tasks(1);
-  CoordinatedJob* worker_job = config.mutable_coordinated_job_list()->Add();
-  worker_job->set_name("worker");
-  worker_job->set_num_tasks(2);
+  task_1.set_recoverable(true);
+  CoordinatedJob chief_job;
+  chief_job.set_name("chief");
+  chief_job.set_num_tasks(1);
+  config.coordinated_job_list.push_back(chief_job);
+  CoordinatedJob worker_job;
+  worker_job.set_name("worker");
+  worker_job.set_num_tasks(2);
+  config.coordinated_job_list.push_back(worker_job);
 
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
 
   // Each coordinated task registers and polls for errors.
   ASSERT_OK(coord_service->RegisterTask(chief, IncarnationId(0)));
@@ -2257,10 +1978,10 @@ TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
 
 TEST_F(CoordinateTwoTasksTest,
        RecoverableTaskReportErrorResetAndRegisterAgain) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/true);
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2282,11 +2003,11 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/false,
-                            /*allow_new_incarnation_to_reconnect=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/false,
+      /*allow_new_incarnation_to_reconnect=*/true);
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
@@ -2296,27 +2017,8 @@ TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
 }
 
-TEST_F(CoordinateTwoTasksTest,
-       DoNotAllowPollForErrorIfHasServiceToClientConnection) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true);
-  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  std::vector<absl::Status> statuses;
-  statuses.reserve(2);
-
-  for (const CoordinatedTask& task : {task_0_, task_1_}) {
-    coord_service_->PollForErrorAsync(
-        task, [&](const absl::Status& status) { statuses.push_back(status); });
-  }
-
-  // The error polling requests will get immediate error because there is
-  // service to client connection.
-  EXPECT_EQ(statuses.size(), 2);
-  EXPECT_THAT(statuses, Each(StatusIs(absl::StatusCode::kInternal)));
-}
-
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfNotInCluster) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   CoordinatedTask task_not_in_cluster;
   absl::Status s;
 
@@ -2328,7 +2030,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfNotInCluster) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskNotRegistered) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
 
   coord_service_->PollForErrorAsync(
@@ -2340,7 +2042,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskNotRegistered) {
 
 TEST_F(CoordinateTwoTasksTest,
        AllowPollForErrorWithinGracePeriodIfTaskHasShutDown) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2359,7 +2061,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskHasShutDown) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2378,7 +2080,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskHasShutDown) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorAfterReset) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->ResetTask(task_0_));
@@ -2393,7 +2095,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorAfterReset) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorWhenInErrorState) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->ReportTaskError(task_0_,
@@ -2405,7 +2107,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorWhenInErrorState) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskIsStale) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // No heartbeat for a while, leader consider the task as stale.
@@ -2422,7 +2124,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskIsStale) {
 
 TEST_F(CoordinateTwoTasksTest,
        CanPropagateTaskRegistrationErrorThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status s0;
@@ -2440,7 +2142,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, LatePollingTaskCanGetError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   std::vector<absl::Status> statuses;
@@ -2465,9 +2167,9 @@ TEST_F(CoordinateTwoTasksTest, LatePollingTaskCanGetError) {
 
 TEST_F(CoordinateTwoTasksTest,
        RegisterWithBarrier_OldHeartbeat_RestartedTasksCanReconnect) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   // Service restarted.
   // Old task 0 sends an unexpected heartbeat, which should fail.
   ASSERT_THAT(coord_service_->RecordHeartbeat(task_0_, incarnation_0_ - 1),
@@ -2488,9 +2190,9 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        RegisterWithBarrier_RestartBeforeBarrier_Succeeds) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   absl::Status task0_status = absl::InternalError("uninitialized_status");
   absl::Status restarted_task0_status =
       absl::InternalError("uninitialized_status");
@@ -2514,9 +2216,9 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, RegisterWithBarrier_RestartAfterBarrier_Fails) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   absl::Status task0_status = absl::InternalError("uninitialized_status");
   // Task 0 registers first.
   coord_service_->RegisterTaskAsync(
@@ -2544,16 +2246,19 @@ TEST_F(CoordinateTwoTasksTest, RegisterWithBarrier_RestartAfterBarrier_Fails) {
 }
 
 TEST_F(CoordinateTwoTasksTest, RegisterWithBarrier_Timeout) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   // Task 0 joins without task 1. Times out eventually as this function is
   // blocking.
   EXPECT_THAT(coord_service_->RegisterTask(task_0_, incarnation_0_),
               StatusIs(absl::StatusCode::kDeadlineExceeded));
 }
 
-using GetAliveTasksTest = CoordinationBarrierTest;
+class GetAliveTasksTest : public CoordinationBarrierTest {
+ public:
+  GetAliveTasksTest() : CoordinationBarrierTest(true) {}
+};
 
 TEST_F(GetAliveTasksTest, SuccessfulGetAliveTasks) {
   // This test has three tasks successfully call GetAliveTasks.
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
index bdcd5d76b579ea..9e7a3269aa9655 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
@@ -53,8 +53,6 @@ using tensorflow::GetKeyValueDirRequest;
 using tensorflow::GetKeyValueDirResponse;
 using tensorflow::GetKeyValueRequest;
 using tensorflow::GetKeyValueResponse;
-using tensorflow::GetTaskStateRequest;
-using tensorflow::GetTaskStateResponse;
 using tensorflow::HeartbeatRequest;
 using tensorflow::HeartbeatResponse;
 using tensorflow::IncrementKeyValueRequest;
@@ -65,18 +63,12 @@ using tensorflow::PollForErrorRequest;
 using tensorflow::PollForErrorResponse;
 using tensorflow::RegisterTaskRequest;
 using tensorflow::RegisterTaskResponse;
-using tensorflow::ReportErrorToServiceRequest;
-using tensorflow::ReportErrorToServiceResponse;
-using tensorflow::ReportErrorToTaskRequest;
-using tensorflow::ReportErrorToTaskResponse;
 using tensorflow::ResetTaskRequest;
 using tensorflow::ResetTaskResponse;
 using tensorflow::ShutdownTaskRequest;
 using tensorflow::ShutdownTaskResponse;
 using tensorflow::TryGetKeyValueRequest;
 using tensorflow::TryGetKeyValueResponse;
-using tensorflow::WaitForAllTasksRequest;
-using tensorflow::WaitForAllTasksResponse;
 using tensorflow::WatchJobStateRequest;
 using tensorflow::WatchJobStateResponse;
 
@@ -134,16 +126,6 @@ class GrpcCoordinationClient : public CoordinationClient {
         &target_);
   }
 
-  void WaitForAllTasksAsync(const WaitForAllTasksRequest* request,
-                            WaitForAllTasksResponse* response,
-                            tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/WaitForAllTasks",
-        *request, response, std::move(done), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
   void ShutdownTaskAsync(tsl::CallOptions* call_opts,
                          const ShutdownTaskRequest* request,
                          ShutdownTaskResponse* response,
@@ -178,37 +160,6 @@ class GrpcCoordinationClient : public CoordinationClient {
         /*fail_fast=*/true, &target_);
   }
 
-  void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                              const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/ReportErrorToTask",
-        *request, response, std::move(done), call_opts,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
-  void ReportErrorToServiceAsync(const ReportErrorToServiceRequest* request,
-                                 ReportErrorToServiceResponse* response,
-                                 tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/ReportErrorToService",
-        *request, response, std::move(done), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
-  void GetTaskStateAsync(const GetTaskStateRequest* request,
-                         GetTaskStateResponse* response,
-                         tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/GetTaskState", *request,
-        response, std::move(done), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
   void WatchJobStateAsync(tsl::CallOptions* call_opts,
                           const WatchJobStateRequest* request,
                           WatchJobStateResponse* response,
@@ -329,77 +280,8 @@ class GrpcCoordinationClient : public CoordinationClient {
   std::unique_ptr<GrpcCoordinationClientThread> client_thread_;
 };
 
-class GrpcCoordinationClientCache : public CoordinationClientCache {
- public:
-  explicit GrpcCoordinationClientCache(
-      std::shared_ptr<tsl::GrpcChannelCache> channel_cache)
-      : next_round_robin_assignment_(0),
-        channel_cache_(channel_cache),
-        threads_(4) {}
-
-  ~GrpcCoordinationClientCache() override = default;
-
-  CoordinationClient* GetClient(const std::string& target) override {
-    absl::MutexLock l(clients_mu_);
-    auto it = clients_.find(target);
-    if (it == clients_.end()) {
-      tsl::SharedGrpcChannelPtr channel =
-          channel_cache_->FindWorkerChannel(target);
-      if (channel == nullptr) {
-        VLOG(2) << "Coordination client for target " << target << " not found.";
-      }
-      int assigned_index = AssignClientToThread(target);
-      auto coord_client = std::make_unique<GrpcCoordinationClient>(
-          channel, threads_[assigned_index].completion_queue(), target);
-      it = clients_.emplace(target, std::move(coord_client)).first;
-    }
-    return it->second.get();
-  }
-
-  std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) override {
-    tsl::SharedGrpcChannelPtr channel =
-        channel_cache_->FindWorkerChannel(target);
-    if (channel == nullptr) {
-      VLOG(2) << "Coordination client for target " << target << " not found.";
-    }
-    return std::make_unique<GrpcCoordinationClient>(channel, target);
-  }
-
- private:
-  absl::Mutex assignment_mu_;
-  std::unordered_map<std::string, size_t> target_assignments_
-      ABSL_GUARDED_BY(assignment_mu_);
-  size_t next_round_robin_assignment_ ABSL_GUARDED_BY(assignment_mu_);
-
-  size_t AssignClientToThread(const std::string& target) {
-    // Round-robin target assignment, but keeps the same target on the same
-    // polling thread always, as this is important for gRPC performance
-    absl::MutexLock l(assignment_mu_);
-    auto it = target_assignments_.find(target);
-    if (it == target_assignments_.end()) {
-      it = target_assignments_
-               .insert(std::make_pair(
-                   target, (next_round_robin_assignment_++) % threads_.size()))
-               .first;
-    }
-    return it->second;
-  }
-
-  std::shared_ptr<tsl::GrpcChannelCache> channel_cache_;
-  mutable absl::Mutex clients_mu_;
-  std::unordered_map<std::string, std::unique_ptr<CoordinationClient>> clients_
-      ABSL_GUARDED_BY(clients_mu_);
-  std::vector<GrpcCoordinationClientThread> threads_;
-};
-
 }  // namespace
 
-CoordinationClientCache* NewGrpcCoordinationClientCache(
-    std::shared_ptr<tsl::GrpcChannelCache> channel_cache) {
-  return new GrpcCoordinationClientCache(channel_cache);
-}
-
 CoordinationClient* NewGrpcCoordinationClient(
     std::shared_ptr<::grpc::Channel> channel) {
   return new GrpcCoordinationClient(channel, /*target=*/"coordination_service");
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h
index 0661b3a7b18df8..a1c41ec4484e4e 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h
@@ -23,9 +23,6 @@ limitations under the License.
 
 namespace xla {
 
-CoordinationClientCache* NewGrpcCoordinationClientCache(
-    std::shared_ptr<tsl::GrpcChannelCache> channel);
-
 CoordinationClient* NewGrpcCoordinationClient(
     std::shared_ptr<::grpc::Channel> channel);
 
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
index 27af5e9104ffb4..1e4c9030f4aed9 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
@@ -44,13 +44,9 @@ void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
                        &GrpcCoordinationServiceImpl::method##Handler, false); \
   } while (0)
   ENQUEUE_REQUEST(RegisterTask);
-  ENQUEUE_REQUEST(WaitForAllTasks);
   ENQUEUE_REQUEST(ShutdownTask);
   ENQUEUE_REQUEST(ResetTask);
   ENQUEUE_REQUEST(Heartbeat);
-  ENQUEUE_REQUEST(ReportErrorToTask);
-  ENQUEUE_REQUEST(ReportErrorToService);
-  ENQUEUE_REQUEST(GetTaskState);
   ENQUEUE_REQUEST(WatchJobState);
   ENQUEUE_REQUEST(InsertKeyValue);
   ENQUEUE_REQUEST(GetKeyValue);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
index 3f699619273755..bf5941d78deadc 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
@@ -85,13 +85,9 @@ class GrpcCoordinationServiceImpl : public tsl::AsyncServiceInterface {
                        /*supports_cancel=*/false);                            \
   }
   HANDLER(RegisterTask);
-  HANDLER(WaitForAllTasks);
   HANDLER(ShutdownTask);
   HANDLER(ResetTask);
   HANDLER(Heartbeat);
-  HANDLER(ReportErrorToTask);
-  HANDLER(ReportErrorToService);
-  HANDLER(GetTaskState);
   HANDLER(WatchJobState);
   HANDLER(InsertKeyValue);
   HANDLER(GetKeyValue);
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/BUILD b/third_party/xla/xla/pjrt/distributed/preemption/BUILD
index bdd4dff4e0eac1..d0cc66f91b4c70 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/preemption/BUILD
@@ -41,6 +41,8 @@ xla_cc_test(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
@@ -87,6 +89,7 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
+        "//xla/tsl/protobuf:coordination_service_proto_cc",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc
index f09feeb05fcb03..e25a85a40e4d38 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc
@@ -28,6 +28,10 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/env.h"
+#if defined(PLATFORM_GOOGLE)
+#include "thread/executor.h"
+#include "thread/signal.h"
+#endif
 
 namespace xla {
 
@@ -53,7 +57,17 @@ class SigtermNotifier : public PreemptionNotifier {
 SigtermNotifier::SigtermNotifier(tsl::Env* env) : PreemptionNotifier(env) {
   sigterm_received.store(false);
   StartListenerThread();
+#if defined(PLATFORM_GOOGLE)
+  thread::signal::Token unused_token;
+
+  thread::signal::AddHandler(
+      SIGTERM, thread::Executor::DefaultExecutor(),
+      []() { sigterm_received.store(true); },
+      /*flags=*/0,  // Don't override existing signal handlers.
+      &unused_token);
+#else
   std::signal(SIGTERM, [](int signal) { sigterm_received.store(true); });
+#endif
 }
 
 void SigtermNotifier::StartListenerThread() {
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h
index d5e073886fb41b..6080754a22f246 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
index 7bf8216b338ef5..92da8a0acd020d 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 #include "xla/pjrt/distributed/preemption/preemption_notifier.h"
 
 #include <csignal>
-#include <functional>
 #include <memory>
-#include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -27,11 +27,30 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#if defined(PLATFORM_GOOGLE)
+#include "thread/executor.h"
+#include "thread/signal.h"
+#endif
 
 namespace xla {
 namespace {
 
-TEST(PreemptNotifierTest, WillBePreemptedAt) {
+class PreemptNotifierTest : public ::testing::Test {
+ public:
+  PreemptNotifierTest() {
+#if defined(PLATFORM_GOOGLE)
+    // Override default test SIGTERM handler so that test does not exit
+    // prematurely.
+    thread::signal::Token unused_token;
+
+    thread::signal::AddHandler(
+        SIGTERM, thread::Executor::DefaultExecutor(), []() {},
+        thread::signal::kOverrideDefault, &unused_token);
+#endif
+  }
+};
+
+TEST_F(PreemptNotifierTest, WillBePreemptedAt) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -52,8 +71,8 @@ TEST(PreemptNotifierTest, WillBePreemptedAt) {
   EXPECT_LT(time_diff, absl::Seconds(3));
 }
 
-TEST(PreemptNotifierTest,
-     WillBePreemptedAt_AlreadyPreempted_ReturnsImmediately) {
+TEST_F(PreemptNotifierTest,
+       WillBePreemptedAt_AlreadyPreempted_ReturnsImmediately) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -78,7 +97,7 @@ TEST(PreemptNotifierTest,
   EXPECT_LT(time_diff, absl::Seconds(2));
 }
 
-TEST(PreemptNotifierTest, WillBePreemptedAtAsync_SameResultForAllCallbacks) {
+TEST_F(PreemptNotifierTest, WillBePreemptedAtAsync_SameResultForAllCallbacks) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -109,7 +128,7 @@ TEST(PreemptNotifierTest, WillBePreemptedAtAsync_SameResultForAllCallbacks) {
   EXPECT_EQ(preempt_time.value(), preempt_time_2.value());
 }
 
-TEST(PreemptNotifierTest, Reset_TwoDifferentPreemptTimesRecorded) {
+TEST_F(PreemptNotifierTest, Reset_TwoDifferentPreemptTimesRecorded) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -131,7 +150,7 @@ TEST(PreemptNotifierTest, Reset_TwoDifferentPreemptTimesRecorded) {
   EXPECT_NE(preempt_time, preempt_time_2);
 }
 
-TEST(PreemptNotifierTest, DestructorCancelsPendingCalls) {
+TEST_F(PreemptNotifierTest, DestructorCancelsPendingCalls) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
index 6db1efa3849749..2564ccc737b75e 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
@@ -38,12 +38,12 @@ limitations under the License.
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
 
 namespace xla {
 namespace {
 using tensorflow::CoordinatedJob;
 using tensorflow::CoordinatedTask;
-using tensorflow::CoordinationServiceConfig;
 
 constexpr char kJobName[] = "test_worker";
 
@@ -144,13 +144,12 @@ class PreemptionSyncManagerTest : public ::testing::Test {
         [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
   }
   std::unique_ptr<CoordinationService> EnableCoordinationService() {
-    CoordinationServiceConfig config;
-    config.set_service_type("standalone");
-    CoordinatedJob* job = config.mutable_coordinated_job_list()->Add();
-    job->set_name(kJobName);
-    job->set_num_tasks(2);
-    return CoordinationService::Create(tsl::Env::Default(), config,
-                                       /*cache=*/nullptr);
+    CoordinationService::Config config;
+    CoordinatedJob job;
+    job.set_name(kJobName);
+    job.set_num_tasks(2);
+    config.coordinated_job_list.push_back(job);
+    return std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   }
   void InitializeAndConnectCoordinationAgents() {
     std::unique_ptr<CoordinationClient> coord_client =
@@ -162,14 +161,18 @@ class PreemptionSyncManagerTest : public ::testing::Test {
     auto error_fn = [](const absl::Status& status) {
       LOG(ERROR) << "Coordination service agent in error status: " << status;
     };
-    CoordinationServiceConfig coord_config;
-    coord_config.set_service_leader("test_leader");
-    CHECK_OK(coord_agent_->Initialize(tsl::Env::Default(), kJobName,
-                                      /*task_id=*/0, coord_config,
-                                      std::move(coord_client), error_fn));
-    CHECK_OK(coord_agent2_->Initialize(tsl::Env::Default(), kJobName,
-                                       /*task_id=*/1, coord_config,
-                                       std::move(coord_client2), error_fn));
+    CoordinationServiceAgent::Config coord_config;
+    coord_config.service_leader = "test_leader";
+    coord_agent_ =
+        CoordinationServiceAgent::Create(tsl::Env::Default(), kJobName,
+                                         /*task_id=*/0, coord_config,
+                                         std::move(coord_client), error_fn)
+            .value();
+    coord_agent2_ =
+        CoordinationServiceAgent::Create(tsl::Env::Default(), kJobName,
+                                         /*task_id=*/1, coord_config,
+                                         std::move(coord_client2), error_fn)
+            .value();
     CHECK_OK(coord_agent_->Connect());
     CHECK_OK(coord_agent2_->Connect());
   }
@@ -181,12 +184,10 @@ class PreemptionSyncManagerTest : public ::testing::Test {
   std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<tsl::Thread> coord_rpc_thread_;
   // Owned by task 1.
-  std::unique_ptr<CoordinationServiceAgent> coord_agent_ =
-      CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationServiceAgent> coord_agent_;
   FakePreemptionNotifier* preempt_notifier_;
   // Owned by task 2.
-  std::unique_ptr<CoordinationServiceAgent> coord_agent2_ =
-      CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationServiceAgent> coord_agent2_;
   FakePreemptionNotifier* preempt_notifier2_;
 };
 
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index 7c7288e79f967a..513341bbd42c3e 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "grpcpp/server_builder.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/pjrt/distributed/coordination/coordination_service.h"
+#include "xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h"
 #include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
@@ -34,25 +34,20 @@ limitations under the License.
 
 namespace {
 
-std::unique_ptr<tsl::CoordinationService> EnableCoordinationService(
+std::unique_ptr<xla::CoordinationService> EnableCoordinationService(
     const xla::CoordinationServiceImpl::Options& options) {
   const std::string job_name = "jax_worker";
-  tensorflow::CoordinationServiceConfig config;
-  config.set_service_type("standalone");
-  config.set_service_leader(absl::StrCat("/job:", job_name, "/task:0"));
-  config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.cluster_register_timeout));
-  config.set_cluster_register_with_barrier(true);
-  config.set_heartbeat_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.heartbeat_timeout));
-  config.set_shutdown_barrier_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.shutdown_timeout));
-  tensorflow::CoordinatedJob* job =
-      config.mutable_coordinated_job_list()->Add();
-  job->set_name(job_name);
-  job->set_num_tasks(options.num_nodes);
+  xla::CoordinationService::Config config;
+  config.cluster_register_timeout = options.cluster_register_timeout;
+  config.cluster_register_with_barrier = true;
+  config.heartbeat_timeout = options.heartbeat_timeout;
+  config.shutdown_barrier_timeout = options.shutdown_timeout;
+  tensorflow::CoordinatedJob job;
+  job.set_name(job_name);
+  job.set_num_tasks(options.num_nodes);
+  config.coordinated_job_list.push_back(job);
   auto service =
-      tsl::CoordinationService::Create(options.env, config, /*cache=*/nullptr);
+      std::make_unique<xla::CoordinationService>(options.env, config);
   return service;
 }
 }  // namespace
@@ -67,10 +62,10 @@ CoordinationServiceImpl::CoordinationServiceImpl(
   coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
       options.env, "CoordinationServiceRpcHandler",
       /*num_threads=*/4);
-  coord_rpc_service_ = std::make_unique<tsl::GrpcCoordinationServiceImpl>(
+  coord_rpc_service_ = std::make_unique<GrpcCoordinationServiceImpl>(
       coord_compute_pool_.get(), builder);
   auto* grpc_coord_service =
-      static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
+      static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
   grpc_coord_service->SetCoordinationServiceInstance(coord_service_.get());
   LOG(INFO) << "Coordination service is enabled.";
 }
@@ -79,7 +74,7 @@ CoordinationServiceImpl::~CoordinationServiceImpl() {
   // Service object must be destroyed to clear all pending RPCs before shutting
   // down the RPC service.
   coord_service_ = nullptr;
-  static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
+  static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
       ->SetCoordinationServiceInstance(nullptr);
   coord_rpc_service_->Shutdown();
 }
diff --git a/third_party/xla/xla/pjrt/distributed/service.h b/third_party/xla/xla/pjrt/distributed/service.h
index 46ee00e7cf3efa..ef875e099559a0 100644
--- a/third_party/xla/xla/pjrt/distributed/service.h
+++ b/third_party/xla/xla/pjrt/distributed/service.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/security/server_credentials.h"
 #include "grpcpp/server_builder.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/pjrt/distributed/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "xla/types.h"
 #include "tsl/platform/env.h"
@@ -72,7 +72,7 @@ class CoordinationServiceImpl {
 
  private:
   tsl::Env* env_ = nullptr;  // Not owned.
-  std::unique_ptr<tsl::CoordinationService> coord_service_;
+  std::unique_ptr<CoordinationService> coord_service_;
   std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
   std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<tsl::Thread> coord_rpc_thread_;
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index 09ae283e73e35e..16b3b3f9605643 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -38,8 +38,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/utils.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/xla/xla/pjrt/distributed/topology_util.h
index 55f0d69a2ef433..d586137712cad1 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.h
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/errors.cc b/third_party/xla/xla/pjrt/errors.cc
index 2a269a3caa2588..698067d1beed18 100644
--- a/third_party/xla/xla/pjrt/errors.cc
+++ b/third_party/xla/xla/pjrt/errors.cc
@@ -23,7 +23,7 @@ namespace xla {
 // The payload attached to the absl::Status returned by the compilation
 // service when the compilation fails due to compilation errors.
 inline constexpr absl::string_view kCompilationErrorPayload =
-    "compilation_error";
+    "type.googleapis.com/xla.CompilationError";
 
 bool HasCompilationErrorPayload(const absl::Status& status) {
   return status.GetPayload(kCompilationErrorPayload).has_value();
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
index 5b70b483d15dcd..50af005112244e 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
@@ -36,6 +36,97 @@ limitations under the License.
 #include "xla/shape.h"
 
 namespace pjrt {
+namespace {
+
+// Nested callback functions for the C API version of
+// xla::PjRtClient::MakeCrossHostReceiveBuffers.
+using CrossHostRecvNotifierFunction = std::function<void(
+    PJRT_Error* error, const char** serialized_descriptors,
+    size_t* descriptors_sizes, size_t num_descriptors,
+    PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+    void* cancel_notifier_user_arg)>;
+using CrossHostSendCancelNotifierFunction = std::function<void(
+    const char* serialized_descriptor, size_t serialized_descriptor_size,
+    PJRT_Error_Code error_code, const char* error_message,
+    size_t error_message_size,
+    PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+    void* on_canceled_user_arg)>;
+using CrossHostOnCanceledCallbackFunction =
+    std::function<void(PJRT_Error* error)>;
+
+// Callback function for the C API version of
+// xla::PjRtBuffer::CopyToRemoteDevice.
+using RemoteSendCallbackFunction =
+    std::function<void(PJRT_Error* error, bool sends_were_enqueued)>;
+
+xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
+    const PJRT_Transfers_CrossHostRecvNotifierInfo& c_notifier) {
+  return [user_arg = c_notifier.user_arg, notifier = c_notifier.notifier](
+             absl::StatusOr<xla::PjRtCrossHostRecvState> recv_state) {
+    // Define the function to pass as `cancel_notifier_user_arg` to
+    // `notifier`.
+    auto cancel_notifier_function = new CrossHostSendCancelNotifierFunction(
+        [cpp_cancel_notifier = std::move(recv_state->cancel_notifier)](
+            const char* serialized_descriptor,
+            size_t serialized_descriptor_size, PJRT_Error_Code error_code,
+            const char* error_message, size_t error_message_size,
+            PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+            void* on_canceled_user_arg) {
+          std::string serialized_descriptor_str(serialized_descriptor,
+                                                serialized_descriptor_size);
+          std::string error_message_str(error_message, error_message_size);
+          absl::Status state(pjrt::PjrtErrorCodeToStatusCode(error_code),
+                             error_message_str);
+          auto cpp_on_canceled = [user_arg = on_canceled_user_arg,
+                                  on_canceled =
+                                      on_canceled](absl::Status status) {
+            auto error = new PJRT_Error{status};
+            on_canceled(error, user_arg);
+            delete error;
+          };
+          return cpp_cancel_notifier(std::move(serialized_descriptor_str),
+                                     std::move(state),
+                                     std::move(cpp_on_canceled));
+        });
+    PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier =
+        [](const char* serialized_descriptor, size_t serialized_descriptor_size,
+           PJRT_Error_Code error, const char* error_message,
+           size_t error_message_size,
+           PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+           void* on_canceled_user_arg, void* user_arg) {
+          CrossHostSendCancelNotifierFunction* cancel_notifier_fn =
+              reinterpret_cast<CrossHostSendCancelNotifierFunction*>(user_arg);
+          (*cancel_notifier_fn)(serialized_descriptor,
+                                serialized_descriptor_size, error,
+                                error_message, error_message_size, on_canceled,
+                                on_canceled_user_arg);
+        };
+    if (!recv_state.ok()) {
+      auto error = new PJRT_Error{recv_state.status()};
+      notifier(error, nullptr, nullptr, 0, user_arg, cancel_notifier,
+               cancel_notifier_function);
+      delete error;
+      return;
+    }
+    // Convert serialized descriptors to char*.
+    std::vector<xla::PjRtCrossHostRecvDescriptors>& descriptors =
+        recv_state->descriptors;
+    std::vector<size_t> descriptors_sizes;
+    descriptors_sizes.reserve(descriptors.size());
+    std::vector<const char*> serialized_descriptors;
+    serialized_descriptors.reserve(descriptors.size());
+    for (int i = 0; i < descriptors.size(); ++i) {
+      serialized_descriptors.push_back(
+          descriptors[i].serialized_descriptors.front().c_str());
+      descriptors_sizes.push_back(
+          descriptors[i].serialized_descriptors.front().size());
+    }
+    notifier(nullptr, serialized_descriptors.data(), descriptors_sizes.data(),
+             descriptors.size(), user_arg, cancel_notifier,
+             cancel_notifier_function);
+  };
+}
+}  // namespace
 
 PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
     PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args* args) {
@@ -106,41 +197,44 @@ PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(
   return nullptr;
 }
 
-namespace {
-static xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
-    const PJRT_Transfers_CrossHostRecvNotifierInfo& c_notifier) {
-  return [user_arg = c_notifier.user_arg, notifier = c_notifier.notifier](
-             absl::StatusOr<xla::PjRtCrossHostRecvState> recv_state) {
-    if (!recv_state.ok()) {
-      auto error = new PJRT_Error{recv_state.status()};
-      notifier(error, nullptr, nullptr, 0, user_arg);
-      return;
-    }
-    auto& descriptors = recv_state->descriptors;
-    std::vector<size_t> descriptors_sizes;
-    descriptors_sizes.reserve(descriptors.size());
-    std::vector<const char*> serialized_descriptors;
-    serialized_descriptors.reserve(descriptors.size());
-    for (int i = 0; i < descriptors.size(); ++i) {
-      serialized_descriptors.push_back(
-          descriptors[i].serialized_descriptors.front().c_str());
-      descriptors_sizes.push_back(
-          descriptors[i].serialized_descriptors.front().size());
-    }
-    notifier(nullptr, serialized_descriptors.data(), descriptors_sizes.data(),
-             descriptors.size(), user_arg);
-  };
-}
-}  // namespace
-
 PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
     const PJRT_Api* c_api, xla::PjRtCrossHostRecvNotifier cpp_notifier) {
-  using CrossHostRecvNotifierFunction =
-      std::function<void(PJRT_Error*, const char**, size_t*, size_t)>;
   auto notifier_function = new CrossHostRecvNotifierFunction(
       [cpp_notifier = std::move(cpp_notifier), c_api](
           PJRT_Error* error, const char** serialized_descriptors,
-          size_t* descriptors_sizes, size_t num_descriptors) {
+          size_t* descriptors_sizes, size_t num_descriptors,
+          PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+          void* cancel_notifier_user_arg) {
+        xla::PjRtCrossHostSendCancelNotifier cpp_cancel_notifier =
+            [user_arg = cancel_notifier_user_arg, notifier = cancel_notifier,
+             c_api](absl::string_view serialized_descriptor,
+                    absl::Status reason,
+                    std::function<void(absl::Status)> on_canceled) {
+              PJRT_Error_Code error_code =
+                  pjrt::StatusCodeToPjrtErrorCode(reason.code());
+              // Define the function to pass as `on_canceled_user_arg` to
+              // the cancel notifier.
+              auto on_canceled_function =
+                  new CrossHostOnCanceledCallbackFunction(
+                      [cpp_on_canceled = std::move(on_canceled),
+                       c_api](PJRT_Error* error) {
+                        absl::Status status =
+                            ::pjrt::PjrtErrorToStatus(error, c_api);
+                        cpp_on_canceled(status);
+                      });
+              PJRT_Transfers_CrossHostOnCanceledCallback on_canceled_callback =
+                  [](PJRT_Error* error, void* user_arg) {
+                    CrossHostOnCanceledCallbackFunction* on_canceled_fn =
+                        reinterpret_cast<CrossHostOnCanceledCallbackFunction*>(
+                            user_arg);
+                    (*on_canceled_fn)(error);
+                    delete on_canceled_fn;
+                  };
+              notifier(serialized_descriptor.data(),
+                       serialized_descriptor.size(), error_code,
+                       reason.message().data(), reason.message().size(),
+                       on_canceled_callback, on_canceled_function, user_arg);
+            };
         if (error != nullptr) {
           absl::Status state = ::pjrt::PjrtErrorToStatus(error, c_api);
           return cpp_notifier(std::move(state));
@@ -154,34 +248,34 @@ PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
           state.descriptors.push_back(std::move(descriptors));
         }
 
-        // TODO(emilyaf): Support cancellation.
-        xla::PjRtCrossHostSendCancelNotifier cancel_notifier =
-            [](absl::string_view, absl::Status,
-               std::function<void(absl::Status)>) {
-              LOG(FATAL) << "MakeCrossHostReceiveBuffers: Cancellation is not "
-                            "supported in PJRT C API.";
-            };
-        state.cancel_notifier = cancel_notifier;
+        state.cancel_notifier = cpp_cancel_notifier;
         return cpp_notifier(std::move(state));
       });
   return PJRT_Transfers_CrossHostRecvNotifierInfo{
       /*user_arg=*/notifier_function,
       /*notifier=*/
       [](PJRT_Error* error, const char** serialized_descriptors,
-         size_t* descriptors_sizes, size_t num_descriptors, void* user_arg) {
+         size_t* descriptors_sizes, size_t num_descriptors, void* user_arg,
+         PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+         void* cancel_notifier_user_arg) {
         CrossHostRecvNotifierFunction* notifier_fn =
             reinterpret_cast<CrossHostRecvNotifierFunction*>(user_arg);
         (*notifier_fn)(error, serialized_descriptors, descriptors_sizes,
-                       num_descriptors);
+                       num_descriptors, cancel_notifier,
+                       cancel_notifier_user_arg);
         delete notifier_fn;
+        // The cancellation callback isn't always called, so instead of freeing
+        // it after usage, we free it here after the notifier is called.
+        CrossHostSendCancelNotifierFunction* cancel_notifier_fn =
+            reinterpret_cast<CrossHostSendCancelNotifierFunction*>(
+                cancel_notifier_user_arg);
+        delete cancel_notifier_fn;
       }};
 }
 
 PJRT_Transfers_CrossHostRemoteSendCallbackInfo
 CppCrossHostRemoteSendCallbackToC(
     const PJRT_Api* c_api, xla::PjRtBuffer::RemoteSendCallback cpp_callback) {
-  using RemoteSendCallbackFunction =
-      std::function<void(PJRT_Error * error, bool sends_were_enqueued)>;
   auto on_done_function = new RemoteSendCallbackFunction(
       [cpp_callback = std::move(cpp_callback), c_api](
           PJRT_Error* error, bool sends_were_enqueued) {
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
index 160a5f4771498b..98dff8fe57ebba 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
@@ -35,7 +35,7 @@ extern "C" {
 // CrossHostSendBuffers and CrossHostReceiveBuffers. These methods allow PjRt
 // clients to implement various optimizations for cross-host transfers.
 
-#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 3
+#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 4
 
 // ---------------------------------- Methods ----------------------------------
 
@@ -83,9 +83,21 @@ typedef PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
 
 // The structs and methods below correspond to the original cross-host transfers
 // API.
+typedef void (*PJRT_Transfers_CrossHostOnCanceledCallback)(PJRT_Error* error,
+                                                           void* user_arg);
+
+typedef void (*PJRT_Transfers_CrossHostSendCancelNotifier)(
+    const char* serialized_descriptor, size_t serialized_descriptor_size,
+    PJRT_Error_Code reason, const char* error_message,
+    size_t error_message_size,
+    PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+    void* on_canceled_user_arg, void* user_arg);
+
 typedef void (*PJRT_Transfers_CrossHostRecvNotifier)(
     PJRT_Error* error, const char** serialized_descriptors,
-    size_t* descriptors_sizes, size_t num_descriptors, void* user_arg);
+    size_t* descriptors_sizes, size_t num_descriptors, void* user_arg,
+    PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+    void* cancel_notifier_user_arg);
 
 struct PJRT_Transfers_CrossHostRecvNotifierInfo {
   void* user_arg;
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 9f2beedc353653..d42925ccb95eba 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -5,7 +5,6 @@ load("//xla/pjrt/gpu:package_groups.bzl", "xla_gpu_internal_packages")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
-load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 # Integrate with PJRT rather than the GPU client directly.
@@ -56,8 +55,6 @@ cc_library(
     deps = [
         ":gpu_helpers",
         ":gpu_metrics",
-        ":gpu_topology",
-        ":gpu_topology_proto_cc",
         ":se_gpu_topology_description",
         "//xla:executable_run_options",
         "//xla:future",
@@ -85,6 +82,7 @@ cc_library(
         "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
         "//xla/pjrt:event_pool",
+        "//xla/pjrt:host_memory_allocator",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:local_device_state",
         "//xla/pjrt:mlir_to_hlo",
@@ -111,14 +109,16 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:computation_placer_hdr",
         "//xla/service:executable",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu:gpu_memory_space_assignment",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -128,7 +128,6 @@ cc_library(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/distributed_runtime/coordination:coordination_service",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/framework:bfc_allocator",
         "//xla/tsl/framework:device_id",
@@ -178,7 +177,6 @@ cc_library(
     ]) + if_cuda([
         # keep sorted
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
-        "//xla/stream_executor/gpu:gpu_stream",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm([
         # keep sorted
@@ -205,8 +203,6 @@ xla_test(
     backends = ["gpu"],
     tags = ["nofixdeps"],
     deps = [
-        ":gpu_topology",
-        ":gpu_topology_proto_cc",
         ":se_gpu_pjrt_client",
         ":se_gpu_topology_description",
         "//xla:debug_options_flags",
@@ -244,9 +240,11 @@ xla_test(
         "//xla/pjrt/profiling:device_time_measurement",
         "//xla/pjrt/profiling/test_util:mock_device_time_measurement",
         "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/service/gpu:gpu_memory_space_assignment",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:literal_test_util",
@@ -378,7 +376,6 @@ xla_test(
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
     },
     deps = [
-        ":gpu_topology_proto_cc",
         ":se_gpu_pjrt_client",
         "//xla:shape_util",
         "//xla:util",
@@ -400,6 +397,7 @@ xla_test(
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/pjrt/distributed:service",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
@@ -432,26 +430,6 @@ xla_test(
     ],
 )
 
-tf_proto_library(
-    name = "gpu_topology_proto",
-    srcs = ["gpu_topology.proto"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gpu_topology",
-    srcs = ["gpu_topology.cc"],
-    hdrs = ["gpu_topology.h"],
-    visibility = internal_visibility([
-        "//xla/pjrt/gpu:legacy_gpu_topology_users",
-        ":__subpackages__",
-    ]),
-    deps = [
-        ":gpu_topology_proto_cc",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
 cc_library(
     name = "se_gpu_pjrt_compiler_impl",
     srcs = ["se_gpu_pjrt_compiler.cc"],
@@ -463,7 +441,7 @@ cc_library(
         "//xla:status_macros",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
+        "//xla/mlir_hlo:mhlo_passes",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
@@ -481,9 +459,11 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -537,7 +517,9 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/service:compiler",
         "//xla/stream_executor:platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
     ] + if_cuda([
         ":se_gpu_pjrt_compiler_cuda_registration",
@@ -563,7 +545,6 @@ xla_test(
     srcs = ["se_gpu_pjrt_compiler_test.cc"],
     backends = ["gpu"],
     deps = [
-        ":gpu_topology",
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler_impl",
         ":se_gpu_topology_description",
@@ -577,6 +558,7 @@ xla_test(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:gpu_topology",
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -612,6 +594,7 @@ xla_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         #"@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -632,8 +615,6 @@ cc_library(
         ":__subpackages__",
     ]),
     deps = [
-        ":gpu_topology",
-        ":gpu_topology_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_common",
@@ -642,6 +623,8 @@ cc_library(
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
         "//xla/pjrt/proto:topology_description_proto_cc",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -660,13 +643,13 @@ xla_cc_test(
     name = "se_gpu_topology_description_test",
     srcs = ["se_gpu_topology_description_test.cc"],
     deps = [
-        ":gpu_topology",
         ":se_gpu_topology_description",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/service:gpu_topology",
         "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 82cbcf932e6dda..b97b44146533e8 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -73,9 +73,8 @@ limitations under the License.
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/host_to_device_transfer_manager.h"
 #include "xla/pjrt/local_device_state.h"
@@ -95,6 +94,8 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
@@ -102,16 +103,15 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
@@ -206,8 +206,8 @@ static absl::flat_hash_map<std::string, PjRtDeviceAttribute> GetAttrsForDevices(
 StreamExecutorGpuClient::StreamExecutorGpuClient(
     std::string platform_name, LocalClient* client,
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
-    int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    int process_index, std::unique_ptr<se::DeviceAddressAllocator> allocator,
+    std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store,
@@ -611,8 +611,7 @@ absl::StatusOr<PreparedReceive> PrepareReceive(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> buffer,
                       client->DefineBuffer(on_device_shape, memory_space,
-                                           raw_buffer, {definition_event},
-                                           /*raw_buffer_is_mutable=*/true));
+                                           raw_buffer, {definition_event}));
   definition_event->AndThen([raw_buffer]() {});
 
   return PreparedReceive(client, std::move(clique_key), std::move(buffer),
@@ -759,7 +758,8 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
   auto setup_sends = [&]() -> absl::Status {
     TF_ASSIGN_OR_RETURN(local_device_state, GetLocalDeviceState(device));
     stream = local_device_state->GetDeviceToDeviceStream();
-    gpu::GpuCollectives* gpu_collectives = gpu::GpuCollectives::Default();
+    gpu::GpuCollectives* gpu_collectives =
+        gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     usage_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
         BufferSequencingEvent::Create(this->thread_pool()));
 
@@ -792,7 +792,12 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
     for (PreparedSend& prepared_send : prepared_sends) {
       // Wait until the buffer we want to send is fully materialized.
       for (const auto& event : prepared_send.definition_events_) {
-        tsl::BlockUntilReady(event.get());
+        if (event->IsType<BufferSequencingEvent>()) {
+          tsl::AsyncValueRef<BufferSequencingEvent> event_ref(event);
+          event_ref->WaitForEventOnStream(stream);
+        } else {
+          tsl::BlockUntilReady(event.get());
+        }
         if (auto* status = event->GetErrorIfPresent(); status != nullptr) {
           return *status;
         }
@@ -911,8 +916,7 @@ StreamExecutorGpuClient::PrepareReceiveBuffer(PjRtDevice* device, Shape shape) {
       auto buffer,
       DefineBuffer(
           on_device_shape, memory_space, raw_buffer,
-          {tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_event)},
-          /*raw_buffer_is_mutable=*/true));
+          {tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_event)}));
 
   return PrepareReceiveBufferResult{std::move(buffer), std::move(raw_buffer),
                                     local_device, stream,
@@ -974,7 +978,8 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
     stream = local_device_state->GetDeviceToDeviceStream();
     TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
                         device->default_memory_space());
-    gpu::GpuCollectives* gpu_collectives = gpu::GpuCollectives::Default();
+    gpu::GpuCollectives* gpu_collectives =
+        gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     definition_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
         BufferSequencingEvent::Create(this->thread_pool()));
 
@@ -1414,7 +1419,7 @@ BuildLocalDeviceStates(LocalClient* xla_client) {
 
 // Constructs a GPU device memory allocator to use, according to the allocator
 // configuration the client requested.
-absl::StatusOr<std::unique_ptr<se::DeviceMemoryAllocator>>
+absl::StatusOr<std::unique_ptr<se::DeviceAddressAllocator>>
 GetStreamExecutorGpuDeviceAllocator(
     se::Platform* platform, const GpuAllocatorConfig& allocator_config,
     const std::map<int, std::unique_ptr<LocalDeviceState>>&
@@ -1791,9 +1796,37 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
                       GetStreamExecutorGpuDeviceAllocator(
                           xla_client->platform(), options.allocator_config,
                           local_device_states));
-  TF_ASSIGN_OR_RETURN(
-      auto host_memory_allocator,
-      GetGpuHostAllocator(local_device_states.begin()->second->executor()));
+  std::unique_ptr<HostMemoryAllocator> host_memory_allocator;
+  if (options.host_memory_allocator_factory != nullptr) {
+    stream_executor::StreamExecutor* const stream_executor =
+        local_device_states.begin()->second->compute_stream()->parent();
+    HostMemoryAllocator::Options allocator_options;
+    allocator_options.alignment = tsl::Allocator::kAllocatorAlignment;
+    allocator_options.map_fn = [stream_executor](void* data, size_t size) {
+      bool success = stream_executor->HostMemoryRegister(data, size);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to register host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    allocator_options.unmap_fn = [stream_executor](void* data) {
+      bool success = stream_executor->HostMemoryUnregister(data);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to unregister host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    host_memory_allocator =
+        options.host_memory_allocator_factory(std::move(allocator_options));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        auto allocator,
+        GetGpuHostAllocator(local_device_states.begin()->second->executor()));
+    host_memory_allocator = std::make_unique<BasicHostMemoryAllocator>(
+        std::move(allocator), tsl::Allocator::kAllocatorAlignment);
+  }
 
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
   if (options.enable_mock_nccl) {
@@ -1849,7 +1882,7 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 static absl::Status CheckAlignment(const BufferAllocation& allocation,
-                                   se::DeviceMemoryBase buffer, int arg_idx) {
+                                   se::DeviceAddressBase buffer, int arg_idx) {
   const int64_t expected_alignment = [&] {
     if (allocation.is_entry_computation_parameter()) {
       return gpu::kEntryParameterAlignBytes;
@@ -1887,7 +1920,7 @@ StreamExecutorGpuClient::RunAsync(
   auto* gpu_exec =
       tensorflow::down_cast<xla::gpu::GpuExecutable*>(exec.executable());
   const ServiceExecutableRunOptions* run_options = &options_and_stream.first;
-  se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
+  se::DeviceAddressAllocator* const memory_allocator = run_options->allocator();
 
   se::StreamExecutor* executor = run_options->stream()->parent();
 
@@ -1932,7 +1965,7 @@ StreamExecutorGpuClient::RunAsync(
   absl::Span<const BufferAllocation* const> allocations =
       gpu_exec->GetAllocations();
 
-  std::vector<se::DeviceMemoryBase> buffers(allocations.size());
+  std::vector<se::DeviceAddressBase> buffers(allocations.size());
   {
     tsl::profiler::TraceMe hlo_module_activity(
         [&] { return std::string("Build buffer allocations"); },
@@ -1940,9 +1973,9 @@ StreamExecutorGpuClient::RunAsync(
     const int64_t num_buffers = allocations.size();
     for (int64_t i = 0; i < num_buffers; ++i) {
       const BufferAllocation& allocation = *allocations[i];
-      se::DeviceMemoryBase& buffer = buffers[i];
+      se::DeviceAddressBase& buffer = buffers[i];
       if (allocation.is_thread_local()) {
-        // buffer = se::DeviceMemoryBase{};
+        // buffer = se::DeviceAddressBase{};
       } else if (allocation.is_entry_computation_parameter()) {
         int64_t param_no = allocation.parameter_number();
         buffer = [&] {
@@ -1970,7 +2003,7 @@ StreamExecutorGpuClient::RunAsync(
         const int64_t buffer_size = allocation.size();
         if (buffer_size > 0) {
           TF_ASSIGN_OR_RETURN(
-              se::OwningDeviceMemory owning_buffer,
+              se::ScopedDeviceAddress<uint8_t> owning_buffer,
               memory_allocator->Allocate(device_ordinal, buffer_size,
                                          /*retry_on_failure=*/true,
                                          /*memory_space=*/allocation.color()));
@@ -1985,7 +2018,7 @@ StreamExecutorGpuClient::RunAsync(
   XLA_VLOG_DEVICE(3, device_ordinal)
       << "Buffer allocations: " << buffer_allocations.ToString();
 
-  std::set<se::DeviceMemoryBase> buffers_in_result;
+  std::set<se::DeviceAddressBase> buffers_in_result;
 
   xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       gpu_exec->result_shape());
@@ -1999,7 +2032,7 @@ StreamExecutorGpuClient::RunAsync(
         gpu_exec->output_info().at(index);
     const BufferAllocation* allocation =
         allocations[output_info.allocation_index];
-    se::DeviceMemoryBase result_buffer;
+    se::DeviceAddressBase result_buffer;
 
     XLA_VLOG_DEVICE(4, device_ordinal)
         << "Looking at: allocation " << output_info.allocation_index
@@ -2035,7 +2068,7 @@ StreamExecutorGpuClient::RunAsync(
                "buffer is not donated; allocating a fresh buffer";
         int64_t allocation_size = ShapeUtil::ByteSizeOf(
             ShapeUtil::GetSubshape(gpu_exec->result_shape(), index));
-        absl::StatusOr<se::OwningDeviceMemory> allocated_buffer =
+        absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> allocated_buffer =
             memory_allocator->Allocate(device_ordinal, allocation_size,
                                        /*retry_on_failure=*/true,
                                        /*memory_space=*/allocation->color());
@@ -2043,7 +2076,7 @@ StreamExecutorGpuClient::RunAsync(
           return gpu_exec->VerboseAllocationError(allocated_buffer.status());
         }
         result_buffer = allocated_buffer->Release();
-        se::DeviceMemoryBase& aliased_buffer =
+        se::DeviceAddressBase& aliased_buffer =
             buffer_allocations.GetMutableDeviceAddress(
                 output_info.allocation_index);
         CHECK_EQ(aliased_buffer.size(), result_buffer.size());
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index b43592589b9cf3..2e36a66365d7e7 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -42,9 +42,8 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -56,10 +55,12 @@ limitations under the License.
 #include "xla/runtime/device_id.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -109,8 +110,8 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   StreamExecutorGpuClient(
       std::string platform_name, LocalClient* client,
       std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
-      int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
-      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      int process_index, std::unique_ptr<se::DeviceAddressAllocator> allocator,
+      std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
       std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
       std::shared_ptr<KeyValueStoreInterface> kv_store,
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
index 07cbfb7ba3276f..1c171fc070861f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
@@ -39,13 +39,13 @@ limitations under the License.
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/service.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index d6283dda89b5fe..f995e57d579cd6 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -69,8 +69,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/service.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/local_device_state.h"
@@ -87,12 +85,14 @@ limitations under the License.
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -510,7 +510,7 @@ static absl::Status MemsetFromValue(
   uint32_t pattern;
   std::memcpy(&pattern, &memset_value->value, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
@@ -559,7 +559,7 @@ static absl::Status MemsetFromAttr(
   uint32_t pattern;
   std::memcpy(&pattern, &attr, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
@@ -2875,8 +2875,7 @@ TEST(StreamExecutorGpuClientTest, LinkedEventPromise) {
                           client->CreateLinkedEventPromise(memory_space, ""));
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer, client->DefineBuffer(device_shape, memory_space, raw_buffer,
-                                        {std::move(event)},
-                                        /*raw_buffer_is_mutable=*/true));
+                                        {std::move(event)}));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto definition_event,
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 5d0632a7f7aa4d..dd31e129319c43 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -25,11 +25,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/layout_util.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -110,18 +111,27 @@ StreamExecutorGpuCompiler::StreamExecutorGpuCompiler(
     stream_executor::Platform::Id platform_id)
     : requested_platform_id_(platform_id) {}
 
+absl::StatusOr<Compiler*> StreamExecutorGpuCompiler::GetOrCreateCompiler() {
+  absl::MutexLock lock(compiler_mutex_);
+  if (compiler_ == nullptr) {
+    // We get the compiler here because doing so in the constructor might fail
+    // due to static initialization order shenanigans (An instance of this class
+    // is initialized statically and this might happen before the compiler is
+    // registered with Compiler::RegisterCompilerFactory). For the same reason,
+    // we can't fail construction of this class, therefore we have this
+    // GetOrCreate function and we can return on error when calling Compile.
+    TF_ASSIGN_OR_RETURN(compiler_,
+                        GetCompilerForPlatform(requested_platform_id_));
+  }
+  return compiler_.get();
+}
+
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    const XlaComputation& computation,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  // We get the compiler here because doing so in the constructor might fail due
-  // to static initialization order shenanigans. Also we can't fail construction
-  // of this class because it's also statically constructed.
-  // TODO(b/382417973): Use factories instead of static initialization of
-  // singletons.
-  TF_ASSIGN_OR_RETURN(auto gpu_compiler,
-                      GetCompilerForPlatform(requested_platform_id_));
+  TF_ASSIGN_OR_RETURN(Compiler * gpu_compiler, GetOrCreateCompiler());
 
   CompileOptions input_options = options;
   if (!options.gpu_target_config) {
@@ -165,10 +175,8 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
       HloModule::CreateFromProto(hlo_module_proto, *hlo_config));
   UpdateEntryComputationLayout(
       hlo_module.get(), std::bind(&Compiler::DefaultDeviceShapeRepresentation,
-                                  gpu_compiler.get(), std::placeholders::_1));
+                                  gpu_compiler, std::placeholders::_1));
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
-  Compiler::CompileOptions opts;
-  opts.gpu_target_config = options.gpu_target_config;
 
   AotCompilationOptions aot_options(gpu_compiler->PlatformId());
   aot_options.set_gpu_target_config(*options.gpu_target_config);
@@ -180,7 +188,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
   const std::string name = hlo_module->name();
   const std::string fingerprint = hlo_module->GetFingerprint128();
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      std::vector<std::unique_ptr<CompiledModule>> aot_results,
       gpu_compiler->CompileAheadOfTime(std::move(hlo_module), aot_options));
   return std::make_unique<StreamExecutorExecutable>(
       std::move(input_options), std::move(aot_results), num_replicas,
@@ -202,14 +210,13 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
     return executable;
   }
 
-  CompileOptions input_options = options;
   XlaComputation xla_computation;
   TF_RETURN_IF_ERROR(MlirToXlaComputation(
       module, xla_computation,
       /*use_tuple_args=*/options.parameter_is_tupled_arguments,
       /*return_tuple=*/false,
-      /*exec_build_options=*/&input_options.executable_build_options,
+      /*exec_build_options=*/&options.executable_build_options,
       mlir::mhlo::getGpuChloToHighLevelMhloOptions()));
-  return Compile(std::move(input_options), xla_computation, topology, client);
+  return Compile(std::move(options), xla_computation, topology, client);
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index 80197b5fe0dcf2..16728c2f6f6dbb 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -19,11 +19,14 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/compiler.h"
 #include "xla/stream_executor/platform.h"
 
 namespace xla {
@@ -49,6 +52,13 @@ class StreamExecutorGpuCompiler : public PjRtCompiler {
 
  private:
   std::optional<stream_executor::Platform::Id> requested_platform_id_;
+  mutable absl::Mutex compiler_mutex_;
+  std::unique_ptr<Compiler> compiler_ ABSL_GUARDED_BY(compiler_mutex_);
+
+  // Returns an instance of the compiler for the given platform (or the default
+  // GPU platform if none is specified). If one does not exist, creates one. The
+  // compiler is cached for subsequent calls.
+  absl::StatusOr<Compiler*> GetOrCreateCompiler();
 };
 }  // namespace xla
 #endif  // XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 246fb166b12771..ceb213b9e29eae 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -47,6 +49,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
+using ::testing::SizeIs;
+
 constexpr absl::string_view kProgram = R"(HloModule Computation
 
 ENTRY Computation() -> s32[] {
@@ -100,6 +105,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           compiler.Compile(opts, mlir_module.get(), *topology,
                                            /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto loaded_executable,
       se_client->Load(std::move(executable), LoadOptions()));
@@ -129,6 +135,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
       se_client->Load(std::move(executable), LoadOptions()));
@@ -155,6 +162,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
 
   // Serialize the executable and load it.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
@@ -192,6 +200,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
       se_client->Load(std::move(executable), LoadOptions()));
@@ -242,6 +251,7 @@ TEST(StreamExecutorGpuCompilerTest, UnloadedExecutableMemoryStats) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(options, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
 
   TF_ASSERT_OK_AND_ASSIGN(CompiledMemoryStats compiled_memory_stats,
                           executable->GetCompiledMemoryStats());
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index 357c7c0a2d1483..ba467b0c8be09f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -36,13 +36,13 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
index f725d4359fe496..a966082b434edc 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
@@ -27,8 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
@@ -36,6 +34,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/primitive_util.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
index 50dc6f291c4fc3..7e8a340ff4ad31 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
@@ -24,13 +24,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
index 3d73431e71b2d5..3e6b300947fc73 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index 92f3be08e16c1a..8f7af9cb74b53c 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -37,7 +37,6 @@ cc_library(
     visibility = internal_visibility(["//xla/pjrt/gpu:legacy_gpu_client_users"]),
     deps = [
         ":gpu_event",
-        ":host_memory_allocator",
         ":tracked_gpu_device_buffer",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
@@ -64,6 +63,7 @@ cc_library(
         "//xla/pjrt:abstract_tracked_device_buffer",
         "//xla/pjrt:device_event",
         "//xla/pjrt:host_callback",
+        "//xla/pjrt:host_memory_allocator",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:layout_mode",
         "//xla/pjrt:mlir_to_hlo",
@@ -85,8 +85,6 @@ cc_library(
         "//xla/pjrt/distributed:topology_util",
         "//xla/pjrt/dump",
         "//xla/pjrt/gpu:gpu_helpers",
-        "//xla/pjrt/gpu:gpu_topology",
-        "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "//xla/pjrt/gpu:se_gpu_topology_description",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_allocator_config",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
@@ -98,16 +96,18 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:executable",
         "//xla/service:generic_transfer_manager",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -212,13 +212,13 @@ xla_test(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:raw_buffer",
         "//xla/pjrt/distributed:in_memory_key_value_store",
-        "//xla/pjrt/gpu:gpu_topology",
-        "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -272,8 +272,8 @@ cc_library(
         "//xla:util",
         "//xla/pjrt:pjrt_client",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/concurrency:async_value",
@@ -317,8 +317,8 @@ xla_cc_test(
         "//xla/pjrt:pjrt_common",
         "//xla/service:gpu_plugin",
         "//xla/service:shaped_buffer",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         # copybara:uncomment "//xla/tsl/framework:allocator",
         "//xla/tsl/platform:env",
@@ -356,11 +356,3 @@ xla_cc_test(
         "@local_tsl//tsl/platform:casts",
     ],
 )
-
-cc_library(
-    name = "host_memory_allocator",
-    hdrs = ["host_memory_allocator.h"],
-    deps = [
-        "//xla/tsl/framework:allocator",
-    ],
-)
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h b/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
deleted file mode 100644
index cef01b496e48ee..00000000000000
--- a/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
-#define XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
-
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "xla/tsl/framework/allocator.h"
-
-namespace xla {
-class HostMemoryAllocator {
- public:
-  explicit HostMemoryAllocator(std::unique_ptr<tsl::Allocator> allocator)
-      : allocator_(std::move(allocator)) {}
-
-  // Uses tsl::Allocator destructor as the deleter for owned pointer.
-  using OwnedPtr = std::unique_ptr<void, std::function<void(void*)>>;
-  OwnedPtr Allocate(size_t size) {
-    if (size == 0) return OwnedPtr(nullptr, [](void* ptr) {});
-    return OwnedPtr(
-        allocator_->AllocateRaw(tsl::Allocator::kAllocatorAlignment, size),
-        [this](void* ptr) { allocator_->DeallocateRaw(ptr); });
-  }
-
- private:
-  std::unique_ptr<tsl::Allocator> allocator_;
-};
-}  // namespace xla
-
-#endif  // XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
index 7ec8d2dc198fee..c15aa659676f74 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
@@ -38,25 +38,25 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -275,7 +275,7 @@ TfrtGpuAsyncHostToDeviceTransferManager::TransferRawDataToSubBuffer(
     staging_buffer = host_memory_allocator->Allocate(transfer_size);
   }
 
-  se::DeviceMemoryBase sub_buffer;
+  se::DeviceAddressBase sub_buffer;
   {
     absl::MutexLock l(mu_);
     DCHECK_LT(buffer_index, buffer_ptrs_.size());
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h
index 4479525d4798d3..2f5cac56f62c93 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
@@ -46,6 +45,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.pb.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 65e7a6ace81d0b..6bfc7a4f27967f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -39,12 +39,11 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -52,14 +51,15 @@ limitations under the License.
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/primitive_util.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/framework/allocator.h"
@@ -384,7 +384,7 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
                 primitive_util::ByteWidth(on_device_shape.element_type());
             options.dims = on_device_shape.dimensions();
             options.permutation = permutation;
-            options.input_layout = TransposePlan::Striding{byte_strides};
+            options.input_striding = TransposePlan::Striding{byte_strides};
             {
               absl::MutexLock lock(client->transpose_mu_);
               absl::StatusOr<std::shared_ptr<TransposePlan>> t =
@@ -505,7 +505,8 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
             int64_t unpacked_size = ShapeUtil::ElementsIn(on_device_shape);
             if (transpose != nullptr) {
               buffer = tsl::port::AlignedMalloc(
-                  unpacked_size, tsl::Allocator::kAllocatorAlignment);
+                  unpacked_size, static_cast<std::align_val_t>(
+                                     tsl::Allocator::kAllocatorAlignment));
             } else {
               buffer = literal->untyped_data();
             }
@@ -583,7 +584,7 @@ Future<> TfrtGpuBuffer::CopyRawToHostFuture(Future<void*> dst_future,
       promise.Set(device_buffer->definition_event().GetError());
       return;
     }
-    se::DeviceMemoryBase device_memory = device_buffer->buffer()->buffer();
+    se::DeviceAddressBase device_memory = device_buffer->buffer()->buffer();
     if (offset < 0 || offset > device_memory.size() ||
         device_memory.size() - offset < transfer_size) {
       LOG(ERROR) << "Copy raw buffer called on buffer size "
@@ -596,7 +597,7 @@ Future<> TfrtGpuBuffer::CopyRawToHostFuture(Future<void*> dst_future,
       return;
     }
 
-    se::DeviceMemoryBase sub_buffer;
+    se::DeviceAddressBase sub_buffer;
     if (transfer_size < device_memory.size()) {
       sub_buffer = device_memory.GetByteSlice(offset, transfer_size);
     } else {
@@ -747,7 +748,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
 
   // Copying across PjRtClients involves a copy through the host.
   if (dst_device->client() != client_) {
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                        PjRtBuffer::ToLiteral().Await());
     // Avoid use-after-free on `literal` due to unsequenced move and use.
     Literal* literal_pointer = literal.get();
     absl::InlinedVector<int64_t, 4> byte_strides(
@@ -824,7 +826,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
 
         auto stream = dst_device->stream();
 
-        se::DeviceMemoryBase dst(allocated_dst_buffer->buffer());
+        se::DeviceAddressBase dst(allocated_dst_buffer->buffer());
         VLOG(3) << "D2D copy: " << src_buffer->buffer().opaque() << " -> "
                 << dst.opaque() << " (" << src_buffer->buffer().size()
                 << " bytes)";
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index eec0d99679e068..a80e288238ac2f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -60,15 +60,13 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/dump/dump.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_executable.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -87,6 +85,8 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/shaped_buffer.h"
@@ -94,9 +94,9 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -148,8 +148,8 @@ TfrtGpuClient::TfrtGpuClient(
     std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
     bool should_stage_host_to_device_transfers,
     bool abort_collectives_on_failure,
-    MaybeOwning<se::DeviceMemoryAllocator> allocator,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    MaybeOwning<se::DeviceAddressAllocator> allocator,
+    std::shared_ptr<HostMemoryAllocator> host_memory_allocator,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store,
     std::shared_ptr<const GpuTopology> gpu_topology)
@@ -160,8 +160,7 @@ TfrtGpuClient::TfrtGpuClient(
           should_stage_host_to_device_transfers),
       abort_collectives_on_failure_(abort_collectives_on_failure),
       allocator_(std::move(allocator)),
-      host_memory_allocator_(std::make_unique<HostMemoryAllocator>(
-          std::move(host_memory_allocator))),
+      host_memory_allocator_(std::move(host_memory_allocator)),
       devices_(InitializeDevices(this, devices)),
       id_to_device_(GetIdToDeviceMap(devices)),
       addressable_devices_(GetAddressableDevicePointers(devices)),
@@ -437,7 +436,7 @@ TfrtGpuClient::CreateViewOfDeviceBuffer(
   CHECK_EQ(memory_space->devices().size(), 1);
   auto* device = memory_space->devices().front();
   size_t byte_size = ShapeUtil::ByteSizeOf(shape);
-  se::DeviceMemoryBase device_memory(device_ptr, byte_size);
+  se::DeviceAddressBase device_memory(device_ptr, byte_size);
   auto non_owning_buffer = GpuDeviceMemory(device_memory);
   auto buffer_async_value_ref =
       tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
@@ -576,7 +575,7 @@ TfrtGpuClient::DeserializeToLocalExecutable(
   if (serialized.size() > std::numeric_limits<int>::max()) {
     return Internal("Proto is too large (>2GB)");
   }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+  if (!proto.ParseFromString(serialized)) {
     return Internal("Proto deserialization failed");
   }
   if (!proto.pjrt_client_name().empty() &&
@@ -888,7 +887,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
     options.elem_size_in_bytes = primitive_util::ByteWidth(type);
     options.dims = dims;
     options.permutation = permutation;
-    options.input_layout = TransposePlan::Striding{*byte_strides};
+    options.input_striding = TransposePlan::Striding{*byte_strides};
     absl::MutexLock lock(transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
@@ -972,7 +971,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
     });
     auto stream = device->stream();
 
-    se::DeviceMemoryBase dest = gpu_buffer->buffer();
+    se::DeviceAddressBase dest = gpu_buffer->buffer();
     VLOG(3) << "H2D copy: " << src_buf << " -> " << dest.opaque() << " ("
             << packed_size << " bytes) on device " << device->DebugString();
 
@@ -1189,11 +1188,36 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
       GetGpuXlaClient(options.platform_name, options.allowed_devices));
   EnablePeerAccess(xla_client->backend().stream_executors());
 
-  std::unique_ptr<tsl::Allocator> host_memory_allocator;
-  if (!xla_client->backend().stream_executors().empty()) {
+  std::shared_ptr<HostMemoryAllocator> host_memory_allocator;
+  if (options.host_memory_allocator_factory != nullptr) {
+    stream_executor::StreamExecutor* const stream_executor =
+        xla_client->backend().stream_executors().front();
+    HostMemoryAllocator::Options allocator_options;
+    allocator_options.alignment = tsl::Allocator::kAllocatorAlignment;
+    allocator_options.map_fn = [stream_executor](void* data, size_t size) {
+      bool success = stream_executor->HostMemoryRegister(data, size);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to register host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    allocator_options.unmap_fn = [stream_executor](void* data) {
+      bool success = stream_executor->HostMemoryUnregister(data);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to unregister host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    host_memory_allocator =
+        options.host_memory_allocator_factory(std::move(allocator_options));
+  } else if (!xla_client->backend().stream_executors().empty()) {
     TF_ASSIGN_OR_RETURN(
-        host_memory_allocator,
+        std::unique_ptr<tsl::Allocator> allocator,
         GetGpuHostAllocator(xla_client->backend().stream_executors().front()));
+    host_memory_allocator = std::make_shared<BasicHostMemoryAllocator>(
+        std::move(allocator), tsl::Allocator::kAllocatorAlignment);
   }
 
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
index 41e95484b084a7..55469f351f373b 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
@@ -46,12 +46,11 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/maybe_owning.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -60,10 +59,11 @@ limitations under the License.
 #include "xla/pjrt/transpose.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/xla.pb.h"
@@ -119,8 +119,8 @@ class TfrtGpuClient final : public PjRtClient {
                 std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
                 bool should_stage_host_to_device_transfers,
                 bool abort_collectives_on_failure,
-                MaybeOwning<se::DeviceMemoryAllocator> allocator,
-                std::unique_ptr<tsl::Allocator> host_memory_allocator,
+                MaybeOwning<se::DeviceAddressAllocator> allocator,
+                std::shared_ptr<HostMemoryAllocator> host_memory_allocator,
                 std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
                 std::shared_ptr<KeyValueStoreInterface> kv_store,
                 std::shared_ptr<const GpuTopology> gpu_topology);
@@ -156,7 +156,7 @@ class TfrtGpuClient final : public PjRtClient {
 
   xla::LocalClient* xla_client() const { return xla_client_; }
 
-  se::DeviceMemoryAllocator* allocator() { return allocator_.get_mutable(); }
+  se::DeviceAddressAllocator* allocator() { return allocator_.get_mutable(); }
 
   bool should_stage_host_to_device_transfers() const {
     return should_stage_host_to_device_transfers_;
@@ -337,9 +337,9 @@ class TfrtGpuClient final : public PjRtClient {
   // Device memory allocator. If owned, the allocator must outlive the devices,
   // because it is the device destructor that waits for any outstanding work to
   // complete.
-  MaybeOwning<se::DeviceMemoryAllocator> allocator_;
+  MaybeOwning<se::DeviceAddressAllocator> allocator_;
   // Allocator to be used for staging memory transfers to devices.
-  std::unique_ptr<HostMemoryAllocator> host_memory_allocator_;
+  std::shared_ptr<HostMemoryAllocator> host_memory_allocator_;
 
   // Pointers to `owned_devices_`.
   std::vector<PjRtDevice*> devices_;
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index 3e5fcc20deb231..ceeb56ce27b7ba 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -57,8 +57,6 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_executable.h"
@@ -72,13 +70,15 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/literal_test_util.h"
@@ -139,7 +139,7 @@ absl::StatusOr<std::shared_ptr<xla::Literal>> ExtractSingleResult(
   TF_RET_CHECK(result->size() == 1);
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = (*result)[0];
   TF_RET_CHECK(result_buffers.size() == 1);
-  TF_ASSIGN_OR_RETURN(auto literal, result_buffers[0]->ToLiteralSync());
+  TF_ASSIGN_OR_RETURN(auto literal, result_buffers[0]->ToLiteral().Await());
   return literal;
 }
 
@@ -386,7 +386,7 @@ static absl::Status MemsetFromValue(
   uint32_t pattern;
   std::memcpy(&pattern, &memset_value->value, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
@@ -434,7 +434,7 @@ static absl::Status MemsetFromAttr(
   uint32_t pattern;
   std::memcpy(&pattern, &attr, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
@@ -563,7 +563,7 @@ TEST(TfrtGpuClientTest, ShouldStageHostToDeviceTransfersSetToTrue) {
           /*device_layout=*/nullptr));
   TF_EXPECT_OK(buffer->GetReadyFuture().Await());
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          buffer->ToLiteralSync());
+                          buffer->ToLiteral().Await());
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*literal, LiteralUtil::CreateR1<int32_t>(data)));
 }
@@ -588,7 +588,7 @@ TEST(TfrtGpuClientTest, ShouldStageHostToDeviceTransfersSetToFalse) {
           /*device_layout=*/nullptr));
   TF_EXPECT_OK(buffer->GetReadyFuture().Await());
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          buffer->ToLiteralSync());
+                          buffer->ToLiteral().Await());
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*literal, LiteralUtil::CreateR1<int32_t>(data)));
 }
@@ -612,7 +612,7 @@ TEST(TfrtGpuClientTest, BufferFromHostBufferPinnedMemory) {
   EXPECT_EQ(buffer->memory_space()->kind(), "pinned_host");
   EXPECT_TRUE(buffer->IsOnCpu());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   std::vector<int32_t> expected{1, 2, 3, 4};
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                      *literal));
@@ -641,7 +641,7 @@ TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpace) {
   EXPECT_EQ(result->memory_space()->kind(), "pinned_host");
   EXPECT_TRUE(result->IsOnCpu());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
   std::vector<int32_t> expected{1, 2, 3, 4};
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                      *literal));
@@ -664,7 +664,7 @@ TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
 
   TF_EXPECT_OK(buffer->GetReadyFuture().Await());
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> device_literal,
-                          buffer->ToLiteralSync());
+                          buffer->ToLiteral().Await());
   std::vector<xla::s4> expected{xla::s4(1), xla::s4(2), xla::s4(3), xla::s4(4)};
   Literal expected_literal = LiteralUtil::CreateR1<xla::s4>(expected);
   EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *device_literal));
@@ -677,7 +677,7 @@ TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
   EXPECT_EQ(result->memory_space()->kind(), "pinned_host");
   EXPECT_TRUE(result->IsOnCpu());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *literal));
 }
 
@@ -1027,7 +1027,8 @@ TEST(TfrtGpuClientTest, FromHostAsyncPinnedHostChunked) {
     }
     offset = end;
   }
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> lit, buf->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> lit,
+                          buf->ToLiteral().Await());
   EXPECT_THAT(lit->data<float>(), ElementsAreArray(data));
 }
 
@@ -1182,15 +1183,17 @@ TEST(TfrtGpuClientTest, CopyRawToHostFullBuffer) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  void* dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  void* dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 
   auto result = buffer->CopyRawToHost(dst, 0, size);
   TF_EXPECT_OK(result.Await());
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
   EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
 
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(TfrtGpuClientTest, CopyRawToHostSubBuffer) {
@@ -1201,14 +1204,16 @@ TEST(TfrtGpuClientTest, CopyRawToHostSubBuffer) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  void* dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  void* dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 
   auto result = buffer->CopyRawToHost(dst, 0, sizeof(float));
   TF_EXPECT_OK(result.Await());
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
 
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(TfrtGpuClientTest, CopyRawToHostOutOfRange) {
@@ -1219,13 +1224,15 @@ TEST(TfrtGpuClientTest, CopyRawToHostOutOfRange) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  void* dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  void* dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 
   auto result = buffer->CopyRawToHost(dst, 1, size);
   EXPECT_THAT(result.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
                                        HasSubstr("invalid offset 1")));
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
@@ -1246,8 +1253,9 @@ TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
   buffer.reset();
   ready.OnReady([dst_promise = std::move(dst_promise),
                  size](absl::Status status) mutable {
-    void* dst =
-        tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+    void* dst = tsl::port::AlignedMalloc(
+        size,
+        static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
     dst_promise.Set(dst);
   });
 
@@ -1256,7 +1264,9 @@ TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
   EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
 
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(GpuTopology, FromProto) {
@@ -1413,7 +1423,7 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTest) {
   EXPECT_GT(memory_stats.peak_memory_in_bytes, 0);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          result_buffers[0]->ToLiteralSync());
+                          result_buffers[0]->ToLiteral().Await());
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
 }
 
@@ -1450,10 +1460,10 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   EXPECT_EQ(result_buffers[1]->memory_space()->kind(), "pinned_host");
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          result_buffers[0]->ToLiteralSync());
+                          result_buffers[0]->ToLiteral().Await());
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> another_literal,
-                          result_buffers[1]->ToLiteralSync());
+                          result_buffers[1]->ToLiteral().Await());
   EXPECT_THAT(another_literal->data<int32_t>(), ElementsAreArray(kData));
 }
 
@@ -1622,7 +1632,7 @@ TEST(TfrtGpuClientTest, CopyToMemorySpace) {
     TF_ASSERT_OK_AND_ASSIGN(buffer,
                             buffer->CopyToMemorySpace(buffer->memory_space()));
     TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> received_literal,
-                            buffer->ToLiteralSync());
+                            buffer->ToLiteral().Await());
     EXPECT_THAT(received_literal->data<int32_t>(),
                 ElementsAreArray(literal.data<int32_t>()));
   }
@@ -1740,10 +1750,12 @@ TEST(TfrtGpuClientTest, DmaMapUnmap) {
   auto client = tensorflow::down_cast<TfrtGpuClient*>(gpu_client.get());
   size_t dma_size = 8192;
   size_t alignment = 4096;
-  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(
+      dma_size, static_cast<std::align_val_t>(alignment));
   auto host_dma_ptr_deleter =
       absl::Cleanup([host_dma_ptr, dma_size, alignment] {
-        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+        tsl::port::AlignedSizedFree(host_dma_ptr, dma_size,
+                                    static_cast<std::align_val_t>(alignment));
       });
 
   // DmaMap the first half of the buffer.
@@ -1817,10 +1829,12 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(
+      dma_size, static_cast<std::align_val_t>(alignment));
   auto host_dma_ptr_deleter =
       absl::Cleanup([host_dma_ptr, dma_size, alignment] {
-        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+        tsl::port::AlignedSizedFree(host_dma_ptr, dma_size,
+                                    static_cast<std::align_val_t>(alignment));
       });
 
   TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
@@ -1837,7 +1851,7 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
       0, host_dma_ptr, 0, size, true, []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteral().Await());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
@@ -1944,7 +1958,7 @@ TEST(TfrtGpuClientTest, CreateAliasBuffer) {
   ASSERT_NE(alias_buffer.second, nullptr);
   TF_ASSERT_OK(std::move(alias_buffer.second)(result_buffer.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto alias_literal,
-                          alias_buffer.first->ToLiteralSync());
+                          alias_buffer.first->ToLiteral().Await());
 
   // Expected result: data + 1
   EXPECT_TRUE(LiteralTestUtil::Equal(
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
index 588227432b216a..c915187600b2d9 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
@@ -52,11 +51,12 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/utils.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
index 95961906b1ace8..97707c3690fb06 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
@@ -46,7 +46,7 @@ limitations under the License.
 #include "xla/pjrt/semaphore.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 6ce960aefdd160..3f8ac2c1636f80 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/layout.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
@@ -68,17 +67,18 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -109,7 +109,7 @@ namespace xla {
 class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  public:
   TfrtGpuCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
-                            se::DeviceMemoryBase dst,
+                            se::DeviceAddressBase dst,
                             tsl::AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
@@ -145,7 +145,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
       return Future<>(done_.GetError());
     }
 
-    se::DeviceMemoryBase dst(
+    se::DeviceAddressBase dst(
         reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
         dst_.size() - current_bytes_);
 
@@ -189,7 +189,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  private:
   int64_t channel_id_;
   se::Stream* stream_;
-  se::DeviceMemoryBase dst_;
+  se::DeviceAddressBase dst_;
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
@@ -770,21 +770,25 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         if (result_is_tuple) {
           for (int i = 0; i < output_buffers.size(); ++i) {
             ScopedShapedBuffer tuple_buffer = output.TakeSubTree({i});
-            stream_executor::DeviceMemoryBase* elem =
+            stream_executor::DeviceAddressBase* elem =
                 tuple_buffer.buffers().mutable_element({});
             VLOG(3) << "untuple: output_buffers[" << i
                     << "].emplace: " << elem->opaque();
-            output_buffers[i].emplace(stream_executor::OwningDeviceMemory(
-                *elem, device->local_device_id().value(), client->allocator()));
-            *elem = se::DeviceMemoryBase();
+            output_buffers[i].emplace(
+                stream_executor::ScopedDeviceAddress<uint8_t>(
+                    *elem, device->local_device_id().value(),
+                    client->allocator()));
+            *elem = se::DeviceAddressBase();
           }
         } else {
           CHECK_EQ(output_buffers.size(), 1);
           auto* elem = output.buffers().mutable_element({});
           VLOG(3) << "output_buffers[0].emplace: " << elem->opaque();
-          output_buffers.front().emplace(stream_executor::OwningDeviceMemory(
-              *elem, device->local_device_id().value(), client->allocator()));
-          *elem = se::DeviceMemoryBase();
+          output_buffers.front().emplace(
+              stream_executor::ScopedDeviceAddress<uint8_t>(
+                  *elem, device->local_device_id().value(),
+                  client->allocator()));
+          *elem = se::DeviceAddressBase();
         }
 
         // Set the scheduled event to concrete to indicate that the scheduling
@@ -901,19 +905,20 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         std::vector<ExecutionInput> inputs;
         if (parameter_is_tupled_arguments) {
           inputs.emplace_back(
-              ShapeTree<MaybeOwningDeviceMemory>(&parameter_shapes->front()));
+              ShapeTree<MaybeOwningDeviceAddress>(&parameter_shapes->front()));
           ExecutionInput& input = inputs.back();
           for (int i = 0; i < tracked_buffers.size(); ++i) {
             VLOG(4) << "tupled input[" << i
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {i}, MaybeOwningDeviceMemory(se::OwningDeviceMemory(
-                           tracked_buffers[i]->buffer()->buffer(),
-                           device->local_hardware_id().value(),
-                           client->allocator())));
+                  {i},
+                  MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
+                      tracked_buffers[i]->buffer()->buffer(),
+                      device->local_hardware_id().value(),
+                      client->allocator())));
             } else {
-              input.SetBuffer({i}, MaybeOwningDeviceMemory(
+              input.SetBuffer({i}, MaybeOwningDeviceAddress(
                                        tracked_buffers[i]->buffer()->buffer()));
             }
           }
@@ -923,16 +928,16 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
             VLOG(4) << "untupled input[" << i
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             inputs.emplace_back(
-                ShapeTree<MaybeOwningDeviceMemory>(&(*parameter_shapes)[i]));
+                ShapeTree<MaybeOwningDeviceAddress>(&(*parameter_shapes)[i]));
             ExecutionInput& input = inputs.back();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {}, MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+                  {}, MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
                           tracked_buffers[i]->buffer()->buffer(),
                           device->local_hardware_id().value(),
                           client->allocator())));
             } else {
-              input.SetBuffer({}, MaybeOwningDeviceMemory(
+              input.SetBuffer({}, MaybeOwningDeviceAddress(
                                       tracked_buffers[i]->buffer()->buffer()));
             }
           }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
index 32543f080947b1..3d49f9d7a16823 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -44,7 +44,7 @@ ShapedBuffer GpuDeviceMemory::AsShapedBuffer(const Shape& on_device_shape,
                                              const PjRtDevice* device) const {
   ShapedBuffer shaped_buffer(on_device_shape, device->local_device_id().value(),
                              device->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+  ShapeTree<se::DeviceAddressBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   CHECK(iterator != shaped_buffer.buffers().end());
   iterator->second = buffer_;
@@ -60,19 +60,19 @@ void GpuDeviceMemory::SetUnOwned() {
 }
 
 absl::StatusOr<GpuDeviceMemory> GpuDeviceMemory::Allocate(
-    se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size) {
+    se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size) {
   return Allocate(allocator, device_ordinal, size,
                   static_cast<int>(se::MemoryType::kDevice));
 }
 
 absl::StatusOr<GpuDeviceMemory> GpuDeviceMemory::Allocate(
-    se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size,
+    se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size,
     int64_t memory_space) {
   if (size == 0) {
-    return GpuDeviceMemory(se::DeviceMemoryBase());
+    return GpuDeviceMemory(se::DeviceAddressBase());
   }
   TF_ASSIGN_OR_RETURN(
-      stream_executor::OwningDeviceMemory memory,
+      stream_executor::ScopedDeviceAddress<uint8_t> memory,
       allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/true,
                           memory_space));
   return GpuDeviceMemory(std::move(memory));
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
index 3a1b1bc186f1e9..71abf7139016dd 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
@@ -29,13 +29,13 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/event.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla {
-// TODO(b/400541410): Refactor and Merge this with MaybeOwningDeviceMemory.
+// TODO(b/400541410): Refactor and Merge this with MaybeOwningDeviceAddress.
 
 // GpuDeviceMemory represents either an owned or unowned GPU memory. It
 // owns GPU memory if an allocator is provided. When the object goes output of
@@ -47,11 +47,11 @@ class GpuDeviceMemory {
   GpuDeviceMemory& operator=(GpuDeviceMemory&& other) = default;
 
   // Creates non-owning GPU device memory from a raw data pointer.
-  explicit GpuDeviceMemory(stream_executor::DeviceMemoryBase buffer)
+  explicit GpuDeviceMemory(stream_executor::DeviceAddressBase buffer)
       : buffer_(buffer) {}
 
   // Creates owning GPU device memory from an owned data pointer.
-  explicit GpuDeviceMemory(stream_executor::OwningDeviceMemory buffer)
+  explicit GpuDeviceMemory(stream_executor::ScopedDeviceAddress<uint8_t> buffer)
       : owning_buffer_(std::move(buffer)), buffer_(*owning_buffer_) {}
 
   ShapedBuffer AsShapedBuffer(const Shape& on_device_shape,
@@ -62,19 +62,19 @@ class GpuDeviceMemory {
 
   // Allocates raw owning memory.
   static absl::StatusOr<GpuDeviceMemory> Allocate(
-      se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size);
+      se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size);
 
   static absl::StatusOr<GpuDeviceMemory> Allocate(
-      se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size,
+      se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size,
       int64_t memory_space);
 
-  stream_executor::DeviceMemoryBase buffer() const { return buffer_; }
+  stream_executor::DeviceAddressBase buffer() const { return buffer_; }
   size_t size_bytes() const { return buffer_.size(); }
   bool owns_data() const { return !owning_buffer_.is_null(); }
 
  private:
-  stream_executor::OwningDeviceMemory owning_buffer_;
-  se::DeviceMemoryBase buffer_;
+  stream_executor::ScopedDeviceAddress<uint8_t> owning_buffer_;
+  se::DeviceAddressBase buffer_;
 };
 
 // Class that represents a GPU buffer. It optionally owns the buffer. It also
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
index 4c0020c87b2329..7961f01d17b439 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
@@ -39,8 +39,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_allocator.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/env.h"
@@ -65,11 +65,11 @@ class TestAllocator : public se::DeviceAddressAllocator {
   absl::StatusOr<stream_executor::ScopedDeviceAddress<uint8_t>> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override {
-    const se::DeviceMemoryBase base(kOpaque, size);
+    const se::DeviceAddressBase base(kOpaque, size);
     return stream_executor::ScopedDeviceAddress<uint8_t>(base, 0, this);
   }
   absl::Status Deallocate(int device_ordinal,
-                          se::DeviceMemoryBase mem) override {
+                          se::DeviceAddressBase mem) override {
     return absl::OkStatus();
   }
   absl::StatusOr<se::Stream*> GetStream(int device_ordinal) override {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
index d6a067722f3c85..7c011e83558428 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
@@ -60,8 +60,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
@@ -77,16 +75,18 @@ limitations under the License.
 #include "xla/runtime/device_id.h"
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -307,7 +307,7 @@ absl::flat_hash_map<std::string, PjRtDeviceAttribute> GetAttrsForDevices(
 class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  public:
   TfrtGpuCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
-                            se::DeviceMemoryBase dst,
+                            se::DeviceAddressBase dst,
                             tsl::AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
@@ -343,7 +343,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
       return Future<>(done_.GetError());
     }
 
-    se::DeviceMemoryBase dst(
+    se::DeviceAddressBase dst(
         reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
         dst_.size() - current_bytes_);
 
@@ -387,7 +387,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  private:
   int64_t channel_id_;
   se::Stream* stream_;
-  se::DeviceMemoryBase dst_;
+  se::DeviceAddressBase dst_;
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
@@ -401,7 +401,7 @@ SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     const se::DeviceMemoryBase&,
+                     const se::DeviceAddressBase&,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return Internal(
           "Don't send a buffer to the channel_id=%d, there was no send "
@@ -415,7 +415,7 @@ SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
 
   return [callbacks, thread_pool](
              int64_t channel_id, se::Stream* stream, const Shape& shape,
-             const se::DeviceMemoryBase& src,
+             const se::DeviceAddressBase& src,
              const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(4) << "Send " << src.size() << " bytes to channel #" << channel_id
@@ -490,7 +490,7 @@ RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     se::DeviceMemoryBase*,
+                     se::DeviceAddressBase*,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return InvalidArgument(
           "Failed to receive a buffer from the channel_id=%d, there was no "
@@ -503,7 +503,7 @@ RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   absl::Span<const RecvCallback> callbacks = options.recv_callbacks[replica];
 
   return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
-                     se::DeviceMemoryBase* dst,
+                     se::DeviceAddressBase* dst,
                      const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(4) << "Recv from channel #" << channel_id
@@ -650,7 +650,7 @@ absl::StatusOr<std::unique_ptr<tsl::Allocator>> CreateAllocatorForDevice(
   }
 }
 
-absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
+absl::StatusOr<MaybeOwning<se::DeviceAddressAllocator>> CreateDeviceAllocator(
     LocalClient* xla_client, const GpuAllocatorConfig& allocator_config,
     const std::vector<std::unique_ptr<TfrtGpuDevice>>& devices) {
   if (allocator_config.kind == GpuAllocatorConfig::Kind::kPlatform) {
@@ -660,7 +660,7 @@ absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
           << "collective_memory_size is non-zero, but allocator kind is set "
              "to \"platform\". Collective memory will not be allocated.";
     }
-    return MaybeOwning<se::DeviceMemoryAllocator>(
+    return MaybeOwning<se::DeviceAddressAllocator>(
         xla_client->backend().memory_allocator());
   }
 
@@ -697,7 +697,7 @@ absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
         /*memory_space=*/static_cast<int>(se::MemoryType::kHost),
         executor->device_ordinal(), executor->GetPlatform());
   }
-  return MaybeOwning<se::DeviceMemoryAllocator>(
+  return MaybeOwning<se::DeviceAddressAllocator>(
       std::make_unique<se::MultiDeviceAdapter>(xla_client->platform(),
                                                std::move(allocators)));
 }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
index 9fdf52226cecba..21b1bdf4110fcd 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/maybe_owning.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
@@ -51,9 +50,10 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -154,7 +154,7 @@ std::vector<std::unique_ptr<PjRtMemorySpace>> InitializeMemorySpaces(
 absl::StatusOr<std::unique_ptr<tsl::Allocator>> CreateAllocatorForDevice(
     se::StreamExecutor* executor, const GpuAllocatorConfig& allocator_config);
 
-absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
+absl::StatusOr<MaybeOwning<se::DeviceAddressAllocator>> CreateDeviceAllocator(
     LocalClient* xla_client, const GpuAllocatorConfig& allocator_config,
     const std::vector<std::unique_ptr<TfrtGpuDevice>>& devices);
 
diff --git a/third_party/xla/xla/pjrt/host_memory_allocator.cc b/third_party/xla/xla/pjrt/host_memory_allocator.cc
new file mode 100644
index 00000000000000..a0a9448a2527ad
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_memory_allocator.cc
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/host_memory_allocator.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "xla/tsl/framework/allocator.h"
+
+namespace xla {
+
+BasicHostMemoryAllocator::BasicHostMemoryAllocator(
+    std::unique_ptr<tsl::Allocator> allocator, size_t alignment)
+    : allocator_(std::move(allocator)), alignment_(alignment) {}
+
+HostMemoryAllocator::OwnedPtr BasicHostMemoryAllocator::Allocate(size_t size) {
+  if (size == 0) {
+    return nullptr;
+  }
+  return OwnedPtr(
+      reinterpret_cast<uint8_t*>(allocator_->AllocateRaw(alignment_, size)),
+      {
+          +[](void* ptr, void* arg) {
+            reinterpret_cast<tsl::Allocator*>(arg)->DeallocateRaw(ptr);
+          },
+          allocator_.get(),
+      });
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_memory_allocator.h b/third_party/xla/xla/pjrt/host_memory_allocator.h
new file mode 100644
index 00000000000000..8ebf83ecadcbd4
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_memory_allocator.h
@@ -0,0 +1,76 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_HOST_MEMORY_ALLOCATOR_H_
+#define XLA_PJRT_HOST_MEMORY_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "xla/tsl/framework/allocator.h"
+
+namespace xla {
+
+// An interface for host memory allocation.
+class HostMemoryAllocator {
+ public:
+  struct Options {
+    // Minimum alignment of the allocated memory.
+    size_t alignment = tsl::Allocator::kAllocatorAlignment;
+
+    // Functions for mapping and unmapping the allocated memory.
+    absl::AnyInvocable<absl::Status(void*, size_t)> map_fn;
+    absl::AnyInvocable<absl::Status(void*)> unmap_fn;
+  };
+
+  using Factory =
+      std::function<std::unique_ptr<HostMemoryAllocator>(Options options)>;
+
+  struct Deleter {
+    void operator()(void* ptr) { deleter(ptr, arg); }
+    void (*deleter)(void* ptr, void* arg);
+    void* arg;
+  };
+  using OwnedPtr = std::unique_ptr<uint8_t[], Deleter>;
+
+  virtual ~HostMemoryAllocator() = default;
+
+  // Allocates `size` bytes of memory. The returned pointer is guaranteed to be
+  // aligned to `options_.alignment`.
+  virtual OwnedPtr Allocate(size_t size) = 0;
+};
+
+// `HostMemoryAllocator` implementation that uses a `tsl::Allocator` to back
+// allocations.
+class BasicHostMemoryAllocator : public HostMemoryAllocator {
+ public:
+  explicit BasicHostMemoryAllocator(
+      std::unique_ptr<tsl::Allocator> allocator,
+      size_t alignment = tsl::Allocator::kAllocatorAlignment);
+
+  OwnedPtr Allocate(size_t size) override;
+
+ private:
+  const std::unique_ptr<tsl::Allocator> allocator_;
+  const size_t alignment_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_HOST_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
index b01c428036110d..aab104323a24a6 100644
--- a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
@@ -154,8 +154,7 @@ class CommonAsyncHostToDeviceTransferManager
       TF_ASSIGN_OR_RETURN(
           auto buffer,
           client->DefineBuffer(device_shape, memory_space, raw_buffer,
-                               {std::move(definition_event)},
-                               /*raw_buffer_is_mutable=*/true));
+                               {std::move(definition_event)}));
       device_shapes.push_back(std::move(device_shape));
       buffers.push_back(std::move(buffer));
       undispatched_buffer_refs.push_back(raw_buffer);
diff --git a/third_party/xla/xla/pjrt/interpreter/BUILD b/third_party/xla/xla/pjrt/interpreter/BUILD
index c8cb8170f82a97..2b8f71df9da43d 100644
--- a/third_party/xla/xla/pjrt/interpreter/BUILD
+++ b/third_party/xla/xla/pjrt/interpreter/BUILD
@@ -28,6 +28,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/expanders:cholesky_expander",
+        "//xla/hlo/transforms/expanders:convolution_type_canonicalizer",
         "//xla/hlo/transforms/expanders:dynamic_index_splitter",
         "//xla/hlo/transforms/expanders:eigh_expander",
         "//xla/hlo/transforms/expanders:qr_expander",
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
index ece4b088dda58c..757850fd2aa392 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/expanders/cholesky_expander.h"
+#include "xla/hlo/transforms/expanders/convolution_type_canonicalizer.h"
 #include "xla/hlo/transforms/expanders/dynamic_index_splitter.h"
 #include "xla/hlo/transforms/expanders/eigh_expander.h"
 #include "xla/hlo/transforms/expanders/qr_expander.h"
@@ -490,6 +491,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> InterpreterClient::RunHloPasses(
       /*rewrite_grad_op=*/true);
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
+  pipeline.AddPass<ConvolutionTypeCanonicalizer>();
 
   TF_RETURN_IF_ERROR(pipeline.Run(hlo_module.get()).status());
   return hlo_module;
diff --git a/third_party/xla/xla/pjrt/local_device_state.cc b/third_party/xla/xla/pjrt/local_device_state.cc
index b3e16c8e8f20ab..a1812c63ec19a0 100644
--- a/third_party/xla/xla/pjrt/local_device_state.cc
+++ b/third_party/xla/xla/pjrt/local_device_state.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/worker_thread.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -177,7 +177,7 @@ absl::Status LocalDeviceState::SynchronizeAllActivity() {
 
 absl::Status LocalDeviceState::ThenMemcpyDeviceToDevice(
     se::Stream* transfer_stream, se::Stream* dst_stream,
-    se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer) {
+    se::DeviceAddressBase src_buffer, se::DeviceAddressBase dst_buffer) {
   // The default implementation simply calls MemcpyD2D, and assumes that
   // the buffer addresses identify the devices. This does not work
   // on all platforms; this method is virtual so it can be overridden.
diff --git a/third_party/xla/xla/pjrt/local_device_state.h b/third_party/xla/xla/pjrt/local_device_state.h
index 675b6b81459f05..38ca812e589c74 100644
--- a/third_party/xla/xla/pjrt/local_device_state.h
+++ b/third_party/xla/xla/pjrt/local_device_state.h
@@ -168,7 +168,7 @@ class LocalDeviceState {
   // Enqueues a copy of `src_buffer` to `dst_buffer` onto `transfer_stream`.
   virtual absl::Status ThenMemcpyDeviceToDevice(
       se::Stream* transfer_stream, se::Stream* dst_stream,
-      se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer);
+      se::DeviceAddressBase src_buffer, se::DeviceAddressBase dst_buffer);
 
   WorkerThread* execute_thread() const { return execute_thread_.get(); }
 
diff --git a/third_party/xla/xla/pjrt/partial_program_utils.cc b/third_party/xla/xla/pjrt/partial_program_utils.cc
index c7568af47e565c..c1b1febd34d288 100644
--- a/third_party/xla/xla/pjrt/partial_program_utils.cc
+++ b/third_party/xla/xla/pjrt/partial_program_utils.cc
@@ -46,8 +46,8 @@ ConvertCharBuffersToPjRtPartialProgramProtos(
   partial_programs.reserve(char_buffers.size());
   for (size_t i = 0; i < char_buffers.size(); ++i) {
     xla::PjRtPartialProgramProto partial_program;
-    bool success =
-        partial_program.ParseFromArray(char_buffers[i], char_buffer_sizes[i]);
+    bool success = partial_program.ParseFromString(
+        absl::string_view(char_buffers[i], char_buffer_sizes[i]));
     if (!success) {
       return absl::InvalidArgumentError(
           "Failed to deserialize PjRtPartialProgramProto");
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index 4fddbe0dc5d331..c3cd93ab5e8faa 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -132,7 +132,7 @@ TEST_P(PjRtClientTest, Execute) {
                           executable->Execute({{buffer.get()}}, options));
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -161,7 +161,7 @@ TEST_P(PjRtClientTest, ExecuteWithImmutableUntilTransferCompletes) {
                           executable->Execute({{buffer.get()}}, options));
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -203,7 +203,7 @@ TEST_P(PjRtClientTest, ExecuteWithTupleZeroCopy) {
 
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -232,7 +232,7 @@ TEST_P(PjRtClientTest, ExecuteWithDonation) {
                           executable->Execute({{buffer.get()}}, options));
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -312,7 +312,7 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsage) {
 
   std::vector<int32_t> expected(4, 1);
   for (const auto& result : results) {
-    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                        *literal));
   }
@@ -357,7 +357,7 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsageAndDonation) {
         auto& results = *results_or;
         CHECK_EQ(results.size(), 1);
         CHECK_EQ(results[0].size(), 1);
-        auto literal_or = results[0][0]->ToLiteralSync();
+        auto literal_or = results[0][0]->ToLiteral().Await();
         if (literal_or.ok()) {
           CHECK(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                        *literal_or.value()));
@@ -380,7 +380,7 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsageAndDonation) {
 
   blocking_counter.Wait();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                      *literal));
 }
@@ -409,7 +409,7 @@ TEST(PjRtClientTest, CopyToDevice) {
   TF_ASSERT_OK_AND_ASSIGN(auto result, buffer->CopyToMemorySpace(
                                            *device_1->default_memory_space()));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 0);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -448,7 +448,7 @@ TEST(PjRtClientTest, CopyToDeviceAsync) {
 
   for (const auto& result : results) {
     ASSERT_TRUE(result);
-    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
     std::vector<int32_t> expected(4, 0);
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -495,7 +495,7 @@ TEST(PjRtClientTest, CopyToDeviceAsyncExternalCpuOnly) {
 
   for (const auto& result : results) {
     ASSERT_TRUE(result);
-    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
     std::vector<int32_t> expected(4, 0);
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 69635421f80399..3a28a3e0441676 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -115,6 +115,7 @@ limitations under the License.
 #include "xla/pjrt/dump/dump.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/host_callback.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/local_device_state.h"
@@ -141,14 +142,14 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -275,8 +276,8 @@ PjRtStreamExecutorClient::PjRtStreamExecutorClient(
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
     int process_index,
     std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces,
-    std::unique_ptr<se::DeviceMemoryAllocator> allocator,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    std::unique_ptr<se::DeviceAddressAllocator> allocator,
+    std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
     : platform_id_(tsl::Fingerprint64(platform_name)),
@@ -302,7 +303,8 @@ PjRtStreamExecutorClient::PjRtStreamExecutorClient(
   }
 
   if (!host_memory_allocator_) {
-    host_memory_allocator_ = std::make_unique<CpuAllocator>();
+    host_memory_allocator_ = std::make_unique<BasicHostMemoryAllocator>(
+        std::make_unique<CpuAllocator>());
   }
 
   for (const std::unique_ptr<PjRtStreamExecutorDevice>& device :
@@ -510,8 +512,7 @@ PjRtStreamExecutorClient::DefineBuffer(
     const Shape& on_device_shape, PjRtMemorySpace* memory_space,
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-        definition_device_events,
-    bool raw_buffer_is_mutable) {
+        definition_device_events) {
   if (raw_buffer && raw_buffer->memory_space() != memory_space) {
     return absl::InvalidArgumentError(
         absl::StrFormat("DefineBuffer: Mismatch in memory spaces: %s vs %s",
@@ -538,8 +539,8 @@ PjRtStreamExecutorClient::DefineBuffer(
 
 absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                          CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
-PjRtStreamExecutorClient::CreateRawBufferChannel(
-    PjRtMemorySpace* memory_space) {
+PjRtStreamExecutorClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                                                 size_t on_device_bytes_count) {
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
       memory_space->devices()[0]);
@@ -642,7 +643,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
     options.elem_size_in_bytes = primitive_util::ByteWidth(type);
     options.dims = dims;
     options.permutation = permutation;
-    options.input_layout = TransposePlan::Striding{*byte_strides};
+    options.input_striding = TransposePlan::Striding{*byte_strides};
     absl::MutexLock lock(transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
@@ -669,12 +670,8 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
   if (must_use_staging_buffer || (!IsDmaMapped(data, packed_size) &&
                                   (should_stage_host_to_device_transfers() &&
                                    packed_size < (int64_t{1} << 30)))) {
-    void* ptr = host_memory_allocator()->AllocateRaw(
-        tsl::Allocator::kAllocatorAlignment, transpose ? size : packed_size);
-    staging_buffer = std::shared_ptr<void>(
-        ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
-          host_memory_allocator->DeallocateRaw(ptr);
-        });
+    staging_buffer =
+        host_memory_allocator()->Allocate(transpose ? size : packed_size);
   }
 
   // Copy the buffer into a staging buffer before returning control to the
@@ -730,7 +727,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
         // memory that has already been allocated, and a possible Event
         // allocation.
 
-        se::DeviceMemoryBase device_memory =
+        se::DeviceAddressBase device_memory =
             tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
                 raw_buffer.get())
                 ->device_buffer()
@@ -904,7 +901,7 @@ PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
   auto* device = memory_space->devices().front();
 
   auto buffer = RawSEDeviceMemory::CreateForeign(
-      se::DeviceMemoryBase(device_ptr, ShapeUtil::ByteSizeOf(shape)),
+      se::DeviceAddressBase(device_ptr, ShapeUtil::ByteSizeOf(shape)),
       std::move(on_delete_callback));
 
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
@@ -1139,7 +1136,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
                 absl::Span<PjRtBuffer* const> py_buffers,
                 absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
                 int device_ordinal) {
-  se::DeviceMemoryAllocator* allocator = client->allocator();
+  se::DeviceAddressAllocator* allocator = client->allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
 
@@ -1156,7 +1153,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
 
   se::Stream* stream = local_device->host_to_device_stream();
   TF_ASSIGN_OR_RETURN(
-      se::OwningDeviceMemory owned_root_table_memory,
+      se::ScopedDeviceAddress<uint8_t> owned_root_table_memory,
       allocator->Allocate(
           device_ordinal,
           transfer_manager->GetByteSizeRequirement(tupled_parameter_shape)));
@@ -1190,7 +1187,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
   }
   CHECK(input_iterator == iterator_end);
 
-  std::vector<se::DeviceMemoryBase> elements;
+  std::vector<se::DeviceAddressBase> elements;
   size_t num_elements = ShapeUtil::TupleElementCount(tupled_parameter_shape);
   elements.reserve(num_elements);
   for (int64_t i = 0; i < num_elements; ++i) {
@@ -1442,7 +1439,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     const se::DeviceMemoryBase&,
+                     const se::DeviceAddressBase&,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return Internal(
           "Don't send a buffer to the channel_id=%d, there was no send "
@@ -1456,7 +1453,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
 
   return [callbacks, thread_pool](
              int64_t channel_id, se::Stream* stream, const Shape& shape,
-             const se::DeviceMemoryBase& src,
+             const se::DeviceAddressBase& src,
              const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(3) << "Send " << src.size() << " bytes to channel #" << channel_id
@@ -1525,7 +1522,7 @@ namespace {
 class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
  public:
   StreamExecutorCopyToDeviceStream(
-      int64_t channel_id, se::Stream* stream, se::DeviceMemoryBase dst,
+      int64_t channel_id, se::Stream* stream, se::DeviceAddressBase dst,
       AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
@@ -1562,7 +1559,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
       return Future<>(done_.GetError());
     }
 
-    se::DeviceMemoryBase dst(
+    se::DeviceAddressBase dst(
         reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
         dst_.size() - current_bytes_);
 
@@ -1602,7 +1599,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
  private:
   int64_t channel_id_;
   se::Stream* stream_;
-  se::DeviceMemoryBase dst_;
+  se::DeviceAddressBase dst_;
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
@@ -1615,7 +1612,7 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     se::DeviceMemoryBase*,
+                     se::DeviceAddressBase*,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return InvalidArgument(
           "Failed to receive a buffer from the channel_id=%d, there was no "
@@ -1628,7 +1625,7 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   absl::Span<const RecvCallback> callbacks = options.recv_callbacks[replica];
 
   return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
-                     se::DeviceMemoryBase* dst,
+                     se::DeviceAddressBase* dst,
                      const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(3) << "Recv from channel #" << channel_id
@@ -1673,12 +1670,12 @@ PjRtStreamExecutorClient::RunAsync(
     auto it = tmp.MutableBuffers()->begin();
     for (auto& v : input) {
       if (v.second.is_donated) {
-        it->second = MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+        it->second = MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
             v.second.buf->mem(), device->local_device_id().value(),
             run_options.allocator()));
         tmp.SetUnownedIndex(it->first);
       } else {
-        it->second = MaybeOwningDeviceMemory(v.second.buf->mem());
+        it->second = MaybeOwningDeviceAddress(v.second.buf->mem());
       }
       ++it;
     }
@@ -1691,7 +1688,7 @@ PjRtStreamExecutorClient::RunAsync(
   xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       ssb.on_device_shape());
   auto it = results.begin();
-  se::DeviceMemoryAllocator* allocator = ssb.memory_allocator();
+  se::DeviceAddressAllocator* allocator = ssb.memory_allocator();
   ShapedBuffer released_ssb = ssb.release();
   for (auto& buf : released_ssb.buffers()) {
     CHECK(it != results.end());
@@ -2845,7 +2842,7 @@ PjRtStreamExecutorClient::DeserializeToLocalExecutable(
   if (serialized.size() > std::numeric_limits<int>::max()) {
     return Internal("Proto is too large (>2GB)");
   }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+  if (!proto.ParseFromString(serialized)) {
     return Internal("Proto deserialization failed");
   }
   if (!proto.pjrt_client_name().empty() &&
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 3e543724c182aa..a67ee1895cbf65 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/common_pjrt_client.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -66,7 +67,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
@@ -91,8 +92,8 @@ struct PjRtStreamExecutorExecutionOutput {
   // Donated inputs which must be freed.
   std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
   // For PjRtStreamExecutorClient implementations that
-  // use OwningDeviceMemory for donated inputs.
-  std::vector<se::OwningDeviceMemory> se_to_be_released;
+  // use ScopedDeviceAddress for donated inputs.
+  std::vector<se::ScopedDeviceAddress<uint8_t>> se_to_be_released;
 };
 
 class PjRtStreamExecutorDevice : public PjRtDevice {
@@ -237,8 +238,8 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
       std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
       int process_index,
       std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces,
-      std::unique_ptr<se::DeviceMemoryAllocator> allocator,
-      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      std::unique_ptr<se::DeviceAddressAllocator> allocator,
+      std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
       std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
   ~PjRtStreamExecutorClient() override = default;
@@ -340,8 +341,8 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
                 ->local_device_state();
   }
   LocalClient* client() const { return client_; }
-  se::DeviceMemoryAllocator* allocator() const { return allocator_; }
-  tsl::Allocator* host_memory_allocator() const {
+  se::DeviceAddressAllocator* allocator() const { return allocator_; }
+  HostMemoryAllocator* host_memory_allocator() const {
     return host_memory_allocator_.get();
   }
   bool should_stage_host_to_device_transfers() const {
@@ -395,12 +396,12 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
       const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-          definition_device_events,
-      bool raw_buffer_is_mutable) override;
+          definition_device_events) override;
 
   absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                            PjRtFulfillAliasRawBufferCallback>>
-  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                         size_t on_device_bytes_count) override;
 
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeInto(
       const LiteralSlice& literal, const xla::Shape& device_shape,
@@ -483,13 +484,13 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
   LocalClient* client_;
 
   // Allocator to be used for staging memory transfers to devices.
-  std::unique_ptr<tsl::Allocator> host_memory_allocator_;
+  std::unique_ptr<HostMemoryAllocator> host_memory_allocator_;
 
   // Device memory allocator. If owned, the allocator must outlive the devices,
   // because it is the device destructor that waits for any outstanding work to
   // complete.
-  se::DeviceMemoryAllocator* allocator_;
-  std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
+  se::DeviceAddressAllocator* allocator_;
+  std::unique_ptr<se::DeviceAddressAllocator> owned_allocator_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> owned_devices_;
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
index 8f593fb2e05fbd..84fa6ae1997577 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
@@ -29,6 +29,7 @@ cc_library(
     hdrs = ["xla_gpu_client_options.h"],
     deps = [
         ":xla_gpu_allocator_config",
+        "//xla/pjrt:host_memory_allocator",
         "//xla/pjrt/distributed:key_value_store_interface",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
index 8e7aa87b935372..771506c9fecf2c 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 
 namespace xla {
@@ -40,6 +41,10 @@ struct GpuClientOptions {
 
   bool should_stage_host_to_device_transfers = true;
 
+  // Optional factory for a host memory allocator to use for transfer. Used only
+  // if `should_stage_host_to_device_transfers` is true.
+  HostMemoryAllocator::Factory host_memory_allocator_factory;
+
   // kv_store must be non-null if num_nodes > 1.
   std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
 
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.cc b/third_party/xla/xla/pjrt/se_raw_buffer.cc
index 1d5fc0516f7e10..641e892394bfa8 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/generic_transfer_manager.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -136,7 +136,7 @@ PjRtStreamExecutorRawBuffer::CopyRawHostToDeviceAndReturnEvent(
                                     local_device = local_device_, stream, src,
                                     offset, transfer_size,
                                     buf = tsl::FormRef(this)]() mutable {
-    se::DeviceMemoryBase sub_buffer = buf->device_buffer_->mem();
+    se::DeviceAddressBase sub_buffer = buf->device_buffer_->mem();
     if (transfer_size < sub_buffer.size()) {
       sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
     }
@@ -151,12 +151,8 @@ PjRtStreamExecutorRawBuffer::CopyRawHostToDeviceAndReturnEvent(
                 "host_memory_allocator should be initialized for "
                 "staging buffer transfer.");
           }
-          void* ptr = client->host_memory_allocator()->AllocateRaw(
-              tsl::Allocator::kAllocatorAlignment, transfer_size);
-          staging_buffer = std::shared_ptr<void>(
-              ptr,
-              [host_memory_allocator = client->host_memory_allocator()](
-                  void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+          staging_buffer =
+              client->host_memory_allocator()->Allocate(transfer_size);
           auto copy_to_staging_buffer = [src, transfer_size,
                                          staging_buffer]() mutable {
             std::memcpy(staging_buffer.get(), src, transfer_size);
@@ -196,7 +192,7 @@ PjRtStreamExecutorRawBuffer::CopyRawDeviceToHostAndReturnEvent(
                                     local_device = local_device_, stream, dst,
                                     offset, transfer_size,
                                     buf = tsl::FormRef(this)]() mutable {
-    se::DeviceMemoryBase sub_buffer = buf->device_buffer_->mem();
+    se::DeviceAddressBase sub_buffer = buf->device_buffer_->mem();
     if (transfer_size < sub_buffer.size()) {
       sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
     }
@@ -210,12 +206,8 @@ PjRtStreamExecutorRawBuffer::CopyRawDeviceToHostAndReturnEvent(
                 "host_memory_allocator should be initialized for "
                 "staging buffer transfer.");
           }
-          void* ptr = client->host_memory_allocator()->AllocateRaw(
-              tsl::Allocator::kAllocatorAlignment, transfer_size);
-          std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-              ptr,
-              [host_memory_allocator = client->host_memory_allocator()](
-                  void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+          std::shared_ptr<void> staging_buffer =
+              client->host_memory_allocator()->Allocate(transfer_size);
           TF_RETURN_IF_ERROR(
               stream->Memcpy(staging_buffer.get(), sub_buffer, transfer_size));
           auto copy_from_staging_buffer = [dst, transfer_size,
@@ -248,7 +240,7 @@ ShapedBuffer PjRtStreamExecutorRawBuffer::AsShapedBuffer(
   auto* device = memory_space()->devices()[0];
   ShapedBuffer shaped_buffer(shape, device->local_device_id().value(),
                              device->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+  ShapeTree<se::DeviceAddressBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   if (device_buffer_) {
     CHECK(iterator != shaped_buffer.buffers().end());
@@ -316,7 +308,7 @@ void PjRtStreamExecutorRawBuffer::CopyToLiteralAsync(
                 primitive_util::ByteWidth(on_device_shape.element_type());
             options.dims = on_device_shape.dimensions();
             options.permutation = permutation;
-            options.input_layout = TransposePlan::Striding{byte_strides};
+            options.input_striding = TransposePlan::Striding{byte_strides};
             {
               absl::MutexLock lock(client->transpose_mu_);
               absl::StatusOr<std::shared_ptr<TransposePlan>> t =
@@ -496,13 +488,10 @@ void PjRtStreamExecutorRawBuffer::CopyTo(
     src_usage_event_promise->Set(*std::move(d2h_event));
     return;
   } else {
-    void* ptr = client_->host_memory_allocator()->AllocateRaw(
-        tsl::Allocator::kAllocatorAlignment, GetOnDeviceSizeInBytes());
-    std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-        ptr, [host_memory_allocator = client_->host_memory_allocator()](
-                 void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
-    auto d2h_event =
-        CopyRawDeviceToHostAndReturnEvent(ptr, 0, GetOnDeviceSizeInBytes());
+    std::shared_ptr<void> staging_buffer =
+        client_->host_memory_allocator()->Allocate(GetOnDeviceSizeInBytes());
+    auto d2h_event = CopyRawDeviceToHostAndReturnEvent(
+        staging_buffer.get(), 0, GetOnDeviceSizeInBytes());
     if (!d2h_event.ok()) {
       definition_event_promise->SetError(d2h_event.status());
       src_usage_event_promise->SetError(d2h_event.status());
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index 61210775a4461c..8ee76ec15f4103 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -39,15 +39,14 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla {
+
 absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
     const {
   std::string serialized;
-  if (std::holds_alternative<
-          std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+  if (std::holds_alternative<std::vector<std::unique_ptr<CompiledModule>>>(
           executables_)) {
     const auto& aot_executables =
-        std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-            executables_);
+        std::get<std::vector<std::unique_ptr<CompiledModule>>>(executables_);
     if (aot_executables.empty()) {
       return absl::InternalError("No local executable");
     }
@@ -63,7 +62,7 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
     Executable* built_executable = local_executables[0]->executable();
     CHECK(local_client_ != nullptr);
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<AotCompilationResult> aot_result,
+        std::unique_ptr<CompiledModule> aot_result,
         local_client_->backend().compiler()->Export(built_executable));
 
     TF_ASSIGN_OR_RETURN(serialized, aot_result->SerializeAsString());
@@ -81,6 +80,26 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
   return proto.SerializeAsString();
 }
 
+StreamExecutorExecutable::StreamExecutorExecutable(
+    const CompileOptions& compile_options,
+    std::vector<std::unique_ptr<CompiledModule>> executables, int num_replicas,
+    int num_partitions, absl::string_view name, absl::string_view fingerprint,
+    absl::string_view default_memory_kind)
+    : compile_options_(compile_options),
+      executables_(std::move(executables)),
+      num_replicas_(num_replicas),
+      num_partitions_(num_partitions),
+      name_(name),
+      fingerprint_(fingerprint),
+      default_memory_kind_(default_memory_kind) {
+  std::vector<std::shared_ptr<HloModule>> hlo_modules;
+  for (const auto& executable :
+       std::get<std::vector<std::unique_ptr<CompiledModule>>>(executables_)) {
+    hlo_modules.push_back(executable->shared_optimized_module());
+  }
+  hlo_modules_ = std::move(hlo_modules);
+}
+
 StreamExecutorExecutable::StreamExecutorExecutable(
     const CompileOptions& compile_options,
     std::optional<HloModuleProto> unoptimized_hlo_module_proto,
@@ -109,7 +128,7 @@ absl::StatusOr<CompiledMemoryStats>
 StreamExecutorExecutable::GetCompiledMemoryStats() const {
   CompiledMemoryStats memory_stats = CompiledMemoryStats();
   if (auto* aot_executables =
-          std::get_if<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+          std::get_if<std::vector<std::unique_ptr<CompiledModule>>>(
               &executables_)) {
     if (aot_executables->size() != 1) {
       return Unimplemented(
@@ -154,8 +173,7 @@ StreamExecutorExecutable::GetCompiledMemoryStats() const {
 }
 
 int64_t StreamExecutorExecutable::SizeOfGeneratedCodeInBytes() const {
-  if (std::holds_alternative<
-          std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+  if (std::holds_alternative<std::vector<std::unique_ptr<CompiledModule>>>(
           executables_)) {
     return 0;
   }
@@ -227,10 +245,9 @@ StreamExecutorExecutable::ConsumeExecutable(
     return std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
         std::move(executables_));
   } else if (std::holds_alternative<
-                 std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-                 executables_)) {
+                 std::vector<std::unique_ptr<CompiledModule>>>(executables_)) {
     auto aot_executables =
-        std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+        std::get<std::vector<std::unique_ptr<CompiledModule>>>(
             std::move(executables_));
     std::vector<std::unique_ptr<LocalExecutable>> local_executables;
     local_executables.reserve(aot_executables.size());
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index cff9d15b53edde..0e21789076b3a4 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -37,20 +38,14 @@ limitations under the License.
 #include "xla/service/hlo_proto_util.h"
 
 namespace xla {
+
 class StreamExecutorExecutable : public PjRtExecutable {
  public:
   StreamExecutorExecutable(
       const CompileOptions& compile_options,
-      std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
+      std::vector<std::unique_ptr<CompiledModule>> executables,
       int num_replicas, int num_partitions, absl::string_view name,
-      absl::string_view fingerprint, absl::string_view default_memory_kind)
-      : compile_options_(compile_options),
-        executables_(std::move(executables)),
-        num_replicas_(num_replicas),
-        num_partitions_(num_partitions),
-        name_(name),
-        fingerprint_(fingerprint),
-        default_memory_kind_(default_memory_kind) {}
+      absl::string_view fingerprint, absl::string_view default_memory_kind);
 
   StreamExecutorExecutable(
       const CompileOptions& compile_options,
@@ -106,7 +101,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
   // The unoptimized HLO module proto is necessary for HLO debug dumping. It is
   // not available for deserialized executables.
   std::optional<HloModuleProto> unoptimized_hlo_module_proto_;
-  std::variant<std::vector<std::unique_ptr<xla::AotCompilationResult>>,
+  std::variant<std::vector<std::unique_ptr<CompiledModule>>,
                std::vector<std::unique_ptr<LocalExecutable>>>
       executables_;
   LocalClient* local_client_ = nullptr;
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index f5595d2ea39040..31f668f7baf51d 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -42,8 +42,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -57,7 +57,7 @@ ShapedBuffer RawSEDeviceMemory::AsShapedBuffer(
     PjRtDevice* device, const Shape& on_device_shape) const {
   ShapedBuffer shaped_buffer(on_device_shape, device->local_device_id().value(),
                              device->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+  ShapeTree<se::DeviceAddressBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   CHECK(iterator != shaped_buffer.buffers().end());
   iterator->second = mem();
@@ -68,9 +68,9 @@ ShapedBuffer RawSEDeviceMemory::AsShapedBuffer(
 
 class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
  public:
-  AllocatedRawSEDeviceMemory(se::DeviceMemoryBase value,
+  AllocatedRawSEDeviceMemory(se::DeviceAddressBase value,
                              LocalDeviceState* local_device,
-                             se::DeviceMemoryAllocator* allocator)
+                             se::DeviceAddressAllocator* allocator)
       : RawSEDeviceMemory(value),
         allocator_(allocator),
         local_device_(local_device) {
@@ -103,21 +103,21 @@ class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
   }
 
  private:
-  se::DeviceMemoryAllocator* allocator_;
+  se::DeviceAddressAllocator* allocator_;
   LocalDeviceState* local_device_;
   size_t sync_point_ = std::numeric_limits<size_t>::max();
 };
 
 tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::Create(
-    se::DeviceMemoryBase value, LocalDeviceState* local_device,
-    se::DeviceMemoryAllocator* allocator) {
+    se::DeviceAddressBase value, LocalDeviceState* local_device,
+    se::DeviceAddressAllocator* allocator) {
   return tsl::MakeAvailableAsyncValueRef<AllocatedRawSEDeviceMemory>(
       value, local_device, allocator);
 }
 
 class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
  public:
-  ForeignRawSEDeviceMemory(se::DeviceMemoryBase value,
+  ForeignRawSEDeviceMemory(se::DeviceAddressBase value,
                            absl::AnyInvocable<void() &&> on_delete_callback)
       : RawSEDeviceMemory(value),
         on_delete_callback_(std::move(on_delete_callback)) {}
@@ -133,7 +133,7 @@ class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
 };
 
 tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
-    se::DeviceMemoryBase value,
+    se::DeviceAddressBase value,
     absl::AnyInvocable<void() &&> on_delete_callback) {
   return tsl::MakeAvailableAsyncValueRef<ForeignRawSEDeviceMemory>(
       value, std::move(on_delete_callback));
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index ecc4a64dc73c45..7bce98bf6fa0a8 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -39,12 +39,12 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -53,11 +53,11 @@ namespace xla {
 
 class RawSEDeviceMemory {
  public:
-  explicit RawSEDeviceMemory(se::DeviceMemoryBase value) : value_(value) {}
+  explicit RawSEDeviceMemory(se::DeviceAddressBase value) : value_(value) {}
 
   virtual ~RawSEDeviceMemory() = default;
 
-  const se::DeviceMemoryBase& mem() const { return value_; }
+  const se::DeviceAddressBase& mem() const { return value_; }
 
   void* opaque() const { return value_.opaque(); }
 
@@ -70,10 +70,10 @@ class RawSEDeviceMemory {
                               const Shape& on_device_shape) const;
 
   static tsl::AsyncValueRef<RawSEDeviceMemory> Create(
-      se::DeviceMemoryBase value, LocalDeviceState* local_device,
-      se::DeviceMemoryAllocator* allocator);
+      se::DeviceAddressBase value, LocalDeviceState* local_device,
+      se::DeviceAddressAllocator* allocator);
   static tsl::AsyncValueRef<RawSEDeviceMemory> CreateForeign(
-      se::DeviceMemoryBase value,
+      se::DeviceAddressBase value,
       absl::AnyInvocable<void() &&> on_delete_callback);
 
   // Returns a definition event (or nullptr if the definition is known to be in
@@ -84,7 +84,7 @@ class RawSEDeviceMemory {
   }
 
  private:
-  se::DeviceMemoryBase value_;
+  se::DeviceAddressBase value_;
 };
 
 // Class that represents a tuple of device buffers. Like a ScopedShapedBuffer it
@@ -109,8 +109,8 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
   // on_device_shape matches that of the TrackedDeviceBuffer. 'end' is used to
   // check that 'iterator' doesn't run out of bounds.
   void AddToInputAsImmutable(
-      ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-      const ShapeTree<MaybeOwningDeviceMemory>::iterator& end) const;
+      ShapeTree<MaybeOwningDeviceAddress>::iterator* iterator,
+      const ShapeTree<MaybeOwningDeviceAddress>::iterator& end) const;
 
   // Adds the owned device buffers in order to 'iterator', marking them as
   // available to be donated. If donation succeeds, i.e., execution_input is
@@ -121,10 +121,10 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
   // that of the TrackedDeviceBuffer. 'end' is used to check that 'iterator'
   // doesn't run out of bounds.
   void AddToInputAsDonated(
-      ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-      const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
+      ShapeTree<MaybeOwningDeviceAddress>::iterator* iterator,
+      const ShapeTree<MaybeOwningDeviceAddress>::iterator& end,
       ExecutionInput* execution_input,
-      se::DeviceMemoryAllocator* allocator) const;
+      se::DeviceAddressAllocator* allocator) const;
 
   const absl::InlinedVector<BufferSequencingEventRef, 2>& definition_events()
       const {
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index f4d2b8664df143..2c1b89083b477d 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/pjrt/tracked_device_buffer.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -34,7 +35,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/statusor.h"
@@ -90,7 +91,7 @@ absl::StatusOr<tsl::AsyncValueRef<RawSEDeviceMemory>> MakeArray(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
       [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
         TF_ASSIGN_OR_RETURN(
-            se::OwningDeviceMemory device_memory,
+            se::ScopedDeviceAddress<uint8_t> device_memory,
             client->backend().memory_allocator()->Allocate(
                 /*device_ordinal=*/0,
                 client->backend().transfer_manager()->GetByteSizeRequirement(
@@ -114,7 +115,7 @@ TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto b_buffer, MakeArray(b_shape, client));
   TF_ASSERT_OK_AND_ASSIGN(auto c_buffer, MakeArray(c_shape, client));
 
-  std::vector<se::DeviceMemoryBase> expected_buffer_sequence = {
+  std::vector<se::DeviceAddressBase> expected_buffer_sequence = {
       a_buffer->mem(), b_buffer->mem(), c_buffer->mem()};
   ShapedBuffer shaped_a = a_buffer->AsShapedBuffer(
       &device,
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 734f05b3d9c50c..c7eb090396085c 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -91,6 +91,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/blocking_counter.h"
@@ -98,6 +99,7 @@ limitations under the License.
 #include "xla/ef57.h"
 #include "xla/permutation_util.h"
 #include "xla/pjrt/transpose_kernels.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -546,124 +548,6 @@ static void ComputeStrides(
   }
 }
 
-void TransposePlan::RemoveTrivialDimensions(
-    absl::InlinedVector<int64_t, 4>& a_dims,
-    absl::InlinedVector<int64_t, 4>& permutation,
-    absl::InlinedVector<int64_t, 4>& lda,
-    absl::InlinedVector<int64_t, 4>& lda_tile,
-    absl::InlinedVector<int64_t, 4>& a_tiling,
-    absl::InlinedVector<int64_t, 4>& b_tiling) {
-  int ndim = a_dims.size();
-  // How many positions has the i-th dimension of 'a' been moved to the left?
-  // -1 if the dimension is to be removed.
-  std::vector<int> shift(ndim);
-  absl::InlinedVector<int64_t, 4> updated_a_dims;
-  absl::InlinedVector<int64_t, 4> updated_lda;
-  absl::InlinedVector<int64_t, 4> updated_lda_tile;
-  absl::InlinedVector<int64_t, 4> updated_a_tiling;
-  updated_a_dims.reserve(ndim);
-  updated_lda.reserve(ndim);
-  updated_lda_tile.reserve(ndim);
-  updated_a_tiling.reserve(ndim);
-  std::vector<int64_t> inv_permutation = InversePermutation(permutation);
-  for (int a_dim = 0; a_dim < ndim; ++a_dim) {
-    int b_dim = inv_permutation[a_dim];
-    // A dimension is trivial if it has size 1 and is not tiled.
-    if (a_dims[a_dim] == 1 && a_tiling[a_dim] == 1 && b_tiling[b_dim] == 1) {
-      shift[a_dim] = -1;
-    } else {
-      updated_a_dims.push_back(a_dims[a_dim]);
-      updated_lda.push_back(lda[a_dim]);
-      updated_lda_tile.push_back(lda_tile[a_dim]);
-      updated_a_tiling.push_back(a_tiling[a_dim]);
-      shift[a_dim] = a_dim + 1 - updated_a_dims.size();
-    }
-  }
-
-  // Updates the permutation and tiling of b.
-  absl::InlinedVector<int64_t, 4> updated_permutation;
-  absl::InlinedVector<int64_t, 4> updated_b_tiling;
-  updated_permutation.reserve(updated_a_dims.size());
-  updated_b_tiling.reserve(updated_a_dims.size());
-  for (int b_dim = 0; b_dim < ndim; ++b_dim) {
-    int a_dim = permutation[b_dim];
-    if (shift[a_dim] >= 0) {
-      updated_permutation.push_back(a_dim - shift[a_dim]);
-      updated_b_tiling.push_back(b_tiling[b_dim]);
-    }
-  }
-
-  DCHECK(IsPermutation(updated_permutation));
-  a_dims = std::move(updated_a_dims);
-  permutation = std::move(updated_permutation);
-  lda = std::move(updated_lda);
-  lda_tile = std::move(updated_lda_tile);
-  a_tiling = std::move(updated_a_tiling);
-  b_tiling = std::move(updated_b_tiling);
-}
-
-void TransposePlan::CoalesceDimensions(
-    absl::InlinedVector<int64_t, 4>& a_dims,
-    absl::InlinedVector<int64_t, 4>& permutation,
-    absl::InlinedVector<int64_t, 4>& lda,
-    absl::InlinedVector<int64_t, 4>& lda_tile,
-    absl::InlinedVector<int64_t, 4>& a_tiling,
-    absl::InlinedVector<int64_t, 4>& b_tiling) {
-  int ndim = a_dims.size();
-  // How many positions has the i-th dimension of 'a' been moved to the left?
-  // -1 if the dimension is to be removed.
-  std::vector<int> shift(ndim, 0);
-  absl::InlinedVector<int64_t, 4> updated_a_dims;
-  absl::InlinedVector<int64_t, 4> updated_lda;
-  absl::InlinedVector<int64_t, 4> updated_lda_tile;
-  absl::InlinedVector<int64_t, 4> updated_a_tiling;
-  updated_a_dims.reserve(ndim);
-  updated_lda.reserve(ndim);
-  updated_lda_tile.reserve(ndim);
-  updated_a_tiling.reserve(ndim);
-  std::vector<int64_t> inv_permutation = InversePermutation(permutation);
-  for (int a_dim = 0; a_dim < ndim; ++a_dim) {
-    // We can coalesce two dimensions if they appear consecutively
-    // in both the input dimensions and the output dimensions, and the stride
-    // of the outer dimension is the usual multiple of the inner dimension.
-    if (a_dim > 0 && inv_permutation[a_dim - 1] + 1 == inv_permutation[a_dim] &&
-        lda[a_dim - 1] == lda[a_dim] * a_dims[a_dim] &&
-        a_tiling[a_dim - 1] == 1 && a_tiling[a_dim] == 1 &&
-        b_tiling[inv_permutation[a_dim]] == 1 &&
-        b_tiling[inv_permutation[a_dim - 1]] == 1) {
-      updated_a_dims.back() *= a_dims[a_dim];
-      updated_lda.back() = lda[a_dim];
-      shift[a_dim] = -1;
-    } else {
-      updated_a_dims.push_back(a_dims[a_dim]);
-      updated_lda.push_back(lda[a_dim]);
-      updated_lda_tile.push_back(lda_tile[a_dim]);
-      updated_a_tiling.push_back(a_tiling[a_dim]);
-      shift[a_dim] = a_dim + 1 - updated_a_dims.size();
-    }
-  }
-
-  // Updates the permutation.
-  absl::InlinedVector<int64_t, 4> updated_permutation;
-  absl::InlinedVector<int64_t, 4> updated_b_tiling;
-  updated_permutation.reserve(updated_a_dims.size());
-  updated_b_tiling.reserve(updated_a_dims.size());
-  for (int b_dim = 0; b_dim < ndim; ++b_dim) {
-    int a_dim = permutation[b_dim];
-    if (shift[a_dim] >= 0) {
-      updated_permutation.push_back(a_dim - shift[a_dim]);
-      updated_b_tiling.push_back(b_tiling[b_dim]);
-    }
-  }
-  DCHECK(IsPermutation(updated_permutation));
-  a_dims = std::move(updated_a_dims);
-  permutation = std::move(updated_permutation);
-  lda = std::move(updated_lda);
-  lda_tile = std::move(updated_lda_tile);
-  a_tiling = std::move(updated_a_tiling);
-  b_tiling = std::move(updated_b_tiling);
-}
-
 int64_t TransposePlan::InputNumElems() const {
   int64_t size = 1;
   for (size_t i = 0; i < a_dims_.size(); ++i) {
@@ -682,9 +566,13 @@ int64_t TransposePlan::OutputNumElems() const {
 
 // Parses and validates a tiling specification, and populates `tiling`.
 static absl::Status ParseTilingSpecification(
-    int ndim, absl::Span<int64_t const> tiling_spec,
+    int ndim, const std::optional<TransposePlan::Tiling>& tiling_opt,
     absl::InlinedVector<int64_t, 4>& tiling) {
   tiling.resize(ndim, 1);
+  if (!tiling_opt) {
+    return absl::OkStatus();
+  }
+  absl::Span<int64_t const> tiling_spec = tiling_opt->tiling;
   if (tiling_spec.size() > ndim) {
     return InvalidArgument(
         "Tiling (%s) must have at most as many dimensions as the array (%d)",
@@ -706,16 +594,21 @@ static absl::Status ParseTilingSpecification(
   return absl::OkStatus();
 }
 
+bool TransposePlan::Loop::operator==(const Loop& other) const {
+  return dim_in_a == other.dim_in_a && tile_interior == other.tile_interior &&
+         dim_size == other.dim_size && tile_size == other.tile_size &&
+         lda == other.lda && ldb == other.ldb &&
+         is_inner_dim_in_a == other.is_inner_dim_in_a &&
+         is_inner_dim_in_b == other.is_inner_dim_in_b &&
+         parallelism == other.parallelism;
+}
+
 // Helper function that builds a plan.
-void TransposePlan::BuildPlanNodes(
-    absl::Span<int64_t const> inverse_permutation, int thread_id,
-    std::vector<TransposePlan::Node>& nodes) {
+void TransposePlan::BuildPlanNodes(int thread_id,
+                                   std::vector<TransposePlan::Node>& nodes) {
   VLOG(8) << "Before plan build: " << ToString();
   const int ndim = a_dims_.size();
   DCHECK_GT(ndim, 0);
-  const int pos_stride1a = ndim - 1;
-  const int pos_stride1b_in_a = permutation_.back();
-  const int pos_stride1a_in_b = inverse_permutation[pos_stride1a];
 
   // We build plans in a depth-first order, visiting loops from outermost to
   // innermost. We use a stack (depth-first) order to handle trailing partial
@@ -740,8 +633,10 @@ void TransposePlan::BuildPlanNodes(
   };
   std::stack<Agendum> agenda;
 
-  int total_tasks =
-      absl::c_accumulate(loop_parallelism_, int{1}, std::multiplies<int>());
+  int total_tasks = 1;
+  for (const Loop& loop : loop_order_) {
+    total_tasks *= loop.parallelism;
+  }
 
   agenda.push(Agendum{/*loop_id=*/0, /*parent_node_id=*/-1,
                       /*num_tasks_at_loop=*/total_tasks,
@@ -772,12 +667,8 @@ void TransposePlan::BuildPlanNodes(
       if (!inner_kernel_is_memcpy_) {
         Node node;
         node.start = node.end = node.inc = -1;
-        node.lda = a_tiling_[pos_stride1b_in_a] > 1
-                       ? lda_tile_[pos_stride1b_in_a]
-                       : lda_[pos_stride1b_in_a];
-        node.ldb = b_tiling_[pos_stride1a_in_b] > 1
-                       ? ldb_tile_[pos_stride1a_in_b]
-                       : ldb_[pos_stride1a_in_b];
+        node.lda = sentinel_lda_;
+        node.ldb = sentinel_ldb_;
         nodes.push_back(node);
       }
       DCHECK(!(inner_kernel_is_memcpy_ && agendum.parent_node_id >= 0));
@@ -786,38 +677,34 @@ void TransposePlan::BuildPlanNodes(
 
     const Loop& loop = loop_order_[agendum.loop_id];
     int a_dim = loop.dim_in_a;
-    int b_dim = inverse_permutation[a_dim];
-    DCHECK(a_tiling_[a_dim] == 1 || b_tiling_[b_dim] == 1 ||
-           a_tiling_[a_dim] == b_tiling_[b_dim]);
-    int64_t tile_size = std::max(a_tiling_[a_dim], b_tiling_[b_dim]);
 
     // Compute the number of tasks for the next loop iteration.
     int task_id_at_loop = agendum.task_id_at_loop;
-    int num_tasks_at_loop =
-        agendum.num_tasks_at_loop / loop_parallelism_[agendum.loop_id];
+    int num_tasks_at_loop = agendum.num_tasks_at_loop / loop.parallelism;
     int task_id_at_next_loop = task_id_at_loop % num_tasks_at_loop;
 
+    Node node;
+    node.lda = loop.lda;
+    node.ldb = loop.ldb;
+    node.inc = 1;
+    node.is_inner_dim_in_a = loop.is_inner_dim_in_a;
+    node.is_inner_dim_in_b = loop.is_inner_dim_in_b;
+    if (node.is_inner_dim_in_a) {
+      node.inc = inner_block_elems_ * outer_block_elems_a_;
+    } else if (node.is_inner_dim_in_b) {
+      node.inc = inner_block_elems_ * outer_block_elems_b_;
+    }
+
+    int task_id = task_id_at_loop / num_tasks_at_loop;
+
     if (loop.tile_interior) {
       // We are visiting the tile interior of a tiled dimension.
       bool partial = agendum.partial_tiles[a_dim];
 
-      Node node;
-      node.lda = a_tiling_[a_dim] > 1 ? lda_tile_[a_dim] : lda_[a_dim];
-      node.ldb = b_tiling_[b_dim] > 1 ? ldb_tile_[b_dim] : ldb_[b_dim];
-      node.inc = 1;
-      node.is_inner_dim_in_a = (a_dim == pos_stride1a);
-      node.is_inner_dim_in_b = (a_dim == pos_stride1b_in_a);
-      if (node.is_inner_dim_in_a) {
-        node.inc = inner_block_elems_ * outer_block_elems_a_;
-      } else if (node.is_inner_dim_in_b) {
-        node.inc = inner_block_elems_ * outer_block_elems_b_;
-      }
-
-      int task_id = task_id_at_loop / num_tasks_at_loop;
-      int64_t size = partial ? a_dims_[a_dim] % tile_size : tile_size;
+      int64_t size = partial ? loop.dim_size % loop.tile_size : loop.tile_size;
       int64_t num_iterations = CeilOfRatio(size, node.inc);
-      int64_t num_iterations_per_task = CeilOfRatio<int64_t>(
-          num_iterations, loop_parallelism_[agendum.loop_id]);
+      int64_t num_iterations_per_task =
+          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
       node.start = std::min(size, task_id * num_iterations_per_task * node.inc);
       node.end =
           std::min(size, (task_id + 1) * num_iterations_per_task * node.inc);
@@ -840,15 +727,14 @@ void TransposePlan::BuildPlanNodes(
     } else {
       // We are either visiting an untiled dimension, or the loop that iterates
       // over tile exteriors.
-      int task_id = task_id_at_loop / num_tasks_at_loop;
-      int64_t num_complete_tiles = a_dims_[a_dim] / tile_size;
-      bool has_partial_tile = (a_dims_[a_dim] % tile_size != 0);
+      int64_t num_complete_tiles = loop.dim_size / loop.tile_size;
+      bool has_partial_tile = (loop.dim_size % loop.tile_size != 0);
 
       // If there is a trailing partial tile as well as complete tiles, handle
       // it as a trailer on the loop over complete tiles.
       bool has_trailing_plan_node = false;
       if (num_complete_tiles > 0 && has_partial_tile &&
-          task_id == loop_parallelism_[agendum.loop_id] - 1) {
+          task_id == loop.parallelism - 1) {
         Agendum new_agendum;
         new_agendum.loop_id = agendum.loop_id + 1;
         new_agendum.parent_node_id = node_id;
@@ -859,17 +745,6 @@ void TransposePlan::BuildPlanNodes(
         agenda.push(std::move(new_agendum));
         has_trailing_plan_node = true;
       }
-      Node node;
-      node.lda = lda_[a_dim] * tile_size / a_tiling_[a_dim];
-      node.ldb = ldb_[b_dim] * tile_size / b_tiling_[b_dim];
-      node.inc = 1;
-      node.is_inner_dim_in_a = (tile_size == 1 && a_dim == ndim - 1);
-      node.is_inner_dim_in_b = (tile_size == 1 && a_dim == pos_stride1b_in_a);
-      if (node.is_inner_dim_in_a) {
-        node.inc = inner_block_elems_ * outer_block_elems_a_;
-      } else if (node.is_inner_dim_in_b) {
-        node.inc = inner_block_elems_ * outer_block_elems_b_;
-      }
 
       // If this tiled dimension consists only of a single partial tile, handle
       // it here; there's no point emitting a degenerate loop and a separate
@@ -879,8 +754,8 @@ void TransposePlan::BuildPlanNodes(
       // Evenly divide the loop iterations amongst the threads.
       int64_t num_tiles = partial ? 1 : num_complete_tiles;
       int64_t num_iterations = CeilOfRatio(num_tiles, node.inc);
-      int64_t num_iterations_per_task = CeilOfRatio<int64_t>(
-          num_iterations, loop_parallelism_[agendum.loop_id]);
+      int64_t num_iterations_per_task =
+          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
       node.start =
           std::min(num_tiles, task_id * num_iterations_per_task * node.inc);
       node.end = std::min(num_tiles,
@@ -909,7 +784,15 @@ void TransposePlan::BuildPlanNodes(
 }
 
 absl::StatusOr<std::unique_ptr<TransposePlan>> TransposePlan::Create(
-    const Options& o) {
+    Options o) {
+  if (o.input_layout.has_value()) {
+    if (const auto* t = std::get_if<Tiling>(&*o.input_layout)) {
+      o.input_tiling = *t;
+    } else if (const auto* s = std::get_if<Striding>(&*o.input_layout)) {
+      o.input_striding = *s;
+    }
+  }
+
   auto is_negative = [](int64_t d) { return d < 0; };
   if (absl::c_find_if(o.dims, is_negative) != o.dims.end()) {
     return InvalidArgument("dims must be non-negative, got %s",
@@ -952,68 +835,82 @@ absl::StatusOr<std::unique_ptr<TransposePlan>> TransposePlan::Create(
   plan->original_b_dims_ = Permute(o.dims, o.permutation);
 
   TF_RETURN_IF_ERROR(
-      ParseTilingSpecification(ndim, o.output_tiling.tiling, plan->b_tiling_));
+      ParseTilingSpecification(ndim, o.output_tiling, plan->b_tiling_));
 
-  // Handles strides.
-  if (std::holds_alternative<Striding>(o.input_layout)) {
+  // Temporary vectors to hold un-permuted attributes
+  absl::InlinedVector<int64_t, 4> temp_lda, temp_lda_tile, temp_a_tiling;
+
+  // Parse the tile and stride specifications.
+  TF_RETURN_IF_ERROR(
+      ParseTilingSpecification(ndim, o.input_tiling, temp_a_tiling));
+  ComputeStrides(plan->elem_size_in_bytes_, o.dims, temp_a_tiling, temp_lda,
+                 temp_lda_tile);
+
+  // Determine tile (outer) strides
+  absl::InlinedVector<int64_t, 4> input_outer_strides;
+  if (o.input_striding) {
     absl::Span<int64_t const> input_strides_in_bytes =
-        std::get<Striding>(o.input_layout).strides_in_bytes;
+        o.input_striding->strides_in_bytes;
     if (input_strides_in_bytes.size() != o.dims.size()) {
       return InvalidArgument(
-          "dims and input_strides_in_bytes must have equal sizes, got %d "
-          "and %d",
+          "dims and input_striding must have equal sizes, "
+          "got %d and %d",
           o.dims.size(), input_strides_in_bytes.size());
     }
+    input_outer_strides.assign(input_strides_in_bytes.begin(),
+                               input_strides_in_bytes.end());
+    // Also save original strides if explicit
     plan->original_a_strides_.resize(ndim);
     absl::c_copy(input_strides_in_bytes, plan->original_a_strides_.begin());
-    // Sort the dimensions from slowest-varying (largest strides) to
-    // fastest-varying (smallest strides).
-    std::vector<int64_t> dim_order(ndim);
-    absl::c_iota(dim_order, 0);
-
-    auto cost = [&](int k) {
-      int64_t stride = input_strides_in_bytes.at(k);
-      // If there is a dimension with size equal to the element size, sort it
-      // last. This ensures that we place any stride-1 dimension last.
-      bool is_stride1 = stride == o.elem_size_in_bytes;
-      // If there are multiple stride-1 dimensions, we'd prefer the one that
-      // matches the stride-1 dimension of the output.
-      // Failing that, we'd just prefer the largest stride-1 dimension last.
-      bool is_trailing_dim_in_b = o.permutation.back() == k;
-
-      // If we are applying ef57 conversion, we want a size-2 stride-1
-      // dimension last.
-      bool ef57_even =
-          (is_stride1 && o.transformation == Transformation::kF64ToEf57 &&
-           o.dims[k] == 2);
-
-      return std::make_tuple(is_stride1, -std::abs(stride), ef57_even,
-                             is_trailing_dim_in_b, o.dims[k]);
-    };
-    absl::c_stable_sort(dim_order,
-                        [&cost](int i, int j) { return cost(i) < cost(j); });
-    // dim_order maps new input dim -> old input dim, we need its inverse to
-    // compute the new permutation.
-    auto inv_dim_order = InversePermutation(dim_order);
-    plan->lda_.reserve(ndim);
-    plan->a_dims_.reserve(ndim);
-    plan->permutation_.reserve(ndim);
-    for (int i = 0; i < ndim; ++i) {
-      plan->lda_.push_back(input_strides_in_bytes.at(dim_order[i]));
-      plan->a_dims_.push_back(o.dims[dim_order[i]]);
-      plan->permutation_.push_back(inv_dim_order[o.permutation[i]]);
-    }
-    plan->lda_tile_.resize(ndim, 1);
-    plan->a_tiling_.resize(ndim, 1);
   } else {
-    TF_RETURN_IF_ERROR(ParseTilingSpecification(
-        ndim, std::get<Tiling>(o.input_layout).tiling, plan->a_tiling_));
-
-    plan->a_dims_ = plan->original_a_dims_;
-    plan->permutation_.resize(ndim);
-    absl::c_copy(o.permutation, plan->permutation_.begin());
-    ComputeStrides(plan->elem_size_in_bytes_, plan->a_dims_, plan->a_tiling_,
-                   plan->lda_, plan->lda_tile_);
+    input_outer_strides = temp_lda;
+  }
+
+  // Sort the dimensions from slowest-varying (largest strides) to
+  // fastest-varying (smallest strides).
+  // Maps new input dim -> old input dim
+  std::vector<int64_t> dim_order(ndim);
+  absl::c_iota(dim_order, 0);
+
+  auto cost = [&](int k) {
+    int64_t stride = input_outer_strides.at(k);
+    // If there is a dimension with size equal to the element size, sort it
+    // last. This ensures that we place any stride-1 dimension last.
+    bool is_stride1 = stride == o.elem_size_in_bytes;
+    // If there are multiple stride-1 dimensions, we'd prefer the one that
+    // matches the stride-1 dimension of the output.
+    // Failing that, we'd just prefer the largest stride-1 dimension last.
+    bool is_trailing_dim_in_b = o.permutation.back() == k;
+
+    // If we are applying ef57 conversion, we want a size-2 stride-1
+    // dimension last.
+    bool ef57_even =
+        (is_stride1 && o.transformation == Transformation::kF64ToEf57 &&
+         o.dims[k] == 2);
+
+    return std::make_tuple(is_stride1, -std::abs(stride), ef57_even,
+                           is_trailing_dim_in_b, o.dims[k]);
+  };
+  absl::c_stable_sort(dim_order,
+                      [&cost](int i, int j) { return cost(i) < cost(j); });
+
+  // Apply permutation to all plan attributes
+  // dim_order maps new input dim -> old input dim, we need its inverse to
+  // compute the new permutation.
+  auto inv_dim_order = InversePermutation(dim_order);
+  plan->lda_.reserve(ndim);
+  plan->lda_tile_.reserve(ndim);
+  plan->a_dims_.reserve(ndim);
+  plan->permutation_.reserve(ndim);
+  plan->a_tiling_.reserve(ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    int old_idx = dim_order[i];
+    plan->lda_.push_back(input_outer_strides.at(old_idx));
+    plan->lda_tile_.push_back(temp_lda_tile.at(old_idx));
+    plan->a_dims_.push_back(o.dims[old_idx]);
+    plan->permutation_.push_back(inv_dim_order[o.permutation[i]]);
+    plan->a_tiling_.push_back(temp_a_tiling[old_idx]);
   }
 
   auto is_not_one = [](int64_t x) { return x != 1; };
@@ -1057,11 +954,6 @@ void TransposePlan::Initialize() {
   if (num_elems_ == 0) {
     return;
   }
-  RemoveTrivialDimensions(a_dims_, permutation_, lda_, lda_tile_, a_tiling_,
-                          b_tiling_);
-  CoalesceDimensions(a_dims_, permutation_, lda_, lda_tile_, a_tiling_,
-                     b_tiling_);
-
   // permutation maps dimensions of b to a
   // inverse_permutation maps dimensions of a to b
   std::vector<int64_t> inverse_permutation = InversePermutation(permutation_);
@@ -1096,14 +988,51 @@ void TransposePlan::Initialize() {
   const int pos_stride1b_in_a = permutation_.back();
   inner_kernel_is_memcpy_ = (pos_stride1b_in_a == pos_stride1a);
 
+  // Calculate sentinel strides.
+  if (!inner_kernel_is_memcpy_) {
+    int pos_stride1a_in_b = inverse_permutation[ndim - 1];
+    sentinel_lda_ = a_tiling_[pos_stride1b_in_a] > 1
+                        ? lda_tile_[pos_stride1b_in_a]
+                        : lda_[pos_stride1b_in_a];
+    sentinel_ldb_ = b_tiling_[pos_stride1a_in_b] > 1
+                        ? ldb_tile_[pos_stride1a_in_b]
+                        : ldb_[pos_stride1a_in_b];
+  }
+
   loop_order_.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
-    loop_order_.push_back(Loop{i, /*tile_interior=*/false});
-    if (a_tiling_[i] != 1 || b_tiling_[inverse_permutation[i]] != 1) {
-      loop_order_.push_back(Loop{i, /*tile_interior=*/true});
+    Loop loop;
+    loop.dim_in_a = i;
+    loop.tile_interior = false;
+    loop.dim_size = a_dims_[i];
+    loop.tile_size = std::max(a_tiling_[i], b_tiling_[inverse_permutation[i]]);
+
+    loop.lda = lda_[i];
+    if (a_tiling_[i] == 1) {
+      loop.lda *= loop.tile_size;
+    }
+    loop.ldb = ldb_[inverse_permutation[i]];
+    if (b_tiling_[inverse_permutation[i]] == 1) {
+      loop.ldb *= loop.tile_size;
+    }
+    loop.is_inner_dim_in_a = (loop.tile_size == 1) && (i == pos_stride1a);
+    loop.is_inner_dim_in_b = (loop.tile_size == 1) && (i == pos_stride1b_in_a);
+    loop_order_.push_back(loop);
+
+    if (loop.tile_size > 1) {
+      loop.tile_interior = true;
+      loop.lda = a_is_tiled_ ? lda_tile_[i] : lda_[i];
+      loop.ldb = b_is_tiled_ ? ldb_tile_[inverse_permutation[i]]
+                             : ldb_[inverse_permutation[i]];
+      loop.is_inner_dim_in_a = (i == pos_stride1a);
+      loop.is_inner_dim_in_b = (i == pos_stride1b_in_a);
+      loop_order_.push_back(loop);
     }
   }
 
+  RemoveTrivialLoops(loop_order_);
+  CoalesceLoops(loop_order_);
+
   // Bound the block sizes so they are smaller than the stride-1 dimension
   // size.
   int64_t a_stride1_size = std::max(
@@ -1169,21 +1098,12 @@ void TransposePlan::Initialize() {
 
   // Loop order heuristic: try to make loops with small strides innermost.
   auto cost = [&](const Loop& l) {
-    int64_t a_stride =
-        std::abs((l.tile_interior && a_is_tiled_) ? lda_tile_[l.dim_in_a]
-                                                  : lda_[l.dim_in_a]);
-    bool is_inner_dim_in_a =
-        (!a_is_tiled_ || l.tile_interior) && (l.dim_in_a == pos_stride1a);
-
-    if (!inner_kernel_is_memcpy_ && is_inner_dim_in_a) {
+    int64_t a_stride = std::abs(l.lda);
+    if (!inner_kernel_is_memcpy_ && l.is_inner_dim_in_a) {
       a_stride *= inner_block_elems_ * outer_block_elems_a_;
     }
-    int b_dim = inverse_permutation[l.dim_in_a];
-    int64_t b_stride =
-        (l.tile_interior && b_is_tiled_) ? ldb_tile_[b_dim] : ldb_[b_dim];
-    bool is_inner_dim_in_b =
-        (!b_is_tiled_ || l.tile_interior) && (l.dim_in_a == pos_stride1b_in_a);
-    if (!inner_kernel_is_memcpy_ && is_inner_dim_in_b) {
+    int64_t b_stride = std::abs(l.ldb);
+    if (!inner_kernel_is_memcpy_ && l.is_inner_dim_in_b) {
       b_stride *= inner_block_elems_ * outer_block_elems_b_;
     }
     // Add a small penalty to the input strides: given the choice between
@@ -1193,10 +1113,7 @@ void TransposePlan::Initialize() {
 
     // If the inner kernel is a memcpy make sure the innermost loop is the
     // stride-1 dimension. This is a requirement of the memcpy kernel.
-    bool dim_must_go_last =
-        inner_kernel_is_memcpy_ && l.dim_in_a == pos_stride1a &&
-        (l.tile_interior ||
-         (a_tiling_[l.dim_in_a] == 1 && b_tiling_[b_dim] == 1));
+    bool dim_must_go_last = inner_kernel_is_memcpy_ && l.is_inner_dim_in_a;
     return std::make_tuple(dim_must_go_last,
                            inner_kernel_is_memcpy_ && l.tile_interior,
                            -std::min<double>(a_stride * penalty, b_stride));
@@ -1210,15 +1127,13 @@ void TransposePlan::Initialize() {
   // both input and output.
 
   // The stride-1 loop must be innermost for a memcpy loop.
-  DCHECK(!inner_kernel_is_memcpy_ || loop_order_.back().dim_in_a == ndim - 1)
+  DCHECK(!inner_kernel_is_memcpy_ || loop_order_.back().is_inner_dim_in_a)
       << ToString();
 
-  loop_parallelism_ = ChooseParallelizationStrategy(inverse_permutation);
-  int num_threads =
-      absl::c_accumulate(loop_parallelism_, int{1}, std::multiplies<int>());
+  int num_threads = ChooseParallelizationStrategy();
   nodes_.resize(num_threads);
   for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
-    BuildPlanNodes(inverse_permutation, thread_id, nodes_[thread_id]);
+    BuildPlanNodes(thread_id, nodes_[thread_id]);
   }
 
   switch (transformation_) {
@@ -1233,28 +1148,20 @@ void TransposePlan::Initialize() {
   }
 }
 
-std::vector<int> TransposePlan::ChooseParallelizationStrategy(
-    absl::Span<int64_t const> inverse_permutation) {
-  std::vector<int> parallelism;
+int TransposePlan::ChooseParallelizationStrategy() {
   int available_parallelism = num_threads_requested_;
-  parallelism.reserve(loop_order_.size());
 
-  int ndim = permutation_.size();
-  const int pos_stride1a = ndim - 1;
-  const int pos_stride1b_in_a = permutation_.back();
   // Compute the number of iterations in `loop`.
   auto loop_iterations = [&](const Loop& loop) {
-    int a_dim = loop.dim_in_a;
-    int b_dim = inverse_permutation[a_dim];
-    int64_t tile_size = std::max(a_tiling_[a_dim], b_tiling_[b_dim]);
     int64_t size = loop.tile_interior
-                       ? tile_size
-                       : (CeilOfRatio(a_dims_[loop.dim_in_a], tile_size));
-    if (!inner_kernel_is_memcpy_ && (loop.tile_interior || tile_size == 1)) {
-      if (loop.dim_in_a == pos_stride1a) {
+                       ? loop.tile_size
+                       : (CeilOfRatio(loop.dim_size, loop.tile_size));
+    if (!inner_kernel_is_memcpy_ &&
+        (loop.tile_interior || loop.tile_size == 1)) {
+      if (loop.is_inner_dim_in_a) {
         size = CeilOfRatio<int64_t>(size,
                                     inner_block_elems_ * outer_block_elems_a_);
-      } else if (loop.dim_in_a == pos_stride1b_in_a) {
+      } else if (loop.is_inner_dim_in_b) {
         size = CeilOfRatio<int64_t>(size,
                                     inner_block_elems_ * outer_block_elems_b_);
       }
@@ -1279,8 +1186,9 @@ std::vector<int> TransposePlan::ChooseParallelizationStrategy(
 
   // Heuristic that attempts to parallelize the outermost loops, down to a
   // minimum per-thread number of bytes processed.
+  int num_threads = 1;
   for (size_t i = 0; i < loop_order_.size(); ++i) {
-    const Loop& loop = loop_order_[i];
+    Loop& loop = loop_order_[i];
     CHECK_GE(available_parallelism, 1);
     int64_t iterations = loop_iterations(loop);
     int kMinBytesPerThread = inner_kernel_is_memcpy_ ? (1 << 20) : (1 << 26);
@@ -1291,14 +1199,15 @@ std::vector<int> TransposePlan::ChooseParallelizationStrategy(
     VLOG(8) << "iterations=" << iterations << " parallel_work=" << parallel_work
             << " available_parallelism=" << available_parallelism;
     if (parallel_work >= available_parallelism) {
-      parallelism.push_back(available_parallelism);
+      loop.parallelism = available_parallelism;
       available_parallelism = 1;
     } else {
-      parallelism.push_back(parallel_work);
+      loop.parallelism = parallel_work;
       available_parallelism /= parallel_work;
     }
+    num_threads *= loop.parallelism;
   }
-  return parallelism;
+  return num_threads;
 }
 
 std::string TransposePlan::ToString() const {
@@ -1321,7 +1230,8 @@ std::string TransposePlan::ToString() const {
       });
   auto format_loop_order = [](std::string* out, const Loop& loop) {
     return absl::StrAppend(out, loop.dim_in_a,
-                           loop.tile_interior ? "[tile]" : "");
+                           loop.tile_interior ? "[tile]" : "", "(",
+                           loop.parallelism, ")");
   };
   std::string transformation_str;
   switch (transformation_) {
@@ -1335,7 +1245,7 @@ std::string TransposePlan::ToString() const {
   return absl::StrFormat(
       "elem_size=%d a_dims=%s b_dims=%s permutation=%s a_tiling=%s b_tiling=%s "
       "lda=%s lda_tile=%s ldb=%s ldb_tile=%s loop_order=%s "
-      "loop_parallelism=%s outer_bs=[%d,%d] inner_bs=%d "
+      "outer_bs=[%d,%d] inner_bs=%d "
       "transformation=%s scratch_size=%d\n"
       "nodes:\n%s",
       elem_size_in_bytes_, absl::StrJoin(a_dims_, ","),
@@ -1344,8 +1254,7 @@ std::string TransposePlan::ToString() const {
       absl::StrJoin(b_tiling_, ","), absl::StrJoin(lda_, ","),
       absl::StrJoin(lda_tile_, ","), absl::StrJoin(ldb_, ","),
       absl::StrJoin(ldb_tile_, ","),
-      absl::StrJoin(loop_order_, ",", format_loop_order),
-      absl::StrJoin(loop_parallelism_, ","), outer_block_elems_a_,
+      absl::StrJoin(loop_order_, ",", format_loop_order), outer_block_elems_a_,
       outer_block_elems_b_, inner_block_elems_, transformation_str,
       scratch_size_, nodes_str);
 }
@@ -1354,8 +1263,8 @@ bool TransposePlanCacheKey::operator==(
     const TransposePlanCacheKey& other) const {
   return elem_size_in_bytes == other.elem_size_in_bytes && dims == other.dims &&
          permutation == other.permutation &&
-         input_layout_is_tiling == other.input_layout_is_tiling &&
-         input_layout == other.input_layout &&
+         input_tiling == other.input_tiling &&
+         input_striding == other.input_striding &&
          output_tiling == other.output_tiling &&
          transformation == other.transformation &&
          num_threads == other.num_threads;
@@ -1363,10 +1272,9 @@ bool TransposePlanCacheKey::operator==(
 
 template <typename H>
 H AbslHashValue(H h, const TransposePlanCacheKey& key) {
-  return H::combine(std::move(h), key.elem_size_in_bytes,
-                    key.input_layout_is_tiling, key.num_threads,
+  return H::combine(std::move(h), key.elem_size_in_bytes, key.num_threads,
                     key.transformation, key.dims, key.permutation,
-                    key.input_layout, key.output_tiling);
+                    key.input_tiling, key.input_striding, key.output_tiling);
 }
 
 TransposePlanCache::TransposePlanCache(int capacity)
@@ -1382,21 +1290,18 @@ absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
   absl::c_copy(o.dims, key.dims.begin());
   key.permutation.resize(o.permutation.size());
   absl::c_copy(o.permutation, key.permutation.begin());
-  if (std::holds_alternative<TransposePlan::Striding>(o.input_layout)) {
-    absl::Span<int64_t const> input_strides_in_bytes =
-        std::get<TransposePlan::Striding>(o.input_layout).strides_in_bytes;
-    key.input_layout = absl::InlinedVector<int64_t, 4>(
-        input_strides_in_bytes.begin(), input_strides_in_bytes.end());
-    key.input_layout_is_tiling = false;
-  } else {
-    absl::Span<int64_t const> input_tiling =
-        std::get<TransposePlan::Tiling>(o.input_layout).tiling;
-    key.input_layout = absl::InlinedVector<int64_t, 4>(input_tiling.begin(),
-                                                       input_tiling.end());
-    key.input_layout_is_tiling = true;
+  if (o.input_tiling) {
+    key.input_tiling.emplace(o.input_tiling->tiling.begin(),
+                             o.input_tiling->tiling.end());
+  }
+  if (o.input_striding) {
+    key.input_striding.emplace(o.input_striding->strides_in_bytes.begin(),
+                               o.input_striding->strides_in_bytes.end());
+  }
+  if (o.output_tiling) {
+    key.output_tiling.emplace(o.output_tiling->tiling.begin(),
+                              o.output_tiling->tiling.end());
   }
-  key.output_tiling.resize(o.output_tiling.tiling.size());
-  absl::c_copy(o.output_tiling.tiling, key.output_tiling.begin());
   key.transformation = o.transformation;
   key.num_threads = o.num_threads;
   return cache_.GetOrCreateIfAbsent(
@@ -1409,4 +1314,79 @@ absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
       });
 }
 
+/*static*/ void TransposePlan::RemoveTrivialLoops(std::vector<Loop>& loops) {
+  auto it = std::remove_if(loops.begin(), loops.end(), [](const Loop& loop) {
+    // We must preserve the loop if it corresponds to the innermost dimension
+    // of the layout, because the kernels (especially TransposeConstStride1)
+    // rely on finding a node with is_inner_dim_in_a/b set to true.
+    if (loop.is_inner_dim_in_a || loop.is_inner_dim_in_b) {
+      return false;
+    }
+    if (loop.tile_interior) {
+      return loop.tile_size == 1;
+    }
+    // Exterior loop.
+    // Trivial if dim_size == tile_size (1 complete tile, no partials). This
+    // also accounts for the case where the dimension is of size 1, since in
+    // that case the tile size is also 1.
+    return loop.dim_size == loop.tile_size;
+  });
+  loops.erase(it, loops.end());
+}
+
+/*static*/ void TransposePlan::CoalesceLoops(std::vector<Loop>& loops) {
+  if (loops.empty()) {
+    return;
+  }
+
+  // Coalesce from slow-varying to fast-varying (outer to inner).
+  // loop_order_[0] is slowest.
+  int write_pos = 0;
+  for (int read_pos = 1; read_pos < loops.size(); ++read_pos) {
+    Loop& outer = loops[write_pos];
+    const Loop& inner = loops[read_pos];
+
+    int64_t inner_iter_size = inner.tile_interior
+                                  ? inner.tile_size
+                                  : (inner.dim_size / inner.tile_size);
+
+    // Two loops can be coalesced if:
+    // * they are both tile interiors or both tile exteriors
+    // * neither has a partial tile
+    // * the inner loop is a multiple of the outer loop.
+    // TODO(phawkins): I suspect this condition can be simplified. In particular
+    // the condition that we separate tile exteriors from interiors feels
+    // arbitrary.
+    bool coalescable = (outer.tile_interior == inner.tile_interior) &&
+                       (outer.dim_size % outer.tile_size == 0) &&
+                       (inner.dim_size % inner.tile_size == 0) &&
+                       (outer.lda == inner.lda * inner_iter_size) &&
+                       (outer.ldb == inner.ldb * inner_iter_size);
+    if (coalescable) {
+      if (outer.tile_interior) {
+        outer.tile_size *= inner.tile_size;
+        outer.dim_size *= inner.dim_size;
+      } else {
+        outer.dim_size *= inner_iter_size;
+      }
+
+      outer.lda = inner.lda;
+      outer.ldb = inner.ldb;
+
+      outer.is_inner_dim_in_a =
+          inner.is_inner_dim_in_a || outer.is_inner_dim_in_a;
+      outer.is_inner_dim_in_b =
+          inner.is_inner_dim_in_b || outer.is_inner_dim_in_b;
+
+      // Don't advance write_pos, so we can merge more into 'outer'.
+    } else {
+      ++write_pos;
+      if (write_pos != read_pos) {
+        loops[write_pos] = inner;
+      }
+    }
+  }
+  loops.resize(write_pos + 1);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index 714db857b7da2d..aef51be791a04b 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -50,7 +50,9 @@ class TransposePlan {
   // dims: the input shape, in elements.
   // permutation: for each output dimension, gives the number of the
   //   corresponding input dimension. Must be a permutation of [0..dims.size())
-  // input_layout: either byte strides or an input tiling.
+  // input_tiling: optional input tiling.
+  // input_striding: optional input byte strides.
+  // output_tiling: optional output tiling.
   //
   // A Striding represents the strides of the input array in bytes. (N.B. not
   // elements).
@@ -71,7 +73,9 @@ class TransposePlan {
   // tiled dimensions. This is acceptable because in the intended use case for
   // this code we expect at most 2 tiled dimensions on input and output.
   //
-  // The input may have either a striding or a tiling but not both.
+  // The input may have both a tiling and a striding. If both are present,
+  // the striding determines the strides between tiles (in bytes).
+  //
   //
   // num_threads: is the number of threads requested. The actual number of
   //   threads used may be smaller if there isn't enough work per thread.
@@ -94,14 +98,19 @@ class TransposePlan {
     size_t elem_size_in_bytes;
     absl::Span<int64_t const> dims;
     absl::Span<int64_t const> permutation;
-    std::variant<Tiling, Striding> input_layout = Tiling{};
-    Tiling output_tiling;
+    std::optional<Tiling> input_tiling = std::nullopt;
+    std::optional<Striding> input_striding = std::nullopt;
+    std::optional<Tiling> output_tiling = std::nullopt;
     Transformation transformation = Transformation::kNone;
     int num_threads = 1;
+
+    // DEPRECATED: Use input_tiling or input_striding instead.
+    // This field is only present for backward compatibility.
+    // TODO(phawkins): remove me.
+    std::optional<std::variant<Tiling, Striding>> input_layout = std::nullopt;
   };
 
-  static absl::StatusOr<std::unique_ptr<TransposePlan>> Create(
-      const Options& options);
+  static absl::StatusOr<std::unique_ptr<TransposePlan>> Create(Options options);
 
   TransposePlan();
   ~TransposePlan();
@@ -138,33 +147,50 @@ class TransposePlan {
  protected:
   // Methods protected so they can be accessed by tests.
 
-  // Removes any size-1 dimensions.
-  static void RemoveTrivialDimensions(
-      absl::InlinedVector<int64_t, 4>& a_dims,
-      absl::InlinedVector<int64_t, 4>& permutation,
-      absl::InlinedVector<int64_t, 4>& lda,
-      absl::InlinedVector<int64_t, 4>& lda_tile,
-      absl::InlinedVector<int64_t, 4>& a_tiling,
-      absl::InlinedVector<int64_t, 4>& b_tiling);
-
-  // Collapses together dimensions that are adjacent both in `dims` and
-  // `permutation`.
-  static void CoalesceDimensions(absl::InlinedVector<int64_t, 4>& a_dims,
-                                 absl::InlinedVector<int64_t, 4>& permutation,
-                                 absl::InlinedVector<int64_t, 4>& lda,
-                                 absl::InlinedVector<int64_t, 4>& lda_tile,
-                                 absl::InlinedVector<int64_t, 4>& a_tiling,
-                                 absl::InlinedVector<int64_t, 4>& b_tiling);
+  struct Loop {
+    // Dimension number in A from which this loop originated. This is mostly
+    // for debugging the plan.
+    int dim_in_a;
+
+    // If true, the loop iterates over the interior of a tile.
+    // For an untiled dimension, this is always false. For a tiled dimension,
+    // we will have two loops: one over the tile exteriors and one over the tile
+    // interiors.
+    bool tile_interior;
+
+    // Size of the iteration space.
+    int64_t dim_size;
+
+    // Size of the tiles, if this a tiled dimension.
+    int64_t tile_size;
+
+    int64_t lda;  // Stride in A for this loop.
+    int64_t ldb;  // Stride in B for this loop.
+
+    // Is this the innermost (stride 1) dimension in A or B? These dimensions
+    // are special for the kernels.
+    bool is_inner_dim_in_a;
+    bool is_inner_dim_in_b;
+
+    // Number of parallel threads to use for this loop.
+    int64_t parallelism;
+
+    bool operator==(const Loop& other) const;
+  };
+
+  // Exposed for testing.
+  static void RemoveTrivialLoops(std::vector<Loop>& loops);
+  static void CoalesceLoops(std::vector<Loop>& loops);
 
  private:
   // Performs plan initialization that cannot fail.
   void Initialize();
 
-  void BuildPlanNodes(absl::Span<int64_t const> inverse_permutation,
-                      int thread_id, std::vector<Node>& output_nodes);
+  void BuildPlanNodes(int thread_id, std::vector<Node>& output_nodes);
 
-  std::vector<int> ChooseParallelizationStrategy(
-      absl::Span<int64_t const> inverse_permutation);
+  // Chooses a parallelism for each loop. Returns the total number of parallel
+  // work units.
+  int ChooseParallelizationStrategy();
 
   // The signature of ExecuteTyped uses char* pointers because we perform
   // address calculations with strides in bytes; the strides need not be
@@ -199,10 +225,10 @@ class TransposePlan {
   absl::InlinedVector<int64_t, 4> permutation_;
 
   // Leading-dimension sizes (byte strides) of each dimension.
-  absl::InlinedVector<int64_t, 4> lda_;
-  absl::InlinedVector<int64_t, 4> lda_tile_;
-  absl::InlinedVector<int64_t, 4> ldb_;
-  absl::InlinedVector<int64_t, 4> ldb_tile_;
+  absl::InlinedVector<int64_t, 4> lda_;       // Strides for tiles
+  absl::InlinedVector<int64_t, 4> lda_tile_;  // Strides for tile interiors
+  absl::InlinedVector<int64_t, 4> ldb_;       // Strides for tiles
+  absl::InlinedVector<int64_t, 4> ldb_tile_;  // Strides for tile interiors
 
   // Tile sizes in each dimension. Has size equal to the number of dimensions.
   // A 1 entry means that dimension is not tiled.
@@ -212,14 +238,8 @@ class TransposePlan {
   bool b_is_tiled_;
 
   // Order to traverse dimensions, from slowest-varying to fastest-varying.
-  struct Loop {
-    // The integers are dimension numbers in A.
-    int dim_in_a;
-    // If true, the loop iterates over the interior of a tile.
-    bool tile_interior;
-  };
+
   std::vector<Loop> loop_order_;
-  std::vector<int> loop_parallelism_;
 
   // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
   // nest. The outer vector is indexed on the thread ID.
@@ -237,6 +257,10 @@ class TransposePlan {
   int outer_block_elems_a_ = 4;
   int outer_block_elems_b_ = 4;
 
+  // Strides used by an inner transpose kernel. Unused for memcpy kernels.
+  int64_t sentinel_lda_ = -1;
+  int64_t sentinel_ldb_ = -1;
+
   // Transformations to apply to the input before transposition.
   // Currently the only supported transformation is EF57 conversion, which is
   // a pair-of-floats extended precision representation used on TPU. We
@@ -257,9 +281,9 @@ struct TransposePlanCacheKey {
   size_t elem_size_in_bytes;
   absl::InlinedVector<int64_t, 4> dims;
   absl::InlinedVector<int64_t, 4> permutation;
-  bool input_layout_is_tiling;
-  absl::InlinedVector<int64_t, 4> input_layout;
-  absl::InlinedVector<int64_t, 4> output_tiling;
+  std::optional<absl::InlinedVector<int64_t, 4>> input_tiling;
+  std::optional<absl::InlinedVector<int64_t, 4>> input_striding;
+  std::optional<absl::InlinedVector<int64_t, 4>> output_tiling;
   TransposePlan::Transformation transformation;
   int num_threads;
 
diff --git a/third_party/xla/xla/pjrt/transpose_test.cc b/third_party/xla/xla/pjrt/transpose_test.cc
index dc16d7b01073d9..c136540eee1175 100644
--- a/third_party/xla/xla/pjrt/transpose_test.cc
+++ b/third_party/xla/xla/pjrt/transpose_test.cc
@@ -23,14 +23,17 @@ limitations under the License.
 #include <ostream>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/numeric/int128.h"
+#include "absl/random/random.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -39,94 +42,115 @@ limitations under the License.
 #include "xla/array.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/permutation_util.h"
-#include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/util.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test_benchmark.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
 class TestTransposePlan : public TransposePlan {
  public:
-  using TransposePlan::CoalesceDimensions;
-  using TransposePlan::RemoveTrivialDimensions;
+  using Loop = TransposePlan::Loop;
+  using TransposePlan::CoalesceLoops;
+  using TransposePlan::RemoveTrivialLoops;
 };
 
-TEST(TransposeTest, RemoveTrivialDimensions) {
-  absl::InlinedVector<int64_t, 4> dims = {4, 5, 1, 3, 1, 2, 5};
-  absl::InlinedVector<int64_t, 4> perm = {0, 2, 1, 4, 3, 6, 5};
-  absl::InlinedVector<int64_t, 4> lda = {2, 5, 7, 100, 3, 0, 1};
-  absl::InlinedVector<int64_t, 4> lda_tile = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> input_tiling = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> output_tiling = {1, 1, 1, 1, 1, 1, 1};
-  TestTransposePlan::RemoveTrivialDimensions(dims, perm, lda, lda_tile,
-                                             input_tiling, output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 5, 3, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(0, 1, 2, 4, 3));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {4, 3, 2, 1, 0};
-  lda = {2, 5, 100, 0, 1};
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::RemoveTrivialDimensions(dims, perm, lda, lda_tile,
-                                             input_tiling, output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 5, 3, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(4, 3, 2, 1, 0));
+TEST(TransposeTest, RemoveTrivialLoops) {
+  using Loop = TestTransposePlan::Loop;
+  std::vector<Loop> loops;
+  // Exterior loop, trivial (size 1)
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/1,
+                       /*tile_size=*/1});
+  // Exterior loop, trivial (dim_size == tile_size, 1 tile)
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/false, /*dim_size=*/10,
+                       /*tile_size=*/10});
+  // Exterior loop, non-trivial
+  loops.push_back(Loop{/*dim_in_a=*/2, /*tile_interior=*/false, /*dim_size=*/10,
+                       /*tile_size=*/2});
+  // Interior loop, trivial (size 1)
+  loops.push_back(Loop{/*dim_in_a=*/3, /*tile_interior=*/true, /*dim_size=*/10,
+                       /*tile_size=*/1});
+  // Interior loop, non-trivial
+  loops.push_back(Loop{/*dim_in_a=*/4, /*tile_interior=*/true, /*dim_size=*/10,
+                       /*tile_size=*/10});
+  // Trivial loop (size 1) but preserved because it is inner dim
+  loops.push_back(Loop{/*dim_in_a=*/5, /*tile_interior=*/false, /*dim_size=*/1,
+                       /*tile_size=*/1, /*lda=*/1, /*ldb=*/1,
+                       /*is_inner_dim_in_a=*/true,
+                       /*is_inner_dim_in_b=*/false});
+
+  TestTransposePlan::RemoveTrivialLoops(loops);
+
+  ASSERT_EQ(loops.size(), 3);
+  // Expect loop 2 (Exterior non-trivial)
+  EXPECT_EQ(loops[0].dim_in_a, 2);
+  EXPECT_EQ(loops[0].tile_interior, false);
+  // Expect loop 4 (Interior non-trivial)
+  EXPECT_EQ(loops[1].dim_in_a, 4);
+  EXPECT_EQ(loops[1].tile_interior, true);
+  // Expect loop 5 (Trivial but preserved)
+  EXPECT_EQ(loops[2].dim_in_a, 5);
+  EXPECT_EQ(loops[2].is_inner_dim_in_a, true);
 }
 
-TEST(TransposeTest, CoalesceDimensions) {
-  absl::InlinedVector<int64_t, 4> dims = {4, 5, 1, 3, 1, 2, 5};
-  absl::InlinedVector<int64_t, 4> perm = {0, 2, 1, 4, 3, 6, 5};
-  absl::InlinedVector<int64_t, 4> lda = {50, 30, 30, 10, 10, 5, 1};
-  absl::InlinedVector<int64_t, 4> lda_tile = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> input_tiling = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> output_tiling = {1, 1, 1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 5, 1, 3, 1, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(0, 2, 1, 4, 3, 6, 5));
-  EXPECT_THAT(lda, testing::ElementsAre(50, 30, 30, 10, 10, 5, 1));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {4, 1, 2, 3, 0};
-  lda = {150, 30, 10, 5, 1};
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 30, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(2, 1, 0));
-  EXPECT_THAT(lda, testing::ElementsAre(150, 5, 1));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {0, 1, 2, 3, 4};
-  lda = {150, 30, 10, 5, 1};
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(600));
-  EXPECT_THAT(perm, testing::ElementsAre(0));
-  EXPECT_THAT(lda, testing::ElementsAre(1));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {4, 1, 2, 3, 0};
-  lda = {150, 30, 10, 7, 1};  // Non-standard stridings prevent coalescing.
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 15, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(3, 1, 2, 0));
-  EXPECT_THAT(lda, testing::ElementsAre(150, 10, 7, 1));
+TEST(TransposeTest, CoalesceLoops) {
+  using Loop = TestTransposePlan::Loop;
+  std::vector<Loop> loops;
+
+  // Case 1: Compatible untiled loops
+  // Outer: size 4, stride 20 (inner size 5 * inner stride 4)
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/4,
+                       /*tile_size=*/1, /*lda=*/20, /*ldb=*/400});
+  // Inner: size 5, stride 4
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/false, /*dim_size=*/5,
+                       /*tile_size=*/1, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+
+  ASSERT_EQ(loops.size(), 1);
+  EXPECT_EQ(loops[0].dim_size, 20);
+  EXPECT_EQ(loops[0].tile_size, 1);
+  EXPECT_EQ(loops[0].lda, 4);
+  EXPECT_EQ(loops[0].ldb, 80);
+
+  // Case 2: Incompatible strides
+  loops.clear();
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/4,
+                       /*tile_size=*/1, /*lda=*/21,
+                       /*ldb=*/400});  // lda mismatch
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/false, /*dim_size=*/5,
+                       /*tile_size=*/1, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+  EXPECT_EQ(loops.size(), 2);
+
+  // Case 3: Compatible tiled interior
+  loops.clear();
+  // Outer interior: tile_size 4, lda 16
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/true, /*dim_size=*/100,
+                       /*tile_size=*/4, /*lda=*/16, /*ldb=*/320});
+  // Inner interior: tile_size 4, lda 4
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/true, /*dim_size=*/100,
+                       /*tile_size=*/4, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+  ASSERT_EQ(loops.size(), 1);
+  EXPECT_EQ(loops[0].tile_size, 16);
+  EXPECT_EQ(loops[0].tile_interior, true);
+
+  // Case 4: Mismatched tile_interior status (should not coalesce)
+  loops.clear();
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/4,
+                       /*tile_size=*/1, /*lda=*/20, /*ldb=*/400});
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/true, /*dim_size=*/5,
+                       /*tile_size=*/5, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+  EXPECT_EQ(loops.size(), 2);
 }
 
 TEST(TransposeTest, InvalidTilings) {
@@ -138,7 +162,7 @@ TEST(TransposeTest, InvalidTilings) {
   options.permutation = perm;
   std::vector<int64_t> input_tiling = {8, 128};
   std::vector<int64_t> output_tiling = {4};
-  options.input_layout = TransposePlan::Tiling{input_tiling};
+  options.input_tiling = TransposePlan::Tiling{input_tiling};
   options.output_tiling = TransposePlan::Tiling{output_tiling};
   auto plan = TransposePlan::Create(options);
   EXPECT_EQ(plan.status().code(), tsl::error::UNIMPLEMENTED);
@@ -156,8 +180,6 @@ TEST(TransposeTest, LargeDimensions) {
   options.elem_size_in_bytes = 8;
   options.dims = dims;
   options.permutation = permutation;
-  options.input_layout = TransposePlan::Tiling{};
-  options.output_tiling = TransposePlan::Tiling{};
   options.transformation = TransposePlan::Transformation::kNone;
   TF_EXPECT_OK(TransposePlan::Create(options).status());
 }
@@ -192,52 +214,60 @@ bool BumpIndices(absl::Span<int64_t const> shape, absl::Span<int64_t> indices) {
   return false;
 }
 
+// Helper to pad tiling to match shape rank. (Suffix alignment).
+std::vector<int64_t> PadTiling(absl::Span<int64_t const> shape,
+                               absl::Span<int64_t const> tiling) {
+  CHECK_LE(tiling.size(), shape.size());
+  std::vector<int64_t> full_tiling(shape.size(), 1);
+  absl::c_copy(tiling, full_tiling.end() - tiling.size());
+  return full_tiling;
+}
+
+std::vector<int64_t> ComputeDefaultStrides(
+    absl::Span<int64_t const> shape, absl::Span<int64_t const> full_tiling,
+    int elem_size_bytes) {
+  CHECK_EQ(full_tiling.size(), shape.size());
+  std::vector<int64_t> strides(shape.size());
+  int64_t stride = elem_size_bytes;
+  for (int64_t t : full_tiling) {
+    stride *= t;
+  }
+
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= CeilOfRatio(shape[i], full_tiling[i]);
+  }
+  return strides;
+}
+
 // Converts a multidimensional index `indices` into an array with `shape` and
 // tiling `tiling` into a linear offset into a buffer.
+// `striding` is the stride between tiles in bytes.
 int64_t IndexToLinearIndex(absl::Span<int64_t const> shape,
-                           absl::Span<int64_t const> tiling,
-                           absl::Span<int64_t const> indices) {
-  CHECK_LE(tiling.size(), shape.size());
+                           absl::Span<int64_t const> full_tiling,
+                           absl::Span<int64_t const> indices,
+                           absl::Span<int64_t const> striding,
+                           int elem_size_bytes) {
+  CHECK_EQ(full_tiling.size(), shape.size());
   CHECK_EQ(shape.size(), indices.size());
+  CHECK_EQ(shape.size(), striding.size());
+
   int64_t stride = 1;
   int64_t offset = 0;
 
-  auto index_it = indices.rbegin();
-  auto tile_it = tiling.rbegin();
-  for (; tile_it != tiling.rend(); ++index_it, ++tile_it) {
-    offset += (*index_it % *tile_it) * stride;
-    stride *= *tile_it;
-  }
-  index_it = indices.rbegin();
-  tile_it = tiling.rbegin();
-  auto shape_it = shape.rbegin();
-  for (; tile_it != tiling.rend(); ++index_it, ++shape_it, ++tile_it) {
-    offset += (*index_it / *tile_it) * stride;
-    stride *= CeilOfRatio(*shape_it, *tile_it);
+  // Strides within a tiling are always the default strides.
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += (indices[i] % full_tiling[i]) * stride;
+    stride *= full_tiling[i];
   }
-  for (; shape_it != shape.rend(); ++index_it, ++shape_it) {
-    offset += *index_it * stride;
-    stride *= *shape_it;
+  // Strides outside a tiling are the input strides.
+  for (size_t i = 0; i < shape.size(); ++i) {
+    int64_t outer_idx = indices[i] / full_tiling[i];
+    offset += outer_idx * (striding[i] / elem_size_bytes);
   }
   return offset;
 }
 
-// Slow reference code that converts an array from an untiled layout into a
-// tiled layout.
-template <typename T>
-std::vector<T> TileArray(const Array<T>& in, absl::Span<int64_t const> tiling) {
-  std::vector<T> out(SizeOfTiledArray(in.dimensions(), tiling), -1);
-  if (in.num_elements() == 0) {
-    return out;
-  }
-  std::vector<int64_t> indices(in.num_dimensions(), 0);
-  do {
-    int64_t i = IndexToLinearIndex(in.dimensions(), tiling, indices);
-    out.at(i) = in(indices);
-  } while (BumpIndices(in.dimensions(), absl::MakeSpan(indices)));
-  return out;
-}
-
 // Reference implementation: transpose using Eigen.
 template <typename T, int NDIMS>
 void TransposeUsingEigenNd(const T* input, T* output,
@@ -291,25 +321,70 @@ void TransposeUsingEigen(const T* input, T* output,
   }
 }
 
+template <typename T>
+void FillRandom(absl::Span<T> input) {
+  absl::BitGen gen;
+  for (auto& val : input) {
+    if constexpr (std::is_same_v<T, absl::int128>) {
+      val = absl::MakeInt128(absl::Uniform<uint64_t>(gen),
+                             absl::Uniform<uint64_t>(gen));
+    } else {
+      using U = std::make_unsigned_t<T>;
+      val = absl::bit_cast<T>(absl::Uniform<U>(gen));
+    }
+  }
+}
+
+// Reference implementation of transpose that handles tiling and striding.
+template <typename T>
+void ReferenceTranspose(absl::Span<int64_t const> dims,
+                        absl::Span<int64_t const> permutation,
+                        absl::Span<int64_t const> input_tiling,
+                        absl::Span<int64_t const> input_striding,
+                        absl::Span<int64_t const> output_tiling,
+                        absl::Span<int64_t const> output_striding,
+                        absl::Span<const T> input, absl::Span<T> output) {
+  std::vector<int64_t> output_dims = Permute(dims, permutation);
+  std::vector<int64_t> indices(dims.size(), 0);
+  std::vector<int64_t> output_indices(dims.size());
+
+  do {
+    int64_t input_linear_idx = IndexToLinearIndex(dims, input_tiling, indices,
+                                                  input_striding, sizeof(T));
+    T val = input[input_linear_idx];
+
+    for (size_t i = 0; i < dims.size(); ++i) {
+      output_indices[i] = indices[permutation[i]];
+    }
+    int64_t output_linear_idx = IndexToLinearIndex(
+        output_dims, output_tiling, output_indices, output_striding, sizeof(T));
+    output[output_linear_idx] = val;
+  } while (BumpIndices(dims, absl::MakeSpan(indices)));
+}
+
 struct TransposeTestCase {
   TransposeTestCase(std::vector<int64_t> dims, std::vector<int64_t> permutation,
                     std::vector<int64_t> input_tiling = {},
-                    std::vector<int64_t> output_tiling = {})
+                    std::vector<int64_t> output_tiling = {},
+                    std::vector<int64_t> input_striding = {})
       : dims(std::move(dims)),
         permutation(std::move(permutation)),
         input_tiling(std::move(input_tiling)),
+        input_striding(std::move(input_striding)),
         output_tiling(std::move(output_tiling)) {}
 
   std::vector<int64_t> dims;
   std::vector<int64_t> permutation;
   std::vector<int64_t> input_tiling;
+  std::vector<int64_t> input_striding;
   std::vector<int64_t> output_tiling;
 
   std::string ToString() const {
     return absl::StrFormat(
-        "[%s],perm=[%s],tiling=[%s]/[%s]", absl::StrJoin(dims, ","),
-        absl::StrJoin(permutation, ","), absl::StrJoin(input_tiling, ","),
-        absl::StrJoin(output_tiling, ","));
+        "[%s],perm=[%s],tiling=[%s]/[%s],striding=[%s]",
+        absl::StrJoin(dims, ","), absl::StrJoin(permutation, ","),
+        absl::StrJoin(input_tiling, ","), absl::StrJoin(output_tiling, ","),
+        input_striding.empty() ? "none" : absl::StrJoin(input_striding, ","));
   }
 };
 
@@ -320,6 +395,7 @@ std::ostream& operator<<(std::ostream& os, const TransposeTestCase& test) {
 
 std::vector<TransposeTestCase> GetTransposeTestCases() {
   std::vector<TransposeTestCase> cases = {
+      TransposeTestCase(/*dims=*/{}, /*permutation=*/{}),
       TransposeTestCase(/*dims=*/{1}, /*permutation=*/{0}),
       TransposeTestCase(/*dims=*/{4}, /*permutation=*/{0}),
       TransposeTestCase(/*dims=*/{27}, /*permutation=*/{0}),
@@ -376,6 +452,12 @@ std::vector<TransposeTestCase> GetTransposeTestCases() {
                         /*input_tiling=*/{2, 4}),
       TransposeTestCase(/*dims=*/{12, 7}, /*permutation=*/{1, 0},
                         /*input_tiling=*/{}, /*output_tiling=*/{5, 2}),
+      TransposeTestCase(/*dims=*/{4, 6}, /*permutation=*/{1, 0},
+                        /*input_tiling=*/{2, 3},
+                        /*output_tiling=*/{}, /*input_striding=*/{512, 128}),
+      TransposeTestCase(/*dims=*/{13, 9}, /*permutation=*/{1, 0},
+                        /*input_tiling=*/{2, 3},
+                        /*output_tiling=*/{}, /*input_striding=*/{0, 0}),
       TransposeTestCase(/*dims=*/{128, 224, 224, 3},
                         /*permutation=*/{3, 1, 2, 0},
                         /*input_tiling=*/{},
@@ -396,29 +478,63 @@ class TransposeTest : public ::testing::TestWithParam<TransposeTestCase> {
     options.elem_size_in_bytes = sizeof(T);
     options.dims = test.dims;
     options.permutation = test.permutation;
-    options.input_layout = TransposePlan::Tiling{test.input_tiling};
-    options.output_tiling = TransposePlan::Tiling{test.output_tiling};
+    if (!test.input_striding.empty()) {
+      options.input_striding = TransposePlan::Striding{test.input_striding};
+    }
+    if (!test.input_tiling.empty()) {
+      options.input_tiling = TransposePlan::Tiling{test.input_tiling};
+    }
+    if (!test.output_tiling.empty()) {
+      options.output_tiling = TransposePlan::Tiling{test.output_tiling};
+    }
     options.transformation = TransposePlan::Transformation::kNone;
     options.num_threads = parallelism;
     TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
     VLOG(1) << plan->ToString();
-    xla::Array<T> untiled_input(test.dims);
-    untiled_input.FillIota(0);
-    xla::Array<T> expected_untiled_output(output_dims);
-    TransposeUsingEigen(untiled_input.data(), expected_untiled_output.data(),
-                        test.dims, output_dims, test.permutation);
-
-    auto tiled_input = TileArray(untiled_input, test.input_tiling);
-    auto expected_tiled_output =
-        TileArray(expected_untiled_output, test.output_tiling);
-
-    std::vector<T> output(
-        SizeOfTiledArray(plan->OutputDims(), test.output_tiling), -1);
-    plan->Execute(
-        tiled_input.data(), output.data(),
-        [&](std::function<void()> fn) { threadpool.Schedule(std::move(fn)); });
-
-    EXPECT_EQ(expected_tiled_output, output);
+
+    // Allocate sufficiently large buffers.
+    // We can use SizeOfTiledArray for output which is always tiled/dense.
+    int64_t output_size =
+        SizeOfTiledArray(plan->OutputDims(), test.output_tiling);
+    std::vector<T> output(output_size, -1);
+
+    std::vector<int64_t> input_striding = test.input_striding;
+    std::vector<int64_t> input_tiling = PadTiling(test.dims, test.input_tiling);
+    if (input_striding.empty()) {
+      input_striding =
+          ComputeDefaultStrides(test.dims, input_tiling, sizeof(T));
+    }
+    int64_t input_tile_size = absl::c_accumulate(input_tiling, int64_t{1},
+                                                 std::multiplies<int64_t>());
+
+    std::vector<int64_t> output_tiling =
+        PadTiling(output_dims, test.output_tiling);
+    std::vector<int64_t> output_striding =
+        ComputeDefaultStrides(output_dims, output_tiling, sizeof(T));
+
+    int64_t input_size = 1;
+    if (!test.dims.empty()) {
+      std::vector<int64_t> max_indices = test.dims;
+      for (int i = 0; i < test.dims.size(); ++i) {
+        max_indices[i] = RoundDownTo(max_indices[i], input_tiling[i]);
+      }
+      input_size = IndexToLinearIndex(test.dims, input_tiling, max_indices,
+                                      input_striding, sizeof(T)) +
+                   input_tile_size;
+    }
+    std::vector<T> input(input_size);
+    FillRandom<T>(absl::MakeSpan(input));
+    std::vector<T> expected_output(output_size, -1);
+
+    ReferenceTranspose<T>(test.dims, test.permutation, input_tiling,
+                          input_striding, output_tiling, output_striding, input,
+                          absl::MakeSpan(expected_output));
+
+    plan->Execute(input.data(), output.data(), [&](std::function<void()> fn) {
+      threadpool.Schedule(std::move(fn));
+    });
+
+    EXPECT_EQ(output, expected_output);
   }
 };
 
@@ -448,7 +564,7 @@ TEST(TransposeTest, NegativeStrides1D) {
   options.dims = dims;
   options.permutation = permutation;
   std::vector<int64_t> strides = {-int64_t{sizeof(int32_t)}};
-  options.input_layout = TransposePlan::Striding{strides};
+  options.input_striding = TransposePlan::Striding{strides};
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
   plan->Execute(input.data() + (n - 1), output.data());
   EXPECT_EQ(expected, output);
@@ -475,7 +591,7 @@ TEST(TransposeTest, NegativeStrides2D) {
   options.permutation = permutation;
   std::vector<int64_t> strides = {4 * sizeof(int16_t),
                                   -int64_t{sizeof(int16_t)}};
-  options.input_layout = TransposePlan::Striding{strides};
+  options.input_striding = TransposePlan::Striding{strides};
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
   plan->Execute(input.data() + 3, output.data());
   EXPECT_EQ(expected, output);
@@ -546,8 +662,6 @@ void BM_Transpose(const TransposeTestCase& bm, int parallelism,
   options.elem_size_in_bytes = sizeof(T);
   options.dims = bm.dims;
   options.permutation = bm.permutation;
-  options.input_layout = TransposePlan::Tiling{};
-  options.output_tiling = TransposePlan::Tiling{};
   options.transformation = TransposePlan::Transformation::kNone;
   options.num_threads = parallelism;
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
diff --git a/third_party/xla/xla/pjrt/triton.h b/third_party/xla/xla/pjrt/triton.h
index 81eabeb1adefaf..528922d38558bc 100644
--- a/third_party/xla/xla/pjrt/triton.h
+++ b/third_party/xla/xla/pjrt/triton.h
@@ -27,9 +27,6 @@ namespace xla::triton {
 struct CompilationResult {
   std::string asm_text;
   int64_t smem_bytes;
-  int cluster_dim_x;
-  int cluster_dim_y;
-  int cluster_dim_z;
 };
 
 absl::StatusOr<CompilationResult> Compile(absl::string_view module,
diff --git a/third_party/xla/xla/pjrt/triton_cuda.cc b/third_party/xla/xla/pjrt/triton_cuda.cc
index 5b4b8a69395d45..acd8866aa40bb6 100644
--- a/third_party/xla/xla/pjrt/triton_cuda.cc
+++ b/third_party/xla/xla/pjrt/triton_cuda.cc
@@ -228,13 +228,12 @@ absl::StatusOr<CompilationResult> Compile(absl::string_view module,
 
   mlir::PassManager pm(&context);
   pm.enableVerifier();
-  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
   TF_ASSIGN_OR_RETURN(
       auto cuda_cc,
       stream_executor::CudaComputeCapability::FromString(arch_name));
   xla::gpu::CreateTritonPipeline(&pm,
                                  stream_executor::GpuComputeCapability(cuda_cc),
-                                 num_warps, num_ctas, num_stages, cluster_info);
+                                 num_warps, num_ctas, num_stages);
   if (failed(pm.run(*module_op))) {
     return absl::InternalError("Failed to compile Triton IR to LLVM IR");
   }
@@ -247,9 +246,6 @@ absl::StatusOr<CompilationResult> Compile(absl::string_view module,
   return CompilationResult{
       ptx,
       shared_mem_bytes,
-      cluster_info.clusterDimX,
-      cluster_info.clusterDimY,
-      cluster_info.clusterDimZ,
   };
 }
 
diff --git a/third_party/xla/xla/python/compile_only_ifrt/BUILD b/third_party/xla/xla/python/compile_only_ifrt/BUILD
index 943810b3b1f647..a9279abc8fb5e0 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/BUILD
+++ b/third_party/xla/xla/python/compile_only_ifrt/BUILD
@@ -19,14 +19,17 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt/hlo:hlo_program",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
         "//xla/python/pjrt_ifrt:pjrt_dtype",
+        "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/service:computation_placer_hdr",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
diff --git a/third_party/xla/xla/python/compile_only_ifrt/client.cc b/third_party/xla/xla/python/compile_only_ifrt/client.cc
index f837702ca2d5d5..380dc756cbce2f 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/client.cc
+++ b/third_party/xla/xla/python/compile_only_ifrt/client.cc
@@ -15,6 +15,22 @@ limitations under the License.
 
 #include "xla/python/compile_only_ifrt/client.h"
 
+#include <memory>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/tsl/platform/statusor.h"
+
 namespace xla {
 
 [[maybe_unused]] char CompileOnlyMemory::ID = 0;
@@ -22,4 +38,27 @@ namespace xla {
 [[maybe_unused]] char CompileOnlyIfrtCompiler::ID = 0;
 [[maybe_unused]] char CompileOnlyIfRtClient::ID = 0;
 
+absl::StatusOr<ifrt::ExecutableRef> CompileOnlyIfrtCompiler::Compile(
+    std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
+    std::unique_ptr<ifrt::CompileOptions> options) {
+  const auto* xla_program = llvm::dyn_cast<ifrt::HloProgram>(program.get());
+  if (xla_program == nullptr) {
+    return absl::InvalidArgumentError(
+        "CompileOnlyIfrtCompiler requires an HloProgram");
+  }
+  TF_ASSIGN_OR_RETURN(auto xla_compile_options,
+                      ifrt::GetXlaCompileOptions(std::move(options)));
+  // Unlike PjRt-IFRT, device ID translation is unnecessary because
+  // `CompileOnlyIfrtClient` does not support device ID mapping.
+  const auto* pjrt_topology = llvm::dyn_cast<ifrt::PjRtTopology>(&topology);
+  if (pjrt_topology == nullptr) {
+    return absl::InvalidArgumentError(
+        "CompileOnlyIfrtCompiler requires a PjRtTopology");
+  }
+  return ifrt::PjRtExecutable::Create(
+      xla_program->mlir_module(),
+      std::move(xla_compile_options->compile_options),
+      *pjrt_topology->description());
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/python/compile_only_ifrt/client.h b/third_party/xla/xla/python/compile_only_ifrt/client.h
index ddc7d744b5f0ca..ade2169f8e5e81 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/client.h
+++ b/third_party/xla/xla/python/compile_only_ifrt/client.h
@@ -102,8 +102,10 @@ class CompileOnlyMemory
 class CompileOnlyDevice
     : public llvm::RTTIExtends<CompileOnlyDevice, ifrt::Device> {
  public:
-  explicit CompileOnlyDevice(const PjRtDeviceDescription* description)
+  explicit CompileOnlyDevice(const PjRtDeviceDescription* description,
+                             absl::string_view platform_name)
       : description_(std::move(description)),
+        platform_name_(platform_name),
         attributes_(ifrt::FromPjRtAttributeMap(description_->Attributes())) {}
 
   const PjRtDeviceDescription& description() const { return *description_; }
@@ -116,6 +118,8 @@ class CompileOnlyDevice
 
   int ProcessIndex() const override { return description_->process_index(); }
 
+  absl::string_view PlatformName() const override { return platform_name_; }
+
   absl::string_view Kind() const override {
     return description_->device_kind();
   }
@@ -151,6 +155,7 @@ class CompileOnlyDevice
 
  private:
   const PjRtDeviceDescription* description_;
+  const std::string platform_name_;
   ifrt::AttributeMap attributes_;
   ifrt::Memory* default_memory_ = nullptr;
   std::vector<ifrt::Memory*> unowned_memories_;
@@ -168,9 +173,7 @@ class CompileOnlyIfrtCompiler final
 
   absl::StatusOr<ifrt::ExecutableRef> Compile(
       std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
-      std::unique_ptr<ifrt::CompileOptions> options) override {
-    return Unimplemented("Compile not implemented.");
-  }
+      std::unique_ptr<ifrt::CompileOptions> options) override;
 
   absl::Status IsExecutableVersionCompatible(
       const xla::ifrt::ExecutableVersion& executable_version,
@@ -196,8 +199,8 @@ class CompileOnlyIfRtClient final
         attributes_(ifrt::AttributeMap::Map()) {
     int offset = 0;
     for (auto& description : descriptions_) {
-      owned_devices_.push_back(
-          std::make_unique<CompileOnlyDevice>(description.get()));
+      owned_devices_.push_back(std::make_unique<CompileOnlyDevice>(
+          description.get(), topology_->platform_name()));
       auto* device = owned_devices_.back().get();
       devices_.push_back(device);
       if (description->process_index() == process_index()) {
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 791170c3c393f0..90238b40036da4 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -160,11 +160,12 @@ cc_library(
         ":serdes_version",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -176,10 +177,10 @@ xla_cc_test(
         ":attribute_map",
         ":serdes_test_util",
         ":serdes_version",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -591,6 +592,7 @@ cc_library(
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
         "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/python/ifrt/attribute_map.cc b/third_party/xla/xla/python/ifrt/attribute_map.cc
index 18e827c78194e3..dd727de8b9eab5 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/attribute_map.pb.h"
 #include "xla/python/ifrt/serdes_version.h"
 
@@ -82,6 +83,7 @@ void AttributeMap::ToProto(AttributeMapProto& proto,
   proto.Clear();
   proto.set_version_number(SerDesVersionNumber(0).value());
 
+  absl::ReaderMutexLock lock(mu_);
   for (const auto& [key, value] : map_) {
     AttributeMapProto::Value value_proto;
     std::visit(
@@ -110,6 +112,7 @@ void AttributeMap::ToProto(AttributeMapProto& proto,
 
 std::string AttributeMap::DebugString(size_t max_string_length,
                                       size_t max_int64_list_size) const {
+  absl::ReaderMutexLock lock(mu_);
   auto formatter = [=](std::string* out,
                        const AttributeMap::Map::value_type& key_value) {
     absl::StrAppend(out, key_value.first, "=");
diff --git a/third_party/xla/xla/python/ifrt/attribute_map.h b/third_party/xla/xla/python/ifrt/attribute_map.h
index 714b437c8f092b..964969b718bf1b 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map.h
+++ b/third_party/xla/xla/python/ifrt/attribute_map.h
@@ -24,13 +24,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/attribute_map.pb.h"
 #include "xla/python/ifrt/serdes_default_version_accessor.h"
 #include "xla/python/ifrt/serdes_version.h"
@@ -39,6 +39,8 @@ namespace xla {
 namespace ifrt {
 
 // Attribute map that contains UTF-8 keys and variant values.
+//
+// This class is thread-safe.
 class AttributeMap {
  public:
   // Supported value types for `AttributeMap`. Modeled after
@@ -90,20 +92,22 @@ class AttributeMap {
 
   explicit AttributeMap(Map map) : map_(std::move(map)) {}
 
-  ABSL_DEPRECATED("map() is not thread-safe. Use Get() function instead.")
-  const Map& map() const { return map_; }
-
   template <typename T>
   absl::StatusOr<T> Get(const std::string& key) const {
-    if constexpr (std::is_same_v<T, std::string> ||
-                  std::is_same_v<T, absl::string_view>) {
+    absl::ReaderMutexLock lock(mu_);
+    if constexpr (std::is_same_v<T, Value>) {
+      auto it = map_.find(key);
+      if (it == map_.end()) {
+        return absl::NotFoundError(absl::StrCat("Key not found: ", key));
+      }
+      return it->second;
+    } else if constexpr (std::is_same_v<T, std::string>) {
       return Get<T, StringValue>(key);
     } else if constexpr (std::is_same_v<T, bool>) {
       return Get<T, BoolValue>(key);
     } else if constexpr (std::is_same_v<T, int64_t>) {
       return Get<T, Int64Value>(key);
-    } else if constexpr (std::is_same_v<T, std::vector<int64_t>> ||
-                         std::is_same_v<T, absl::Span<const int64_t>>) {
+    } else if constexpr (std::is_same_v<T, std::vector<int64_t>>) {
       return Get<T, Int64ListValue>(key);
     } else if constexpr (std::is_same_v<T, float>) {
       return Get<T, FloatValue>(key);
@@ -112,6 +116,28 @@ class AttributeMap {
     }
   }
 
+  template <typename T>
+  absl::Status Set(const std::string& key, T value) {
+    absl::MutexLock lock(mu_);
+    using ValueType = std::decay_t<T>;
+    if constexpr (std::is_same_v<ValueType, std::string> ||
+                  std::is_same_v<ValueType, const char*> ||
+                  std::is_convertible_v<ValueType, std::string>) {
+      map_.insert_or_assign(key, StringValue(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, bool>) {
+      map_.insert_or_assign(key, BoolValue(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, int64_t>) {
+      map_.insert_or_assign(key, Int64Value(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, std::vector<int64_t>>) {
+      map_.insert_or_assign(key, Int64ListValue(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, float>) {
+      map_.insert_or_assign(key, FloatValue(std::move(value)));
+    } else {
+      static_assert(false, "Unsupported type for AttributeMap::Set");
+    }
+    return absl::OkStatus();
+  }
+
   // Deserializes `AttributeMapProto` into `AttributeMap`.
   static absl::StatusOr<AttributeMap> FromProto(const AttributeMapProto& proto);
 
@@ -135,11 +161,65 @@ class AttributeMap {
     sink.Append(attribute_map.DebugString());
   }
 
-  bool IsEmpty() const { return map_.empty(); }
+  bool IsEmpty() const {
+    absl::ReaderMutexLock lock(mu_);
+    return map_.empty();
+  }
+
+  // Invokes `f` for each key-value pair in the attribute map.
+  void ForEach(
+      absl::FunctionRef<void(const std::string&, const Value&)> f) const {
+    absl::ReaderMutexLock lock(mu_);
+    for (const auto& [key, value] : map_) {
+      f(key, value);
+    }
+  }
+
+  bool operator==(const AttributeMap& other) const {
+    absl::ReaderMutexLock lock1(mu_);
+    absl::ReaderMutexLock lock2(other.mu_);
+    return map_ == other.map_;
+  }
+
+  size_t size() const {
+    absl::ReaderMutexLock lock(mu_);
+    return map_.size();
+  }
+
+  // Copyable and movable.
+  AttributeMap(const AttributeMap& other) {
+    absl::ReaderMutexLock lock(other.mu_);
+    map_ = other.map_;
+  }
+  AttributeMap& operator=(const AttributeMap& other) {
+    Map map;
+    {
+      absl::ReaderMutexLock lock(other.mu_);
+      map = other.map_;
+    }
+    absl::MutexLock lock(mu_);
+    map_ = std::move(map);
+    return *this;
+  }
+  AttributeMap(AttributeMap&& other) {
+    absl::MutexLock lock(other.mu_);
+    map_ = std::move(other.map_);
+  }
+  AttributeMap& operator=(AttributeMap&& other) {
+    Map map;
+    {
+      absl::MutexLock lock(other.mu_);
+      map = std::move(other.map_);
+    }
+    absl::MutexLock lock(mu_);
+    map_ = std::move(map);
+    return *this;
+  }
 
  private:
   template <typename T, typename V>
-  absl::StatusOr<T> Get(const std::string& key) const {
+  absl::StatusOr<T> Get(const std::string& key) const
+      ABSL_SHARED_LOCKS_REQUIRED(mu_) {
     auto it = map_.find(key);
     if (it == map_.end()) {
       return absl::NotFoundError(absl::StrCat("Key not found: ", key));
@@ -152,7 +232,8 @@ class AttributeMap {
     return value->value;
   }
 
-  Map map_;
+  mutable absl::Mutex mu_;
+  Map map_ ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/attribute_map_test.cc b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
index 96069fdfb6ee74..3f0bad64f22583 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map_test.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
@@ -23,10 +23,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -46,14 +45,14 @@ TEST(AttributeMapTest, MapElements) {
       {"float", AttributeMap::FloatValue(1.23f)},
   });
 
-  EXPECT_EQ(map.map(), AttributeMap::Map({
-                           {"string", AttributeMap::StringValue("value")},
-                           {"bool", AttributeMap::BoolValue(true)},
-                           {"int64", AttributeMap::Int64Value(123)},
-                           {"int64_list", AttributeMap::Int64ListValue(
-                                              {int64_t{1}, int64_t{2}})},
-                           {"float", AttributeMap::FloatValue(1.23f)},
-                       }))
+  EXPECT_EQ(map, AttributeMap({
+                     {"string", AttributeMap::StringValue("value")},
+                     {"bool", AttributeMap::BoolValue(true)},
+                     {"int64", AttributeMap::Int64Value(123)},
+                     {"int64_list",
+                      AttributeMap::Int64ListValue({int64_t{1}, int64_t{2}})},
+                     {"float", AttributeMap::FloatValue(1.23f)},
+                 }))
       << map.DebugString();
 }
 
@@ -67,23 +66,38 @@ TEST(AttributeMapTest, Get) {
   });
 
   EXPECT_THAT(map.Get<std::string>("string"), IsOkAndHolds("value"));
-  EXPECT_THAT(map.Get<absl::string_view>("string"), IsOkAndHolds("value"));
   EXPECT_THAT(map.Get<bool>("bool"), IsOkAndHolds(true));
   EXPECT_THAT(map.Get<int64_t>("int64"), IsOkAndHolds(123));
   EXPECT_THAT(map.Get<std::vector<int64_t>>("int64_list"),
               IsOkAndHolds(std::vector<int64_t>{1, 2}));
-  EXPECT_THAT(map.Get<absl::Span<const int64_t>>("int64_list"),
-              IsOkAndHolds(std::vector<int64_t>{1, 2}));
   EXPECT_THAT(map.Get<float>("float"), IsOkAndHolds(1.23f));
 
   EXPECT_THAT(map.Get<std::string>("float"),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        HasSubstr("Value type mismatch for key: float")));
-  EXPECT_THAT(map.Get<absl::Span<const int64_t>>("string"),
+  EXPECT_THAT(map.Get<std::vector<int64_t>>("string"),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        HasSubstr("Value type mismatch for key: string")));
 }
 
+TEST(AttributeMapTest, Set) {
+  AttributeMap map({});
+  TF_ASSERT_OK(map.Set("string", "value"));
+  TF_ASSERT_OK(map.Set("bool", true));
+  TF_ASSERT_OK(map.Set("int64", int64_t{123}));
+  TF_ASSERT_OK(map.Set("int64_list", std::vector<int64_t>{1, 2}));
+  TF_ASSERT_OK(map.Set("float", 1.23f));
+  EXPECT_EQ(map, AttributeMap({
+                     {"string", AttributeMap::StringValue("value")},
+                     {"bool", AttributeMap::BoolValue(true)},
+                     {"int64", AttributeMap::Int64Value(123)},
+                     {"int64_list",
+                      AttributeMap::Int64ListValue({int64_t{1}, int64_t{2}})},
+                     {"float", AttributeMap::FloatValue(1.23f)},
+                 }))
+      << map.DebugString();
+}
+
 class AttributeMapSerDesTest : public testing::TestWithParam<SerDesVersion> {
  public:
   AttributeMapSerDesTest() : version_(GetParam()) {}
@@ -105,7 +119,7 @@ TEST_P(AttributeMapSerDesTest, ToFromProto) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto map_copy,
                           AttributeMap::FromProto(map.ToProto(version())));
-  EXPECT_EQ(map_copy.map(), map.map()) << map_copy.DebugString();
+  EXPECT_EQ(map_copy, map) << map_copy.DebugString();
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/third_party/xla/xla/python/ifrt/device.h b/third_party/xla/xla/python/ifrt/device.h
index a9d86a7d17e3ed..0298701f54ba2f 100644
--- a/third_party/xla/xla/python/ifrt/device.h
+++ b/third_party/xla/xla/python/ifrt/device.h
@@ -57,6 +57,9 @@ class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
   // reference will remain valid for the lifetime of the Device.
   virtual const AttributeMap& Attributes() const = 0;
 
+  // A string that uniquely identifies the platform, e.g., "tpu", "cuda", "cpu".
+  virtual absl::string_view PlatformName() const = 0;
+
   // A vendor-dependent string that uniquely identifies the kind of device,
   // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
   // compatible compilation.
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index c48becc712bcb4..77ea0524094d7b 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -179,6 +179,7 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:device_proto_cc",
         "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt:serdes_proto_cc",
         "//xla/python/ifrt:serdes_version",
         "//xla/python/ifrt:serdes_week_4_old_version_accessor",
         "//xla/python/pjrt_ifrt:xla_executable_version",
@@ -507,6 +508,7 @@ cc_library(
         ":atom_program_compiler",
         ":ifrt_ir_program",
         ":ir",
+        ":program_interpreter",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_layout",
@@ -520,6 +522,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -542,7 +545,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//xla/python/ifrt:users"],
     deps = [
-        ":compiled_ifrt_ir_program",
+        ":atom_program_compiler",
         ":ir",
         "//xla:status_macros",
         "//xla/python/ifrt",
@@ -555,6 +558,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
index b3ec80c3d6f3e9..2cd2b723343cbd 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
@@ -55,6 +54,7 @@ limitations under the License.
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/ifrt_ir_program.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
+#include "xla/python/ifrt/ir/program_interpreter.h"
 #include "xla/python/ifrt/ir/transforms/debug.h"
 #include "xla/python/ifrt/ir/transforms/passes.h"
 #include "xla/python/ifrt/ir/transforms/utils.h"
@@ -327,6 +327,8 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
   mlir::MLIRContext* context = mlir_module.getContext();
   xla::ifrt::support::RegisterMlirDialects(*context);
 
+  std::string program_name = mlir_module.getName().value_or("unknown").str();
+
   // Add the bounded executables to the atom program executable map so that
   // they can be used by the interpreter
   std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_executable_map =
@@ -383,14 +385,8 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     compile_pipeline_options.propagate_shardings =
         compile_options->propagate_shardings;
     for (const auto device : devices) {
-      auto platform_name =
-          device->Attributes().Get<std::string>("platform_name");
-      if (platform_name.ok()) {
-        compile_pipeline_options.platform_names.push_back(*platform_name);
-      } else {
-        compile_pipeline_options.platform_names.push_back(
-            std::string(client->platform_name()));
-      }
+      compile_pipeline_options.platform_names.push_back(
+          std::string(device->PlatformName()));
     }
     TF_RETURN_IF_ERROR(xla::ifrt::createOutlinedAtomProgramsToCompiledPipeline(
         pm, std::move(atom_program_compiler), compile_pipeline_options,
@@ -434,8 +430,16 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     }
   }
 
+  TF_ASSIGN_OR_RETURN(DeviceListRef device_list,
+                      client->MakeDeviceList(devices));
+  TF_ASSIGN_OR_RETURN(
+      auto interpreter,
+      ProgramInterpreter::Create(client, program_name, mlir_module,
+                                 atom_executable_map, std::move(device_list)));
+  TF_ASSIGN_OR_RETURN(auto execute_fn, interpreter->BuildExecuteFn());
+
   return CompiledIfrtIrProgram{
-      /*program_name=*/mlir_module.getName().value_or("unknown").str(),
+      /*program_name=*/std::move(program_name),
       /*atom_program_executables=*/std::move(atom_executable_map),
       /*in_specs=*/std::move(in_specs),
       /*out_specs=*/std::move(out_specs),
@@ -444,6 +448,7 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
       /*program=*/std::move(ifrt_ir_program),
       /*device_assignments=*/std::move(device_assignments),
       /*compile_options=*/compile_options,
+      /*execute_fn=*/std::move(execute_fn),
   };
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
index 509750627489e1..c9baf2b35b5b8c 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
@@ -16,14 +16,20 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_IR_COMPILED_IFRT_IR_PROGRAM_H_
 #define XLA_PYTHON_IFRT_IR_COMPILED_IFRT_IR_PROGRAM_H_
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/ifrt_ir_program.h"
 
@@ -63,6 +69,14 @@ struct CompiledIfrtIrProgram {
   // The compile options used to compile the program.
   std::shared_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options;
 
+  // Precompiled execute function that interprets the IFRT IR program. The
+  // signature matches that of `xla::ifrt::LoadedExecutable::Execute()`.
+  absl::AnyInvocable<absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult>(
+      absl::Span<xla::ifrt::ArrayRef> arrays,
+      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+      std::optional<xla::ifrt::DeviceListRef> devices)>
+      execute_fn;
+
   // Compiles an IFRT IR program.
   static absl::StatusOr<CompiledIfrtIrProgram> Create(
       std::unique_ptr<xla::ifrt::IfrtIRProgram> ifrt_ir_program,
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc
index 76cad284e95c04..27443bda6b27f4 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/python/ifrt/ir/ifrt_ir_executable_version.pb.h"
 #include "xla/python/ifrt/ir/version.h"
 #include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/serdes.pb.h"
 #include "xla/python/ifrt/serdes_version.h"
 #include "xla/python/ifrt/serdes_week_4_old_version_accessor.h"
 #include "xla/python/pjrt_ifrt/xla_executable_version.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
index 7e8612f830303f..313977f13d1030 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/ifrt/ir/program_interpreter.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,18 +25,21 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/DebugStringHelper.h"
@@ -46,7 +50,7 @@ limitations under the License.
 #include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/ir/compiled_ifrt_ir_program.h"
+#include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/constants.h"
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
@@ -74,12 +78,20 @@ using ExecuteResult = ::xla::ifrt::LoadedExecutable::ExecuteResult;
 
 namespace {
 
+// Opaque handle that represents an array. Zero is reserved for null.
+using ArrayHandle = uintptr_t;
+
 // Array with additional metadata (e.g., if it can be donated).
 struct ArrayState {
   ArrayRef array;
   bool can_be_donated;
 };
 
+// Assigns a unique handle to the given MLIR value.
+ArrayHandle ToArrayHandle(mlir::Value value) {
+  return reinterpret_cast<ArrayHandle>(value.getAsOpaquePointer());
+}
+
 // Returns an xla::ifrt::Sharding for the given IFRT array type.
 absl::StatusOr<xla::ifrt::ShardingRef> GetSharding(
     xla::ifrt::IfrtArrayType array_type, xla::ifrt::Client* client,
@@ -110,65 +122,23 @@ std::string PrettyPrintGeneric(mlir::Operation* op) {
                       GetPrettyLocation(op->getLoc()));
 }
 
-// Populates the cache storing a Sharding for each IfrtArrayType.
-//
-// This cache exists to avoid traversing and creating large device lists at
-// execution time.
-//
-// Note that the cache is only populated for array types returned by CopyArrays
-// and RemapArrays ops because they are the only ops that need shardings.
-absl::StatusOr<llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>>
-PopulateShardingCache(mlir::func::FuncOp main_func, xla::ifrt::Client* client,
-                      const xla::ifrt::DeviceListRef& devices) {
-  llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>
-      array_type_to_sharding;
-  for (const mlir::Operation& op : main_func.getOps()) {
-    if (auto copy_arrays_op = llvm::dyn_cast<xla::ifrt::CopyArraysOp>(&op);
-        copy_arrays_op != nullptr) {
-      for (const auto [idx, output] :
-           llvm::enumerate(copy_arrays_op.getOutputs())) {
-        const auto array_type =
-            llvm::cast<xla::ifrt::IfrtArrayType>(output.getType());
-        TF_RET_CHECK(array_type != nullptr)
-            << "Output array #" << idx << " is not of type `IfrtArrayType`. "
-            << PrettyPrintGeneric(copy_arrays_op);
-        if (array_type_to_sharding.find(array_type) ==
-            array_type_to_sharding.end()) {
-          TF_ASSIGN_OR_RETURN(auto sharding,
-                              GetSharding(array_type, client, devices));
-          array_type_to_sharding[array_type] = std::move(sharding);
-        }
-      }
-    } else if (auto remap_op = llvm::dyn_cast<xla::ifrt::RemapArraysOp>(&op);
-               remap_op != nullptr) {
-      for (const auto [idx, output] : llvm::enumerate(remap_op.getOutputs())) {
-        const auto array_type =
-            llvm::cast<xla::ifrt::IfrtArrayType>(output.getType());
-        TF_RET_CHECK(array_type != nullptr)
-            << "Output array #" << idx << " is not of type `IfrtArrayType`. "
-            << PrettyPrintGeneric(remap_op);
-        if (array_type_to_sharding.find(array_type) ==
-            array_type_to_sharding.end()) {
-          TF_ASSIGN_OR_RETURN(auto sharding,
-                              GetSharding(array_type, client, devices));
-          array_type_to_sharding[array_type] = std::move(sharding);
-        }
-      }
-    }
-  }
-  return array_type_to_sharding;
-}
-
 }  // namespace
 
 struct Environment {
-  // Associates array with an MLIR value.
-  void AssociateArray(mlir::Value value, ArrayState array) {
-    CHECK(value_to_array.try_emplace(value, array).second);
+  // Associates array with an opaque handle.
+  void AssociateArray(ArrayHandle handle, ArrayState array) {
+    CHECK(handle_to_array.try_emplace(handle, array).second);
   }
 
-  // Map from MLIR value to IFRT array corresponding to the value.
-  llvm::DenseMap<mlir::Value, ArrayState> value_to_array;
+  // IFRT client for execution.
+  xla::ifrt::Client* client;
+  // Name of the program.
+  std::string program_name;
+  // Set of donated program arguments, which can be deleted after their last
+  // use. Entries are removed upon deletion or if they are aliased.
+  absl::flat_hash_set<ArrayHandle> deletable_program_arguments;
+  // Map from an opaque handle to IFRT array corresponding to the value.
+  absl::flat_hash_map<ArrayHandle, ArrayState> handle_to_array;
   // Outputs of the program.
   std::vector<ArrayRef> outputs;
   // `ExecuteOptions.fill_status` passed to Execute().
@@ -179,213 +149,401 @@ struct Environment {
 };
 
 absl::StatusOr<std::unique_ptr<ProgramInterpreter>> ProgramInterpreter::Create(
-    xla::ifrt::Client* client, std::shared_ptr<CompiledIfrtIrProgram> program,
+    xla::ifrt::Client* client, absl::string_view program_name,
+    mlir::ModuleOp mlir_module,
+    std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables,
     xla::ifrt::DeviceListRef devices) {
-  mlir::func::FuncOp main_func =
-      xla::ifrt::GetMainFunction(program->program->mlir_module);
+  mlir::func::FuncOp main_func = xla::ifrt::GetMainFunction(mlir_module);
   if (!main_func->hasAttr(xla::ifrt::kIfrtFunctionAttrName)) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "`main` function of IFRT IR program: ", program->program_name,
-        " is not an IFRT function."));
+    return absl::InvalidArgumentError(
+        absl::StrCat("`main` function of IFRT IR program: ", program_name,
+                     " is not an IFRT function."));
   }
-  TF_ASSIGN_OR_RETURN(auto array_type_to_sharding,
-                      PopulateShardingCache(main_func, client, devices));
   return std::unique_ptr<ProgramInterpreter>(new ProgramInterpreter(
-      client, std::move(program), std::move(devices), mlir::Liveness(main_func),
-      std::move(array_type_to_sharding)));
+      client, program_name, mlir_module, std::move(atom_program_executables),
+      std::move(devices), mlir::Liveness(main_func)));
 }
 
-absl::StatusOr<ExecuteResult> ProgramInterpreter::Execute(
-    absl::Span<ArrayRef> arrays, const ExecuteOptions& options,
-    std::optional<xla::ifrt::DeviceListRef> devices) {
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchProgram",
-                         {
-                             {"ifrt_ir_program", program_->program_name},
-                         });
-  });
-  VLOG(2) << "Started interpreting program: " << program_->program_name;
-  mlir::func::FuncOp main_func =
-      xla::ifrt::GetMainFunction(program_->program->mlir_module);
-  if (arrays.size() != main_func.getNumArguments()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "`main` function of IFRT IR program: ", program_->program_name,
-        " invoked with ", arrays.size(), " arguments, but it expects ",
-        main_func.getNumArguments(), " arguments."));
-  }
+namespace {
+
+struct ProgramInterpreterState {
+  xla::ifrt::Client* client;
+  std::string program_name;
 
-  for (const auto& [idx, array] : llvm::enumerate(arrays)) {
-    if (array->IsDeleted()) {
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<int> donated_input_indices;
+
+  std::vector<absl::AnyInvocable<absl::Status(Environment& env) const>> op_fns;
+
+  absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult> Run(
+      absl::Span<xla::ifrt::ArrayRef> arrays,
+      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+      std::optional<xla::ifrt::DeviceListRef> devices) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchProgram",
+                           {{"ifrt_ir_program", program_name}});
+    });
+    VLOG(2) << "Started interpreting program: " << program_name;
+
+    if (arrays.size() != input_handles.size()) {
       return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " of program ", program_->program_name,
-          " has already been deleted or donated."));
+          "`main` function of IFRT IR program: ", program_name,
+          " invoked with ", arrays.size(), " arguments, but it expects ",
+          input_handles.size(), " arguments."));
     }
-  }
 
-  Environment env;
-  env.fill_status = options.fill_status;
+    for (int idx = 0; idx < arrays.size(); ++idx) {
+      const xla::ifrt::ArrayRef& array = arrays[idx];
+      if (array->IsDeleted()) {
+        return absl::InvalidArgumentError(
+            absl::StrCat("Input array #", idx, " of program ", program_name,
+                         " has already been deleted or donated."));
+      }
+    }
+
+    Environment env;
+    env.client = client;
+    env.fill_status = options.fill_status;
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      // Add to the environment the arrays that are used.
+      bool is_donated = donated_input_indices.contains(idx) &&
+                        !options.non_donatable_input_indices.contains(idx);
+      const ArrayHandle handle = input_handles[idx];
+      if (handle != 0) {
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/arrays[idx],
+                                       /*can_be_donated=*/is_donated,
+                                   });
+        if (is_donated) {
+          env.deletable_program_arguments.insert(handle);
+        }
+      } else if (is_donated) {
+        // If the argument is donated but not used, it can be deleted.
+        arrays[idx]->Delete();
+      }
+    }
+
+    for (const auto& op_fn : op_fns) {
+      TF_RETURN_IF_ERROR(op_fn(env));
+    }
+
+    VLOG(2) << "Finished interpreting program: " << program_name;
+    ExecuteResult result;
+    if (env.fill_status) {
+      result.status =
+          tsl::JoinFutures(absl::MakeSpan(env.leaf_call_op_futures));
+    }
+    result.outputs = std::move(env.outputs);
+    return result;
+  };
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::ExecuteFn>
+ProgramInterpreter::BuildExecuteFn() {
+  ProgramInterpreterState state;
+  state.client = client_;
+  state.program_name = program_name_;
+
+  mlir::func::FuncOp main_func = xla::ifrt::GetMainFunction(mlir_module_);
+
   for (const auto [idx, arg] : llvm::enumerate(main_func.getArguments())) {
     // Add to the environment the arrays that are used.
-    bool is_donated = main_func.getArgAttr(
-                          idx, xla::ifrt::kIfrtDonatedArgAttrName) != nullptr &&
-                      !options.non_donatable_input_indices.contains(idx);
-    if (!arg.use_empty()) {
-      env.AssociateArray(arg, ArrayState{/*array=*/arrays[idx],
-                                         /*can_be_donated=*/is_donated});
-      if (is_donated) {
-        deletable_program_arguments_.insert(arg);
-      }
-    } else if (is_donated) {
-      // If the argument is donated but not used, it can be deleted.
-      arrays[idx]->Delete();
+    const ArrayHandle handle = arg.use_empty() ? 0 : ToArrayHandle(arg);
+    state.input_handles.push_back(handle);
+    if (main_func.getArgAttr(idx, xla::ifrt::kIfrtDonatedArgAttrName) !=
+        nullptr) {
+      state.donated_input_indices.insert(idx);
     }
   }
 
-  // Walk ops one-by-one in program order, and dispatch atom program and
-  // copy arrays.
+  // Walk ops one-by-one in program order and create functions that execute each
+  // op on a given environment.
   for (mlir::Operation& op : main_func.getOps()) {
-    auto exec_op_status =
-        llvm::TypeSwitch<const mlir::Operation&, absl::Status>(op)
+    auto op_fn =
+        llvm::TypeSwitch<const mlir::Operation&, absl::StatusOr<OpFn>>(op)
             .Case<xla::ifrt::CallLoadedExecutableOp, xla::ifrt::RemapArraysOp,
                   xla::ifrt::CopyArraysOp, mlir::func::ReturnOp>(
-                [&](const auto& op) { return ExecuteOp(op, env); })
-            .Default([&](const auto& op) {
+                [this](const auto& op) { return HandleOp(op); })
+            .Default([](const mlir::Operation& op) {
               return absl::InvalidArgumentError(absl::StrCat(
                   "Interpreter found unexpected op: ", mlir::debugString(op)));
             });
-    if (!exec_op_status.ok()) {
-      tsl::errors::AppendToMessage(&exec_op_status, PrettyPrint(&op));
-      return exec_op_status;
+    if (!op_fn.ok()) {
+      absl::Status status = op_fn.status();
+      tsl::errors::AppendToMessage(&status, PrettyPrint(&op));
+      return status;
     }
+    state.op_fns.push_back(
+        [op_fn = *std::move(op_fn),
+         pretty_print = PrettyPrint(&op)](Environment& env) -> absl::Status {
+          absl::Status status = op_fn(env);
+          tsl::errors::AppendToMessage(&status, pretty_print);
+          return status;
+        });
   }
 
-  VLOG(2) << "Finished interpreting program: " << program_->program_name;
-  ExecuteResult result;
-  if (env.fill_status) {
-    result.status = tsl::JoinFutures(absl::MakeSpan(env.leaf_call_op_futures));
-  }
-  result.outputs = std::move(env.outputs);
-  return result;
+  return absl::bind_front(&ProgramInterpreterState::Run, std::move(state));
 }
 
-absl::Status ProgramInterpreter::ExecuteOp(
-    xla::ifrt::CallLoadedExecutableOp call_loaded_op, Environment& env) {
+namespace {
+
+struct CallLoadedExecutableOpState {
+  std::string pretty_print;
+  std::string atom_program_name;
+
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<int> donated_arg_idxs;
+  absl::flat_hash_set<ArrayHandle> dead_inputs;
+
+  xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
+  std::shared_ptr<xla::ifrt::LoadedExecutable> executable;
+
+  std::vector<ArrayHandle> output_handles;
+  bool is_leaf_op;
+
+  absl::Status Run(Environment& env) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchLoadedExecutableOp",
+                           {
+                               {"ifrt_ir_program", env.program_name},
+                               {"atom_program", atom_program_name},
+                           });
+    });
+    VLOG(3) << pretty_print;
+
+    xla::ifrt::LoadedExecutable::ExecuteOptions options = execute_options;
+    options.fill_status = env.fill_status;
+
+    // Get the inputs of the loaded executable.
+    std::vector<ArrayRef> inputs;
+    std::vector<ArrayHandle> arrays_to_remove;
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      const ArrayHandle handle = input_handles[idx];
+
+      auto array_it = env.handle_to_array.find(handle);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      if (array_it->second.array->IsDeleted()) {
+        // We explicitly check here for deletion in order to provide a more
+        // informative error message.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Input array #", idx, "` has already been deleted or donated. ",
+            pretty_print));
+      }
+      inputs.push_back(array_it->second.array);
+
+      bool is_donated = donated_arg_idxs.contains(idx);
+      if (is_donated && !array_it->second.can_be_donated) {
+        VLOG(2) << "Atom program donates input #" << idx
+                << ", but it has not been donated to the IFRT IR program. "
+                   "Input will not be donated. \n"
+                << pretty_print;
+        is_donated = false;
+      }
+      if (is_donated || dead_inputs.contains(handle)) {
+        arrays_to_remove.push_back(handle);
+      }
+      if (!is_donated) {
+        options.non_donatable_input_indices.insert(idx);
+      }
+    }
+
+    TF_ASSIGN_OR_RETURN(xla::ifrt::LoadedExecutable::ExecuteResult result,
+                        executable->Execute(absl::MakeSpan(inputs), options,
+                                            /*devices=*/std::nullopt));
+    TF_RET_CHECK(result.outputs.size() == output_handles.size())
+        << "Got " << result.outputs.size() << " results, but atom program has "
+        << output_handles.size() << ". " << pretty_print;
+
+    // Remove the arrays from the environment after the inputs vector is
+    // created. This is because in situations such as `ifrt.Call(%0, %0)` the
+    // liveness analysis will return that %0 is dead, but it's used for the
+    // second argument.
+    for (const auto handle : arrays_to_remove) {
+      if (env.deletable_program_arguments.erase(handle)) {
+        // Explicitly delete donated program arguments that are not used later.
+        env.handle_to_array[handle].array->Delete();
+      }
+      env.handle_to_array.erase(handle);
+    }
+
+    for (int i = 0; i < output_handles.size(); ++i) {
+      const ArrayHandle handle = output_handles[i];
+      if (handle != 0) {
+        // The output array is kept only if it used later. This can happen if an
+        // executable has multiple output arrays, but only some of them are
+        // used.
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/std::move(result.outputs[i]),
+                                       /*can_be_donated=*/true,
+                                   });
+      }
+    }
+    if (is_leaf_op && env.fill_status) {
+      env.leaf_call_op_futures.push_back(std::move(result.status));
+    }
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    xla::ifrt::CallLoadedExecutableOp call_loaded_op) {
+  CallLoadedExecutableOpState state;
+  state.pretty_print = PrettyPrint(call_loaded_op);
+
   xla::ifrt::LoadedExecutableOp loaded_exec_op =
       call_loaded_op.getCalleeOp(symbol_table_);
-  std::string atom_program_name = loaded_exec_op.getSymName().str();
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchLoadedExecutableOp",
-                         {
-                             {"ifrt_ir_program", program_->program_name},
-                             {"atom_program", atom_program_name},
-                         });
-  });
-  std::string op_name = call_loaded_op->getName().getStringRef().str();
-  VLOG(3) << PrettyPrint(call_loaded_op);
+  state.atom_program_name = loaded_exec_op.getSymName().str();
+
   // Get the loaded executable for the atom program.
-  auto exec_it = program_->atom_program_executables->find(atom_program_name);
-  TF_RET_CHECK(exec_it != program_->atom_program_executables->end())
-      << "Could not find executable. " << PrettyPrint(call_loaded_op);
+  auto exec_it = atom_program_executables_->find(state.atom_program_name);
+  TF_RET_CHECK(exec_it != atom_program_executables_->end())
+      << "Could not find executable. " << state.pretty_print;
+  state.executable = exec_it->second;
 
-  absl::flat_hash_set<int> donated_arg_idxs(
-      call_loaded_op.getDonatedInputIndices().begin(),
-      call_loaded_op.getDonatedInputIndices().end());
+  state.donated_arg_idxs.insert(call_loaded_op.getDonatedInputIndices().begin(),
+                                call_loaded_op.getDonatedInputIndices().end());
   for (const auto& io_alias :
        call_loaded_op.getIoAliases().getAsRange<mlir::DenseI32ArrayAttr>()) {
     // Insert the aliased input to the set.
-    donated_arg_idxs.insert(io_alias.asArrayRef()[0]);
+    state.donated_arg_idxs.insert(io_alias.asArrayRef()[0]);
   }
-  // Get the inputs of the loaded executable.
-  std::vector<ArrayRef> inputs;
-  xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
-  execute_options.fill_status = env.fill_status;
-  llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
-  for (const auto [idx, input] : llvm::enumerate(call_loaded_op.getInputs())) {
-    auto array_it = env.value_to_array.find(input);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. "
-        << PrettyPrint(call_loaded_op);
-    if (array_it->second.array->IsDeleted()) {
-      // We explicitly check here for deletion in order to provide a more
-      // informative error message.
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, "` has already been deleted or donated. ",
-          PrettyPrint(call_loaded_op)));
-    }
-    inputs.push_back(array_it->second.array);
-
-    bool is_donated = donated_arg_idxs.contains(idx);
-    if (is_donated && !array_it->second.can_be_donated) {
-      VLOG(2) << "Atom program donates input #" << idx
-              << ", but it has not been donated to the IFRT IR program. "
-                 "Input will not be donated. \n"
-              << PrettyPrint(call_loaded_op);
-      is_donated = false;
-    }
-    if (is_donated || liveness_.isDeadAfter(input, call_loaded_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
-    if (!is_donated) {
-      execute_options.non_donatable_input_indices.insert(idx);
+  for (const auto input : call_loaded_op.getInputs()) {
+    state.input_handles.push_back(ToArrayHandle(input));
+    if (liveness_.isDeadAfter(input, call_loaded_op)) {
+      state.dead_inputs.insert(ToArrayHandle(input));
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
-      xla::ifrt::LoadedExecutable::ExecuteResult result,
-      exec_it->second->Execute(absl::MakeSpan(inputs), execute_options,
-                               /*devices=*/std::nullopt));
-  TF_RET_CHECK(result.outputs.size() == call_loaded_op.getOutputs().size())
-      << "Got " << result.outputs.size() << " results, but atom program has "
-      << call_loaded_op.getOutputs().size() << ". "
-      << PrettyPrint(call_loaded_op);
-
-  // Remove the arrays from the environment after the inputs vector is created.
-  // This is because in situations such as `ifrt.Call(%0, %0)` the liveness
-  // analysis will return that %0 is dead, but it's used for the second
-  // argument.
-  for (const auto& array_value : array_values_to_gc_from_env) {
-    if (deletable_program_arguments_.erase(array_value)) {
-      // Explicitly delete donated program arguments that are not used later.
-      env.value_to_array[array_value].array->Delete();
-    }
-    env.value_to_array.erase(array_value);
-  }
+  state.is_leaf_op = true;
+  for (const auto output : call_loaded_op.getOutputs()) {
+    const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
+    state.output_handles.push_back(handle);
 
-  bool is_leaf_op = true;
-  for (const auto [output_array, output] :
-       llvm::zip(result.outputs, call_loaded_op.getOutputs())) {
-    if (!output.use_empty()) {
-      // The output array is kept only if it used later. This can happen if
-      // an executable has multiple output arrays, but only some of them are
-      // used.
-      env.AssociateArray(output, ArrayState{/*array=*/std::move(output_array),
-                                            /*can_be_donated=*/true});
-    }
-    if (is_leaf_op) {
+    if (state.is_leaf_op) {
       for (mlir::OpOperand& use : output.getUses()) {
         // An ifrt.CallOp is not a leaf if any of its outputs are not returned.
         if (llvm::dyn_cast<mlir::func::ReturnOp>(use.getOwner()) == nullptr) {
-          is_leaf_op = false;
+          state.is_leaf_op = false;
           break;
         }
       }
     }
   }
-  if (is_leaf_op && env.fill_status) {
-    env.leaf_call_op_futures.push_back(std::move(result.status));
-  }
 
-  return absl::OkStatus();
+  return absl::bind_front(&CallLoadedExecutableOpState::Run, std::move(state));
 }
 
-absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
-                                           Environment& env) {
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchRemapArraysOp",
-                         {{"ifrt_ir_program", program_->program_name}});
-  });
-  std::string op_name = remap_op->getName().getStringRef().str();
-  VLOG(3) << PrettyPrint(remap_op);
+namespace {
+
+struct RemapArraysOpState {
+  std::string pretty_print;
+
+  xla::ifrt::RemapPlan remap_plan;
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<ArrayHandle> dead_inputs;
+  bool remap_is_donated;
+
+  std::vector<ArrayHandle> output_handles;
+
+  absl::Status Run(Environment& env) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchRemapArraysOp",
+                           {{"ifrt_ir_program", env.program_name}});
+    });
+    VLOG(3) << pretty_print;
+
+    std::vector<ArrayRef> inputs;
+    inputs.reserve(remap_plan.input_specs.size());
+
+    std::optional<bool> is_donated;
+    std::vector<ArrayHandle> arrays_to_remove;
+
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      const ArrayHandle handle = input_handles[idx];
+
+      auto array_it = env.handle_to_array.find(handle);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      if (array_it->second.array->IsDeleted()) {
+        // We explicitly check here for deletion in order to provide a more
+        // informative error message.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Input array #", idx, "` has already been deleted or donated. ",
+            pretty_print));
+      }
+      inputs.push_back(array_it->second.array);
+
+      // The default buffer donation semantic is finalized at compilation time.
+      // Users can override the donation semantic at runtime. In the meantime,
+      // the IFRT client RemapArrays API requires all input arrays have the same
+      // donation semantic.
+      if (!is_donated.has_value()) {
+        is_donated = remap_is_donated && array_it->second.can_be_donated;
+      }
+      if (*is_donated && !array_it->second.can_be_donated) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Donation semantic must be consistent across all input arrays of "
+            "RemapArraysOp. Input array #",
+            idx,
+            " cannot be donated, but previous input arrays can be donated. "
+            "It's likely due to a MPMD program argument is marked as "
+            "non-donatable. ",
+            pretty_print));
+      }
+      if (*is_donated || dead_inputs.contains(handle)) {
+        arrays_to_remove.push_back(handle);
+      }
+    }
+    TF_RET_CHECK(is_donated.has_value())
+        << "Unable to determine the donation semantic of the remap op. The "
+           "remap op has no inputs. "
+        << pretty_print;
+
+    // Apply the remap arrays operation.
+    xla::ifrt::ArrayCopySemantics copy_semantics =
+        *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
+                    : xla::ifrt::ArrayCopySemantics::kReuseInput;
+    TF_ASSIGN_OR_RETURN(auto out_arrays, env.client->RemapArrays(
+                                             remap_plan, absl::MakeSpan(inputs),
+                                             copy_semantics));
+
+    for (const auto handle : arrays_to_remove) {
+      // Donated remapped arrays are pro-actively deleted, and aliased arrays
+      // cannot be deleted later. Thus, remove the arrays from the deletable
+      // program arguments set.
+      env.deletable_program_arguments.erase(handle);
+      env.handle_to_array.erase(handle);
+    }
+
+    // Store the result arrays in the environment.
+    TF_RET_CHECK(out_arrays.size() == remap_plan.output_specs.size())
+        << "Got " << out_arrays.size() << " results, but op has "
+        << remap_plan.output_specs.size() << ". " << pretty_print;
+    for (int i = 0; i < output_handles.size(); ++i) {
+      const ArrayHandle handle = output_handles[i];
+      if (handle != 0) {
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/std::move(out_arrays[i]),
+                                       /*can_be_donated=*/true,
+                                   });
+      }
+    }
+
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    xla::ifrt::RemapArraysOp remap_op) {
+  RemapArraysOpState state;
+  state.pretty_print = PrettyPrint(remap_op);
 
   // Construct the mappings of the remap plan.
   auto mappings =
@@ -410,54 +568,28 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
     }
   };
 
-  std::vector<ArrayRef> inputs;
-  std::vector<xla::ifrt::ArraySpec> input_specs;
-  inputs.reserve(remap_op.getInputs().size());
-  input_specs.reserve(remap_op.getInputs().size());
   // Get the input specs of the remap plan and the input arrays.
-  llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
-  std::optional<bool> is_donated;
+  std::vector<xla::ifrt::ArraySpec> input_specs;
+  input_specs.reserve(remap_op.getOutputs().size());
   for (const auto [idx, input] : llvm::enumerate(remap_op.getInputs())) {
-    auto array_it = env.value_to_array.find(input);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. " << PrettyPrint(remap_op);
-    if (array_it->second.array->IsDeleted()) {
-      // We explicitly check here for deletion in order to provide a more
-      // informative error message.
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " has already been deleted or donated. ",
-          PrettyPrint(remap_op)));
-    }
-    inputs.push_back(array_it->second.array);
+    state.input_handles.push_back(ToArrayHandle(input));
+
+    const auto array_type =
+        llvm::cast<xla::ifrt::IfrtArrayType>(input.getType());
+    TF_ASSIGN_OR_RETURN(
+        xla::ifrt::DType dtype,
+        xla::ifrt::ToIfrtDType(array_type.getShape().getElementType()));
+    TF_ASSIGN_OR_RETURN(xla::ifrt::ShardingRef sharding,
+                        GetSharding(array_type, client_, devices_));
     input_specs.push_back(xla::ifrt::ArraySpec{
-        /*dtype=*/array_it->second.array->dtype(),
-        /*shape=*/array_it->second.array->shape(),
-        /*sharding=*/array_it->second.array->shared_ptr_sharding()});
-
-    // The default buffer donation semantic is finalized at compilation time.
-    // Users can override the donation semantic at runtime. In the meantime, the
-    // IFRT client RemapArrays API requires all input arrays have the same
-    // donation semantic.
-    if (!is_donated.has_value()) {
-      is_donated = remap_op.getDonated() && array_it->second.can_be_donated;
-    }
-    if (*is_donated && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Donation semantic must be consistent across all input arrays of "
-          "RemapArraysOp. Input array #",
-          idx,
-          " cannot be donated, but previous input arrays can be donated. It's "
-          "likely due to a MPMD program argument is marked as non-donatable. ",
-          PrettyPrint(remap_op)));
-    }
-    if (*is_donated || liveness_.isDeadAfter(input, remap_op)) {
-      array_values_to_gc_from_env.insert(input);
+        /*dtype=*/dtype,
+        /*shape=*/xla::ifrt::Shape(array_type.getShape().getShape()),
+        /*sharding=*/std::move(sharding)});
+
+    if (liveness_.isDeadAfter(input, remap_op)) {
+      state.dead_inputs.insert(ToArrayHandle(input));
     }
   }
-  TF_RET_CHECK(is_donated.has_value())
-      << "Unable to determine the donation semantic of the remap op. The remap "
-         "op has no inputs. "
-      << PrettyPrint(remap_op);
 
   // Get the output specs of the remap plan.
   std::vector<xla::ifrt::ArraySpec> output_specs;
@@ -468,153 +600,199 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
     TF_ASSIGN_OR_RETURN(
         xla::ifrt::DType dtype,
         xla::ifrt::ToIfrtDType(array_type.getShape().getElementType()));
+    TF_ASSIGN_OR_RETURN(xla::ifrt::ShardingRef sharding,
+                        GetSharding(array_type, client_, devices_));
     output_specs.push_back(xla::ifrt::ArraySpec{
         /*dtype=*/dtype,
         /*shape=*/xla::ifrt::Shape(array_type.getShape().getShape()),
-        /*sharding=*/array_type_to_sharding_.at(array_type)});
+        /*sharding=*/std::move(sharding)});
   }
 
-  // Apply the remap arrays operation.
-  xla::ifrt::ArrayCopySemantics copy_semantics =
-      *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
-                  : xla::ifrt::ArrayCopySemantics::kReuseInput;
-  TF_ASSIGN_OR_RETURN(
-      auto out_arrays,
-      client_->RemapArrays({
-                               /*input_specs=*/std::move(input_specs),
-                               /*output_specs=*/std::move(output_specs),
-                               /*mappings=*/std::move(mappings),
-                           },
-                           absl::MakeSpan(inputs), copy_semantics));
-
-  for (const auto& array_value : array_values_to_gc_from_env) {
-    // Donated remapped arrays are pro-actively deleted, and aliased arrays
-    // cannot be deleted later. Thus, remove the arrays from the deletable
-    // program arguments set.
-    deletable_program_arguments_.erase(array_value);
-    env.value_to_array.erase(array_value);
-  }
+  state.remap_plan = xla::ifrt::RemapPlan{
+      /*input_specs=*/std::move(input_specs),
+      /*output_specs=*/std::move(output_specs),
+      /*mappings=*/std::move(mappings),
+  };
+  state.remap_is_donated = remap_op.getDonated();
 
-  // Store the result arrays in the environment.
-  TF_RET_CHECK(out_arrays.size() == remap_op.getOutputs().size())
-      << "Got " << out_arrays.size() << " results, but op has "
-      << remap_op.getOutputs().size() << ". " << PrettyPrint(remap_op);
-  for (const auto [output_array, output] :
-       llvm::zip(out_arrays, remap_op.getOutputs())) {
-    if (!output.use_empty()) {
-      env.AssociateArray(output, ArrayState{/*array=*/std::move(output_array),
-                                            /*can_be_donated=*/true});
-    }
+  TF_RETURN_IF_ERROR(state.remap_plan.ComputeInputDevicesForOutputMap(client_));
+  TF_RETURN_IF_ERROR(state.remap_plan.Validate());
+
+  for (const auto output : remap_op.getOutputs()) {
+    const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
+    state.output_handles.push_back(handle);
   }
-  return absl::OkStatus();
+
+  return absl::bind_front(&RemapArraysOpState::Run, std::move(state));
 }
 
-absl::Status ProgramInterpreter::ExecuteOp(
-    xla::ifrt::CopyArraysOp copy_arrays_op, Environment& env) {
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchCopyArraysOp",
-                         {{"ifrt_ir_program", program_->program_name}});
-  });
-  std::string op_name = copy_arrays_op->getName().getStringRef().str();
-  VLOG(3) << PrettyPrint(copy_arrays_op);
-
-  std::vector<ArrayRef> inputs;
-  inputs.reserve(copy_arrays_op.getInputs().size());
-  llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
-  std::optional<bool> is_donated;
-  for (const auto [idx, input] : llvm::enumerate(copy_arrays_op.getInputs())) {
-    auto array_it = env.value_to_array.find(input);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. "
-        << PrettyPrint(copy_arrays_op);
-    if (array_it->second.array->IsDeleted()) {
-      // We explicitly check here for deletion in order to provide a more
-      // informative error message.
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " has already been deleted or donated. ",
-          PrettyPrint(copy_arrays_op)));
+namespace {
+
+struct CopyArraysOpState {
+  std::string pretty_print;
+
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<ArrayHandle> dead_inputs;
+  bool copy_is_donated;
+
+  std::vector<ArrayHandle> output_handles;
+  xla::ifrt::ShardingRef new_sharding;
+
+  absl::Status Run(Environment& env) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchCopyArraysOp",
+                           {{"ifrt_ir_program", env.program_name}});
+    });
+    VLOG(3) << pretty_print;
+
+    std::vector<ArrayRef> inputs;
+    inputs.reserve(input_handles.size());
+
+    std::optional<bool> is_donated;
+    std::vector<ArrayHandle> arrays_to_remove;
+
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      const ArrayHandle handle = input_handles[idx];
+
+      auto array_it = env.handle_to_array.find(handle);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      if (array_it->second.array->IsDeleted()) {
+        // We explicitly check here for deletion in order to provide a more
+        // informative error message.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Input array #", idx, " has already been deleted or donated. ",
+            pretty_print));
+      }
+      inputs.push_back(array_it->second.array);
+
+      // The default buffer donation semantic is finalized at compilation time.
+      // Users can override the donation semantic at runtime. In the meantime,
+      // the IFRT client CopyArrays API requires all input arrays have the same
+      // donation semantic.
+      if (!is_donated.has_value()) {
+        is_donated = copy_is_donated && array_it->second.can_be_donated;
+      }
+      if (*is_donated && !array_it->second.can_be_donated) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Donation semantic must be consistent across all input arrays of "
+            "CopyArraysOp. Input array #",
+            idx,
+            " cannot be donated, but previous input arrays can be donated. "
+            "It's likely due to a MPMD program argument is marked as "
+            "non-donatable. ",
+            pretty_print));
+      }
+      if (*is_donated || dead_inputs.contains(handle)) {
+        arrays_to_remove.push_back(handle);
+      }
     }
-    inputs.push_back(array_it->second.array);
-
-    // The default buffer donation semantic is finalized at compilation time.
-    // Users can override the donation semantic at runtime. In the meantime, the
-    // IFRT client CopyArrays API requires all input arrays have the same
-    // donation semantic.
-    if (!is_donated.has_value()) {
-      is_donated =
-          copy_arrays_op.getDonated() && array_it->second.can_be_donated;
+    TF_RET_CHECK(is_donated.has_value())
+        << "Unable to determine the donation semantic of the copy arrays op. "
+           "The copy arrays op has no inputs. "
+        << pretty_print;
+
+    auto array_copy_semantics =
+        *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
+                    : xla::ifrt::ArrayCopySemantics::kAlwaysCopy;
+    // It is safe to get the devices and memory kind from the first output
+    // because all outputs use the same devices and have the same memory kind.
+    TF_ASSIGN_OR_RETURN(auto copied_arrays,
+                        env.client->CopyArrays(
+                            absl::MakeSpan(inputs), new_sharding->devices(),
+                            new_sharding->memory_kind(), array_copy_semantics));
+
+    for (const auto handle : arrays_to_remove) {
+      if (env.deletable_program_arguments.erase(handle)) {
+        // Explicitly delete donated program arguments that are not used later.
+        env.handle_to_array[handle].array->Delete();
+      }
+      env.handle_to_array.erase(handle);
     }
-    if (*is_donated && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Donation semantic must be consistent across all input arrays of "
-          "CopyArraysOp. Input array #",
-          idx,
-          " cannot be donated, but previous input arrays can be donated. It's "
-          "likely due to a MPMD program argument is marked as non-donatable. ",
-          PrettyPrint(copy_arrays_op)));
+
+    TF_RET_CHECK(copied_arrays.size() == inputs.size())
+        << "Got " << copied_arrays.size() << " results, but op has "
+        << inputs.size() << ". " << pretty_print;
+    for (int i = 0; i < output_handles.size(); ++i) {
+      const ArrayHandle handle = output_handles[i];
+      if (handle != 0) {
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/std::move(copied_arrays[i]),
+                                       /*can_be_donated=*/true,
+                                   });
+      }
     }
-    if (*is_donated || liveness_.isDeadAfter(input, copy_arrays_op)) {
-      array_values_to_gc_from_env.insert(input);
+
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    xla::ifrt::CopyArraysOp copy_arrays_op) {
+  CopyArraysOpState state;
+  state.pretty_print = PrettyPrint(copy_arrays_op);
+
+  for (const auto [idx, input] : llvm::enumerate(copy_arrays_op.getInputs())) {
+    state.input_handles.push_back(ToArrayHandle(input));
+    if (liveness_.isDeadAfter(input, copy_arrays_op)) {
+      state.dead_inputs.insert(ToArrayHandle(input));
     }
   }
-  TF_RET_CHECK(is_donated.has_value())
-      << "Unable to determine the donation semantic of the copy arrays op. The "
-         "copy arrays op has no inputs. "
-      << PrettyPrint(copy_arrays_op);
+  state.copy_is_donated = copy_arrays_op.getDonated();
 
   const auto out_array_type = llvm::cast<xla::ifrt::IfrtArrayType>(
       copy_arrays_op.getOutputs().front().getType());
   TF_RET_CHECK(out_array_type != nullptr)
       << "Output array #0 is not of type `IfrtArrayType`. "
-      << PrettyPrint(copy_arrays_op);
-  auto new_sharding = array_type_to_sharding_.at(out_array_type);
-  auto array_copy_semantics = *is_donated
-                                  ? xla::ifrt::ArrayCopySemantics::kDonateInput
-                                  : xla::ifrt::ArrayCopySemantics::kAlwaysCopy;
-  // It is safe to get the devices and memory kind from the first output
-  // because all outputs use the same devices and have the same memory kind.
-  TF_ASSIGN_OR_RETURN(
-      auto copied_arrays,
-      client_->CopyArrays(absl::MakeSpan(inputs), new_sharding->devices(),
-                          new_sharding->memory_kind(), array_copy_semantics));
-
-  for (const auto& array_value : array_values_to_gc_from_env) {
-    if (deletable_program_arguments_.erase(array_value)) {
-      // Explicitly delete donated program arguments that are not used later.
-      env.value_to_array[array_value].array->Delete();
-    }
-    env.value_to_array.erase(array_value);
+      << state.pretty_print;
+  TF_ASSIGN_OR_RETURN(state.new_sharding,
+                      GetSharding(out_array_type, client_, devices_));
+
+  for (const auto output : copy_arrays_op.getOutputs()) {
+    const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
+    state.output_handles.push_back(handle);
   }
 
-  // Store the result arrays in the environment.
-  TF_RET_CHECK(copied_arrays.size() == copy_arrays_op.getOutputs().size())
-      << "Got " << copied_arrays.size() << " results, but op has "
-      << copy_arrays_op.getOutputs().size() << ". "
-      << PrettyPrint(copy_arrays_op);
-  for (const auto [output_array, output] :
-       llvm::zip(copied_arrays, copy_arrays_op.getOutputs())) {
-    if (!output.use_empty()) {
-      env.AssociateArray(output, ArrayState{/*array=*/std::move(output_array),
-                                            /*can_be_donated=*/true});
+  return absl::bind_front(&CopyArraysOpState::Run, std::move(state));
+}
+
+namespace {
+
+struct ReturnOpState {
+  std::string pretty_print;
+  std::vector<ArrayHandle> output_handles;
+
+  absl::Status Run(Environment& env) const {
+    VLOG(3) << "func.return of `main` function";
+    env.outputs.reserve(output_handles.size());
+    for (int idx = 0; idx < output_handles.size(); ++idx) {
+      auto array_it = env.handle_to_array.find(output_handles[idx]);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      env.outputs.push_back(std::move(array_it->second.array));
     }
+    env.handle_to_array.clear();
+    return absl::OkStatus();
   }
-  return absl::OkStatus();
-}
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    mlir::func::ReturnOp return_op) {
+  ReturnOpState state;
+  state.pretty_print = PrettyPrint(return_op);
 
-absl::Status ProgramInterpreter::ExecuteOp(mlir::func::ReturnOp return_op,
-                                           Environment& env) {
   auto func_op = return_op->getParentOfType<mlir::func::FuncOp>();
   CHECK_EQ(func_op.getSymName().str(), "main");
-  VLOG(3) << return_op->getName().getStringRef().str() << " of `main` function";
-  env.outputs.reserve(return_op->getNumOperands());
+  state.output_handles.reserve(return_op->getNumOperands());
   for (const auto& [idx, result] : llvm::enumerate(return_op.getOperands())) {
-    auto array_it = env.value_to_array.find(result);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. " << PrettyPrint(return_op);
-    env.outputs.push_back(std::move(array_it->second.array));
+    state.output_handles.push_back(ToArrayHandle(result));
   }
-  env.value_to_array.clear();
-  return absl::OkStatus();
+
+  return absl::bind_front(&ReturnOpState::Run, std::move(state));
 }
 
 std::string ProgramInterpreter::PrettyPrint(mlir::Operation* op) {
diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.h b/third_party/xla/xla/python/ifrt/ir/program_interpreter.h
index 3f8e8075404185..35158ac1305124 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.h
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.h
@@ -21,23 +21,22 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/SymbolTable.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/ir/compiled_ifrt_ir_program.h"
-#include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
-#include "xla/python/ifrt/sharding.h"
 
 namespace xla {
 namespace ifrt {
@@ -46,59 +45,75 @@ namespace ifrt {
 struct Environment;
 
 // Interpreter for an IFRT IR program.
+//
+// The program interpreter is responsible for executing an IFRT IR program. The
+// interpreter works in two stages. First, when `BuildExecuteFn` is called, it
+// traverses the program and builds a function that can be invoked to execute
+// the program, which happens only once during compilation. Second, the returned
+// execute function can be called multiple times to interpret the IFRT IR
+// program.
+//
+// This two-stage design has two primary purposes:
+//
+// 1. It allows us to leverage the static information available in the program
+//    as much as possible. For example, `RemapArraysOp` builds its remap plan
+//    during the first stage and the plan is reused for all executions.
+//
+// 2. It avoids running any LLVM/MLIR code during execution. This is
+//    particularly useful in environments where the use of LLVM/MLIR
+//    synchronization primitives may cause deadlocks, e.g., cooperatively
+//    scheduled fibers.
 class ProgramInterpreter {
  public:
+  using ExecuteFn = absl::AnyInvocable<
+      absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult>(
+          absl::Span<xla::ifrt::ArrayRef> arrays,
+          const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+          std::optional<xla::ifrt::DeviceListRef> devices)>;
+
   static absl::StatusOr<std::unique_ptr<ProgramInterpreter>> Create(
-      xla::ifrt::Client* client, std::shared_ptr<CompiledIfrtIrProgram> program,
+      xla::ifrt::Client* client, absl::string_view program_name,
+      mlir::ModuleOp mlir_module,
+      std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables,
       xla::ifrt::DeviceListRef devices);
 
-  // Executes the IFRT IR program.
-  absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult> Execute(
-      absl::Span<xla::ifrt::ArrayRef> arrays,
-      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
-      std::optional<xla::ifrt::DeviceListRef> devices);
+  absl::StatusOr<ExecuteFn> BuildExecuteFn();
 
  private:
+  using OpFn = absl::AnyInvocable<absl::Status(Environment& env) const>;
+
   ProgramInterpreter(
-      xla::ifrt::Client* client, std::shared_ptr<CompiledIfrtIrProgram> program,
-      xla::ifrt::DeviceListRef devices, mlir::Liveness liveness,
-      llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>
-          array_type_to_sharding)
+      xla::ifrt::Client* client, absl::string_view program_name,
+      mlir::ModuleOp mlir_module,
+      std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables,
+      xla::ifrt::DeviceListRef devices, mlir::Liveness liveness)
       : client_(client),
-        program_(std::move(program)),
+        program_name_(program_name),
+        mlir_module_(mlir_module),
+        atom_program_executables_(std::move(atom_program_executables)),
         devices_(std::move(devices)),
-        liveness_(std::move(liveness)),
-        array_type_to_sharding_(std::move(array_type_to_sharding)) {}
+        liveness_(std::move(liveness)) {}
 
-  absl::Status ExecuteOp(xla::ifrt::CallLoadedExecutableOp call_loaded_op,
-                         Environment& env);
-  absl::Status ExecuteOp(xla::ifrt::RemapArraysOp remap_op, Environment& env);
-  absl::Status ExecuteOp(xla::ifrt::CopyArraysOp copy_arrays_op,
-                         Environment& env);
-  absl::Status ExecuteOp(mlir::func::ReturnOp return_op, Environment& env);
+  absl::StatusOr<OpFn> HandleOp(
+      xla::ifrt::CallLoadedExecutableOp call_loaded_op);
+  absl::StatusOr<OpFn> HandleOp(xla::ifrt::RemapArraysOp remap_op);
+  absl::StatusOr<OpFn> HandleOp(xla::ifrt::CopyArraysOp copy_arrays_op);
+  absl::StatusOr<OpFn> HandleOp(mlir::func::ReturnOp return_op);
 
   // Returns a pretty string representation of the op.
   std::string PrettyPrint(mlir::Operation* op);
 
   xla::ifrt::Client* client_;
   mlir::SymbolTableCollection symbol_table_;
-  std::shared_ptr<CompiledIfrtIrProgram> program_;
+  std::string program_name_;
+  mlir::ModuleOp mlir_module_;
+  std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables_;
 
   // All the devices the program uses.
   xla::ifrt::DeviceListRef devices_;
 
   // Cached liveness analysis of the IFRT IR program.
   mlir::Liveness liveness_;
-
-  // Mapping between IfrtArrayType and Sharding. This map is used to cache
-  // the Shardings at IFRT IR program compilation time in order to avoid
-  // overheads at execution time.
-  llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>
-      array_type_to_sharding_;
-
-  // Set of donated program arguments, which can be deleted after their last
-  // use. Entries are removed upon deletion or if they are aliased.
-  llvm::DenseSet<mlir::Value> deletable_program_arguments_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc b/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
index 596767a9dc3a1d..097136f5b1c631 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
@@ -67,11 +67,18 @@ class TestChildExecutableCompiler : public AtomProgramCompiler {
            "invalidated some method string_views.";
     auto mock_executable =
         std::make_unique<testing::NiceMock<MockLoadedExecutable>>();
+    int num_devices;
+    if (options.executable_build_options.has_device_assignment()) {
+      num_devices =
+          options.executable_build_options.device_assignment().num_elements();
+    } else {
+      num_devices = 1;
+    }
     int num_parameters_to_propagate =
         options.executable_build_options
             .allow_spmd_sharding_propagation_to_parameters()
             .size();
-    if (num_parameters_to_propagate > 0) {
+    if (num_devices > 1 && num_parameters_to_propagate > 0) {
       xla::OpSharding op_sharding;
       op_sharding.set_type(xla::OpSharding::REPLICATED);
       std::vector<xla::OpSharding> parameter_shardings(
@@ -83,7 +90,7 @@ class TestChildExecutableCompiler : public AtomProgramCompiler {
         options.executable_build_options
             .allow_spmd_sharding_propagation_to_output()
             .size();
-    if (num_outputs_to_propagate > 0) {
+    if (num_devices > 1 && num_outputs_to_propagate > 0) {
       // Always infer output shardings to be replicated for the lit tests.
       xla::OpSharding op_sharding;
       op_sharding.set_type(xla::OpSharding::REPLICATED);
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir
index 4021496168cb8c..e8c49c453b6853 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir
@@ -286,6 +286,36 @@ module @propagate_to_inputs {
 
 // -----
 
+!array_unspecified = !ifrt.array<tensor<2x2xi32>,
+                                 #ifrt.sharding_unspecified, [0]>
+// CHECK-LABEL: @propagate_single_device
+module @propagate_single_device {
+  func.func @main(%arg0: !array_unspecified)
+      -> !array_unspecified attributes {ifrt.function} {
+    // CHECK: %[[OUT:.+]], %{{.+}} = ifrt.CallLoadedExecutable @[[CALLEE:.+]](%arg0)
+    // CHECK-SAME: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+    // CHECK-SAME: -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+    %0, %ctrl_0 = ifrt.Call @add_one_0::@main(%arg0) on devices [0]
+        {ifrt.module_type = "xla"} : (!array_unspecified) -> !array_unspecified
+    return %0 : !array_unspecified
+  }
+
+  // CHECK: ifrt.LoadedExecutable @[[CALLEE]]
+  // CHECK-SAME: on devices [0]
+  // CHECK-SAME: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+  // CHECK-SAME: -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+  module @add_one_0 attributes {sym_visibility = "private"} {
+    func.func @main(%arg0: tensor<2x2xi32>) -> (tensor<2x2xi32>) {
+      %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+      %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+      return %1 : tensor<2x2xi32>
+    }
+  }
+
+}
+
+// -----
+
 !array = !ifrt.array<tensor<2x2xi32>,
                      #ifrt.sharding_param<2x1 to [0] on 2>, [0, 1]>
 !array_unspecified = !ifrt.array<tensor<2x2xi32>,
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc
index c48128130c59f7..a828664243a673 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "xla/python/ifrt/ir/transforms/utils.h"
 #include "xla/python/ifrt/support/sharding_conversions.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace ifrt {
@@ -388,10 +389,16 @@ IfrtCompileAndPropagateShardingsPass::GetInputShardingParams(
     if (llvm::isa<IfrtUnspecifiedShardingAttr>(
             in_array_type.getShardingAttr())) {
       if (!in_shardings.has_value()) {
-        in_shardings = compile_result.executable->GetParameterShardings();
-        if (!in_shardings.has_value()) {
-          return call_op.emitError()
-                 << "executable does not have input shardings";
+        if (call_op.getDevices().size() == 1) {
+          // Use replicated sharding for single-device inputs without calling
+          // `GetParameterShardings` since it may return `std::nullopt`.
+          in_shardings.emplace(call_op.getOutputs().size());
+        } else {
+          in_shardings = compile_result.executable->GetParameterShardings();
+          if (!in_shardings.has_value()) {
+            return call_op.emitError()
+                   << "executable does not have input shardings";
+          }
         }
         if (in_shardings->size() != call_op.getOutputs().size()) {
           return call_op.emitError()
@@ -443,10 +450,16 @@ IfrtCompileAndPropagateShardingsPass::GetOutputShardingParams(
     if (llvm::isa<IfrtUnspecifiedShardingAttr>(
             out_array_type.getShardingAttr())) {
       if (!out_shardings.has_value()) {
-        out_shardings = compile_result.executable->GetOutputShardings();
-        if (!out_shardings.has_value()) {
-          return call_op.emitError()
-                 << "executable does not have output shardings";
+        if (call_op.getDevices().size() == 1) {
+          // Use replicated sharding for single-device inputs without calling
+          // `GetParameterShardings` since it may return `std::nullopt`.
+          out_shardings.emplace(call_op.getOutputs().size());
+        } else {
+          out_shardings = compile_result.executable->GetOutputShardings();
+          if (!out_shardings.has_value()) {
+            return call_op.emitError()
+                   << "executable does not have output shardings";
+          }
         }
         if (out_shardings->size() != call_op.getOutputs().size()) {
           return call_op.emitError()
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
index fbd4f41c157e0f..0669c2599d5566 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
@@ -156,7 +156,7 @@ void IfrtCompileAtomProgramPass::runOnOperation() {
       }
 
       // TODO(b/433244129) - remove after 6 months bwd compatibility window.
-      if (sdy_meshes_round_trip_attr && call_op->hasAttr(kIsSdyPartitioned)) {
+      if (sdy_meshes_round_trip_attr) {
         // Add the meshes roundtrip attribute to the callee module if the
         // atom program was partitioned with sdy.
         xla::sdy::setFrontendAttribute(callee_module,
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc
index 66c8aa2d6a3eb9..063423317fbbfe 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Pass/Pass.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -94,7 +95,7 @@ void IfrtLowerAtomProgramMetadataToXlaPass::runOnOperation() {
   // after 6 month bwd compatibility window.
   bool is_sdy = module_op->hasAttr(kIsSdyPartitioned);
   if (is_sdy) {
-    module_op.emitWarning()
+    mlir::emitWarning(module_op->getLoc())
         << "`" << kIsSdyPartitioned
         << "` attribute is deprecated and will be removed. See b/433244129."
            " Please use `compile_options_override` to specify sharding.";
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
index 803cc7a5ddee8f..f085752125d0ed 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <tuple>
-#include <vector>
 
 #include "absl/log/check.h"
 #include "llvm/ADT/STLExtras.h"
@@ -134,13 +133,13 @@ bool MergeReshardsIgnoringControlDependencies(mlir::func::FuncOp func_op) {
     // order after the merge.
     rewriter.setInsertionPoint(reshards.back());
     auto merged_reshard =
-        rewriter.create<ReshardOp>(rewriter.getFusedLoc(locs),
-                                   /*outputs=*/output_types,
-                                   /*control_output=*/
-                                   IfrtControlType::get(rewriter.getContext()),
-                                   /*inputs=*/inputs,
-                                   /*donated=*/reshards.front().getDonated(),
-                                   /*control_inputs=*/mlir::ValueRange());
+        ReshardOp::create(rewriter, rewriter.getFusedLoc(locs),
+                          /*outputs=*/output_types,
+                          /*control_output=*/
+                          IfrtControlType::get(rewriter.getContext()),
+                          /*inputs=*/inputs,
+                          /*donated=*/reshards.front().getDonated(),
+                          /*control_inputs=*/mlir::ValueRange());
 
     // Replace the original reshards with the new merged reshard.
     for (auto [index, reshard] : llvm::enumerate(reshards)) {
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc
index c390916b320389..000c916852aea9 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc
index c62dc0cc98b897..fe0f68ccbea178 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -130,50 +131,49 @@ void IfrtVerifyBoundExternalLoadedExecutablePass::runOnOperation() {
       }
 
       auto func_type = loaded_exec_op.getFunctionType();
-      if (!exec_it->second->GetParameterShardings().has_value()) {
+      std::optional<std::vector<xla::OpSharding>> parameter_shardings;
+      if (loaded_exec_op.getDevices().size() == 1) {
+        parameter_shardings.emplace(func_type.getNumInputs());
+      } else {
+        parameter_shardings = exec_it->second->GetParameterShardings();
+      }
+      if (!parameter_shardings.has_value()) {
         return loaded_exec_op.emitOpError()
                << "cannot be bound to an executable without parameter "
                   "shardings";
       }
-      if (!exec_it->second->GetOutputShardings().has_value()) {
+      std::optional<std::vector<xla::OpSharding>> output_shardings;
+      if (loaded_exec_op.getDevices().size() == 1) {
+        output_shardings.emplace(func_type.getNumResults());
+      } else {
+        output_shardings = exec_it->second->GetOutputShardings();
+      }
+      if (!output_shardings.has_value()) {
         return loaded_exec_op.emitOpError()
-               << "cannot be bound to an executable without output shardings";
+               << "cannot be bound to a multi-device executable without output "
+                  "shardings";
       }
-      if (func_type.getNumInputs() !=
-          exec_it->second->GetParameterShardings()->size()) {
+      if (func_type.getNumInputs() != parameter_shardings->size()) {
         return loaded_exec_op.emitOpError()
                << "expects an executable with " << func_type.getNumInputs()
                << " inputs, but was bound to an executable with "
-               << exec_it->second->GetParameterShardings()->size() << " inputs";
+               << parameter_shardings->size() << " inputs";
       }
-      if (func_type.getNumResults() !=
-          exec_it->second->GetOutputShardings()->size()) {
+      if (func_type.getNumResults() != output_shardings->size()) {
         return loaded_exec_op.emitOpError()
                << "expects an executable with " << func_type.getNumResults()
                << " results, but was bound to an executable with "
-               << exec_it->second->GetOutputShardings()->size() << " results";
+               << output_shardings->size() << " results";
       }
       // Verify that the input and output shardings of the LoadedExecutableOp
       // are the same as the shardings of the bound executable.
-      if (!exec_it->second->GetParameterShardings().has_value()) {
-        return loaded_exec_op.emitOpError()
-               << "cannot be bound to an executable without parameter "
-                  "shardings";
-      }
-      if (!exec_it->second->GetOutputShardings().has_value()) {
-        return loaded_exec_op.emitOpError()
-               << "cannot be bound to an executable without output "
-                  "shardings";
-      }
       auto sharding_equal_status = VerifyShardingsEqual(
-          func_type.getInputs(), *exec_it->second->GetParameterShardings(),
-          "input");
+          func_type.getInputs(), *parameter_shardings, "input");
       if (!sharding_equal_status.ok()) {
         return loaded_exec_op.emitOpError() << sharding_equal_status.message();
       }
-      sharding_equal_status = VerifyShardingsEqual(
-          func_type.getResults(), *exec_it->second->GetOutputShardings(),
-          "output");
+      sharding_equal_status = VerifyShardingsEqual(func_type.getResults(),
+                                                   *output_shardings, "output");
       if (!sharding_equal_status.ok()) {
         return loaded_exec_op.emitOpError() << sharding_equal_status.message();
       }
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
index 24785ec7e59522..ee938534b809b0 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
@@ -180,7 +180,8 @@ absl::StatusOr<CompileFuture> MultiThreadedAtomProgramCompiler::CompileXla(
       /*context=*/nullptr,  // Shares the same long-living context.
       mlir::OwningOpRef<mlir::ModuleOp>(module_op.clone()));
   auto [promise, future] = CompileFuture::MakePromise();
-  tsl::Env::Default()->SchedClosure(
+  tsl::Env::Default()->StartDetachedThread(
+      tsl::ThreadOptions(), /*name=*/"MultiThreadedAtomProgramCompiler",
       WithCurrentUserContext([this, hlo_program = std::move(hlo_program),
                               compile_options = std::move(compile_options),
                               promise = std::move(promise)]() mutable {
diff --git a/third_party/xla/xla/python/ifrt/ir/utils.cc b/third_party/xla/xla/python/ifrt/ir/utils.cc
index c8015a51eccc50..6744cce7d1d796 100644
--- a/third_party/xla/xla/python/ifrt/ir/utils.cc
+++ b/third_party/xla/xla/python/ifrt/ir/utils.cc
@@ -51,6 +51,9 @@ absl::StatusOr<int64_t> GetDeviceMemoryInBytes(absl::string_view device_kind) {
   if (device_kind == "NVIDIA H100 80GB HBM3") {
     return 80LL * kGB;
   }
+  if (device_kind == "NVIDIA H200") {
+    return 141LL * kGB;
+  }
   if (device_kind == "NVIDIA B200") {
     return 192LL * kGB;
   }
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index b71a27124a61df..3ecaef013dd39c 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -261,6 +261,9 @@ MockDevice::MockDevice(Device* delegated) : delegated_(delegated) {
   ON_CALL(*this, ProcessIndex).WillByDefault([this]() {
     return delegated_->ProcessIndex();
   });
+  ON_CALL(*this, PlatformName).WillByDefault([this]() {
+    return delegated_->PlatformName();
+  });
   ON_CALL(*this, Kind).WillByDefault([this]() { return delegated_->Kind(); });
   ON_CALL(*this, Attributes).WillByDefault([this]() -> const AttributeMap& {
     return delegated_->Attributes();
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 6e73a68ba80917..ae686297484915 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -236,6 +237,7 @@ class MockDevice : public Device {
   MOCK_METHOD(bool, IsAddressable, (), (const, final));
   MOCK_METHOD(int, ProcessIndex, (), (const, final));
   MOCK_METHOD(DeviceId, Id, (), (const, final));
+  MOCK_METHOD(absl::string_view, PlatformName, (), (const, final));
   MOCK_METHOD(absl::string_view, Kind, (), (const, final));
   MOCK_METHOD((const AttributeMap&), Attributes, (), (const, final));
   MOCK_METHOD(absl::StatusOr<Memory*>, DefaultMemory, (), (const, final));
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 9892f3aebc5e62..1c34eea924511b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -442,6 +442,7 @@ cc_library(
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:sharding_serdes",
@@ -470,6 +471,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:platform_port",
@@ -623,6 +625,8 @@ ifrt_proxy_cc_test(
         ":version",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt/profiling:device_time_measurement",
+        "//xla/pjrt/profiling/test_util:mock_device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:mock",
@@ -643,6 +647,7 @@ ifrt_proxy_cc_test(
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index d30ee6f9e0d03d..38ae216b880237 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -497,9 +497,6 @@ bool Array::IsDeleted() const {
       return false;
     }
   }
-  if (GetGlobalClientFlags()->array_is_deleted_hack) {
-    return false;
-  }
   auto req = std::make_unique<IsArrayDeletedRequest>();
   req->set_array_handle(handle_.handle);
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index 03c92861e6560f..65380233637223 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -20,7 +20,6 @@
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -139,9 +138,9 @@ absl::StatusOr<std::unique_ptr<Client>> Client::Create(
     bool is_addressable = addressable_device_ids.contains(d.id());
     bool is_primary = primary_device_ids.contains(d.id());
 
-    auto device =
-        std::make_unique<Device>(std::move(desc), d.local_device_id(),
-                                 d.local_hardware_id(), is_addressable);
+    auto device = std::make_unique<Device>(
+        std::move(desc), d.platform_name(), d.local_device_id(),
+        d.local_hardware_id(), is_addressable);
     all_device_ptrs.push_back(device.get());
     if (is_primary) {
       primary_device_ptrs.push_back(device.get());
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.cc b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
index 63a8b084512dc1..cb1019ae548705 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
@@ -14,6 +14,7 @@
 
 #include "xla/python/ifrt_proxy/client/device.h"
 
+#include <string>
 #include <utility>
 
 #include "absl/status/status.h"
@@ -28,9 +29,10 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
-Device::Device(DeviceDescription description, int local_device_id,
-               int local_hardware_id, bool is_addressable)
+Device::Device(DeviceDescription description, std::string platform_name,
+               int local_device_id, int local_hardware_id, bool is_addressable)
     : description_(std::move(description)),
+      platform_name_(std::move(platform_name)),
       attributes_(FromPjRtAttributeMap(description_.Attributes())),
       local_device_id_(local_device_id),
       local_hardware_id_(local_hardware_id),
@@ -42,6 +44,7 @@ DeviceId Device::Id() const { return DeviceId(description_.id()); }
 
 bool Device::IsAddressable() const { return is_addressable_; }
 
+absl::string_view Device::PlatformName() const { return platform_name_; }
 absl::string_view Device::Kind() const { return description_.device_kind(); }
 absl::string_view Device::ToString() const { return description_.ToString(); }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/xla/xla/python/ifrt_proxy/client/device.h
index 7e0c684d8b0e5a..c81b0359b7ba9d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -76,13 +76,14 @@ class DeviceDescription final : public xla::PjRtDeviceDescription {
 
 class Device final : public llvm::RTTIExtends<Device, xla::ifrt::Device> {
  public:
-  Device(DeviceDescription description, int local_device_id,
-         int local_hardware_id, bool is_addressable);
+  Device(DeviceDescription description, std::string platform_name,
+         int local_device_id, int local_hardware_id, bool is_addressable);
 
   ifrt::Client* client() const override;
   bool IsAddressable() const override;
 
   DeviceId Id() const override;
+  absl::string_view PlatformName() const override;
   absl::string_view Kind() const override;
   absl::string_view ToString() const override;
   absl::string_view DebugString() const override;
@@ -100,6 +101,7 @@ class Device final : public llvm::RTTIExtends<Device, xla::ifrt::Device> {
 
   ifrt::Client* client_;
   const DeviceDescription description_;
+  const std::string platform_name_;
 
   const AttributeMap attributes_;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index a1173d09c8ee7d..9204aa35e25ac9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -35,6 +35,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,6 +43,7 @@
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
@@ -133,7 +135,8 @@ absl::StatusOr<absl::Cord> ExecuteLoadedHostCallback(
   absl::CordReader reader(operand_buffer);
   for (const auto& spec : xla_host_callback.operands) {
     const int64_t size = xla::ShapeUtil::ByteSizeOf(spec.shape);
-    void* p = tsl::port::AlignedMalloc(size, kAlignment);
+    void* p = tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(kAlignment));
     CHECK(p != nullptr);
     std::unique_ptr<char, Deleter> buffer(reinterpret_cast<char*>(p));
 
@@ -161,7 +164,8 @@ absl::StatusOr<absl::Cord> ExecuteLoadedHostCallback(
 
   for (const auto& spec : xla_host_callback.results) {
     const int64_t size = xla::ShapeUtil::ByteSizeOf(spec.shape);
-    void* data = tsl::port::AlignedMalloc(size, kAlignment);
+    void* data = tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(kAlignment));
     CHECK(data != nullptr);
 
     result_ptrs.push_back(data);
@@ -735,10 +739,17 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
     }
   }
 
+  std::optional<uint64_t> device_time_key = xla::GetDeviceTimeMeasurementKey();
+  if (device_time_key.has_value()) {
+    // An active device time measurement requires the server to respond with
+    // measured device times after the execution is complete.
+    req->mutable_execute_options()->set_fill_status(true);
+  }
+
   // Starting version 6, the server populates the status future only if it was
   // explicitly requested via `options.fill_status`.
-  const bool result_needs_exec_status =
-      rpc_helper_->protocol_version() < 6 || options.fill_status;
+  const bool result_needs_exec_status = rpc_helper_->protocol_version() < 6 ||
+                                        req->execute_options().fill_status();
 
   // The client generates handles if the protocol version is sufficiently newer,
   // and we've already seen at least one response from an execute (and thus know
@@ -789,10 +800,13 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
     }
     rpc_helper_->LoadedExecutableExecute(std::move(req));
     if (result_needs_exec_status) {
-      // Note that `CheckFuture` needs to be sent after
+      // Note that the RPCs within `FetchExecuteResult` need to be sent after
       // `LoadedExecutableExecute` above, or the server will not recognize the
       // handle being sent.
-      result.status = rpc_helper_->CheckFuture(status_handle);
+      tsl::Future<> status = FetchExecuteResult(status_handle, device_time_key);
+      if (options.fill_status) {
+        result.status = std::move(status);
+      }
     }
 
     return result;
@@ -808,8 +822,8 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
       Array::Destruct(rpc_helper_.get(), ArrayHandle{output.array_handle()});
     }
     if (result_needs_exec_status) {
-      // `CheckFuture` deletes the server-side future handle.
-      rpc_helper_->CheckFuture(response->status_handle());
+      // `FetchExecuteResult` deletes the server-side future handle.
+      FetchExecuteResult(response->status_handle(), device_time_key);
     }
     return status;
   }
@@ -836,7 +850,11 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
     }
   }
   if (result_needs_exec_status) {
-    result.status = rpc_helper_->CheckFuture(response->status_handle());
+    tsl::Future<> status =
+        FetchExecuteResult(response->status_handle(), device_time_key);
+    if (options.fill_status) {
+      result.status = std::move(status);
+    }
   } else {
     CHECK_EQ(response->status_handle(), 0);
   }
@@ -853,6 +871,49 @@ absl::Span<xla::ifrt::Device* const> LoadedExecutable::addressable_devices()
   return addressable_devices_;
 }
 
+tsl::Future<> LoadedExecutable::FetchExecuteResult(
+    uint64_t status_handle, std::optional<uint64_t> device_time_key) {
+  if (rpc_helper_->protocol_version() < protocol_version::kExecuteResult) {
+    return rpc_helper_->CheckFuture(status_handle);
+  }
+  auto req = std::make_unique<LoadedExecutableFetchExecuteResultRequest>();
+  req->set_result_status_handle(status_handle);
+
+  using RespT = std::shared_ptr<LoadedExecutableFetchExecuteResultResponse>;
+
+  tsl::Future<RespT> result =
+      rpc_helper_->LoadedExecutableFetchExecuteResult(std::move(req));
+
+  if (device_time_key.has_value()) {
+    result.OnReady([device_time_key](const absl::StatusOr<RespT>& resp) {
+      if (!resp.ok()) {
+        LOG_EVERY_N_SEC(ERROR, 60)
+            << "Device time measurement was requested but failed to retrieve "
+               "the execution result: "
+            << resp.status();
+        return;
+      }
+
+      for (const auto& [device_type_name, duration] : (*resp)->device_time()) {
+        xla::DeviceTimeMeasurement::DeviceType device_type;
+        if (device_type_name == "tpu") {
+          device_type = xla::DeviceTimeMeasurement::DeviceType::kTpu;
+        } else if (device_type_name == "gpu") {
+          device_type = xla::DeviceTimeMeasurement::DeviceType::kGpu;
+        } else {
+          device_type = xla::DeviceTimeMeasurement::DeviceType::kUnknown;
+        }
+        if (device_type != xla::DeviceTimeMeasurement::DeviceType::kUnknown) {
+          xla::RecordDeviceTimeMeasurement(
+              *device_time_key, absl::Microseconds(duration), device_type);
+        }
+      }
+    });
+  }
+
+  return result.GetReadyFuture();
+}
+
 char LoadedExecutable::ID = 0;  // NOLINT
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index 749c7af950543f..56c74a83e718bb 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -143,6 +143,9 @@ class LoadedExecutable final
     int64_t size_of_generated_code_in_bytes;
   };
 
+  tsl::Future<> FetchExecuteResult(uint64_t status_handle,
+                                   std::optional<uint64_t> device_time_key);
+
   xla::ifrt::Client* client_;
   std::shared_ptr<RpcHelper> rpc_helper_;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 54d11c3cdf4539..c2c46a215ce144 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -16,6 +16,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -28,6 +29,8 @@
 #include "llvm/Support/Casting.h"
 #include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
+#include "xla/pjrt/profiling/test_util/mock_device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/device.h"
@@ -55,6 +58,7 @@
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 using ::testing::_;
@@ -232,7 +236,7 @@ TEST_F(LoadedExecutableTest, Execute) {
   exec_options.fill_status = true;
 
   IfrtResponse execute_response;
-  IfrtResponse check_future_response;
+  IfrtResponse fetch_execute_result_response;
 
   ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
                                             loaded_executable_execute_response {
@@ -277,12 +281,13 @@ TEST_F(LoadedExecutableTest, Execute) {
                                               }
                                             }
                                           )pb",
-                                          &check_future_response));
+                                          &fetch_execute_result_response));
   EXPECT_CALL(*session_,
-              Enqueue(Pointee(Partially(EquivToProto(R"pb(check_future_request {
-                                                            future_handle: 2000
-                                                          })pb")))))
-      .WillOnce(MockClientSessionReturnResponse(check_future_response));
+              Enqueue(Pointee(Partially(EquivToProto(
+                  R"pb(loaded_executable_fetch_execute_result_request {
+                         result_status_handle: 2000
+                       })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(fetch_execute_result_response));
 
   DeviceListRef devices = BasicDeviceList::Create({&device});
 
@@ -333,21 +338,23 @@ TEST_F(LoadedExecutableTest, Execute) {
       Enqueue(IfrtRequestOfType(IfrtRequest::kLoadedExecutableExecuteRequest)))
       .WillOnce(MockClientCaptureAndReturn(&requests_queue, execute_response));
   EXPECT_CALL(*session_,
-              Enqueue(IfrtRequestOfType(IfrtRequest::kCheckFutureRequest)))
-      .WillOnce(
-          MockClientCaptureAndReturn(&requests_queue, check_future_response));
+              Enqueue(IfrtRequestOfType(
+                  IfrtRequest::kLoadedExecutableFetchExecuteResultRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue,
+                                           fetch_execute_result_response));
 
   TF_ASSERT_OK_AND_ASSIGN(
       result, executable.Execute(absl::MakeSpan(args), exec_options, devices));
 
   auto execute_req = requests_queue.Pop().loaded_executable_execute_request();
-  auto check_future_req = requests_queue.Pop().check_future_request();
+  auto fetch_execute_result_req =
+      requests_queue.Pop().loaded_executable_fetch_execute_result_request();
 
   EXPECT_THAT(
       result.status.Await(),
       absl_testing::StatusIs(absl::StatusCode::kUnknown, "injected error"));
   EXPECT_EQ(execute_req.result_status_handle(),
-            check_future_req.future_handle());
+            fetch_execute_result_req.result_status_handle());
 
   ASSERT_THAT(result.outputs, SizeIs(2));
   ASSERT_THAT(execute_req.result_array_handle(), SizeIs(2));
@@ -361,6 +368,79 @@ TEST_F(LoadedExecutableTest, Execute) {
             execute_req.result_array_handle()[1]);
 }
 
+TEST_F(LoadedExecutableTest, DeviceTime) {
+  if (tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "DeviceTimeMeasurement implementation isn't available in OSS.";
+  }
+
+  MockClient client;
+
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_metadata_response {
+          parameter_shardings {}
+          output_shardings {}
+          output_layouts_list {}
+        }
+      )pb",
+      &response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_metadata_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  LoadedExecutable executable(
+      &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
+      /*num_devices=*/1, /*devices=*/{}, /*addressable_devices=*/{},
+      /*fingerprint=*/"fingerprint",
+      /*ready_future=*/tsl::Future<>(absl::OkStatus()),
+      /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
+
+  xla::ifrt::LoadedExecutable::ExecuteOptions exec_options;
+  exec_options.fill_status = true;
+
+  IfrtResponse execute_response;
+  IfrtResponse fetch_execute_result_response;
+
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_execute_response { status_handle: 2000 }
+      )pb",
+      &execute_response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_execute_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(execute_response));
+
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_fetch_execute_result_response {
+          device_time { key: "tpu" value: 1234.0 }
+        }
+      )pb",
+      &fetch_execute_result_response));
+  EXPECT_CALL(*session_,
+              Enqueue(Pointee(Partially(EquivToProto(
+                  R"pb(loaded_executable_fetch_execute_result_request {
+                         result_status_handle: 2000
+                       })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(fetch_execute_result_response));
+
+  auto device_time = xla::CreateDeviceTimeMeasurement();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          executable.Execute({}, exec_options, std::nullopt));
+  EXPECT_OK(result.status.Await());
+
+  EXPECT_THAT(device_time->GetTotalDuration(
+                  xla::DeviceTimeMeasurement::DeviceType::kTpu),
+              absl::Microseconds(1234.0));
+}
+
 }  // namespace
 }  // namespace proxy
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h b/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
index 0a0a0ce0286b5d..505cac0be580b6 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
@@ -35,9 +35,6 @@ struct GlobalClientFlags {
   // codepath works well.
   bool synchronous_host_buffer_store;
 
-  // TODO(b/393445969): Implement faster is_delete without needing a hack.
-  bool array_is_deleted_hack;
-
   // Zero or negative values are interpreted as no maximum.
   int grpc_max_ongoing_host_buffer_stores;
   int grpc_max_ongoing_host_buffer_lookups;
@@ -51,7 +48,6 @@ inline std::ostream& operator<<(std::ostream& os, GlobalClientFlags flags) {
   return os << "xla::ifrt::proxy::GlobalClientFlags{"
             << "synchronous_host_buffer_store="
             << flags.synchronous_host_buffer_store << ","
-            << "array_is_deleted_hack=" << flags.array_is_deleted_hack << ","
             << "grpc_max_ongoing_host_buffer_stores="
             << flags.grpc_max_ongoing_host_buffer_stores << ","
             << "grpc_max_ongoing_host_buffer_lookups="
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc b/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc
index 06522c5e330cef..8038c6cedd14b0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc
@@ -65,8 +65,6 @@ int GetIntFromEnv(const char* key) {
 static GlobalClientFlags DefaultGlobalClientFlags() {
   GlobalClientFlags result;
   result.synchronous_host_buffer_store = false;
-  result.array_is_deleted_hack =
-      GetBoolFromEnv("IFRT_PROXY_ARRAY_IS_DELETED_HACK", false);
   result.grpc_max_ongoing_host_buffer_stores =
       GetIntFromEnv<int>("IFRT_PROXY_GRPC_MAX_ONGOING_HOST_BUFFER_STORES", 0);
   result.grpc_max_ongoing_host_buffer_lookups =
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
index bc728d0002e249..42ade73d4d3c51 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -477,6 +477,7 @@ RPC(LoadedExecutableMpmdCostAnalysis, loaded_executable_mpmd_cost_analysis);
 RPC(LoadedExecutableHumanReadableProgramText,
     loaded_executable_human_readable_program_text);
 RPC(LoadedExecutableExecute, loaded_executable_execute);
+RPC(LoadedExecutableFetchExecuteResult, loaded_executable_fetch_execute_result);
 RPC(LoadedExecutableDelete, loaded_executable_delete);
 RPC(LoadedExecutableIsDeleted, loaded_executable_is_deleted);
 RPC(LoadedExecutableDestruct, loaded_executable_destruct);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
index 47193c84f9abb5..14e78f496b2147 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -152,6 +152,9 @@ class RpcHelper {
       std::unique_ptr<LoadedExecutableHumanReadableProgramTextRequest> req);
   ResponseFuture<LoadedExecutableExecuteResponse> LoadedExecutableExecute(
       std::unique_ptr<LoadedExecutableExecuteRequest> req);
+  ResponseFuture<LoadedExecutableFetchExecuteResultResponse>
+  LoadedExecutableFetchExecuteResult(
+      std::unique_ptr<LoadedExecutableFetchExecuteResultRequest> req);
   ResponseFuture<LoadedExecutableDeleteResponse> LoadedExecutableDelete(
       std::unique_ptr<LoadedExecutableDeleteRequest> req);
   ResponseFuture<LoadedExecutableIsDeletedResponse> LoadedExecutableIsDeleted(
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
index 23894f6ea2938e..59147e46daf025 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
+++ b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
@@ -124,3 +124,16 @@
     *   Added support for `MpmdLoadedExecutable::GetMpmdAddressableDevices()`.
     *   Added support for `MpmdLoadedExecutable::GetMpmdCompiledMemoryStats()`.
     *   Added support for `MpmdLoadedExecutable::GetMpmdCostAnalysis()`.
+
+## Version kExecutionResult
+
+*   Added date: 2025-12-11
+*   Changes:
+    *   Added a new op `LoadedExecutableFetchExecuteResult` for reading
+        execution results.
+
+## Version kkDevicePlatformName
+
+*   Added date: 2025-12-13
+*   Changes:
+    *   Added `Device::PlatformName()` for getting the platform of a device.
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index 347a211608edbe..6b20e3bad369cc 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -37,7 +37,7 @@ message IfrtProxyVersion {
   int32 ifrt_serdes_version_number = 2;
 }
 
-// Next ID: 32.
+// Next ID: 33.
 message IfrtRequest {
   RequestMetadata request_metadata = 1;
 
@@ -73,6 +73,8 @@ message IfrtRequest {
     // ===== LoadedExecutable =====
     LoadedExecutableMetadataRequest loaded_executable_metadata_request = 14;
     LoadedExecutableExecuteRequest loaded_executable_execute_request = 15;
+    LoadedExecutableFetchExecuteResultRequest
+        loaded_executable_fetch_execute_result_request = 32;
     LoadedExecutableCostAnalysisRequest
         loaded_executable_cost_analysis_request = 28;
     LoadedExecutableHumanReadableProgramTextRequest
@@ -102,7 +104,7 @@ message IfrtRequest {
   reserved 10;
 }
 
-// Next ID: 32.
+// Next ID: 33.
 message IfrtResponse {
   ResponseMetadata response_metadata = 1;
 
@@ -138,6 +140,8 @@ message IfrtResponse {
     // ===== LoadedExecutable =====
     LoadedExecutableMetadataResponse loaded_executable_metadata_response = 14;
     LoadedExecutableExecuteResponse loaded_executable_execute_response = 15;
+    LoadedExecutableFetchExecuteResultResponse
+        loaded_executable_fetch_execute_result_response = 32;
     LoadedExecutableCostAnalysisResponse
         loaded_executable_cost_analysis_response = 28;
     LoadedExecutableHumanReadableProgramTextResponse
@@ -244,6 +248,7 @@ message InitResponse {
     int32 id = 1;
     int32 local_device_id = 9;
     int32 local_hardware_id = 2;
+    string platform_name = 11;
     string device_kind = 3;
     optional int32 default_memory_id = 7;
     repeated int32 memory_ids = 8;
@@ -604,6 +609,14 @@ message LoadedExecutableExecuteResponse {
   repeated Output outputs = 2;
 }
 
+message LoadedExecutableFetchExecuteResultRequest {
+  fixed64 result_status_handle = 1;
+}
+message LoadedExecutableFetchExecuteResultResponse {
+  // Map from device types to device time in microseconds.
+  map<string, double> device_time = 1;
+}
+
 // Mirrors `LoadedExecutable::Delete`. Returns a handle of a future that becomes
 // ready when the deletion completes.
 message LoadedExecutableDeleteRequest {
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/versions.h b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
index ec024d9cb18810..2b04ea6a778bf4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/versions.h
+++ b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
@@ -66,6 +66,13 @@ enum {
   // GetMpmdCostAnalysis.
   kMpmdLoadedExecutableMethods = 20,
 
+  // kExecuteResult adds a separate request/response type for Execution
+  // results to return extra information such as device time measurement.
+  kExecuteResult = 21,
+
+  // kDevicePlatformName adds a PlatformName() method to Device.
+  kDevicePlatformName = 22,
+
   // kSentiel is used to derive kCurrent below. Keep this as the last value of
   // the enum.
   kSentiel,
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 340fa770032977..8faa6a591e9647 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -134,6 +134,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
@@ -197,6 +198,7 @@ ifrt_proxy_cc_test(
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
@@ -234,10 +236,12 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index c6318aaa7f02d1..b4408a2bcf8dbf 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -22,7 +22,6 @@
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -47,6 +46,7 @@
 #include "xla/future.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
@@ -637,19 +637,34 @@ tsl::Future<BackendInterface::Response> IfrtBackend::ProcessInternal(
           HandleLoadedExecutableExecuteRequest(*asr, std::move(request));
       if (client_generated_status_handle != 0) {
         // Populate the handle if not already populated.
-        absl::MutexLock l(futures_mutex_);
-        const bool inserted = futures_
-                                  .insert({client_generated_status_handle,
-                                           tsl::Future<>(result.status())})
-                                  .second;
-        // If `HandleLoadedExecutableExecuteRequest` returned OK, verify that
-        // it already has populated status_handle.
-        if (result.ok()) {
-          CHECK(!inserted);
+        if (protocol_version() >= protocol_version::kExecuteResult) {
+          absl::MutexLock l(execute_results_mutex_);
+          if (result.ok()) {
+            CHECK(execute_results_.contains(client_generated_status_handle));
+          } else {
+            CHECK(execute_results_
+                      .insert({client_generated_status_handle,
+                               tsl::Future<ExecuteResult>(result.status())})
+                      .second);
+          }
+        } else {
+          absl::MutexLock l(futures_mutex_);
+          const bool inserted = futures_
+                                    .insert({client_generated_status_handle,
+                                             tsl::Future<>(result.status())})
+                                    .second;
+          // If `HandleLoadedExecutableExecuteRequest` returned OK, verify that
+          // it already has populated status_handle.
+          if (result.ok()) {
+            CHECK(!inserted);
+          }
         }
       }
       return tsl::Future<Response>(asr->ProcessResponse(std::move(result)));
     }
+    case IfrtRequest::RequestCase::kLoadedExecutableFetchExecuteResultRequest:
+      return HandleLoadedExecutableFetchExecuteResultRequest(
+          std::move(request));
     case IfrtRequest::RequestCase::kLoadedExecutableDeleteRequest:
       return tsl::Future<Response>(
           HandleLoadedExecutableDeleteRequest(std::move(request)));
@@ -748,6 +763,7 @@ absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleInit(
   for (auto* device : all_devices) {
     InitResponse::Device* d = init_resp->add_all_devices();
     d->set_id(device->Id().value());
+    d->set_platform_name(AsProtoStringData(device->PlatformName()));
     d->set_device_kind(AsProtoStringData(device->Kind()));
     if (auto default_memory = device->DefaultMemory(); default_memory.ok()) {
       d->set_default_memory_id((*default_memory)->Id().value());
@@ -1816,6 +1832,11 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
     TF_ASSIGN_OR_RETURN(devices, client_->MakeDeviceList(std::move(d)));
   }
 
+  std::unique_ptr<xla::DeviceTimeMeasurement> device_time;
+  if (execute_options.fill_status) {
+    device_time = xla::CreateDeviceTimeMeasurement();
+  }
+
   TF_ASSIGN_OR_RETURN(xla::ifrt::LoadedExecutable::ExecuteResult result,
                       executable_info->executable->Execute(
                           absl::MakeSpan(args), execute_options, devices));
@@ -1895,15 +1916,31 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
   // atomically (as in ACID) across all handles.
   [&]() -> void {
     if (execute_options.fill_status) {
-      // Caller is expected to call `CheckFuture` exactly once to check for its
-      // status and erase it.
-      absl::MutexLock lock(futures_mutex_);
       uint64_t status_handle = execute.result_status_handle();
       if (status_handle == 0) {
         status_handle = handle_generator_.GenerateAtServer();
       }
       execute_response->set_status_handle(status_handle);
-      futures_.insert({status_handle, std::move(result.status)});
+
+      if (version_.protocol_version() >= protocol_version::kExecuteResult) {
+        // Caller is expected to call `LoadedExecutableFetchExecuteResult`
+        // exactly once to check for its status and erase it.
+        absl::MutexLock lock(execute_results_mutex_);
+        tsl::Future<ExecuteResult> future = result.status.Map<ExecuteResult>(
+            [device_time = std::move(device_time)]() mutable {
+              ExecuteResult result;
+              if (device_time != nullptr) {
+                result.device_time = device_time->GetTotalDurations();
+              }
+              return result;
+            });
+        execute_results_.insert({status_handle, std::move(future)});
+      } else {
+        // Caller is expected to call `CheckFuture` exactly once to check for
+        // its status and erase it.
+        absl::MutexLock lock(futures_mutex_);
+        futures_.insert({status_handle, std::move(result.status)});
+      }
     }
 
     std::vector<uint64_t> result_handles = asr.Fill(result.outputs);
@@ -1927,6 +1964,50 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
   return ifrt_resp;
 }
 
+tsl::Future<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableFetchExecuteResultRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& fetch = request->loaded_executable_fetch_execute_result_request();
+
+  tsl::Future<ExecuteResult> result;
+  {
+    absl::MutexLock lock(execute_results_mutex_);
+    const auto it = execute_results_.find(fetch.result_status_handle());
+    if (it == execute_results_.end()) {
+      return tsl::Future<Response>(absl::NotFoundError(absl::StrCat(
+          "Unknown result status handle: ", fetch.result_status_handle())));
+    }
+    result = std::move(it->second);
+    execute_results_.erase(it);
+  }
+
+  return result.Map<BackendInterface::Response>(
+      [op_id =
+           request->request_metadata().op_id()](const ExecuteResult& result) {
+        auto ifrt_resp = NewIfrtResponse(op_id);
+
+        auto* const fetch_response =
+            ifrt_resp
+                ->mutable_loaded_executable_fetch_execute_result_response();
+        for (const auto& [device_type, duration] : result.device_time) {
+          switch (device_type) {
+            case xla::DeviceTimeMeasurement::DeviceType::kTpu:
+              fetch_response->mutable_device_time()->insert(
+                  {"tpu", absl::ToDoubleMicroseconds(duration)});
+              break;
+            case xla::DeviceTimeMeasurement::DeviceType::kGpu:
+              fetch_response->mutable_device_time()->insert(
+                  {"gpu", absl::ToDoubleMicroseconds(duration)});
+              break;
+            case xla::DeviceTimeMeasurement::DeviceType::kUnknown:
+              break;
+          }
+        }
+
+        return ifrt_resp;
+      });
+}
+
 // This handler will be deleted on 2025-06-06 since the underlying IFRT API is
 // deprecated. An error is returned until then to gracefully handle old clients.
 absl::StatusOr<BackendInterface::Response>
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index 2ca312d98566fa..ed5945bbc5a8b8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -27,7 +27,9 @@
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
@@ -157,6 +159,11 @@ class IfrtBackend final : public BackendInterface {
         ABSL_GUARDED_BY(mu_);
   };
 
+  struct ExecuteResult {
+    absl::flat_hash_map<xla::DeviceTimeMeasurement::DeviceType, absl::Duration>
+        device_time;
+  };
+
   IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
               std::shared_ptr<xla::ifrt::Client> ifrt_client,
               std::shared_ptr<HostBufferStore> host_buffer_store);
@@ -223,6 +230,8 @@ class IfrtBackend final : public BackendInterface {
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableExecuteRequest(
       ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
+  tsl::Future<Response> HandleLoadedExecutableFetchExecuteResultRequest(
+      std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableDeleteRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableIsDeletedRequest(
@@ -292,6 +301,10 @@ class IfrtBackend final : public BackendInterface {
   absl::flat_hash_map<uint64_t, std::shared_ptr<LoadedExecutableWithInfo>>
       executables_ ABSL_GUARDED_BY(executables_mutex_);
 
+  absl::Mutex execute_results_mutex_;
+  absl::flat_hash_map<uint64_t, tsl::Future<ExecuteResult>> execute_results_
+      ABSL_GUARDED_BY(execute_results_mutex_);
+
   absl::Mutex host_callback_queues_mutex_;
   absl::flat_hash_map<uint64_t, std::shared_ptr<RemoteLoadedHostCallbackQueue>>
       host_callback_queues_ ABSL_GUARDED_BY(host_callback_queues_mutex_);
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index b6100518833f7b..9e8b511d18bc72 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -37,6 +37,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -48,6 +49,7 @@
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
@@ -88,6 +90,7 @@
 #include "xla/tsl/protobuf/status.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace xla {
@@ -102,6 +105,7 @@ using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::Invoke;
+using ::testing::MatchesRegex;
 using ::testing::Not;
 using ::testing::NotNull;
 using ::testing::Optional;
@@ -467,6 +471,7 @@ TEST_P(IfrtBackendHandlerTest, Init) {
 
     MockDevice& mock_device = *mock_devices_[i];
     // TODO(b/314368788): Clean up PJRT device ID APIs.
+    EXPECT_CALL(mock_device, PlatformName()).WillRepeatedly(Return("mock"));
     EXPECT_CALL(mock_device, Kind()).WillRepeatedly(Return("mock"));
     EXPECT_CALL(mock_device, Memories())
         .WillRepeatedly(Return(device_memories[i]));
@@ -496,6 +501,7 @@ TEST_P(IfrtBackendHandlerTest, Init) {
   EXPECT_EQ(init_response.all_devices().size(), 2);
   for (auto device : init_response.all_devices()) {
     int device_canonical_num = device.id();
+    EXPECT_EQ(device.platform_name(), "mock");
     EXPECT_EQ(device.device_kind(), "mock");
     EXPECT_EQ(device.default_memory_id(), device_canonical_num);
     EXPECT_EQ(device.memory_ids().size(), 1);
@@ -1375,19 +1381,36 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecute) {
     EXPECT_NE(output.array_handle(), 0);
   }
 
+  auto check_execution_result = [&](uint64_t handle) -> absl::Status {
+    if (handle == 0) {
+      return absl::InternalError("Test error, future handle is 0");
+    }
+    if (Version().protocol_version() >= protocol_version::kExecuteResult) {
+      auto request = NewIfrtRequest(NewOpId());
+      request->mutable_loaded_executable_fetch_execute_result_request()
+          ->set_result_status_handle(handle);
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<IfrtResponse> response,
+                          CallBackend(std::move(request)));
+      return tsl::StatusFromProto(response->response_metadata().status());
+    } else {
+      return CheckFuture(handle);
+    }
+  };
+
   EXPECT_THAT(
-      CheckFuture(
+      check_execution_result(
           response->loaded_executable_execute_response().status_handle()),
       absl_testing::StatusIs(absl::StatusCode::kInternal,
                              StrEq("injected error")));
 
-  // The second call to `CheckFuture` fails since `CheckFuture` above performs a
-  // destructive read.
+  // The second call to `check_execution_result` fails since
+  // `check_execution_result` above performs a destructive read.
   EXPECT_THAT(
-      CheckFuture(
+      check_execution_result(
           response->loaded_executable_execute_response().status_handle()),
-      absl_testing::StatusIs(absl::StatusCode::kNotFound,
-                             HasSubstr("Unknown future handle")));
+      absl_testing::StatusIs(
+          absl::StatusCode::kNotFound,
+          MatchesRegex("Unknown (future|result status) handle.*")));
 }
 
 TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecuteErrorWithClientHandles) {
@@ -1450,13 +1473,91 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecuteErrorWithClientHandles) {
 
   EXPECT_THAT(CallBackend(std::move(request)), status_is_err);
 
-  EXPECT_THAT(CheckFuture(kFirstResultHandle + kNumOutputs), status_is_err);
+  {
+    const uint64_t handle = kFirstResultHandle + kNumOutputs;
+    if (Version().protocol_version() >= protocol_version::kExecuteResult) {
+      auto request = NewIfrtRequest(NewOpId());
+      request->mutable_loaded_executable_fetch_execute_result_request()
+          ->set_result_status_handle(handle);
+      EXPECT_THAT(CallBackend(std::move(request)), status_is_err);
+    } else {
+      EXPECT_THAT(CheckFuture(handle), status_is_err);
+    }
+  }
 
   for (int i = 0; i < kNumOutputs; ++i) {
     EXPECT_THAT(CheckValueReady(kFirstResultHandle + i), status_is_err);
   }
 }
 
+TEST_P(IfrtBackendHandlerTest, LoadedExecutableDeviceTime) {
+  if (tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "DeviceTimeMeasurement implementation isn't available in OSS.";
+  }
+  if (Version().protocol_version() < protocol_version::kExecuteResult) {
+    GTEST_SKIP()
+        << "Device time measurement is not supported in this protocol version";
+  }
+
+  MockLoadedExecutable* executable;
+  uint64_t handle;
+  {
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+    TF_ASSERT_OK_AND_ASSIGN(CompileResponse response,
+                            CompileTestLoadedExecutable(std::move(e)));
+    handle = response.loaded_executable_handle();
+  }
+
+  EXPECT_CALL(*executable, Execute(_, _, _))
+      .WillOnce([&](absl::Span<ArrayRef> args,
+                    const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+                    std::optional<DeviceListRef> devices)
+                    -> absl::StatusOr<LoadedExecutable::ExecuteResult> {
+        std::optional<uint64_t> device_time_key =
+            xla::GetDeviceTimeMeasurementKey();
+        if (device_time_key.has_value()) {
+          xla::RecordDeviceTimeMeasurement(
+              *device_time_key, absl::Microseconds(1234),
+              xla::DeviceTimeMeasurement::DeviceType::kTpu);
+        }
+        LoadedExecutable::ExecuteResult result;
+        result.status = tsl::Future<>(absl::OkStatus());
+        return result;
+      });
+
+  constexpr uint64_t kResultStatusHandle = 1000;
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableExecuteRequest* execute_request =
+        request->mutable_loaded_executable_execute_request();
+    execute_request->set_loaded_executable_handle(handle);
+    execute_request->set_result_status_handle(kResultStatusHandle);
+
+    xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
+    execute_options.fill_status = true;
+    TF_ASSERT_OK(execute_options.ToProto(
+        *execute_request->mutable_execute_options(), ifrt_serdes_version()));
+
+    EXPECT_OK(CallBackend(std::move(request)));
+  }
+
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    request->mutable_loaded_executable_fetch_execute_result_request()
+        ->set_result_status_handle(kResultStatusHandle);
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+    EXPECT_THAT(response, Pointee(Partially(EquivToProto(R"pb(
+                  loaded_executable_fetch_execute_result_response {
+                    device_time { key: "tpu" value: 1234.0 }
+                    device_time { key: "gpu" value: 0 }
+                  }
+                )pb"))));
+  }
+}
+
 TEST_P(IfrtBackendHandlerTest, LoadedExecutableDestruct) {
   MockLoadedExecutable* executable;
   uint64_t handle;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 46451a959629d5..afc3b0ba21baa4 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -342,6 +342,7 @@ cc_library(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:topology_util",
+        "//xla/pjrt/distributed/coordination:coordination_service_agent",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
@@ -354,7 +355,6 @@ cc_library(
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/distributed_runtime:call_options",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 05be2489eaafee..ac9bc1ace446ad 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -550,15 +550,15 @@ absl::StatusOr<ArrayRef> PjRtArray::Copy(
   if (new_client == nullptr) {
     new_client = client_;
   }
-  std::shared_ptr<const xla::PjRtLayout> layout;
-  static MemoryKind kUnpinnedHostMemoryKind(UnpinnedHostMemorySpace::kKind);
-  // Unpinned host supports default layouts only; a custom layout would be
-  // ignored.
-  // TODO(hyeontaek): This behavior should be informed by the underlying PjRt
-  // client instead of following a convention.
-  if (layout_ != nullptr &&
-      canonicalized_sharding_memory_kind != kUnpinnedHostMemoryKind) {
-    layout = layout_;
+  std::shared_ptr<const xla::PjRtLayout> layout = layout_;
+  // If a copy has happened across clients or across different memory spaces,
+  // the layout of a new buffer may be different from that of the original
+  // buffer. Refreshing the custom layout using the new buffer layout makes sure
+  // that `PjRtArray` tracks a valid custom layout.
+  if (layout != nullptr &&
+      (client_ != new_client ||
+       sharding_->memory_kind() != canonicalized_sharding_memory_kind)) {
+    layout = buffers.front()->layout();
   }
   return std::visit(
       [this, new_client, &new_sharding, &buffers,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc
index af2a07cb85d92f..a28a1cfa8cf481 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc
@@ -59,12 +59,12 @@ AttributeMap FromPjRtAttributeMap(
 absl::flat_hash_map<std::string, xla::PjRtValueType> ToPjRtAttributeMap(
     AttributeMap attributes) {
   absl::flat_hash_map<std::string, xla::PjRtValueType> result;
-  result.reserve(attributes.map().size());
-  for (auto& item : attributes.map()) {
+  result.reserve(attributes.size());
+  attributes.ForEach([&](const std::string& key,
+                         const AttributeMap::Value& value) {
     std::visit(
         [&](auto& value) {
           using T = std::decay_t<decltype(value)>;
-          const auto& key = item.first;
           if constexpr (std::is_same_v<T, AttributeMap::StringValue>) {
             result.insert({key, std::move(value.value)});
           } else if constexpr (std::is_same_v<T, AttributeMap::BoolValue>) {
@@ -78,8 +78,8 @@ absl::flat_hash_map<std::string, xla::PjRtValueType> ToPjRtAttributeMap(
             result.insert({key, value.value});
           }
         },
-        item.second);
-  }
+        value);
+  });
   return result;
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc
index afee66155aa4ad..dd8742d16610ad 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc
@@ -38,8 +38,8 @@ TEST(PjRtAttributeMapUtilTest, FromPjRtAttributeMap) {
       {"float", xla::PjRtValueType(1.23f)},
   });
 
-  EXPECT_EQ(FromPjRtAttributeMap(pjrt_map).map(),
-            AttributeMap::Map({
+  EXPECT_EQ(FromPjRtAttributeMap(pjrt_map),
+            AttributeMap({
                 {"string", AttributeMap::StringValue("value")},
                 {"bool", AttributeMap::BoolValue(true)},
                 {"int64", AttributeMap::Int64Value(123)},
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index c2d4458ba0663f..e2cdde0b602b88 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/pjrt/distributed/coordination/coordination_service_agent.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/host_memory_spaces.h"
@@ -90,7 +91,6 @@ limitations under the License.
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
@@ -498,6 +498,7 @@ MakePjRtDevicesFromGlobalTopology(PjRtClient* client,
       }
     }
 
+    std::string platform_name(pjrt_client->platform_name());
     const bool node_is_me = process_index == global_topology.my_process_index;
     for (const DeviceProto& device_proto : node.devices()) {
       absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes;
@@ -536,7 +537,7 @@ MakePjRtDevicesFromGlobalTopology(PjRtClient* client,
         }
       }
       auto ifrt_device = std::make_unique<PjRtDevice>(
-          client, ifrt_device_id, device_proto.device_kind(),
+          client, ifrt_device_id, platform_name, device_proto.device_kind(),
           std::move(to_string), std::move(debug_string), process_index,
           std::move(attributes), pjrt_device);
       devices.push_back(std::move(ifrt_device));
@@ -869,7 +870,7 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> PjRtClient::Create(
 
   // Start a background thread to monitor the status of all processes.
   if (client->distributed_client_) {
-    absl::StatusOr<tsl::CoordinationServiceAgent*> agent =
+    absl::StatusOr<xla::CoordinationServiceAgent*> agent =
         client->distributed_client_->GetCoordinationServiceAgent();
     if (agent.ok()) {
       client->global_process_info_thread_.reset(
@@ -1497,7 +1498,7 @@ CrossHostTransferKey PjRtClient::CreateNewTransferKey() {
 }
 
 absl::Status PjRtClient::WatchGlobalProcessInfo(
-    tsl::CoordinationServiceAgent& agent) {
+    xla::CoordinationServiceAgent& agent) {
   TF_ASSIGN_OR_RETURN(tensorflow::CoordinatedTask task, agent.GetOwnTask());
   VLOG(3) << "Watching global process info for task "
           << task.ShortDebugString();
@@ -1763,7 +1764,7 @@ PjRtClient::Incarnations() const {
   if (!distributed_client_) {
     return absl::FailedPreconditionError("missing distributed client");
   }
-  TF_ASSIGN_OR_RETURN(tsl::CoordinationServiceAgent * agent,
+  TF_ASSIGN_OR_RETURN(xla::CoordinationServiceAgent * agent,
                       distributed_client_->GetCoordinationServiceAgent());
   return agent->Incarnations();
 }
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index f06da35624af69..91274e98d6fd34 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -426,7 +426,7 @@ class PjRtClient final
   // If true, the backend implements the cross-host transfer APIs.
   bool pjrt_supports_cross_host_transfers_ = false;
 
-  absl::Status WatchGlobalProcessInfo(tsl::CoordinationServiceAgent& agent);
+  absl::Status WatchGlobalProcessInfo(xla::CoordinationServiceAgent& agent);
 
   std::atomic<int64_t> next_transfer_key_ = 0;
   std::shared_ptr<xla::DistributedRuntimeClient> distributed_client_;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
index 5fb57bb8882f7e..fa8db5f6b9f2bc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
@@ -39,13 +39,15 @@ char PjRtCompatibleDevice::ID = 0;
 char PjRtDevice::ID = 0;
 
 PjRtDevice::PjRtDevice(
-    PjRtClient* client, DeviceId id, std::string kind, std::string to_string,
-    std::string debug_string, int process_index,
+    PjRtClient* client, DeviceId id, std::string platform_name,
+    std::string kind, std::string to_string, std::string debug_string,
+    int process_index,
     absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
     xla::PjRtDevice* pjrt_device)
     : client_(client),
       id_(id),
       attributes_(FromPjRtAttributeMap(std::move(attributes))),
+      platform_name_(std::move(platform_name)),
       kind_(std::move(kind)),
       to_string_(std::move(to_string)),
       debug_string_(std::move(debug_string)),
@@ -56,6 +58,8 @@ DeviceId PjRtDevice::Id() const { return id_; }
 
 const AttributeMap& PjRtDevice::Attributes() const { return attributes_; }
 
+absl::string_view PjRtDevice::PlatformName() const { return platform_name_; }
+
 absl::string_view PjRtDevice::Kind() const { return kind_; }
 
 absl::string_view PjRtDevice::ToString() const { return to_string_; }
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
index 596db196304df4..6a832174feaaa0 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
@@ -43,8 +43,9 @@ class PjRtCompatibleDevice : public llvm::RTTIExtends<PjRtDevice, Device> {
 class PjRtDevice final
     : public llvm::RTTIExtends<PjRtDevice, PjRtCompatibleDevice> {
  public:
-  PjRtDevice(PjRtClient* client, DeviceId id, std::string kind,
-             std::string to_string, std::string debug_string, int process_index,
+  PjRtDevice(PjRtClient* client, DeviceId id, std::string platform_name,
+             std::string kind, std::string to_string, std::string debug_string,
+             int process_index,
              absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
              xla::PjRtDevice* pjrt_device);
 
@@ -57,6 +58,7 @@ class PjRtDevice final
 
   DeviceId Id() const final;
   const AttributeMap& Attributes() const final;
+  absl::string_view PlatformName() const final;
   absl::string_view Kind() const final;
   absl::string_view ToString() const final;
   absl::string_view DebugString() const final;
@@ -74,6 +76,7 @@ class PjRtDevice final
 
   DeviceId id_;
   AttributeMap attributes_;
+  std::string platform_name_;
   std::string kind_;
   std::string to_string_;
   std::string debug_string_;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 772bafc939652b..a700a495ca48a8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -34,6 +33,8 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/future.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/utils.h"
-#include "xla/primitive_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
@@ -66,12 +66,10 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/pjrt_memory.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
-#include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -85,145 +83,220 @@ namespace ifrt {
 
 namespace {
 
-// Returns the op sharding of the root instruction in the entry computation.
-absl::StatusOr<const xla::HloInstructionProto*> FindRootInstruction(
-    const HloModuleProto& proto) {
-  for (const auto& computation : proto.computations()) {
-    if (computation.id() == proto.entry_computation_id()) {
-      for (const auto& instruction : computation.instructions()) {
-        if (instruction.id() == computation.root_id()) {
-          return &instruction;
-        }
-      }
-    }
-  }
-  return InvalidArgument("Entry computation not found");
+constexpr absl::string_view kDefaultMemoryKind = "device";
+
+// Returns a flat list of IFRT dtypes from element type information that a PjRt
+// executable returns (per-module lists of primitive of element types).
+// PjRt-IFRT always uses the first module's information.
+absl::StatusOr<std::vector<DType>> GetDTypes(
+    const absl::StatusOr<std::vector<std::vector<xla::PrimitiveType>>>&
+        pjrt_executable_element_types) {
+  TF_RETURN_IF_ERROR(pjrt_executable_element_types.status());
+  if (pjrt_executable_element_types->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  std::vector<DType> dtypes;
+  dtypes.reserve(pjrt_executable_element_types->front().size());
+  for (xla::PrimitiveType element_type :
+       pjrt_executable_element_types->front()) {
+    TF_ASSIGN_OR_RETURN(DType dtype, ToDType(element_type));
+    dtypes.push_back(dtype);
+  }
+  return dtypes;
 }
 
-// Returns the output element types of the first module in a
-// `PjRtLoadedExecutable`.
-absl::StatusOr<std::vector<xla::PrimitiveType>>
-GetFirstModuleOutputElementTypes(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable) {
-  auto element_types = pjrt_loaded_executable->GetOutputElementTypes();
-  TF_RETURN_IF_ERROR(element_types.status());
-  if (element_types->empty()) {
-    return FailedPrecondition("No output element types found");
-  }
-  return element_types->front();
-}
-
-// Returns the output dimensions of the first module in a
-// `PjRtLoadedExecutable`.
-absl::StatusOr<std::vector<xla::DimensionVector>>
-GetFirstModuleOutputDimensions(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable) {
-  auto dimensions = pjrt_loaded_executable->GetOutputDimensions();
-  TF_RETURN_IF_ERROR(dimensions.status());
-  if (dimensions->empty()) {
-    return FailedPrecondition("No output dimensions found");
-  }
-  return dimensions->front();
+// Returns a flat list of IFRT shapes from the dimension information that a PjRt
+// executable returns (per-module lists of dimension vectors).
+// PjRt-IFRT always uses the first module's information.
+absl::StatusOr<std::vector<Shape>> GetShapes(
+    const absl::StatusOr<std::vector<std::vector<xla::DimensionVector>>>&
+        pjrt_executable_dimensions,
+    absl::Span<const DType> dtypes) {
+  TF_RETURN_IF_ERROR(pjrt_executable_dimensions.status());
+  if (pjrt_executable_dimensions->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  if (pjrt_executable_dimensions->front().size() != dtypes.size()) {
+    return FailedPrecondition(
+        "Output dimensions and dtypes have different sizes: %d vs. %d",
+        pjrt_executable_dimensions->front().size(), dtypes.size());
+  }
+  std::vector<Shape> shapes;
+  shapes.reserve(pjrt_executable_dimensions->front().size());
+  for (int i = 0; i < pjrt_executable_dimensions->front().size(); ++i) {
+    if (dtypes[i].kind() == DType::kToken) {
+      // Token uses a scalar shape by convention.
+      shapes.push_back(Shape({}));
+    } else {
+      shapes.push_back(Shape(pjrt_executable_dimensions->front()[i]));
+    }
+  }
+  return shapes;
 }
 
-// Returns the output shardings of the first module in a
-// `PjRtLoadedExecutable`.
-absl::StatusOr<std::optional<xla::HloSharding>> GetFirstModuleOutputSharding(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable,
-    const xla::Shape& shape) {
-  auto output_shardings = pjrt_loaded_executable->GetOutputShardings();
-  std::optional<xla::HloSharding> result_hlo_sharding;
-  if (output_shardings.has_value()) {
-    std::vector<xla::HloSharding> hlo_shardings;
-    hlo_shardings.reserve(output_shardings->size());
-    for (const auto& sharding : *output_shardings) {
-      TF_ASSIGN_OR_RETURN(auto hlo_sharding,
-                          xla::HloSharding::FromProto(sharding));
-      hlo_shardings.push_back(hlo_sharding);
-    }
-    if (shape.IsTuple()) {
-      return xla::HloSharding::Tuple(shape, hlo_shardings);
+// Returns a pair of flat lists of IFRT dtypes and shapes from XLA shapes
+// extracted from an MLIR module's signature.
+absl::StatusOr<std::pair<std::vector<DType>, std::vector<Shape>>>
+GetDTypesAndShapes(absl::Span<const xla::Shape> mlir_module_xla_shapes) {
+  std::vector<DType> dtypes;
+  dtypes.reserve(mlir_module_xla_shapes.size());
+  std::vector<Shape> shapes;
+  shapes.reserve(mlir_module_xla_shapes.size());
+  for (const xla::Shape& xla_shape : mlir_module_xla_shapes) {
+    TF_ASSIGN_OR_RETURN(DType dtype, ToDType(xla_shape.element_type()));
+    dtypes.push_back(dtype);
+    if (dtype.kind() == DType::kToken) {
+      // Token uses a scalar shape by convention.
+      shapes.push_back(Shape({}));
     } else {
-      return hlo_shardings.front();
+      shapes.push_back(Shape(xla_shape.dimensions()));
     }
   }
-  return std::nullopt;
+  return std::make_pair(std::move(dtypes), std::move(shapes));
 }
 
-// Returns the flattened output memory_kinds of the first module in a
-// `PjRtLoadedExecutable`.
-// `UnimplementedError` will be converted into `std::nullopt`.
-absl::StatusOr<std::optional<std::vector<absl::string_view>>>
-GetFirstModuleOutputMemoryKinds(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable) {
-  auto output_memory_kinds = pjrt_loaded_executable->GetOutputMemoryKinds();
-  // Gracefully handle an unimplemented error.
-  if (absl::IsUnimplemented(output_memory_kinds.status())) {
+// Returns a flat list of HLO shardings from the sharding information that a
+// PjRt executable returns (a flat list of `OpSharding`s, with some special
+// cases). Returns `std::nullopt` if the executable does not have sharding
+// information.
+absl::StatusOr<std::optional<std::vector<xla::HloSharding>>> GetHloShardings(
+    const std::optional<std::vector<xla::OpSharding>>&
+        pjrt_executable_op_shardings,
+    absl::Span<const DType> dtypes, bool is_output) {
+  if (!pjrt_executable_op_shardings.has_value()) {
     return std::nullopt;
   }
-  TF_RETURN_IF_ERROR(output_memory_kinds.status());
-  // Expect `xla::PjRtExecutable::GetOutputMemoryKinds()` to return at least
-  // one module's output memory_kinds if it returns any non-error result.
-  if (output_memory_kinds->empty()) {
-    return FailedPrecondition("No output memory kinds found");
+  std::vector<xla::HloSharding> hlo_shardings;
+  if (is_output && dtypes.empty()) {
+    // If the HLO module output is an empty tuple, the output sharding will have
+    // a single element for the tuple as a special case. We allow this condition
+    // by checking this condition specifically.
+    if (pjrt_executable_op_shardings->size() != 1) {
+      return FailedPrecondition(
+          "HLO module output is an empty tuple, but the output sharding has "
+          "%d elements",
+          pjrt_executable_op_shardings->size());
+    }
+    return std::vector<xla::HloSharding>();
+  }
+  if (pjrt_executable_op_shardings->size() != dtypes.size()) {
+    return FailedPrecondition(
+        "Output shardings and dtypes have different sizes: %d vs. %d",
+        pjrt_executable_op_shardings->size(), dtypes.size());
+  }
+  hlo_shardings.reserve(pjrt_executable_op_shardings->size());
+  for (int i = 0; i < pjrt_executable_op_shardings->size(); ++i) {
+    if (dtypes[i].kind() == DType::kToken) {
+      // Token uses a fully replicated sharding by convention.
+      hlo_shardings.push_back(xla::HloSharding::Replicate());
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          auto hlo_sharding,
+          xla::HloSharding::FromProto((*pjrt_executable_op_shardings)[i]));
+      hlo_shardings.push_back(hlo_sharding);
+    }
   }
-  return std::move(output_memory_kinds)->front();
+  return hlo_shardings;
 }
 
-// Returns the flattened output layouts of the first module in a
-// `PjRtLoadedExecutable`.
-// `UnimplementedError` will be converted into a vector of `nullptr`.
-absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-GetFirstModuleOutputLayouts(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable,
-    absl::Span<const xla::LayoutMode> output_layout_modes) {
-  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-      executable_output_layouts = pjrt_loaded_executable->GetOutputLayouts();
-  // An unimplemented error is converted into all-default layouts.
-  if (absl::IsUnimplemented(executable_output_layouts.status())) {
-    return std::vector<std::shared_ptr<const xla::PjRtLayout>>(
-        /*size=*/output_layout_modes.size(), /*value=*/nullptr);
-  }
-  TF_RETURN_IF_ERROR(executable_output_layouts.status());
-  std::vector<std::shared_ptr<const xla::PjRtLayout>> output_layouts;
-  if (executable_output_layouts->size() != output_layout_modes.size()) {
+// Returns a flat list of IFRT memory kinds from the memory kind information
+// that a PjRt executable returns (per-module lists of memory kind strings).
+// PjRt-IFRT always uses the first module's information.
+absl::StatusOr<std::vector<absl::string_view>> GetMemoryKinds(
+    const absl::StatusOr<std::vector<std::vector<absl::string_view>>>&
+        pjrt_executable_memory_kinds,
+    absl::Span<const DType> dtypes) {
+  std::vector<absl::string_view> memory_kinds;
+  // An unimplemented error is converted into all-default memory kinds.
+  if (absl::IsUnimplemented(pjrt_executable_memory_kinds.status())) {
+    memory_kinds.resize(/*size=*/dtypes.size(), /*value=*/kDefaultMemoryKind);
+    return memory_kinds;
+  }
+  TF_RETURN_IF_ERROR(pjrt_executable_memory_kinds.status());
+  if (pjrt_executable_memory_kinds->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  if (pjrt_executable_memory_kinds->front().size() != dtypes.size()) {
     return FailedPrecondition(
-        "Output memory kinds and output layout modes have different sizes: %d "
-        "vs. %d",
-        executable_output_layouts->size(), output_layout_modes.size());
-  }
-  output_layouts.reserve(executable_output_layouts->size());
-  for (int i = 0; i < executable_output_layouts->size(); ++i) {
-    if (output_layout_modes[i].mode == xla::LayoutMode::Mode::kDefault) {
-      output_layouts.push_back(nullptr);
+        "Memory kinds and dtypes have different sizes: %d vs. %d",
+        pjrt_executable_memory_kinds->front().size(), dtypes.size());
+  }
+  memory_kinds.reserve(pjrt_executable_memory_kinds->front().size());
+  for (int i = 0; i < pjrt_executable_memory_kinds->front().size(); ++i) {
+    if (dtypes[i].kind() == DType::kToken) {
+      // Token uses a device memory kind by convention.
+      memory_kinds.push_back(kDefaultMemoryKind);
     } else {
-      output_layouts.push_back(std::move((*executable_output_layouts)[i]));
+      memory_kinds.push_back(pjrt_executable_memory_kinds->front()[i]);
     }
   }
-  return output_layouts;
+  return memory_kinds;
 }
 
-struct ShapePartialInfo {
-  std::vector<xla::PrimitiveType> element_types;
-  std::vector<xla::DimensionVector> dimensions;
-};
-
-absl::StatusOr<ShapePartialInfo> CreateShapePartialInfo(
-    absl::Span<const xla::Shape> shapes) {
-  ShapePartialInfo partial_info;
-  partial_info.element_types.reserve(shapes.size());
-  partial_info.dimensions.reserve(shapes.size());
-  for (const auto& shape : shapes) {
-    if (shape.IsTuple()) {
-      return FailedPrecondition(
-          "Tupled shape is not supported in `CreateShapePartialInfo`.");
+// Makes IFRT shardings created from HLO shardings and memory kinds.
+std::vector<ShardingRef> MakeShardings(
+    absl::Span<const Shape> shapes,
+    const std::optional<std::vector<xla::HloSharding>>& hlo_shardings,
+    absl::Span<const absl::string_view> memory_kinds,
+    const DeviceListRef& executable_devices) {
+  std::vector<ShardingRef> shardings;
+  shardings.reserve(memory_kinds.size());
+  if (hlo_shardings.has_value()) {
+    for (int i = 0; i < memory_kinds.size(); ++i) {
+      shardings.push_back(ifrt::HloSharding::Create(executable_devices,
+                                                    MemoryKind(memory_kinds[i]),
+                                                    (*hlo_shardings)[i]));
+    }
+  } else {
+    // Assume a traditional replication computation where tile shapes are the
+    // same as global shapes.
+    for (int i = 0; i < memory_kinds.size(); ++i) {
+      shardings.push_back(ifrt::ConcreteEvenSharding::Create(
+          executable_devices, MemoryKind(memory_kinds[i]),
+          /*shape=*/shapes[i],
+          /*shard_shape=*/shapes[i]));
     }
-    partial_info.element_types.push_back(shape.element_type());
-    partial_info.dimensions.push_back(
-        xla::ShapeUtil::CreateDimensionVectorFromShape(shape));
   }
+  return shardings;
+}
 
-  return partial_info;
+// Returns a flat list of layouts by combining layout modes and PjRt executable
+// layouts.
+// If any error other than an unimplemented error happens, returns
+// `std::nullopt`. The layout will be determined at execute time.
+//
+// TODO(hyeontaek): Remove the nullopt path once obtaining layout modes and
+// concrete layouts avoids HLO module serialization/deserialization and always
+// succeeds.
+absl::StatusOr<
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>>
+GetLayouts(
+    const absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
+        pjrt_executable_layouts,
+    absl::Span<const xla::LayoutMode> layout_modes) {
+  // An unimplemented error is converted into all-default layouts.
+  if (absl::IsUnimplemented(pjrt_executable_layouts.status())) {
+    return std::vector<std::shared_ptr<const xla::PjRtLayout>>(
+        /*size=*/layout_modes.size(), /*value=*/nullptr);
+  }
+  if (!pjrt_executable_layouts.ok()) {
+    return std::nullopt;
+  }
+  std::vector<std::shared_ptr<const xla::PjRtLayout>> layouts;
+  if (pjrt_executable_layouts->size() != layout_modes.size()) {
+    return FailedPrecondition(
+        "Layouts and layout modes have different sizes: %d vs. %d",
+        pjrt_executable_layouts->size(), layout_modes.size());
+  }
+  layouts.reserve(pjrt_executable_layouts->size());
+  for (int i = 0; i < pjrt_executable_layouts->size(); ++i) {
+    if (layout_modes[i].mode == xla::LayoutMode::Mode::kDefault) {
+      layouts.push_back(nullptr);
+    } else {
+      layouts.push_back(std::move((*pjrt_executable_layouts)[i]));
+    }
+  }
+  return layouts;
 }
 
 // Special `xla::GetLayoutModes()` implementation for obtaining layout modes
@@ -237,9 +310,10 @@ static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
   std::vector<std::string> str_modes =
       absl::StrSplit(attr, kDelimiter, absl::SkipEmpty());
   std::vector<LayoutMode> result;
+  result.reserve(str_modes.size());
   for (const std::string& str_mode : str_modes) {
     TF_ASSIGN_OR_RETURN(LayoutMode mode, LayoutMode::FromString(str_mode));
-    result.emplace_back(std::move(mode));
+    result.push_back(std::move(mode));
   }
   return result;
 }
@@ -256,232 +330,45 @@ static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModes(
   return GetLayoutModesFromFrontendAttr(iter->second);
 }
 
-}  // namespace
-
-char PjRtCompatibleExecutable::ID = 0;
-char PjRtCompatibleLoadedExecutable::ID = 0;
-char PjRtExecutable::ID = 0;
-char PjRtLoadedExecutable::ID = 0;
-
-absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
-    std::shared_ptr<xla::PjRtExecutable> pjrt_executable) {
-  return ExecutableRef(new PjRtExecutable(std::move(pjrt_executable)));
-}
-
-absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
-    mlir::ModuleOp module, xla::CompileOptions compile_options,
-    const xla::PjRtTopologyDescription& topology) {
-  TF_ASSIGN_OR_RETURN(auto pjrt_executable,
-                      PjRtCompile(std::move(compile_options), std::move(module),
-                                  topology, /*client=*/nullptr));
-  return ExecutableRef(new PjRtExecutable(std::move(pjrt_executable)));
+// Returns a flat list of output layout modes by examining the HLO modules.
+//
+// TODO(hyeontaek): Remove this layout mode discovery method once
+// deserialization loads layout information from the serialization metadata
+// instead of from `xla::PjRtExecutable` or `xla::PjRtLoadedExecutable`.
+absl::StatusOr<std::vector<xla::LayoutMode>> GetOutputLayoutModesFromHloModules(
+    const absl::StatusOr<std::vector<std::shared_ptr<xla::HloModule>>>&
+        hlo_modules,
+    absl::Span<const DType> output_dtypes) {
+  TF_RETURN_IF_ERROR(hlo_modules.status());
+  if (hlo_modules->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  return GetLayoutModes(*hlo_modules->front(), "out_layout_modes",
+                        output_dtypes.size());
 }
 
-absl::StatusOr<std::optional<std::string>> PjRtExecutable::Fingerprint() const {
-  DCHECK(this);
-  return pjrt_executable_->FingerprintExecutable();
-}
-
-absl::StatusOr<std::string> PjRtExecutable::Serialize() const {
-  DCHECK(this);
-  return pjrt_executable_->SerializeExecutable();
-}
-
-absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
-    PjRtClient* client,
-    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
-    DeviceListRef executable_devices) {
-  // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
-  // shape.
-  VLOG(3) << "PjRtLoadedExecutable::Create";
-  VLOG(3) << "Using per-shard shape";
-  TF_ASSIGN_OR_RETURN(
-      auto result_element_types,
-      GetFirstModuleOutputElementTypes(pjrt_loaded_executable.get()));
-  TF_ASSIGN_OR_RETURN(
-      auto result_dimensions,
-      GetFirstModuleOutputDimensions(pjrt_loaded_executable.get()));
-  TF_ASSIGN_OR_RETURN(
-      auto result_memory_kinds,
-      GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-  // Obtaining output layout modes and output layouts directly from
-  // `PjRtLoadedExecutable` may fail because the currently PjRt implementations
-  // often fetch and serialize the optimized HLO. For now, we gracefully
-  // handle it by omitting output layouts at creation time and using output
-  // `PjRtBuffer`'s concrete layouts.
-  // TODO(hyeontaek): Add a way to obtain output layout modes and
-  // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
-  // HLO to be serialized and fetched.
-  std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-      output_layouts;
-  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> hlo_modules =
-      pjrt_loaded_executable->GetHloModules();
-  if (hlo_modules.ok()) {
-    if (hlo_modules->empty()) {
-      return FailedPrecondition("Requires at least one HloModule.");
-    }
-    absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
-        GetLayoutModes(*hlo_modules->front(), "out_layout_modes",
-                       result_element_types.size());
-    if (output_layout_modes.ok()) {
-      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-          first_module_output_layouts = GetFirstModuleOutputLayouts(
-              pjrt_loaded_executable.get(), *output_layout_modes);
-      if (first_module_output_layouts.ok()) {
-        output_layouts = *std::move(first_module_output_layouts);
-      }
-    }
-  }
-  return CreateInternal(client, std::move(pjrt_loaded_executable),
-                        result_element_types, result_dimensions,
-                        /*result_hlo_sharding=*/std::nullopt,
-                        result_memory_kinds, output_layouts,
-                        loaded_host_callbacks, std::move(executable_devices));
-}
-
-static absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
+// Returns a list of result shapes from the given MLIR module.
+absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
     mlir::ModuleOp module) {
-  auto main = module.lookupSymbol<mlir::func::FuncOp>("main");
+  mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
   if (!main) {
     return InvalidArgument("MLIR module has no main function");
   }
-  auto type = main.getFunctionType();
+  mlir::FunctionType type = main.getFunctionType();
   std::vector<xla::Shape> result_shapes;
   result_shapes.reserve(type.getNumResults());
   for (unsigned i = 0; i < type.getNumResults(); ++i) {
-    auto result_type = type.getResult(i);
+    mlir::Type result_type = type.getResult(i);
     result_shapes.push_back(xla::TypeToShape(result_type));
   }
   return result_shapes;
 }
 
-absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
-    PjRtClient* client, mlir::ModuleOp module,
-    xla::CompileOptions compile_options,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
-    DeviceListRef executable_devices) {
-  VLOG(3) << "PjRtLoadedExecutable::Create";
-  if (VLOG_IS_ON(3)) {
-    module.dump();
-  }
-  VLOG(3) << compile_options.ToProto()->DebugString();
-  const auto& build_options = compile_options.executable_build_options;
-  const bool auto_spmd_partitioning =
-      build_options.use_spmd_partitioning() &&
-      build_options.num_partitions() > 1 &&
-      (build_options.use_auto_spmd_partitioning() ||
-       build_options.any_allow_spmd_sharding_propagation_to_parameters() ||
-       build_options.any_allow_spmd_sharding_propagation_to_output());
-
-  // We have to do process the MLIR before the compile call, since the latter
-  // will use the MLIR as scratch space, or possibly even deallocate it.
-  TF_ASSIGN_OR_RETURN(const std::vector<xla::Shape> result_shapes,
-                      ResultShapesOfModule(module));
-  absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
-      GetOutputLayoutModes(module);
-
-  TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executable,
-                      client->pjrt_client()->CompileAndLoad(
-                          std::move(module), std::move(compile_options)));
-
-  if (auto_spmd_partitioning) {
-    // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
-    // shape.
-    VLOG(3) << "Using per-shard shape";
-    TF_ASSIGN_OR_RETURN(
-        auto result_element_types,
-        GetFirstModuleOutputElementTypes(pjrt_loaded_executable.get()));
-    TF_ASSIGN_OR_RETURN(
-        auto result_dimensions,
-        GetFirstModuleOutputDimensions(pjrt_loaded_executable.get()));
-    TF_ASSIGN_OR_RETURN(
-        auto result_memory_kinds,
-        GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-    // Obtaining output layout modes and output layouts directly from
-    // `PjRtLoadedExecutable` may fail because the currently PjRt
-    // implementations often fetch and serialize the optimized HLO. For now, we
-    // gracefully handle it by omitting output layouts at creation time and
-    // using output `PjRtBuffer`'s concrete layouts.
-    // TODO(hyeontaek): Add a way to obtain output layout modes and
-    // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
-    // HLO to be serialized and fetched.
-    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-        output_layouts;
-    if (output_layout_modes.ok()) {
-      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-          first_module_output_layouts = GetFirstModuleOutputLayouts(
-              pjrt_loaded_executable.get(), *output_layout_modes);
-      if (first_module_output_layouts.ok()) {
-        output_layouts = *std::move(first_module_output_layouts);
-      }
-    }
-    return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          result_element_types, result_dimensions,
-                          /*result_hlo_sharding=*/std::nullopt,
-                          result_memory_kinds, output_layouts,
-                          std::move(loaded_host_callbacks),
-                          std::move(executable_devices));
-  } else {
-    VLOG(3) << "Using full shape";
-    // TODO(yueshengys): Consider getting element types and dimensions directly
-    // from module.
-    bool tuple_output = result_shapes.size() != 1;
-    xla::Shape result_shape;
-    std::vector<xla::Shape> output_shapes;
-    if (tuple_output) {
-      result_shape = xla::ShapeUtil::MakeTupleShape(result_shapes);
-      output_shapes = std::move(result_shapes);
-    } else {
-      result_shape = result_shapes.front();
-      output_shapes = result_shape.IsTuple()
-                          ? result_shape.tuple_shapes()
-                          : std::vector<xla::Shape>{result_shape};
-    }
-    TF_ASSIGN_OR_RETURN(auto shape_partial_info,
-                        CreateShapePartialInfo(output_shapes));
-    TF_ASSIGN_OR_RETURN(auto result_hlo_sharding,
-                        GetFirstModuleOutputSharding(
-                            pjrt_loaded_executable.get(), result_shape));
-    TF_ASSIGN_OR_RETURN(
-        auto result_memory_kinds,
-        GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-    // Obtaining output layout modes and output layouts directly from
-    // `PjRtLoadedExecutable` may fail because the currently PjRt
-    // implementations often fetch and serialize the optimized HLO. For now, we
-    // gracefully handle it by omitting output layouts at creation time and
-    // using output `PjRtBuffer`'s concrete layouts.
-    // TODO(hyeontaek): Add a way to obtain output layout modes and
-    // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
-    // HLO to be serialized and fetched.
-    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-        output_layouts;
-    if (output_layout_modes.ok()) {
-      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-          first_module_output_layouts = GetFirstModuleOutputLayouts(
-              pjrt_loaded_executable.get(), *output_layout_modes);
-      if (first_module_output_layouts.ok()) {
-        output_layouts = *std::move(first_module_output_layouts);
-      }
-    }
-    return CreateInternal(
-        client, std::move(pjrt_loaded_executable),
-        shape_partial_info.element_types, shape_partial_info.dimensions,
-        result_hlo_sharding, result_memory_kinds, output_layouts,
-        std::move(loaded_host_callbacks), std::move(executable_devices));
-  }
-}
-
-absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
-    PjRtClient* client,
-    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    absl::Span<const xla::PrimitiveType> result_element_types,
-    absl::Span<const xla::DimensionVector> result_dimensions,
-    const std::optional<xla::HloSharding>& result_hlo_sharding,
-    const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
-    const std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
-        output_layouts,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+// Returns a new `DeviceListRef` that contains the addressable devices of the
+// PjRt executable if the supplied `executable_devices` has an incomplete set of
+// devices.
+absl::StatusOr<DeviceListRef> AdjustExecutableDevicesForPmap(
+    PjRtClient* client, const xla::PjRtLoadedExecutable* pjrt_loaded_executable,
     DeviceListRef executable_devices) {
   // For jit(pmap(...)), the device assignment (passed as `executable_devices`)
   // may contain a single device while the PjRt executable has multiple
@@ -516,101 +403,15 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
     return FailedPrecondition(
         "Sharding devices must be at least as many as addressable devices");
   }
-  std::vector<DType> output_dtypes;
-  std::vector<Shape> output_shapes;
-  std::vector<ShardingRef> output_shardings;
-
-  auto append_arg = [&](const xla::PrimitiveType& element_type,
-                        const xla::DimensionVector& dimensions,
-                        const xla::HloSharding* sharding,
-                        MemoryKind memory_kind) -> absl::Status {
-    TF_ASSIGN_OR_RETURN(auto dtype, ToDType(element_type));
-    output_dtypes.push_back(dtype);
-    output_shapes.push_back(Shape(dimensions));
-
-    CHECK(xla::primitive_util::IsArrayType(element_type));
-
-    if (sharding != nullptr) {
-      output_shardings.push_back(ifrt::HloSharding::Create(
-          executable_devices, memory_kind, *sharding));
-    } else {
-      // Assume a traditional replication computation where tile shapes are
-      // the same as global shapes.
-      const xla::DimensionVector& tile_shape_dimensions = dimensions;
-      output_shardings.push_back(ifrt::ConcreteEvenSharding::Create(
-          executable_devices, memory_kind,
-          /*shape=*/ifrt::Shape(dimensions),
-          /*shard_shape=*/ifrt::Shape(tile_shape_dimensions)));
-    }
-    return absl::OkStatus();
-  };
-  auto append_token = [&](MemoryKind memory_kind) {
-    output_dtypes.push_back(DType(DType::kToken));
-    output_shapes.push_back(Shape({}));
-    output_shardings.push_back(
-        ifrt::ConcreteEvenSharding::Create(executable_devices, memory_kind,
-                                           /*shape=*/ifrt::Shape({}),
-                                           /*shard_shape=*/ifrt::Shape({})));
-  };
-  auto check_output_sharding_condition =
-      [](absl::Span<const xla::PrimitiveType> element_types,
-         const xla::HloSharding& sharding) {
-        if (sharding.IsTuple()) {
-          // Check that the HLO sharding of the result has the same number of
-          // elements as the output tuple shape. If the output is an empty tuple
-          // then the output sharding will have a single element for the tuple
-          // as a special case, so we will have to allow that by checking this
-          // condition specifically.
-          return element_types.size() == sharding.tuple_elements().size() ||
-                 (element_types.empty() &&
-                  sharding.tuple_elements().size() == 1);
-        }
-        return element_types.size() == 1;
-      };
-
-  if (result_memory_kinds.has_value() &&
-      result_memory_kinds->size() != result_element_types.size()) {
-    return FailedPrecondition(
-        "Output memory kinds are inconsistent with the output shape");
-  }
-  if (result_hlo_sharding.has_value() &&
-      !check_output_sharding_condition(result_element_types,
-                                       *result_hlo_sharding)) {
-    return FailedPrecondition(
-        "Output sharding is inconsistent with the output shape");
-  }
-
-  CHECK_EQ(result_element_types.size(), result_dimensions.size());
-  output_dtypes.reserve(result_element_types.size());
-  output_shapes.reserve(result_element_types.size());
-  output_shardings.reserve(result_element_types.size());
-  for (int i = 0; i < result_element_types.size(); ++i) {
-    const auto& element_type = result_element_types[i];
-    MemoryKind element_memory_kind;
-    if (result_memory_kinds.has_value()) {
-      element_memory_kind = MemoryKind((*result_memory_kinds)[i]);
-    }
-    if (xla::primitive_util::IsArrayType(element_type)) {
-      const xla::HloSharding* element_hlo_sharding = nullptr;
-      if (result_hlo_sharding.has_value()) {
-        element_hlo_sharding = result_hlo_sharding->IsTuple()
-                                   ? &result_hlo_sharding->tuple_elements()[i]
-                                   : &*result_hlo_sharding;
-        if (element_hlo_sharding->IsTuple()) {
-          return FailedPrecondition(
-              "Nested-tupled output sharding is not supported");
-        }
-      }
-      TF_RETURN_IF_ERROR(append_arg(element_type, result_dimensions[i],
-                                    element_hlo_sharding, element_memory_kind));
-    } else if (element_type == TOKEN) {
-      append_token(element_memory_kind);
-    } else {
-      return FailedPrecondition(
-          "The element type is not a supported type (array, token)");
-    }
-  }
+  return executable_devices;
+}
 
+// Gathers all `PjRtHostSendAndRecvLoadedHostCallback` from the given list of
+// loaded host callbacks.
+std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+GatherHostSendAndRecvCallbacks(
+    absl::Span<const tsl::RCReference<LoadedHostCallback>>
+        loaded_host_callbacks) {
   std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
       host_send_and_recv_callbacks;
   host_send_and_recv_callbacks.reserve(loaded_host_callbacks.size());
@@ -626,20 +427,202 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
       host_send_and_recv_callbacks.push_back(host_send_and_recv_callback);
     }
   }
+  return host_send_and_recv_callbacks;
+}
+
+}  // namespace
+
+char PjRtCompatibleExecutable::ID = 0;
+char PjRtCompatibleLoadedExecutable::ID = 0;
+char PjRtExecutable::ID = 0;
+char PjRtLoadedExecutable::ID = 0;
+
+absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
+    mlir::ModuleOp module, xla::CompileOptions compile_options,
+    const xla::PjRtTopologyDescription& topology) {
+  // We have to do process the MLIR before the compile call, since the latter
+  // will use the MLIR as scratch space, or possibly even deallocate it.
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<xla::Shape> mlir_module_output_xla_shapes,
+      ResultShapesOfModule(module));
+  TF_ASSIGN_OR_RETURN(const std::vector<xla::LayoutMode> output_layout_modes,
+                      GetOutputLayoutModes(module));
+
+  TF_ASSIGN_OR_RETURN(auto pjrt_executable,
+                      PjRtCompile(std::move(compile_options), std::move(module),
+                                  topology, /*client=*/nullptr));
+
+  TF_ASSIGN_OR_RETURN(auto output_dtypes_and_shapes,
+                      GetDTypesAndShapes(mlir_module_output_xla_shapes));
+  std::vector<DType> output_dtypes = std::move(output_dtypes_and_shapes.first);
+  std::vector<Shape> output_shapes = std::move(output_dtypes_and_shapes.second);
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+      GetHloShardings(pjrt_executable->GetOutputShardings(), output_dtypes,
+                      /*is_output=*/true));
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::string_view> output_memory_kinds,
+      GetMemoryKinds(pjrt_executable->GetOutputMemoryKinds(), output_dtypes));
+
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts,
+      GetLayouts(pjrt_executable->GetOutputLayouts(), output_layout_modes));
+
+  return ExecutableRef(new PjRtExecutable(
+      std::move(pjrt_executable), std::move(output_dtypes),
+      std::move(output_shapes), std::move(output_hlo_shardings),
+      std::move(output_memory_kinds), std::move(output_layouts)));
+}
+
+PjRtExecutable::PjRtExecutable(
+    std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+    std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+    std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+    std::vector<absl::string_view> output_memory_kinds,
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        output_layouts)
+    : pjrt_executable_(std::move(pjrt_executable)),
+      output_dtypes_(std::move(output_dtypes)),
+      output_shapes_(std::move(output_shapes)),
+      output_hlo_shardings_(std::move(output_hlo_shardings)),
+      output_memory_kinds_(std::move(output_memory_kinds)),
+      output_layouts_(std::move(output_layouts)) {}
 
-  std::vector<Device*> addressable_devices;
-  addressable_devices.reserve(
-      pjrt_loaded_executable->addressable_devices().size());
-  for (xla::PjRtDevice* device :
-       pjrt_loaded_executable->addressable_devices()) {
-    TF_ASSIGN_OR_RETURN(Device * ifrt_device, client->LookupPjRtDevice(device));
-    addressable_devices.push_back(ifrt_device);
+absl::StatusOr<std::optional<std::string>> PjRtExecutable::Fingerprint() const {
+  DCHECK(this);
+  return pjrt_executable_->FingerprintExecutable();
+}
+
+absl::StatusOr<std::string> PjRtExecutable::Serialize() const {
+  DCHECK(this);
+  return pjrt_executable_->SerializeExecutable();
+}
+
+absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
+    PjRtClient* client,
+    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+    DeviceListRef executable_devices) {
+  VLOG(3) << "PjRtLoadedExecutable::Create";
+
+  TF_ASSIGN_OR_RETURN(
+      executable_devices,
+      AdjustExecutableDevicesForPmap(client, pjrt_loaded_executable.get(),
+                                     std::move(executable_devices)));
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DType> output_dtypes,
+      GetDTypes(pjrt_loaded_executable->GetOutputElementTypes()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<Shape> output_shapes,
+      GetShapes(pjrt_loaded_executable->GetOutputDimensions(), output_dtypes));
+  // When creating `xla::ifrt::PjRtLoadedExecutable` from an already compiled
+  // and loaded `xla::PjRtLoadedExecutable`, we do not have a full shape
+  // (`xla::PjRtLoadedExecutable::GetOutputDimensions()` returns shard shapes).
+  // This prevents us from using
+  // `xla::PjRtLoadedExecutable::GetOutputShardings()` for constructing IFRT
+  // shardings; otherwise, we would try to apply the shardings to already
+  // sharded shapes, which will result in incorrect sharded shapes (and layouts
+  // computed from these shard shapes). Thus, we ignore HLO shardings and use
+  // `xla::ifrt::ConcreteEvenSharding` that will take the already sharded shapes
+  // as shard shapes.
+  //
+  // TODO(hyeontaek): Remove this special handling once we can preserve full
+  // output shapes and layouts from the original compilation during
+  // serialization/deserialization, and remove this `PjRtLoadedExecutable`
+  // construction path.
+  std::optional<std::vector<xla::HloSharding>> output_hlo_shardings =
+      std::nullopt;
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::string_view> output_memory_kinds,
+      GetMemoryKinds(pjrt_loaded_executable->GetOutputMemoryKinds(),
+                     output_dtypes));
+  std::vector<ShardingRef> output_shardings =
+      MakeShardings(output_shapes, output_hlo_shardings, output_memory_kinds,
+                    executable_devices);
+
+  // Obtaining output layout modes and output layouts directly may fail because
+  // PjRt implementations often fetch and serialize/deserialize the optimized
+  // HLO to provide the layout information. For now, we gracefully handle it by
+  // omitting output layouts at creation time and using output `PjRtBuffer`'s
+  // concrete layouts.
+  //
+  // TODO(hyeontaek): Remove this layout mode discovery method once
+  // deserialization loads layout information from the serialization metadata
+  // instead of from `xla::PjRtExecutable` or `xla::PjRtLoadedExecutable`.
+  std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+      output_layouts;
+  absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
+      GetOutputLayoutModesFromHloModules(
+          pjrt_loaded_executable->GetHloModules(), output_dtypes);
+  if (output_layout_modes.ok()) {
+    TF_ASSIGN_OR_RETURN(output_layouts,
+                        GetLayouts(pjrt_loaded_executable->GetOutputLayouts(),
+                                   *output_layout_modes));
   }
 
   return LoadedExecutableRef(new PjRtLoadedExecutable(
       client, std::move(pjrt_loaded_executable), std::move(executable_devices),
-      std::move(addressable_devices), std::move(loaded_host_callbacks),
-      std::move(host_send_and_recv_callbacks), std::move(output_dtypes),
+      std::move(loaded_host_callbacks), std::move(output_dtypes),
+      std::move(output_shapes), std::move(output_shardings),
+      std::move(output_layouts)));
+}
+
+absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
+    PjRtClient* client, mlir::ModuleOp module,
+    xla::CompileOptions compile_options,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+    DeviceListRef executable_devices) {
+  VLOG(3) << "PjRtLoadedExecutable::Create";
+  if (VLOG_IS_ON(3)) {
+    module.dump();
+  }
+  VLOG(3) << compile_options.ToProto()->DebugString();
+
+  // We have to do process the MLIR before the compile call, since the latter
+  // will use the MLIR as scratch space, or possibly even deallocate it.
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<xla::Shape> mlir_module_output_xla_shapes,
+      ResultShapesOfModule(module));
+  TF_ASSIGN_OR_RETURN(const std::vector<xla::LayoutMode> output_layout_modes,
+                      GetOutputLayoutModes(module));
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      client->pjrt_client()->CompileAndLoad(std::move(module),
+                                            std::move(compile_options)));
+
+  TF_ASSIGN_OR_RETURN(
+      executable_devices,
+      AdjustExecutableDevicesForPmap(client, pjrt_loaded_executable.get(),
+                                     std::move(executable_devices)));
+
+  TF_ASSIGN_OR_RETURN(auto output_dtypes_and_shapes,
+                      GetDTypesAndShapes(mlir_module_output_xla_shapes));
+  std::vector<DType> output_dtypes = std::move(output_dtypes_and_shapes.first);
+  std::vector<Shape> output_shapes = std::move(output_dtypes_and_shapes.second);
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+      GetHloShardings(pjrt_loaded_executable->GetOutputShardings(),
+                      output_dtypes, /*is_output=*/true));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::string_view> output_memory_kinds,
+      GetMemoryKinds(pjrt_loaded_executable->GetOutputMemoryKinds(),
+                     output_dtypes));
+  std::vector<ShardingRef> output_shardings =
+      MakeShardings(output_shapes, output_hlo_shardings, output_memory_kinds,
+                    executable_devices);
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts,
+      GetLayouts(pjrt_loaded_executable->GetOutputLayouts(),
+                 output_layout_modes));
+
+  return LoadedExecutableRef(new PjRtLoadedExecutable(
+      client, std::move(pjrt_loaded_executable), std::move(executable_devices),
+      std::move(loaded_host_callbacks), std::move(output_dtypes),
       std::move(output_shapes), std::move(output_shardings),
       std::move(output_layouts)));
 }
@@ -647,10 +630,8 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
 PjRtLoadedExecutable::PjRtLoadedExecutable(
     PjRtClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    DeviceListRef devices, std::vector<Device*> addressable_devices,
+    DeviceListRef devices,
     std::vector<tsl::RCReference<LoadedHostCallback>> all_loaded_host_callbacks,
-    std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
-        host_send_recv_callbacks,
     std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
     std::vector<ShardingRef> output_shardings,
     std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
@@ -658,11 +639,12 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
     : client_(client),
       pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
       devices_(std::move(devices)),
-      addressable_devices_(std::move(addressable_devices)),
+      addressable_devices_(devices_->AddressableDeviceList()->devices()),
       all_loaded_host_callbacks_(
           std::make_shared<std::vector<tsl::RCReference<LoadedHostCallback>>>(
               std::move(all_loaded_host_callbacks))),
-      host_send_recv_callbacks_(std::move(host_send_recv_callbacks)),
+      host_send_recv_callbacks_(
+          GatherHostSendAndRecvCallbacks(*all_loaded_host_callbacks_)),
       output_dtypes_(std::move(output_dtypes)),
       output_shapes_(std::move(output_shapes)),
       output_shardings_(std::move(output_shardings)),
@@ -877,20 +859,9 @@ PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
     }
   } else {
     auto maybe_layouts = GetOutputLayouts();
+    // An unimplemented error is converted into all-default layouts.
     if (absl::IsUnimplemented(maybe_layouts.status())) {
-      for (int i = 0; i < num_outputs; ++i) {
-        std::shared_ptr<const xla::PjRtLayout> layout;
-        if (output_dtypes_[i].kind() == xla::ifrt::DType::kToken) {
-          layout = std::make_shared<xla::PjRtLayout>(xla::Layout());
-        } else {
-          TF_ASSIGN_OR_RETURN(layout,
-                              client_->GetDefaultPjRtLayout(
-                                  output_dtypes_[i], output_shapes_[i].dims(),
-                                  devices_->devices().front(),
-                                  output_shardings_[i]->memory_kind()));
-        }
-        layouts.push_back(std::move(layout));
-      }
+      layouts.resize(/*size=*/num_outputs, /*value=*/nullptr);
     } else {
       TF_RETURN_IF_ERROR(maybe_layouts.status());
       layouts = *std::move(maybe_layouts);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 9dd2d445da4504..976a905b7afefc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -91,13 +90,6 @@ class PjRtCompatibleLoadedExecutable
 class PjRtExecutable final
     : public llvm::RTTIExtends<PjRtExecutable, PjRtCompatibleExecutable> {
  public:
-  // Creates PjRtExecutable from xla::PjRtExecutable.
-  ABSL_DEPRECATED(
-      "Use the `Create()` that takes an MLIR module and compiles it "
-      "internally.")
-  static absl::StatusOr<ExecutableRef> Create(
-      std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
-
   // Creates PjRtExecutable from an MLIR module. Internally, it compiles the
   // provided MLIR module into an `xla::PjRtExecutable`.
   static absl::StatusOr<ExecutableRef> Create(
@@ -183,10 +175,23 @@ class PjRtExecutable final
   static char ID;  // NOLINT
 
  protected:
-  explicit PjRtExecutable(std::shared_ptr<xla::PjRtExecutable> pjrt_executable)
-      : pjrt_executable_(std::move(pjrt_executable)) {}
+  PjRtExecutable(
+      std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+      std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+      std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+      std::vector<absl::string_view> output_memory_kinds,
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts);
 
   std::shared_ptr<xla::PjRtExecutable> pjrt_executable_;
+
+  // Output array specs.
+  std::vector<DType> output_dtypes_;
+  std::vector<Shape> output_shapes_;
+  std::optional<std::vector<xla::HloSharding>> output_hlo_shardings_;
+  std::vector<absl::string_view> output_memory_kinds_;
+  std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+      output_layouts_;
 };
 
 // `LoadedExecutable` implementation that wraps a `xla::PjRtLoadedExecutable`.
@@ -350,26 +355,12 @@ class PjRtLoadedExecutable final
   static char ID;  // NOLINT
 
  private:
-  static absl::StatusOr<LoadedExecutableRef> CreateInternal(
-      PjRtClient* client,
-      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      absl::Span<const xla::PrimitiveType> result_element_types,
-      absl::Span<const xla::DimensionVector> result_dimensions,
-      const std::optional<xla::HloSharding>& result_hlo_sharding,
-      const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
-      const std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
-          output_layouts,
-      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
-      DeviceListRef executable_devices);
-
   PjRtLoadedExecutable(
       PjRtClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      DeviceListRef devices, std::vector<Device*> addressable_devices,
+      DeviceListRef devices,
       std::vector<tsl::RCReference<LoadedHostCallback>>
           all_loaded_host_callbacks,
-      std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
-          host_send_recv_callbacks,
       std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
       std::vector<ShardingRef> output_shardings,
       std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
@@ -380,17 +371,19 @@ class PjRtLoadedExecutable final
   // Devices that `pjrt_loaded_executable_` runs on. Empty if the executable is
   // portable.
   DeviceListRef devices_;
-  std::vector<Device*> addressable_devices_;
+  // Addressable devices. The underlying device list is owned by
+  // `devices_->AddressableDeviceList()`.
+  absl::Span<Device* const> addressable_devices_;
   std::shared_ptr<std::vector<tsl::RCReference<LoadedHostCallback>>>
       all_loaded_host_callbacks_;
   std::vector<PjRtHostSendAndRecvLoadedHostCallback*> host_send_recv_callbacks_;
 
-  // Output array specs. If the executable is portable, shardings in
-  // `output_shardings_` will use an arbitrary addressable device, and will be
-  // overridden by a `SingleDeviceSharding` generated on the fly at execution
-  // time.
+  // Output array specs.
   std::vector<DType> output_dtypes_;
   std::vector<Shape> output_shapes_;
+  // If the executable is portable, shardings in `output_shardings_` will use an
+  // arbitrary addressable device, and will be overridden by a
+  // `SingleDeviceSharding` generated on the fly at execution time.
   std::vector<ShardingRef> output_shardings_;
   std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
       output_layouts_;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index 7d933d36ef3d87..a110413ab4849d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -294,7 +294,7 @@ TEST_P(LoadedExecutableImplTest, Analysis) {
 
   TF_ASSERT_OK_AND_ASSIGN(const auto cost_analysis,
                           executable->GetCostAnalysis());
-  EXPECT_THAT(cost_analysis.map(), Not(IsEmpty()));
+  EXPECT_FALSE(cost_analysis.IsEmpty());
 }
 
 TEST_P(LoadedExecutableImplTest, GetDonatableInputIndices) {
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 7b141248168629..a59429d66487fb 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -260,7 +260,19 @@ NB_MODULE(_profiler, m) {
           &tensorflow::ProfileOptions::set_raise_error_on_start_failure)
       .def_prop_rw(
           "advanced_configuration",
-          &tensorflow::ProfileOptions::advanced_configuration,
+          [](const tensorflow::ProfileOptions& options) {
+            nb::dict dict;
+            for (const auto& [key, value] : options.advanced_configuration()) {
+              if (value.has_bool_value()) {
+                dict[key.c_str()] = value.bool_value();
+              } else if (value.has_int64_value()) {
+                dict[key.c_str()] = value.int64_value();
+              } else {
+                dict[key.c_str()] = value.string_value();
+              }
+            }
+            return dict;
+          },
           [](tensorflow::ProfileOptions* options, const nb::dict& dict) {
             if (options->mutable_advanced_configuration() == nullptr) {
               throw xla::XlaRuntimeError("advanced_configuration is null");
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 8332ea79552d77..08d97088e51d8d 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -1,4 +1,4 @@
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
@@ -21,6 +21,7 @@ cc_library(
         "//tensorflow/python/profiler/internal:__subpackages__",
     ]),
     deps = [
+        ":traceme_state",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:macros",
         "//xla/tsl/profiler/utils:time_utils",
@@ -40,6 +41,19 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "traceme_state",
+    srcs = ["traceme_state.cc"],
+    hdrs = ["traceme_state.h"],
+    copts = tf_profiler_copts() + if_windows(["/DTF_COMPILE_LIBRARY"]),
+    visibility = internal_visibility([
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
+    deps = [
+        "//xla/tsl/platform:macros",
+    ],
+)
+
 cc_library(
     name = "traceme_wrapper",
     hdrs = ["traceme_wrapper.h"],
diff --git a/third_party/xla/xla/python/profiler/internal/python_hooks.cc b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
index 052a2a06da6021..fbe1afabf26270 100644
--- a/third_party/xla/xla/python/profiler/internal/python_hooks.cc
+++ b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "xla/python/profiler/internal/traceme_state.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
@@ -148,7 +149,7 @@ void PythonHookContext::Start(const PythonHooksOptions& options) {
   if (options_.enable_python_traceme || options_.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
     if (options_.enable_python_traceme) {
-      EnableTraceMe(true);
+      traceme_enabled = true;
     }
     if (options_.enable_trace_python_function) {
       SetProfilerInAllThreads();
@@ -187,7 +188,7 @@ void PythonHookContext::Stop() {
       ClearProfilerInAllThreads();
     }
     if (options_.enable_python_traceme) {
-      EnableTraceMe(false);
+      traceme_enabled = false;
     }
     PyGILState_Release(gil_state);
   }
@@ -408,16 +409,5 @@ void PythonHookContext::ProfileFast(PyFrameObject* frame, int what,
   ThreadingSetProfile(py::none());
 }
 
-/*static*/ void PythonHookContext::EnableTraceMe(bool enable) {
-  const char* kModuleName =
-      "tensorflow.python.profiler.trace";
-  try {
-    auto trace_module = py::module::import(kModuleName);
-    trace_module.attr("enabled") = py::bool_(enable);
-  } catch (const py::error_already_set& e) {
-    LOG(INFO) << "Can't import " << kModuleName;
-  }
-}
-
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/python/profiler/internal/python_hooks.h b/third_party/xla/xla/python/profiler/internal/python_hooks.h
index fd1ffbf4c925c3..623df122ed1cd9 100644
--- a/third_party/xla/xla/python/profiler/internal/python_hooks.h
+++ b/third_party/xla/xla/python/profiler/internal/python_hooks.h
@@ -135,7 +135,6 @@ class PythonHookContext {
   void Stop();
   void ProfileFast(PyFrameObject* frame, int what, PyObject* arg);
   void CollectData(tensorflow::profiler::XPlane* raw_plane);
-  static void EnableTraceMe(bool enable);
 
   static void SetProfilerInAllThreads();
   static void ClearProfilerInAllThreads();
diff --git a/ci/official/containers/ml_build_arm64/apt.conf b/third_party/xla/xla/python/profiler/internal/traceme_state.cc
similarity index 71%
rename from ci/official/containers/ml_build_arm64/apt.conf
rename to third_party/xla/xla/python/profiler/internal/traceme_state.cc
index ea7b56091e5fd3..b4959e725ab372 100644
--- a/ci/official/containers/ml_build_arm64/apt.conf
+++ b/third_party/xla/xla/python/profiler/internal/traceme_state.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,4 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-APT::Default-Release "focal";
+#include "xla/python/profiler/internal/traceme_state.h"
+
+#include <atomic>
+
+namespace xla {
+namespace profiler {
+
+std::atomic<bool> traceme_enabled{false};
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h b/third_party/xla/xla/python/profiler/internal/traceme_state.h
similarity index 58%
rename from third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h
rename to third_party/xla/xla/python/profiler/internal/traceme_state.h
index c903698baf0922..937321772507a5 100644
--- a/third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h
+++ b/third_party/xla/xla/python/profiler/internal/traceme_state.h
@@ -1,4 +1,4 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef XLA_PYTHON_PROFILER_INTERNAL_TRACEME_STATE_H_
+#define XLA_PYTHON_PROFILER_INTERNAL_TRACEME_STATE_H_
 
-#ifndef GOOGLETEST_WRAPPER_INCLUDE_GTEST_GTEST_H_
-#define GOOGLETEST_WRAPPER_INCLUDE_GTEST_GTEST_H_
+#include <atomic>
 
-#include_next "gtest/gtest.h"
+#include "xla/tsl/platform/macros.h"
+namespace xla {
+namespace profiler {
 
-#endif  // GOOGLETEST_WRAPPER_INCLUDE_GTEST_GTEST_H_
+// Indicates whether TraceMe is enabled.
+TF_EXPORT extern std::atomic<bool> traceme_enabled;
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_INTERNAL_TRACEME_STATE_H_
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 86f9d46b359be0..a192d8d98efcc5 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER \
-  38  // `xla::ifrt::Executable::Create()` can directly take an MLIR module and
-      // compile it.
+#define JAX_IFRT_VERSION_NUMBER 46  // Shardy replicated -> unreduced
 
 #endif  // XLA_PYTHON_VERSION_H_
diff --git a/third_party/xla/xla/runtime/buffer_use.cc b/third_party/xla/xla/runtime/buffer_use.cc
index 23030c71cffac6..10aeab882b9e27 100644
--- a/third_party/xla/xla/runtime/buffer_use.cc
+++ b/third_party/xla/xla/runtime/buffer_use.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 252abf3c4d8f98..1fd795ebfb143a 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -504,6 +504,7 @@ xla_cc_test(
         ":collective_pipeliner_utils",
         ":hlo_module_config",
         ":hlo_verifier",
+        ":host_offload_utils",
         ":legalize_scheduling_annotations",
         ":memory_annotations_hdr",
         ":scheduling_annotations_util",
@@ -925,6 +926,7 @@ xla_cc_test(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -939,6 +941,9 @@ xla_cc_test(
     name = "hlo_sharding_test",
     srcs = ["hlo_sharding_test.cc"],
     deps = [
+        "//xla:array",
+        "//xla:array3d",
+        "//xla:array4d",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -949,8 +954,11 @@ xla_cc_test(
         "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1526,10 +1534,9 @@ cc_library(
     deps = [
         ":buffer_assignment",
         ":computation_layout",
-        ":hlo_execution_profile",
         ":hlo_module_config",
         ":hlo_proto_cc",
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         ":shaped_buffer",
         ":stream_pool",
         "//xla:executable_run_options",
@@ -1579,6 +1586,18 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "compiled_module",
+    hdrs = ["compiled_module.h"],
+    deps = [
+        ":buffer_assignment",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 cc_library(
     name = "compiler",
     srcs = ["compiler.cc"],
@@ -1588,8 +1607,8 @@ cc_library(
         "//xla/backends/gpu/runtime:__subpackages__",
     ]),
     deps = [
-        ":buffer_assignment",
         ":buffer_value",
+        ":compiled_module",
         ":computation_placer",
         ":executable",
         ":hlo_cost_analysis",
@@ -1667,7 +1686,7 @@ cc_library(
     hdrs = ["transfer_manager.h"],
     deps = [
         ":compiler",
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         ":shaped_buffer",
         "//xla:literal",
         "//xla:shape_tree",
@@ -1978,6 +1997,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:macros",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -2279,7 +2299,6 @@ cc_library(
     hdrs = ["triangular_solve_expander.h"],
     deps = [
         ":hlo_creation_utils",
-        ":hlo_module_config",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -2291,14 +2310,14 @@ cc_library(
         "//xla/hlo/builder/lib:slicing",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2483,6 +2502,7 @@ cc_library(
         ":collective_opt_utils",
         ":hlo_module_config",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -2618,6 +2638,7 @@ xla_cc_test(
     srcs = ["conditional_simplifier_test.cc"],
     deps = [
         ":conditional_simplifier",
+        ":hlo_verifier",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
@@ -2625,9 +2646,12 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -3974,7 +3998,6 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -4081,28 +4104,41 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "maybe_owning_device_memory",
-    srcs = ["maybe_owning_device_memory.cc"],
-    hdrs = ["maybe_owning_device_memory.h"],
+    name = "maybe_owning_device_address",
+    srcs = ["maybe_owning_device_address.cc"],
+    hdrs = ["maybe_owning_device_address.h"],
     deps = [
+        "//xla:types",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
 xla_cc_test(
-    name = "maybe_owning_device_memory_test",
-    srcs = ["maybe_owning_device_memory_test.cc"],
+    name = "maybe_owning_device_address_test",
+    srcs = ["maybe_owning_device_address_test.cc"],
     deps = [
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
     ],
 )
 
+cc_library(
+    name = "maybe_owning_device_memory",
+    hdrs = ["maybe_owning_device_memory.h"],
+    deps = [
+        ":maybe_owning_device_address",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
 cc_library(
     name = "float8_fnuz_ir_emitter",
     srcs = [
@@ -4450,7 +4486,7 @@ cc_library(
         ":executable",
         ":hlo_module_util",
         ":hlo_runner_interface",
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         ":shaped_buffer",
         ":transfer_manager",
         "//xla:executable_run_options",
@@ -4471,6 +4507,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
@@ -4511,13 +4548,14 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:recordphase",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -5797,34 +5835,34 @@ xla_aot_compile_cpu(
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_hlo",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test.hlo",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_constant",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_constant.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_convolution",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
 xla_aot_compile_gpu_runtime_autotuning(
     name = "xla_aot_compile_test_gpu_executable_convolution_runtime_autotuning",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
@@ -6062,14 +6100,14 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:flatten_call_graph",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6084,7 +6122,6 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -6220,6 +6257,31 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "gpu_topology_proto",
+    srcs = ["gpu_topology.proto"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "gpu_topology",
+    srcs = ["gpu_topology.cc"],
+    hdrs = ["gpu_topology.h"],
+    visibility = internal_visibility([
+        "//xla/pjrt:__subpackages__",
+        "//third_party/pathways:__subpackages__",
+        "//learning/brain/research/pjrt:__subpackages__",
+        "//learning/brain/research/jax:__subpackages__",
+        "//learning/pathways/compilation_service:__subpackages__",
+        "//tensorflow/core/common_runtime/eager:__subpackages__",
+        ":__subpackages__",
+    ]),
+    deps = [
+        ":gpu_topology_proto_cc",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 cc_library(
     name = "matmul_indexing_utils",
     srcs = ["matmul_indexing_utils.cc"],
diff --git a/third_party/xla/xla/service/all_gather_simplifier.cc b/third_party/xla/xla/service/all_gather_simplifier.cc
index d1e6f1a285f32a..f56bf656b9d72d 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -61,11 +62,14 @@ absl::StatusOr<bool> AllGatherSimplifier::RunImpl(
                 HloPredicateIsOp<HloOpcode::kReplicaId>);
         if (spec.has_value() &&
             spec->split_dim == all_gather->all_gather_dimension()) {
-          changed = true;
           CHECK_EQ(all_gather->users().size(), 1);
           HloInstruction* ds = all_gather->users().front();
-          TF_RETURN_IF_ERROR(
-              ds->ReplaceAllUsesWith(all_gather->mutable_operand(0)));
+          HloInstruction* ag_operand = all_gather->mutable_operand(0);
+          if (!ShapeUtil::Compatible(ds->shape(), ag_operand->shape())) {
+            continue;
+          }
+          changed = true;
+          TF_RETURN_IF_ERROR(ds->ReplaceAllUsesWith(ag_operand));
           TF_RETURN_IF_ERROR(
               computation->RemoveInstructionAndUnusedOperands(ds));
         }
diff --git a/third_party/xla/xla/service/all_gather_simplifier_test.cc b/third_party/xla/xla/service/all_gather_simplifier_test.cc
index 13cb631ea38f32..418fa8f9134763 100644
--- a/third_party/xla/xla/service/all_gather_simplifier_test.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier_test.cc
@@ -59,5 +59,29 @@ test {
               GmockMatch(m::Add(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST_F(AllGatherSimplifierTest, DoesNotReplaceIfInputShapeMismatch) {
+  const absl::string_view kModuleStr = R"(
+HloModule m
+
+test {
+  p0 = f32[1, 5920, 4, 2304] parameter(0)
+  all-gather = f32[1, 23680, 4, 2304] all-gather(p0), replica_groups={{0, 1, 2, 3}}, dimensions={1}, use_global_device_ids=true, channel_id=1
+  replica-id = u32[] replica-id()
+  table = s32[4] constant({0, 5520, 11040, 16560})
+  ds_index = s32[1] dynamic-slice(table, replica-id), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(ds_index)
+  zero = s32[] constant(0)
+  ROOT dynamic-slice = f32[1, 5520, 4, 2304] dynamic-slice(all-gather, zero, reshape, zero, zero), dynamic_slice_sizes={1, 5520, 4, 2304}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/4));
+  module->mutable_config().set_use_spmd_partitioning(true);
+  AllGatherSimplifier ag_simplifier;
+  auto result = ag_simplifier.Run(module.get());
+  ASSERT_TRUE(result.ok()) << result.status();
+  ASSERT_FALSE(result.value());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 5446e720cc2e81..5c1945d05b7343 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -578,6 +578,36 @@ absl::StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueSlice(
   return result;
 }
 
+absl::StatusOr<Shape> BufferAssignment::GetShapeForUniqueSlice(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  VLOG(3) << "Trying to find shape for unique slice for " << instruction->name()
+          << " [" << index << "]";
+  std::optional<Shape> result;
+  for (const HloValue* value :
+       dataflow_analysis().GetValueSet(instruction, index).values()) {
+    VLOG(3) << "Examining value " << *value;
+    if (HasAllocation(*value)) {
+      VLOG(3) << "Has allocation";
+      if (result == std::nullopt) {
+        result = value->shape();
+      } else if (result != value->shape()) {
+        return FailedPrecondition(
+            "Shape for instruction %s at index %s cannot "
+            "be determined at compile-time.",
+            instruction->name(), index.ToString());
+      }
+    } else {
+      VLOG(3) << "No allocation";
+    }
+  }
+  if (result == std::nullopt) {
+    return FailedPrecondition(
+        "BufferAllocation::Slice not assigned for instruction %s at index %s",
+        instruction->name(), index.ToString());
+  }
+  return *result;
+}
+
 absl::StatusOr<BufferAllocation::Slice>
 BufferAssignment::GetUniqueTopLevelSlice(
     const HloInstruction* instruction) const {
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 709833efe98336..67f6cfabfde934 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 
 namespace xla {
@@ -493,6 +494,9 @@ class BufferAssignment {
   // the slice cannot be determined at compile time then an error is returned.
   absl::StatusOr<BufferAllocation::Slice> GetUniqueSlice(
       const HloInstruction* instruction, const ShapeIndex& index) const;
+  absl::StatusOr<Shape> GetShapeForUniqueSlice(
+      const HloInstruction* instruction, const ShapeIndex& index) const;
+
   // Like GetUniqueSlice but fixes the index to the top-level of the shape
   // (index = {}).
   absl::StatusOr<BufferAllocation::Slice> GetUniqueTopLevelSlice(
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 31a0cc1c4d5e32..56b5c5d34f8ab2 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -399,7 +399,7 @@ class BufferAssignmentTest : public HloHardwareIndependentTestBase {
   Shape f32a100x10_ = ShapeUtil::MakeShape(F32, {100, 10});
   Shape t_s32_f32v4_ = ShapeUtil::MakeTupleShape({s32_, f32vec4_});
   Shape t_s32_f32v10_ = ShapeUtil::MakeTupleShape({s32_, f32vec10_});
-  const AliasInfo alias_info_;
+  AliasInfo alias_info_;
 };
 
 // Returns true if the buffers assigned to instructions in "a" are distinct
@@ -2504,6 +2504,10 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
   // Verify 'weights1' and read-only use while1{1} alias.
   EXPECT_EQ(assignment->GetUniqueSlice(weights1, {}).value(),
             assignment->GetUniqueSlice(while1, {1}).value());
+
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape,
+                          assignment->GetShapeForUniqueSlice(while1, {1}));
+  EXPECT_EQ(shape, data_shape_);
 }
 
 // Tests that two colocated buffer sets are not merged if an entry parameter
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 45ee3a096c9708..b8ff16b756eec9 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -238,7 +238,8 @@ absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
   return Internal("Unexpected instruction type.");
 }
 
-const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo) {
+const CollectiveDeviceListBase& GetCollectiveDeviceList(
+    const HloInstruction* hlo) {
   return Cast<HloCollectiveInstruction>(hlo)->device_list();
 }
 
@@ -375,21 +376,23 @@ GetParticipatingDevicesGroups(const HloInstruction* collective) {
       device_assignment, GetCollectiveReplicaGroups(collective), mode);
 }
 
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
     const DeviceAssignment& device_assignment,
-    const CollectiveDeviceList& collective_device_list,
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode) {
   return GetParticipatingFlattenedIdGroups(
       collective_device_list, group_mode, device_assignment.replica_count(),
       device_assignment.computation_count());
 }
 
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const CollectiveDeviceList& collective_device_list,
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode, int replica_count, int partition_count) {
   if (group_mode ==
       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
-    return collective_device_list;
+    return collective_device_list.Clone();
   }
   std::vector<ReplicaGroup> filled_empty_replica_group;
   absl::Span<const ReplicaGroup> original_replica_groups =
@@ -456,27 +459,29 @@ absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
       }
     }
   }
-  return CollectiveDeviceList(flattened_replica_groups);
+  return std::make_unique<CollectiveDeviceList>(flattened_replica_groups);
 }
 
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, const DeviceAssignment& device_assignment) {
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo,
+                                  const DeviceAssignment& device_assignment) {
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
                       GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(
-      CollectiveDeviceList collective_device_list,
+      std::unique_ptr<CollectiveDeviceListBase> collective_device_list,
       GetParticipatingFlattenedIdGroups(device_assignment,
                                         GetCollectiveDeviceList(hlo), mode));
   return collective_device_list;
 }
 
 // Same as above, used for cases where static_device_assignment is not present.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, int replica_count, int partition_count) {
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo, int replica_count,
+                                  int partition_count) {
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
                       GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(
-      CollectiveDeviceList collective_device_list,
+      std::unique_ptr<CollectiveDeviceListBase> collective_device_list,
       GetParticipatingFlattenedIdGroups(GetCollectiveDeviceList(hlo), mode,
                                         replica_count, partition_count));
   return collective_device_list;
@@ -664,13 +669,12 @@ absl::StatusOr<std::vector<int64_t>> GetPariticipantCountsForReplicaGroups(
 
 absl::StatusOr<std::optional<std::pair<int64_t, int64_t>>>
 GetReplicaGroupCountAndSize(const HloInstruction* hlo) {
-  const CollectiveDeviceList& device_list = GetCollectiveDeviceList(hlo);
+  const CollectiveDeviceListBase& device_list = GetCollectiveDeviceList(hlo);
   auto config = hlo->GetModule()->config();
 
-  if (device_list.iota_replica_group_list().has_value()) {
-    return std::make_pair(
-        device_list.iota_replica_group_list()->num_replica_groups(),
-        device_list.iota_replica_group_list()->num_devices_per_group());
+  if (device_list.version() == CollectiveDeviceListVersion::kIota) {
+    return std::make_pair(device_list.num_replica_groups(),
+                          device_list.num_devices_per_group());
   }
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(hlo));
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index 4c86d1fab8b06c..bdb81ed62cae5b 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_COLLECTIVE_OPS_UTILS_H_
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -80,7 +81,8 @@ absl::StatusOr<std::vector<int>> GetParticipatingIDs(
 absl::StatusOr<std::vector<std::vector<int64_t>>> GetAsyncReplicaGroups(
     const HloInstruction* instruction);
 
-const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo);
+const CollectiveDeviceListBase& GetCollectiveDeviceList(
+    const HloInstruction* hlo);
 
 const std::vector<ReplicaGroup>& GetCollectiveReplicaGroups(
     const HloInstruction* hlo);
@@ -129,24 +131,28 @@ GetParticipatingDevicesGroups(const HloInstruction* collective);
 
 // Same as above, except that it returns the flattened id in the replica groups
 // instead of device id.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
     const DeviceAssignment& device_assignment,
-    const CollectiveDeviceList& collective_device_list,
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode);
 
 // Same as above, but take replica/partition count instead of device assignment.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const CollectiveDeviceList& collective_device_list,
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode, int replica_count, int partition_count);
 
 // Same as above, with collective group mode determined by the collective
 // instruction.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, const DeviceAssignment& device_assignment);
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo,
+                                  const DeviceAssignment& device_assignment);
 
 // Same as above, used for cases where static_device_assignment is not present.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, int replica_count, int partition_count);
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo, int replica_count,
+                                  int partition_count);
 
 // Figures out which devices are participating in the collective subgroup.
 absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
@@ -296,27 +302,6 @@ inline bool MayPipelineSendRecvChannel(int64_t channel_id) {
 // Send or Recv. For all other cases, asynchronous stream kP2P0 is used.
 constexpr char kSendRecvPipelineAttr[] = "_xla_send_recv_pipeline";
 
-// This frontend attribute conveys the following information:
-// (1) _xla_send_recv_validation="invalid": the runtime should skip sending or
-// receiving data when the instruction is executed.
-// (2) the absent of the attribute: the runtime should faithfully perform the
-// Send or Recv operation when the instruction is executed.
-// (3) _xla_send_recv_validation={list-of-bounds}: the list-of-bounds
-// corresponds to the value of _xla_send_recv_source_target_pairs, and specifies
-// the execution instances for which the runtime should faithfully perform the
-// Send or Recv operation. Here is an example:
-//   _xla_send_recv_source_target_pairs={{0,1}, {1,2}}
-//   _xla_send_recv_validation={{2,3}, {5,7}}
-// The Send or Recv instruction with the above two attributes have the
-// following semantics:
-// The communication between device 0 and 1 will only send or receive data
-// for execution instances 2 and 3 of the instruction on devices 0 and 1.
-// For execution instances 0, 1, and beyond 3, the runtime should skip sending
-// or receiving any data.
-// Similarly, the communication between device 1 and 2 will only send or
-// receive data on execution instances 5 and 7.
-constexpr char kSendRecvValidationAttr[] = "_xla_send_recv_validation";
-
 // Attribute to indicate that collective operations should be issued on a
 // dedicated p2p stream. This is a hint and there is no guarantee that this will
 // be honored.
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index d7e9c91ee9c438..6d762d9b13b7df 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -57,11 +57,11 @@ using CycleType = collective_permute_cycle::CycleType;
 
 // Creates a container of ReplicaGroups.
 std::vector<ReplicaGroup> CreateReplicaGroups(
-    const std::vector<std::vector<int64_t>> &replica_groups) {
+    const std::vector<std::vector<int64_t>>& replica_groups) {
   std::vector<ReplicaGroup> result;
   result.reserve(replica_groups.size());
-  for (const auto &replica_group : replica_groups) {
-    ReplicaGroup &group = result.emplace_back();
+  for (const auto& replica_group : replica_groups) {
+    ReplicaGroup& group = result.emplace_back();
     for (auto id : replica_group) {
       group.add_replica_ids(id);
     }
@@ -116,7 +116,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
 
-  HloInstruction *all_gather =
+  HloInstruction* all_gather =
       module->entry_computation()->GetInstructionWithName("all-gather");
 
   EXPECT_EQ(IsOrHasCollectiveWithChannelId(all_gather), all_gather);
@@ -138,10 +138,10 @@ TEST(CollectiveOpsUtilsTest, IsNonFusionCollectiveSendRecv) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(hlo_string));
 
-  HloInstruction *recv_ctx =
+  HloInstruction* recv_ctx =
       module->entry_computation()->GetInstructionWithName("recv_ctx");
   ASSERT_NE(recv_ctx, nullptr);
-  HloInstruction *send_ctx =
+  HloInstruction* send_ctx =
       module->entry_computation()->GetInstructionWithName("send_ctx");
   ASSERT_NE(send_ctx, nullptr);
 
@@ -160,7 +160,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
       HloInstruction * param_0,
       builder.AddParameter(HloInstruction::CreateParameter(
           0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p0")));
-  HloInstruction *instr =
+  HloInstruction* instr =
       builder.AddInstruction(HloInstruction::CreateAllGather(
           ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_0}, 1,
           CollectiveDeviceList(std::vector<ReplicaGroup>({group})), true, 231,
@@ -178,7 +178,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
       HloInstruction * param_1,
       builder2.AddParameter(HloInstruction::CreateParameter(
           0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p1")));
-  HloInstruction *instr_without_channel_id =
+  HloInstruction* instr_without_channel_id =
       builder2.AddInstruction(HloInstruction::CreateAllGather(
           ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_1}, 1, {group},
           true, std::nullopt, true));
@@ -191,7 +191,6 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
   EXPECT_EQ(IsOrHasCollectiveWithChannelId(fusion2.get()), nullptr);
 }
 
-
 TEST(IsExclusivelyCrossModuleTest, CrossReplicaNoChannelSet) {
   int64_t num_replicas = 4;
   int64_t num_partitions = 2;
@@ -280,14 +279,14 @@ TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
   // Set up a collective permute start instruction
   auto builder = HloComputation::Builder("GetReplicaGroupsTest");
   auto param_shape = ShapeUtil::MakeShape(F32, {4, 4});
-  HloInstruction *param_0 = builder.AddInstruction(
+  HloInstruction* param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "p0"));
 
   // Test for CollectivePermuteStart
   std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
       {0, 1}, {1, 2}, {2, 3}, {3, 0}};
 
-  HloInstruction *permute_start =
+  HloInstruction* permute_start =
       builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
           param_shape, param_0, source_target_pairs, /*channel_id=*/1));
 
@@ -303,7 +302,7 @@ TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
   // Test for AllGatherStart
   std::vector<ReplicaGroup> replica_groups =
       CreateReplicaGroups({{0, 1}, {2, 3}});
-  HloInstruction *all_gather_start =
+  HloInstruction* all_gather_start =
       builder.AddInstruction(HloInstruction::CreateAllGatherStart(
           ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
           /*all_gather_dimension=*/0, replica_groups,
@@ -326,10 +325,10 @@ TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
   reducer_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, reducer_x, reducer_y));
 
-  HloComputation *add_computation =
+  HloComputation* add_computation =
       module.AddEmbeddedComputation(reducer_builder.Build());
 
-  HloInstruction *all_reduce_start =
+  HloInstruction* all_reduce_start =
       builder.AddInstruction(HloInstruction::CreateAllReduceStart(
           ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
           add_computation, replica_groups, /*constrain_layout=*/false,
@@ -347,14 +346,14 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   HloModule module("test_module", HloModuleConfig());
   auto builder = HloComputation::Builder("IsAsyncCollectiveTest");
   auto param_shape = ShapeUtil::MakeShape(F32, {4, 4});
-  HloInstruction *param_0 = builder.AddInstruction(
+  HloInstruction* param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "p0"));
 
   // Test for CollectivePermuteStart and CollectivePermuteDone
   std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
       {0, 1}, {1, 2}, {2, 3}, {3, 0}};
 
-  HloInstruction *permute_start =
+  HloInstruction* permute_start =
       builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
           param_shape, param_0, source_target_pairs, /*channel_id=*/1));
 
@@ -362,7 +361,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.ok());
   EXPECT_TRUE(is_async_status.value());
 
-  HloInstruction *permute_done =
+  HloInstruction* permute_done =
       builder.AddInstruction(HloInstruction::CreateUnary(
           param_shape, HloOpcode::kCollectivePermuteDone, permute_start));
 
@@ -374,7 +373,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   std::vector<ReplicaGroup> replica_groups =
       CreateReplicaGroups({{0, 1}, {2, 3}});
 
-  HloInstruction *all_gather_start =
+  HloInstruction* all_gather_start =
       builder.AddInstruction(HloInstruction::CreateAllGatherStart(
           ShapeUtil::MakeTupleShape(
               {ShapeUtil::MakeShape(F32, {8, 4}), param_shape}),
@@ -386,7 +385,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.ok());
   EXPECT_TRUE(is_async_status.value());
 
-  HloInstruction *all_gather_done = builder.AddInstruction(
+  HloInstruction* all_gather_done = builder.AddInstruction(
       HloInstruction::CreateUnary(ShapeUtil::MakeShape(F32, {8, 4}),
                                   HloOpcode::kAllGatherDone, all_gather_start));
 
@@ -397,17 +396,17 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   // Test for AllReduceStart and AllReduceDone
   // First create a reduction computation
   HloComputation::Builder reducer_builder("add");
-  HloInstruction *reducer_x = reducer_builder.AddInstruction(
+  HloInstruction* reducer_x = reducer_builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeScalarShape(F32), "x"));
-  HloInstruction *reducer_y = reducer_builder.AddInstruction(
+  HloInstruction* reducer_y = reducer_builder.AddInstruction(
       HloInstruction::CreateParameter(1, ShapeUtil::MakeScalarShape(F32), "y"));
   reducer_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, reducer_x, reducer_y));
 
-  HloComputation *add_computation =
+  HloComputation* add_computation =
       module.AddEmbeddedComputation(reducer_builder.Build());
 
-  HloInstruction *all_reduce_start =
+  HloInstruction* all_reduce_start =
       builder.AddInstruction(HloInstruction::CreateAllReduceStart(
           ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
           add_computation, replica_groups, /*constrain_layout=*/false,
@@ -417,7 +416,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.ok());
   EXPECT_TRUE(is_async_status.value());
 
-  HloInstruction *all_reduce_done =
+  HloInstruction* all_reduce_done =
       builder.AddInstruction(HloInstruction::CreateUnary(
           param_shape, HloOpcode::kAllReduceDone, all_reduce_start));
 
@@ -426,7 +425,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.value());
 
   // Test for regular CollectivePermute (non-async)
-  HloInstruction *permute =
+  HloInstruction* permute =
       builder.AddInstruction(HloInstruction::CreateCollectivePermute(
           param_shape, param_0, source_target_pairs, /*channel_id=*/1));
 
@@ -612,7 +611,7 @@ std::vector<TestCase> GetTestCases() {
 class GetCollectOpGroupModeTest : public testing::TestWithParam<TestCase> {};
 
 TEST_P(GetCollectOpGroupModeTest, Test) {
-  const TestCase &tc = GetParam();
+  const TestCase& tc = GetParam();
   absl::StatusOr<CollectiveOpGroupMode> actual =
       GetCollectiveOpGroupMode(tc.has_channel_id, tc.use_global_device_ids);
   if (tc.expected) {
@@ -681,13 +680,13 @@ absl::StatusOr<std::unique_ptr<HloComputation>> CreateMaxComputation() {
   TF_ASSIGN_OR_RETURN(HloInstruction * b,
                       builder_max.AddParameter(
                           HloInstruction::CreateParameter(1, scalar, "b")));
-  HloInstruction *max = builder_max.AddInstruction(
+  HloInstruction* max = builder_max.AddInstruction(
       HloInstruction::CreateBinary(scalar, HloOpcode::kMaximum, a, b), "max");
   return builder_max.Build(max);
 }
 
 TEST_P(GetCollectOpGroupModeTestForInstruction, Test) {
-  const TestCaseForInstruction &test_case = GetParam();
+  const TestCaseForInstruction& test_case = GetParam();
   ReplicaGroup group;
   for (int k = 0; k < 4; ++k) {
     group.add_replica_ids(k);
@@ -712,7 +711,7 @@ TEST_P(GetCollectOpGroupModeTestForInstruction, Test) {
                           builder.AddParameter(HloInstruction::CreateParameter(
                               0, two_elements, "parameter")));
 
-  HloInstruction *collective;
+  HloInstruction* collective;
   switch (test_case.op_code) {
     case HloOpcode::kAllGather:
       collective = builder.AddInstruction(HloInstruction::CreateAllGather(
@@ -823,7 +822,7 @@ std::string TestCase::ToString() const {
   return s.str();
 }
 
-std::ostream &operator<<(std::ostream &os, const TestCase &tc) {
+std::ostream& operator<<(std::ostream& os, const TestCase& tc) {
   os << tc.ToString();
   return os;
 }
@@ -1077,7 +1076,7 @@ std::vector<TestCase> GetTestCases() {
 class GetParticipatingTest : public testing::TestWithParam<TestCase> {};
 
 TEST_P(GetParticipatingTest, Test) {
-  const TestCase &tc = GetParam();
+  const TestCase& tc = GetParam();
 
   int64_t num_replicas = tc.device_assignment.n1();
   int64_t num_partitions = tc.device_assignment.n2();
@@ -1103,7 +1102,7 @@ TEST_P(GetParticipatingTest, Test) {
   }
 
   // Test GetParticipatingDevices.
-  for (const TestCase::CurrentIdAndOutput &subtest : tc.subtests) {
+  for (const TestCase::CurrentIdAndOutput& subtest : tc.subtests) {
     absl::StatusOr<std::vector<GlobalDeviceId>> actual =
         GetParticipatingDevices(GlobalDeviceId(subtest.current_id),
                                 device_assignment, replica_groups, *group_mode);
@@ -1144,15 +1143,15 @@ TEST_P(GetParticipatingTest, Test) {
               testing::UnorderedElementsAreArray(expect_device_groups));
 
   // Test GetParticipatingFlattenedIdGroups.
-  absl::StatusOr<CollectiveDeviceList> collective_device_list =
-      GetParticipatingFlattenedIdGroups(
+  absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+      collective_device_list = GetParticipatingFlattenedIdGroups(
           device_assignment, CollectiveDeviceList(replica_groups), *group_mode);
   if (!collective_device_list.ok()) {
     EXPECT_TRUE(tc.expected_failure);
     return;
   }
-  const std::vector<ReplicaGroup> &actual_flattened_id_groups =
-      collective_device_list.value().replica_groups();
+  const std::vector<ReplicaGroup>& actual_flattened_id_groups =
+      collective_device_list.value()->replica_groups();
 
   std::vector<std::vector<int64_t>> actual_flattened_id_groups_int;
   actual_flattened_id_groups_int.reserve(actual_flattened_id_groups.size());
@@ -1192,16 +1191,16 @@ TEST_P(GetParticipatingTest, Test) {
       /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
   sum_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y));
-  HloComputation *reduction =
+  HloComputation* reduction =
       hlo_module.AddEmbeddedComputation(sum_builder.Build());
   HloComputation::Builder entry_builder("test_entry");
-  HloInstruction *operand = entry_builder.AddInstruction(
+  HloInstruction* operand = entry_builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   std::optional<int64_t> channel_id = std::nullopt;
   if (tc.has_channel_id) {
     channel_id = 0;
   }
-  HloInstruction *ar =
+  HloInstruction* ar =
       entry_builder.AddInstruction(HloInstruction::CreateAllReduce(
           operand->shape(), {operand}, reduction, replica_groups,
           /*constrain_layout=*/false,
@@ -1241,7 +1240,7 @@ class GetPariticipantCountsForReplicaGroupsTest
     : public testing::TestWithParam<TestCase> {};
 
 TEST_P(GetPariticipantCountsForReplicaGroupsTest, Test) {
-  const TestCase &tc = GetParam();
+  const TestCase& tc = GetParam();
 
   std::vector<ReplicaGroup> replica_groups =
       CreateReplicaGroups(tc.replica_groups);
@@ -1294,7 +1293,7 @@ INSTANTIATE_TEST_SUITE_P(
     GetPariticipantCountsForReplicaGroupsTest,
     testing::ValuesIn(GetTestCases()),
     [](const testing::TestParamInfo<
-        GetPariticipantCountsForReplicaGroupsTest::ParamType> &info) {
+        GetPariticipantCountsForReplicaGroupsTest::ParamType>& info) {
       return info.param.test_name;
     });
 
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 553beccf215b3c..eec1f85305f70b 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -514,7 +514,9 @@ std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
     HloPredicate should_allow_loop_variant_parameter_in_chain,
     const absl::flat_hash_set<const HloInstruction*>&
         loop_invariant_instructions,
-    bool should_add_loop_invariant_op_in_chain) {
+    bool should_add_loop_invariant_op_in_chain,
+    CollectivePipeliner::AdditionalChainStartOpFinder
+        additional_chain_start_op_finder) {
   std::vector<HloInstruction*> chain;
   absl::flat_hash_set<const HloInstruction*> visited_set({instr});
   std::vector<std::pair<HloInstruction*, int>> stack(1, {instr, 0});
@@ -527,6 +529,16 @@ std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
         return !IsLoopIterator(instr, loop_iter) &&
                !loop_invariant_params.count(instr);
       };
+
+  if (additional_chain_start_op_finder) {
+    auto maybe_additional_op = additional_chain_start_op_finder(instr);
+    if (maybe_additional_op.has_value()) {
+      if (visited_set.insert(maybe_additional_op.value()).second) {
+        stack.emplace_back(maybe_additional_op.value(), 0);
+      }
+    }
+  }
+
   while (!stack.empty()) {
     auto& curr = stack.back();
     if (curr.second == curr.first->operand_count()) {
@@ -600,14 +612,16 @@ std::optional<std::vector<HloInstruction*>> CollectChainsToPushBackwards(
     bool should_allow_control_dependencies,
     const absl::flat_hash_set<const HloInstruction*>&
         loop_invariant_instructions,
-    bool should_add_loop_invariant_op_in_chain) {
+    bool should_add_loop_invariant_op_in_chain,
+    CollectivePipeliner::AdditionalChainStartOpFinder
+        additional_chain_start_op_finder) {
   if (instr->HasControlDependencies() && !should_allow_control_dependencies) {
     return std::nullopt;
   }
   return CollectIndependentOperandChain(
       instr, loop_iter, loop_invariant_params,
       should_allow_loop_variant_parameter_in_chain, loop_invariant_instructions,
-      should_add_loop_invariant_op_in_chain);
+      should_add_loop_invariant_op_in_chain, additional_chain_start_op_finder);
 }
 
 // Given a dynamic-update-slice find the output index of the loop we feed into.
@@ -910,7 +924,9 @@ class WhileLoopAnalysis {
       HloPredicate should_allow_loop_variant_parameter_in_chain =
           HloPredicateFalse,
       bool should_allow_control_dependencies = false,
-      bool should_add_loop_invariant_op_in_chain = false);
+      bool should_add_loop_invariant_op_in_chain = false,
+      CollectivePipeliner::AdditionalChainStartOpFinder
+          additional_chain_start_op_finder = nullptr);
   HloInstruction* while_loop_instruction() const { return while_; }
   void ExtractLoopInvariantOps();
 
@@ -1321,7 +1337,9 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
     HloPredicate should_process, HloPredicate acceptable_formatting,
     HloPredicate should_allow_loop_variant_parameter_in_chain,
     bool should_allow_control_dependencies,
-    bool should_add_loop_invariant_op_in_chain) {
+    bool should_add_loop_invariant_op_in_chain,
+    CollectivePipeliner::AdditionalChainStartOpFinder
+        additional_chain_start_op_finder) {
   move_infos_.clear();
   HloComputation* while_body = while_->while_body();
   const HloInstruction* loop_parameter =
@@ -1498,7 +1516,8 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
           invariant_loop_parameters_,
           should_allow_loop_variant_parameter_in_chain,
           should_allow_control_dependencies, invariant_loop_instructions_,
-          should_add_loop_invariant_op_in_chain);
+          should_add_loop_invariant_op_in_chain,
+          additional_chain_start_op_finder);
       if (!chain_collected.has_value()) {
         VLOG(5) << "Skipping " << instr->name()
                 << " because didn't find compatible slice of parameter";
@@ -1636,24 +1655,6 @@ HloInstruction* CreateZero(HloComputation* comp, const Shape& shape,
 
 }  // namespace
 
-using Interval = std::pair<int64_t, int64_t>;
-using Intervals = std::vector<Interval>;
-// Parses a string "{{a,b},{c,d},{e,f},...}" to a vector of pairs.
-absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
-    absl::string_view str) {
-  TF_ASSIGN_OR_RETURN(std::vector<ReplicaGroup> replica_groups,
-                      ParseReplicaGroupsOnly(str));
-  std::vector<Interval> res;
-  res.reserve(replica_groups.size());
-  for (const ReplicaGroup& replica_group : replica_groups) {
-    TF_RET_CHECK(replica_group.replica_ids_size() == 2);
-    int64_t a = replica_group.replica_ids(0);
-    int64_t b = replica_group.replica_ids(1);
-    res.emplace_back(a, b);
-  }
-  return res;
-}
-
 // If there is a collective-permute instruction with _xla_send_recv_validation
 // attribute in the computation, then during pipelining the loop trip count
 // changes. This function fixes the attribute for the cloned instruction.
@@ -1680,87 +1681,6 @@ absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
 // attribute will become {{1,0},{1,0},{1,0},{0,0},{0,0}} and for the collective
 // inside while loop, this attribute will become
 // {{0,4},{0,4},{1,5},{1,5},{2,5}}.
-absl::Status UpdateSendRecvValidation(
-    HloInstruction* instruction, bool is_peeled,
-    collective_pipeliner_utils::PipeliningDirection direction,
-    const WhileLoopAnalysis& loop_analysis) {
-  if (instruction->opcode() != HloOpcode::kCollectivePermute) {
-    return absl::OkStatus();
-  }
-  const auto& frontend_attributes = instruction->frontend_attributes().map();
-  if (!frontend_attributes.contains(kSendRecvValidationAttr)) {
-    return absl::OkStatus();
-  }
-  VLOG(3) << "Trip count = "
-          << loop_analysis.GetLoopIterationCount()->GetSignedValue();
-  VLOG(3) << "Collective permute with _xla_send_recv_validation: "
-          << instruction->ToString();
-  TF_ASSIGN_OR_RETURN(
-      Intervals old_intervals,
-      ParseVectorOfPairs(frontend_attributes.at(kSendRecvValidationAttr)));
-
-  Intervals intervals;
-
-  if (direction == collective_pipeliner_utils::PipeliningDirection::kForward) {
-    // It is a forward pipelining which means that the peeled collective permute
-    // is before the loop. It should run once for the devices executing the
-    // first iteration and the internal collective permute now sees each
-    // original iteration decreased by one.
-    //
-    // peeled collective permute:
-    //      {{0,0} if {a,b} in old and a<=0<=b, {1,0} otherwise}
-    // internal collective permute: {{max(0, a-1), max(0, b-1)} | {a,b} in old}
-    for (auto [a, b] : old_intervals) {
-      if (is_peeled) {
-        if (a <= 0 && 0 <= b) {
-          intervals.push_back({0, 0});
-        } else {
-          intervals.push_back({1, 0});
-        }
-      } else {
-        intervals.push_back(
-            {std::max(int64_t{0}, a - 1), std::max(int64_t{0}, b - 1)});
-      }
-    }
-  } else if (direction ==
-             collective_pipeliner_utils::PipeliningDirection::kBackward) {
-    // It is a backward pipelining which means that the peeled collective is
-    // after the loop. It should run once for the devices executing the last
-    // iteration and the internal collective permute doesn't see the last
-    // iteration.
-    //
-    // peeled collective permute:
-    //      {{0,0} if {a,b} in old and a<=n<=b where n=#last_iteration, {1,0}
-    //      otherwise}
-    // interval collective permute:
-    //      {{a,min(n-1,b)} | {a,b} in old and n=#last_iteration}
-    auto trip_count_value = loop_analysis.GetLoopIterationCount();
-    if (!trip_count_value) {
-      return absl::InternalError(
-          "Unable to deduce loop trip count in collective pipeliner. This is "
-          "required for backward pipelining while fixing the "
-          "_xla_send_recv_validation attribute");
-    }
-    int64_t trip_count = trip_count_value->GetSignedValue();
-    int64_t last_iteration = trip_count - 1;
-    for (auto [a, b] : old_intervals) {
-      if (is_peeled) {
-        if (a <= last_iteration && last_iteration <= b) {
-          intervals.push_back({0, 0});
-        } else {
-          intervals.push_back({1, 0});
-        }
-      } else {
-        intervals.push_back({a, std::min(last_iteration - 1, b)});
-      }
-    }
-  }
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      instruction, kSendRecvValidationAttr, intervals);
-  VLOG(3) << "Updated collective_permute with _xla_send_recv_validation: "
-          << instruction->ToString();
-  return absl::OkStatus();
-}
 
 // Function that does the work of pushing forward instructions that have been
 // determined that can be pipelined. Rough transformation:
@@ -1790,7 +1710,7 @@ absl::Status UpdateSendRecvValidation(
 // }
 // xg_last = all-reduce(x)
 // yg_last = all-reduce(y)
-absl::Status TransformLoopForward(
+absl::StatusOr<HloInstruction*> TransformLoopForward(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool pipeline_use_tree,
     bool process_different_sized_ops, HloPredicate should_process,
@@ -1923,12 +1843,6 @@ absl::Status TransformLoopForward(
       TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
           cloned_instr, next_scheduling_id, annotation_map));
     }
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        cloned_instr, true,
-        collective_pipeliner_utils::PipeliningDirection::kForward,
-        loop_analysis));
     while_body_to_peeled[instr] = cloned_instr;
     auto output_it = is_output_instruction.find(instr);
     if (output_it != is_output_instruction.end()) {
@@ -1992,14 +1906,6 @@ absl::Status TransformLoopForward(
   HloComputation* new_while_body =
       loop_computation->parent()->AddEmbeddedComputation(
           while_body->CloneWithReplacements(&replacements));
-  for (HloInstruction* instruction : new_while_body->instructions()) {
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        instruction, false,
-        collective_pipeliner_utils::PipeliningDirection::kForward,
-        loop_analysis));
-  }
   HloInstruction* new_init = loop_computation->AddInstruction(
       HloInstruction::CreateTuple(new_init_operands));
   while_body_to_peeled[while_body->root_instruction()] = new_init;
@@ -2292,7 +2198,7 @@ absl::Status TransformLoopForward(
         absl::MakeSpan(loop_output_to_replace), output_stacked_data));
   }
   TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
-  return absl::OkStatus();
+  return new_while_loop;
 }
 
 absl::Status TransformFormattingOp(
@@ -2525,7 +2431,7 @@ absl::Status TransformFormattingOp(
 // }
 // xg_all = all-reduce(x_all)
 // yg_all = all-reduce(y_all)
-absl::Status TransformLoopForwardSink(
+absl::StatusOr<HloInstruction*> TransformLoopForwardSink(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool pipeline_use_tree,
     bool process_different_sized_ops, HloPredicate should_process,
@@ -2880,7 +2786,7 @@ absl::Status TransformLoopForwardSink(
   TF_RETURN_IF_ERROR(
       loop_computation->RemoveInstructionAndUnusedOperands(while_loop));
   TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
-  return absl::OkStatus();
+  return new_while;
 }
 
 // Function that does the work of pushing backward instructions that have been
@@ -2906,7 +2812,7 @@ absl::Status TransformLoopForwardSink(
 //   x_ag = p0_ag_next
 // }
 // x_last = computation(p0_ag_next)
-static absl::Status TransformLoopBackward(
+static absl::StatusOr<HloInstruction*> TransformLoopBackward(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool process_different_sized_ops,
     HloPredicate acceptable_formatting,
@@ -3150,14 +3056,6 @@ static absl::Status TransformLoopBackward(
   TF_RETURN_IF_ERROR(UpdateControlDependencies(while_body->root_instruction(),
                                                new_loop_root,
                                                while_body_replacement_map));
-  for (HloInstruction* instruction : new_while_body->instructions()) {
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        instruction, false,
-        collective_pipeliner_utils::PipeliningDirection::kBackward,
-        loop_analysis));
-  }
   auto cond_builder =
       HloComputation::Builder(while_loop->while_condition()->name());
   HloInstruction* new_cond_param =
@@ -3244,12 +3142,6 @@ static absl::Status TransformLoopBackward(
                                update_collective_channel_id);
     TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
         cloned_instr, next_scheduling_id, annotation_map));
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        cloned_instr, true,
-        collective_pipeliner_utils::PipeliningDirection::kBackward,
-        loop_analysis));
     while_body_replacement_map[instr] = cloned_instr;
     if (instruction_is_output_it != is_output_instruction.end()) {
       for (int64_t index : instruction_is_output_it->second) {
@@ -3266,7 +3158,7 @@ static absl::Status TransformLoopBackward(
   TF_RETURN_IF_ERROR(
       loop_computation->RemoveInstructionAndUnusedOperands(while_loop));
   TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
-  return absl::OkStatus();
+  return new_while_loop;
 }
 
 absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
@@ -3289,6 +3181,7 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
       if (instruction->opcode() != HloOpcode::kWhile) {
         continue;
       }
+
       if (std::none_of(instruction->while_body()->instructions().begin(),
                        instruction->while_body()->instructions().end(),
                        config_.should_process)) {
@@ -3322,7 +3215,8 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
         config_.should_process, config_.acceptable_formatting,
         config_.should_allow_loop_variant_parameter_in_chain,
         config_.should_allow_control_dependencies,
-        config_.should_add_loop_invariant_op_in_chain);
+        config_.should_add_loop_invariant_op_in_chain,
+        config_.additional_chain_start_op_finder);
     if (loop_analysis->GetMoveInfos().empty()) {
       continue;
     }
@@ -3334,31 +3228,44 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
         VLOG(1) << "MoveInfo #" << id++ << "\n" << ToString(to_move);
       }
     }
+    HloInstruction* transformed_while_loop;
     if (config_.pipelining_direction ==
         collective_pipeliner_utils::PipeliningDirection::kForward) {
       CHECK(config_.reuse_pipelined_op_buffer);
-      TF_RETURN_IF_ERROR(TransformLoopForward(
-          *loop_analysis, !config_.last_run, config_.level_to_operate_on,
-          config_.pipeline_use_tree, config_.process_different_sized_ops,
-          config_.should_process, config_.acceptable_formatting,
-          config_.reuse_pipelined_op_buffer, next_channel_id,
-          config_.unique_channel_id, config_.postprocess_pipelined_ops));
+      TF_ASSIGN_OR_RETURN(
+          transformed_while_loop,
+          TransformLoopForward(
+              *loop_analysis, !config_.last_run, config_.level_to_operate_on,
+              config_.pipeline_use_tree, config_.process_different_sized_ops,
+              config_.should_process, config_.acceptable_formatting,
+              config_.reuse_pipelined_op_buffer, next_channel_id,
+              config_.unique_channel_id, config_.postprocess_pipelined_ops));
     } else if (config_.pipelining_direction ==
                collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
-      TF_RETURN_IF_ERROR(TransformLoopForwardSink(
-          *loop_analysis, !config_.last_run, config_.level_to_operate_on,
-          config_.pipeline_use_tree, config_.process_different_sized_ops,
-          config_.should_process, next_channel_id, config_.unique_channel_id));
+      TF_ASSIGN_OR_RETURN(
+          transformed_while_loop,
+          TransformLoopForwardSink(
+              *loop_analysis, !config_.last_run, config_.level_to_operate_on,
+              config_.pipeline_use_tree, config_.process_different_sized_ops,
+              config_.should_process, next_channel_id,
+              config_.unique_channel_id));
     } else {
       CHECK_EQ(config_.pipelining_direction,
                collective_pipeliner_utils::PipeliningDirection::kBackward);
-      TF_RETURN_IF_ERROR(TransformLoopBackward(
-          *loop_analysis, !config_.last_run, config_.level_to_operate_on,
-          config_.process_different_sized_ops, config_.acceptable_formatting,
-          config_.postprocess_backward_peeled_op,
-          config_.postprocess_backward_rotated_op,
-          config_.postprocess_backward_peeled_trailing_op, next_channel_id,
-          config_.unique_channel_id, config_.postprocess_pipelined_ops));
+      TF_ASSIGN_OR_RETURN(
+          transformed_while_loop,
+          TransformLoopBackward(
+              *loop_analysis, !config_.last_run, config_.level_to_operate_on,
+              config_.process_different_sized_ops,
+              config_.acceptable_formatting,
+              config_.postprocess_backward_peeled_op,
+              config_.postprocess_backward_rotated_op,
+              config_.postprocess_backward_peeled_trailing_op, next_channel_id,
+              config_.unique_channel_id, config_.postprocess_pipelined_ops));
+    }
+    if (config_.postprocess_transformed_while_loop) {
+      TF_RETURN_IF_ERROR(
+          config_.postprocess_transformed_while_loop(transformed_while_loop));
     }
     ++transformed_loops;
     changed = true;
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index 7988daa085caaf..427702e3fe7287 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -67,6 +67,10 @@ class CollectivePipeliner : public HloModulePass {
   // created.
   using HloPostprocessor = std::function<absl::Status(
       HloInstruction* instr, HloInstruction* new_while_instr)>;
+  using WhileLoopPostprocessor =
+      std::function<absl::Status(HloInstruction* while_loop)>;
+  using AdditionalChainStartOpFinder =
+      std::function<std::optional<HloInstruction*>(HloInstruction*)>;
 
   struct Config {
     int64_t level_to_operate_on = 0;
@@ -99,6 +103,11 @@ class CollectivePipeliner : public HloModulePass {
     // pipelined. The control dependencies will be dropped when the operation is
     // pipelined. This is currently only used to support kBackward pipelining.
     bool should_allow_control_dependencies = false;
+    // Function to find an additional operation to start the operand chain from.
+    // If set, this function will be called to discover additional starting
+    // points for the operand chain (e.g., DynamicSlice operations through
+    // formatting ops).
+    AdditionalChainStartOpFinder additional_chain_start_op_finder = nullptr;
     // TODO(b/399476667): Consolidate these postprocessing functions.
     HloPostprocessor postprocess_backward_peeled_op;
     HloPostprocessor postprocess_backward_rotated_op;
@@ -112,6 +121,9 @@ class CollectivePipeliner : public HloModulePass {
     bool delay_sinking_large_collectives = true;
     // When cloning collectives, use a unique channel id for each clone.
     bool unique_channel_id = true;
+    // Postprocessing hook which runs for every successfully transformed while
+    // loop.
+    WhileLoopPostprocessor postprocess_transformed_while_loop;
   };
   static const char* const kInsertedByPreviousStep;
   static const char* const kSunkByPreviousStep;
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 24e6ba9be4aa1b..a8fb19690f1592 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <queue>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -52,6 +53,7 @@ limitations under the License.
 #include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/service/host_offload_utils.h"
 #include "xla/service/legalize_scheduling_annotations.h"
 #include "xla/service/memory_annotations.h"
 #include "xla/service/scheduling_annotations_util.h"
@@ -107,7 +109,9 @@ absl::StatusOr<bool> RunOptimizer(
         {},
     bool should_add_loop_invariant_op_in_chain = false,
     int64_t collective_size_threshold_to_delay_sinking = INT64_MAX,
-    bool unique_channel_id = true) {
+    bool unique_channel_id = true,
+    CollectivePipeliner::WhileLoopPostprocessor
+        postprocess_transformed_while_loop = {}) {
   CollectivePipeliner::Config config = {
       /*level_to_operate_on=*/level_to_operate_on,
       /*max_pipelining_per_loop=*/INT64_MAX,
@@ -120,12 +124,14 @@ absl::StatusOr<bool> RunOptimizer(
       /*acceptable_formatting=*/acceptable_formatting,
       /*reuse_pipelined_op_buffer=*/reuse_pipelined_op_buffer,
       should_allow_loop_variant_parameter_in_chain,
-      /*should_allow_control_dependencies=*/false, postprocess_backward_peeled,
+      /*should_allow_control_dependencies=*/false,
+      /*additional_chain_start_op_finder=*/nullptr, postprocess_backward_peeled,
       postprocess_backward_rotated, postprocess_backward_peeled_trailing,
       should_add_loop_invariant_op_in_chain,
       /*postprocess_pipelined_ops=*/{},
       collective_size_threshold_to_delay_sinking,
-      /*delay_sinking_large_collectives=*/true, unique_channel_id};
+      /*delay_sinking_large_collectives=*/true, unique_channel_id,
+      postprocess_transformed_while_loop};
   HloPassPipeline pass("optimizer");
   pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
                             /*allow_mixed_precision=*/false);
@@ -352,8 +358,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}},
-                     frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,13}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}}
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
   constant.2559 = s32[] constant(14)
@@ -388,14 +393,14 @@ ENTRY entry {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: HloModule
     // CHECK: %while_body
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{0,5},{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[dus]], {{.*}}%[[dus]])
     // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %entry
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}{_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[ds]], {{.*}}%[[ds]])
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
@@ -428,8 +433,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                     frontend_attributes={_xla_send_recv_validation="{{7,13},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
   constant.2559 = s32[] constant(14)
@@ -464,14 +468,14 @@ ENTRY entry {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: HloModule
     // CHECK: %while_body
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6},{0,5}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[dus]], {{.*}}%[[dus]])
     // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %entry
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}{_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[ds]], {{.*}}%[[ds]])
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
@@ -1995,8 +1999,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}},
-                     frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,13}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}}
   constant.2561 = s32[] constant(0)
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
@@ -2041,13 +2044,13 @@ ENTRY entry {
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
   // CHECK: %while_body
-  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,12}{{[}]}}}
+  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}})
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ENTRY %entry
   // CHECK: %[[while:.+]] = {{.+}} while({{.*}})
   // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.*}}%[[while]]), index=1
-  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
+  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]])
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp2]], {{.*}})
   // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.*}}%[[tuple]]), index=1
@@ -2074,8 +2077,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                     frontend_attributes={_xla_send_recv_validation="{{7,13},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   constant.2561 = s32[] constant(0)
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
@@ -2120,13 +2122,13 @@ ENTRY entry {
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
   // CHECK: %while_body
-  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{7,12},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}{{[}]}}}
+  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}})
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ENTRY %entry
   // CHECK: %[[while:.+]] = {{.+}} while({{.+}})
   // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.*}}%[[while]]), index=1
-  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
+  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]])
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp2]], {{.*}})
   // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.*}}%[[tuple]]), index=1
@@ -5629,5 +5631,209 @@ ENTRY entry {
   EXPECT_EQ(fusion_count, 4);
 }
 
+TEST_F(CollectivePipelinerTest, HostOffloadingForward) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule jit_scanned
+
+%region_0.40 (arg_tuple.13: (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000])) -> (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) {
+  %arg_tuple.13 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.14 = s32[] get-tuple-element(%arg_tuple.13), index=0
+  %constant.19 = s32[] constant(1)
+  %add.38 = s32[] add(%get-tuple-element.14, %constant.19)
+  %get-tuple-element.15 = f32[1000,1000] get-tuple-element(%arg_tuple.13), index=1
+  %get-tuple-element.17 = f32[10,1000,8000] get-tuple-element(%arg_tuple.13), index=3
+  %constant.20 = s32[] constant(0)
+  %dynamic-slice.21 = f32[1,1000,8000] dynamic-slice(%get-tuple-element.17, %get-tuple-element.14, %constant.20, %constant.20), dynamic_slice_sizes={1,1000,8000}
+  %reshape.22 = f32[1000,8000] reshape(%dynamic-slice.21)
+  %dot.0 = f32[1000,8000] dot(%get-tuple-element.15, %reshape.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %get-tuple-element.18 = f32[10,8000,1000] get-tuple-element(%arg_tuple.13), index=4
+  %dynamic-slice.23 = f32[1,8000,1000] dynamic-slice(%get-tuple-element.18, %get-tuple-element.14, %constant.20, %constant.20), dynamic_slice_sizes={1,8000,1000}
+  %reshape.24 = f32[8000,1000] reshape(%dynamic-slice.23)
+  %dot.1 = f32[1000,1000] dot(%dot.0, %reshape.24), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %get-tuple-element.16 = f32[10,1000,1000] get-tuple-element(%arg_tuple.13), index=2
+  %custom-call.2 = f32[1000,1000] custom-call(%dot.1), custom_call_target="MoveToHost"
+  %reshape.36 = f32[1,1000,1000] reshape(%custom-call.2)
+  %dynamic-update-slice.37 = f32[10,1000,1000] dynamic-update-slice(%get-tuple-element.16, %reshape.36, %get-tuple-element.14, %constant.20, %constant.20)
+  ROOT %tuple.39 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) tuple(%add.38, %dot.1, %dynamic-update-slice.37, %get-tuple-element.17, %get-tuple-element.18)
+}
+
+%region_1.49 (arg_tuple.41: (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000])) -> pred[] {
+  %arg_tuple.41 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.42 = s32[] get-tuple-element(%arg_tuple.41), index=0
+  %constant.47 = s32[] constant(10)
+  ROOT %compare.48 = pred[] compare(%get-tuple-element.42, %constant.47), direction=LT
+}
+
+ENTRY %main.117 (Arg_0.1: f32[10,1000,8000], Arg_1.2: f32[10,8000,1000], Arg_2.3: f32[1000,1000]) -> (f32[10,1000,8000], f32[10,8000,1000]) {
+  %constant.10 = s32[] constant(0)
+  %constant.4 = f32[] constant(0)
+  %Arg_2.3 = f32[1000,1000] parameter(2)
+  %broadcast.12 = f32[10,1000,1000] broadcast(%constant.4), dimensions={}
+  %Arg_0.1 = f32[10,1000,8000] parameter(0)
+  %Arg_1.2 = f32[10,8000,1000] parameter(1)
+  %tuple.50 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) tuple(%constant.10, %Arg_2.3, %broadcast.12, %Arg_0.1, %Arg_1.2)
+  %while.51 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) while(%tuple.50), condition=%region_1.49, body=%region_0.40
+  %get-tuple-element.52 = f32[10,1000,8000] get-tuple-element(%while.51), index=3
+  %get-tuple-element.53 = f32[10,8000,1000] get-tuple-element(%while.51), index=4
+  ROOT %tuple.116 = (f32[10,1000,8000], f32[10,8000,1000]) tuple(%get-tuple-element.52, %get-tuple-element.53)
+}
+)";
+
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kForward,
+          /*should_process=*/
+          host_offload_utils::IsMoveToHostWithDynamicUpdateSlice,
+          /*acceptable_formatting=*/HloPredicateTrue,
+          /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
+          /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
+          /*should_add_loop_invariant_op_in_chain=*/false,
+          /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+          /*unique_channel_id=*/true,
+          /*postprocess_transformed_while_loop=*/
+          host_offload_utils::MarkDynamicVariables)
+          .value());
+
+  std::vector<HloInstruction*> while_loops;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_loops.push_back(instr);
+    }
+  }
+  ASSERT_EQ(while_loops.size(), 1) << "Expected 1 while loop in the module";
+
+  XLA_VLOG_LINES(1, "Transformed while body:\n" +
+                        while_loops[0]->while_body()->ToString());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_loops[0]->backend_config<WhileLoopBackendConfig>());
+
+  std::set<int64_t> dynamic_indices(
+      config.dynamic_variable_tuple_indices().begin(),
+      config.dynamic_variable_tuple_indices().end());
+
+  std::set<int64_t> expected_indices = {0, 5};
+  EXPECT_EQ(dynamic_indices, expected_indices);
+}
+
+TEST_F(CollectivePipelinerTest, HostOffloadingBackward) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule jit_scanned
+
+%region_2.98 (arg_tuple.55: (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000])) -> (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) {
+  %arg_tuple.55 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.56 = s32[] get-tuple-element(%arg_tuple.55), index=0
+  %constant.64 = s32[] constant(1)
+  %add.96 = s32[] add(%get-tuple-element.56, %constant.64)
+  %get-tuple-element.57 = f32[1000,1000] get-tuple-element(%arg_tuple.55), index=1
+  %get-tuple-element.62 = f32[10,8000,1000] get-tuple-element(%arg_tuple.55), index=6
+  %constant.1 = s32[] constant(9)
+  %subtract = s32[] subtract(%constant.1, %get-tuple-element.56)
+  %constant.63 = s32[] constant(0)
+  %dynamic-slice.72 = f32[1,8000,1000] dynamic-slice(%get-tuple-element.62, %subtract, %constant.63, %constant.63), dynamic_slice_sizes={1,8000,1000}
+  %reshape.73 = f32[8000,1000] reshape(%dynamic-slice.72)
+  %dot.2 = f32[1000,8000] dot(%get-tuple-element.57, %reshape.73), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  %get-tuple-element.61 = f32[10,1000,8000] get-tuple-element(%arg_tuple.55), index=5
+  %dynamic-slice.70 = f32[1,1000,8000] dynamic-slice(%get-tuple-element.61, %subtract, %constant.63, %constant.63), dynamic_slice_sizes={1,1000,8000}
+  %reshape.71 = f32[1000,8000] reshape(%dynamic-slice.70)
+  %dot.3 = f32[1000,1000] dot(%dot.2, %reshape.71), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  %get-tuple-element.58 = f32[10,1000,8000] get-tuple-element(%arg_tuple.55), index=2
+  %get-tuple-element.60 = f32[10,1000,1000] get-tuple-element(%arg_tuple.55), index=4
+  %dynamic-slice.68 = f32[1,1000,1000] dynamic-slice(%get-tuple-element.60, %subtract, %constant.63, %constant.63), dynamic_slice_sizes={1,1000,1000}
+  %reshape.69 = f32[1000,1000] reshape(%dynamic-slice.68)
+  %custom-call.3 = f32[1000,1000] custom-call(%reshape.69), custom_call_target="MoveToDevice"
+  %dot.7 = f32[1000,8000] dot(%custom-call.3, %dot.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  %reshape.92 = f32[1,1000,8000] reshape(%dot.7)
+  %dynamic-update-slice.93 = f32[10,1000,8000] dynamic-update-slice(%get-tuple-element.58, %reshape.92, %subtract, %constant.63, %constant.63)
+  %get-tuple-element.59 = f32[10,8000,1000] get-tuple-element(%arg_tuple.55), index=3
+  %dot.5 = f32[1000,8000] dot(%custom-call.3, %reshape.71), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %dot.8 = f32[8000,1000] dot(%dot.5, %get-tuple-element.57), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  %reshape.94 = f32[1,8000,1000] reshape(%dot.8)
+  %dynamic-update-slice.95 = f32[10,8000,1000] dynamic-update-slice(%get-tuple-element.59, %reshape.94, %subtract, %constant.63, %constant.63)
+  ROOT %tuple.97 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) tuple(%add.96, %dot.3, %dynamic-update-slice.93, %dynamic-update-slice.95, %get-tuple-element.60, /*index=5*/%get-tuple-element.61, %get-tuple-element.62)
+}
+
+%region_3.109 (arg_tuple.99: (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000])) -> pred[] {
+  %arg_tuple.99 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.100 = s32[] get-tuple-element(%arg_tuple.99), index=0
+  %constant.107 = s32[] constant(10)
+  ROOT %compare.108 = pred[] compare(%get-tuple-element.100, %constant.107), direction=LT
+}
+
+ENTRY %main.117 (Arg_0.1: f32[10,1000,8000], Arg_1.2: f32[10,8000,1000], Arg_2.3: f32[10,1000,1000]) -> (f32[10,1000,8000], f32[10,8000,1000]) {
+  %constant.10 = s32[] constant(0)
+  %constant.8 = f32[] constant(1)
+  %broadcast.9 = f32[1000,1000] broadcast(%constant.8), dimensions={}
+  %constant.4 = f32[] constant(0)
+  %broadcast.7 = f32[10,1000,8000] broadcast(%constant.4), dimensions={}
+  %broadcast.5 = f32[10,8000,1000] broadcast(%constant.4), dimensions={}
+  %Arg_2.3 = f32[10,1000,1000] parameter(2)
+  %Arg_0.1 = f32[10,1000,8000] parameter(0)
+  %Arg_1.2 = f32[10,8000,1000] parameter(1)
+  %tuple.110 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) tuple(%constant.10, %broadcast.9, %broadcast.7, %broadcast.5, %Arg_2.3, /*index=5*/%Arg_0.1, %Arg_1.2)
+  %while.111 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) while(%tuple.110), condition=%region_3.109, body=%region_2.98
+  %get-tuple-element.114 = f32[10,1000,8000] get-tuple-element(%while.111), index=2
+  %get-tuple-element.115 = f32[10,8000,1000] get-tuple-element(%while.111), index=3
+  ROOT %tuple.116 = (f32[10,1000,8000], f32[10,8000,1000]) tuple(%get-tuple-element.114, %get-tuple-element.115)
+}
+)";
+
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kBackward,
+          /*should_process=*/
+          host_offload_utils::IsMoveToDeviceWithDynamicSlice,
+          /*acceptable_formatting=*/HloPredicateTrue,
+          /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
+          /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
+          /*should_add_loop_invariant_op_in_chain=*/false,
+          /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+          /*unique_channel_id=*/true,
+          /*postprocess_transformed_while_loop=*/
+          host_offload_utils::MarkDynamicVariables)
+          .value());
+
+  std::vector<HloInstruction*> while_loops;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_loops.push_back(instr);
+    }
+  }
+  ASSERT_EQ(while_loops.size(), 1) << "Expected 1 while loop in the module";
+
+  XLA_VLOG_LINES(1, "Transformed while body:\n" +
+                        while_loops[0]->while_body()->ToString());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_loops[0]->backend_config<WhileLoopBackendConfig>());
+
+  std::set<int64_t> dynamic_indices(
+      config.dynamic_variable_tuple_indices().begin(),
+      config.dynamic_variable_tuple_indices().end());
+
+  std::set<int64_t> expected_indices = {0, 8};
+  EXPECT_EQ(dynamic_indices, expected_indices);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/compiled_module.h b/third_party/xla/xla/service/compiled_module.h
new file mode 100644
index 00000000000000..7c4f8f6f305aaf
--- /dev/null
+++ b/third_party/xla/xla/service/compiled_module.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILED_MODULE_H_
+#define XLA_SERVICE_COMPILED_MODULE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+
+class Executable;
+
+// Abstract superclass describing the result of an ahead-of-time compilation.
+class CompiledModule {
+ public:
+  virtual ~CompiledModule() = default;
+
+  virtual absl::StatusOr<std::string> SerializeAsString() const = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      const stream_executor::StreamExecutor* executor) && = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const {
+    return absl::UnimplementedError("buffer_assignment is not supported.");
+  }
+
+  // Returns the optimized HLO module if one was computed and the implementation
+  // supports it.
+  virtual const HloModule* optimized_module() const = 0;
+  virtual std::shared_ptr<HloModule> shared_optimized_module() = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILED_MODULE_H_
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 4d83f602f2cb53..7938e3d1a50588 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -41,8 +41,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
+#include "xla/service/compiled_module.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -69,39 +69,9 @@ namespace xla {
 // computation.
 using ObjectFileData = std::vector<char>;
 
-class Compiler;
 class AotCompilationOptions;
 
-// Abstract superclass describing the result of an ahead-of-time compilation.
-class AotCompilationResult {
- public:
-  AotCompilationResult(const AotCompilationResult&) = delete;
-  AotCompilationResult& operator=(AotCompilationResult const&) = delete;
-
-  virtual ~AotCompilationResult() = default;
-
-  virtual absl::StatusOr<std::string> SerializeAsString() const {
-    return Unimplemented("SerializeAsString unimplemented.");
-  }
-
-  virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* executor) && {
-    return Unimplemented("LoadExecutable unimplemented.");
-  }
-
-  virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
-      const {
-    return Unimplemented("buffer_assignment unimplemented.");
-  }
-
-  // Returns the optimized HLO module if one was computed and the implementation
-  // supports it.
-  virtual const HloModule* optimized_module() const = 0;
-  virtual std::unique_ptr<HloModule> consume_optimized_module() = 0;
-
- protected:
-  AotCompilationResult() = default;
-};
+using AotCompilationResult ABSL_DEPRECATE_AND_INLINE() = CompiledModule;
 
 // Abstract superclass describing metadata produced during ahead-of-time
 // compilation.
@@ -199,6 +169,9 @@ class Compiler {
 
     // Embed HLO module in the executable. Only used on GPU at the moment.
     bool embed_hlo_module = true;
+
+    // If true, the compiler will exit after the layout assignment pass.
+    bool early_exit_with_layouts = false;
   };
 
   virtual ~Compiler() = default;
@@ -356,7 +329,7 @@ class Compiler {
 
   // Returns an AotCompilationResult of the executable for serialization.
   virtual absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
-      Executable* executable) const {
+      Executable* executable) {
     return Unimplemented("Export unimplemented");
   }
 
@@ -500,6 +473,18 @@ class AotCompilationOptions {
     gpu_target_config_ = gpu_target_config;
   }
 
+  // Provides a way to end compilation early and get partial outputs.
+  enum class EarlyExitPoint {
+    kNone,
+    kAfterLayoutAssignment,
+    kAfterBufferAssignment,
+  };
+
+  EarlyExitPoint early_exit_point() const { return early_exit_point_; }
+  void set_early_exit_point(EarlyExitPoint early_exit_point) {
+    early_exit_point_ = early_exit_point;
+  }
+
  protected:
   AotCompilationOptions();
 
@@ -519,6 +504,7 @@ class AotCompilationOptions {
   std::vector<std::string> sanitize_abilists_dataflow_;
   // Contains target-specific information required by AOT compilation.
   std::optional<Compiler::GpuTargetConfig> gpu_target_config_;
+  EarlyExitPoint early_exit_point_ = EarlyExitPoint::kNone;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/conditional_simplifier.cc b/third_party/xla/xla/service/conditional_simplifier.cc
index 0308e66d7d3bfc..7b9ea64becffbe 100644
--- a/third_party/xla/xla/service/conditional_simplifier.cc
+++ b/third_party/xla/xla/service/conditional_simplifier.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/literal.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/call_inliner.h"
@@ -300,6 +301,7 @@ bool RemoveUnusedTupleElements(HloInstruction* conditional_op) {
 
   // Replace the conditional instruction itself.
   *conditional_op->mutable_shape() = new_shape;
+  CopyOriginalValue(conditional_op, conditional_op, old_to_new_mapping);
 
   // Reroute all user GTE instructions to new tuple indices.
   for (HloInstruction* user : conditional_op->users()) {
diff --git a/third_party/xla/xla/service/conditional_simplifier_test.cc b/third_party/xla/xla/service/conditional_simplifier_test.cc
index 6baae0030ee21c..c300276556642d 100644
--- a/third_party/xla/xla/service/conditional_simplifier_test.cc
+++ b/third_party/xla/xla/service/conditional_simplifier_test.cc
@@ -15,18 +15,23 @@ limitations under the License.
 
 #include "xla/service/conditional_simplifier.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
+#include "xla/service/hlo_verifier.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
@@ -512,6 +517,53 @@ ENTRY entry {
                   op::GetTupleElement(op::Tuple(op::AfterAll()), 0))));
 }
 
+TEST_F(ConditionalSimplifierTest, RemoveUnusedTupleElementsWithOriginalValue) {
+  absl::string_view hlo_string =
+      R"(
+HloModule FirstTupleElementUnusedAndRemoved
+
+on_true {
+  arg_tuple.7 = (f32[10,10]{1,0}) parameter(0)
+  get-tuple-element.9 = f32[10,10]{1,0} get-tuple-element(arg_tuple.7), index=0
+  copy = f32[10,10]{1,0} copy(get-tuple-element.9)
+  ROOT tuple.6 = (f32[10,10]{1,0}, f32[10,10]{1,0}) tuple(copy, get-tuple-element.9), origin={({"tuple.6" {0}}, {"tuple.6" {1}})}
+}
+
+on_false {
+  constant.17 = f32[] constant(0)
+  constant.18 = f32[] constant(1)
+  rng.19 = f32[10,10]{1,0} rng(constant.17, constant.18), distribution=rng_uniform
+  arg_tuple.14 = (f32[10,10]{1,0}) parameter(0)
+  get-tuple-element.16 = f32[10,10]{1,0} get-tuple-element(arg_tuple.14), index=0
+  ROOT tuple.7 = (f32[10,10]{1,0}, f32[10,10]{1,0}) tuple(rng.19, get-tuple-element.16), origin={({"tuple.7" {0}}, {"tuple.7" {1}})}
+}
+
+ENTRY main {
+  constant.38 = pred[] constant(true)
+  arg_tuple.30 = (s32[], f32[10,10]{1,0}) parameter(0)
+  get-tuple-element.21 = f32[10,10]{1,0} get-tuple-element(arg_tuple.30), index=1
+  tuple.1 = (f32[10,10]{1,0}) tuple(get-tuple-element.21)
+  conditional = (f32[10,10]{1,0}, f32[10,10]{1,0}) conditional(constant.38, tuple.1, tuple.1), true_computation=on_true, false_computation=on_false, origin={({"cond" {0}}, {"cond" {1}})}
+  get-second-index = f32[10,10]{1,0} get-tuple-element(conditional), index=1
+  ROOT result = (f32[10,10]{1,0}) tuple(get-second-index)
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+
+  ASSERT_OK_AND_ASSIGN(bool changed, ConditionalSimplifier().Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloVerifier v(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false);
+  TF_ASSERT_OK(v.Run(module.get()));
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  // The first element of "conditional" result tuple (f32[10,10], f32[10,10])
+  // should be removed since it is not referenced by any GTE instructions (see
+  // "get-second-index" instruction in hlo_string).
+  EXPECT_EQ(ShapeUtil::TupleElementCount(conditional->shape()), 1);
+  EXPECT_EQ(conditional->original_value()->ToString(), R"(({"cond" {1}}))");
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 986fe761476f80..8f0c7d635d3879 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -17,7 +17,6 @@ load(
     "if_llvm_x86_available",
 )
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 load(":build_defs.bzl", "runtime_copts")
 
 package(
@@ -158,7 +157,7 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:constant_allocation",
         "//xla/backends/cpu:target_machine_options",
-        "//xla/backends/cpu:xnn_support",
+        "//xla/backends/cpu:ynn_support",
         "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:compiled_function_library",
         "//xla/backends/cpu/codegen:cpu_features",
@@ -174,7 +173,6 @@ cc_library(
         "//xla/backends/cpu/runtime:thunk_proto_cc_impl",
         "//xla/backends/cpu/runtime:thunk_proto_serdes",
         "//xla/backends/cpu/transforms:library_rewriter",
-        "//xla/backends/cpu/transforms:xnn_graph_fusion",
         "//xla/backends/cpu/transforms/collectives:all_reduce_combiner",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
@@ -356,8 +354,6 @@ cc_library(
         ":onednn_contraction_rewriter",
         ":onednn_float_support",
         ":onednn_ops_rewriter",
-    ]) + if_ynnpack([
-        "//xla/backends/cpu:ynn_support",
     ]),
 )
 
@@ -550,7 +546,7 @@ cc_library(
         "//xla/service:hlo_execution_profile",
         "//xla/service:hlo_profile_printer_data_cc",
         "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:xla_debug_info_manager",
         "//xla/stream_executor:device_address",
@@ -805,7 +801,7 @@ cc_library(
     srcs = ["thunk_emitter.cc"],
     hdrs = ["thunk_emitter.h"],
     copts = tsl_copts(),
-    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
+    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
     deps = [
         ":backend_config_proto_cc",
         ":cpu_options",
@@ -821,8 +817,8 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:onednn_emitter",
         "//xla/backends/cpu:onednn_support",
-        "//xla/backends/cpu:xnn_emitter",
-        "//xla/backends/cpu:xnn_support",
+        "//xla/backends/cpu:ynn_emitter",
+        "//xla/backends/cpu:ynn_support",
         "//xla/backends/cpu/codegen:computation_kernel_emitter",
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
@@ -855,8 +851,8 @@ cc_library(
         "//xla/backends/cpu/runtime:topk_thunk",
         "//xla/backends/cpu/runtime:while_thunk",
         "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_kernel_source",
@@ -896,11 +892,6 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_onednn([
         "//xla/backends/cpu/runtime/onednn:onednn_op_thunk",
-    ]) + if_ynnpack([
-        "//xla/backends/cpu:ynn_emitter",
-        "//xla/backends/cpu:ynn_support",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
     ]),
 )
 
@@ -1097,6 +1088,8 @@ cc_library(
     srcs = ["fusion_wrapper.cc"],
     hdrs = ["fusion_wrapper.h"],
     deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen/tiled:tiled_fusion_emitter",
         "//xla/codegen/emitters:fusion_wrapper_base",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index 3da0ca6b17a641..1071345acd574a 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -20,6 +20,7 @@ message CustomCallBackendConfig {
 message FusionBackendConfig {
   string kind = 1;
   oneof custom_fusion_config_oneof {
+    // TODO: b/467367981, this is deprecated and should be removed.
     XnnFusionOptions xnn_fusion_options = 2;
     YnnFusionOptions ynn_fusion_options = 3;
   }
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
index e422891c24ec34..ad60a542567127 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
@@ -146,7 +146,6 @@ CpuAotCompilationResult::CpuAotCompilationResult(
 
 absl::StatusOr<std::unique_ptr<Executable>>
 CpuAotCompilationResult::LoadExecutable(
-    [[maybe_unused]] Compiler* compiler,
     const se::StreamExecutor* stream_exec) && {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
@@ -156,13 +155,9 @@ CpuAotCompilationResult::LoadExecutable(
 
   // Copied from cpu_compiler.cc in order to avoid dependency on cpu_compiler.
   std::function<int64_t(const BufferValue&)> buffer_size_bytes_function_getter =
-      compiler ? compiler->BufferSizeBytesFunction() : []() {
-        HloCostAnalysis::ShapeSizeFunction shape_size =
-            CpuExecutable::ShapeSizeBytes;
-        return [shape_size](const BufferValue& buffer) {
-          return shape_size(buffer.shape());
-        };
-      }();
+      [](const BufferValue& buffer) {
+        return CpuExecutable::ShapeSizeBytes(buffer.shape());
+      };
 
   // Recreate BufferAssignment from proto.
   AliasInfo alias_info;
@@ -196,8 +191,8 @@ CpuAotCompilationResult::LoadExecutable(
       cpu_executable,
       CpuExecutable::Create(std::move(function_library_),
                             std::move(buffer_assignment), std::move(module),
-                            std::move(*thunks), std::move(constants), nullptr,
-                            nullptr, target_machine_options));
+                            std::move(*thunks), std::move(constants),
+                            target_machine_options));
 
   // Dump computation proto state and buffer assignment for
   // GetCompiledMemoryStats results.
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index e6589fb1787da5..e61d8fe61c3f28 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -100,7 +100,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
 };
 
 // This class represents the result of a CPU AOT compilation.
-class CpuAotCompilationResult : public AotCompilationResult {
+class CpuAotCompilationResult : public CompiledModule {
  public:
   static absl::StatusOr<std::unique_ptr<CpuAotCompilationResult>> Create(
       const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
@@ -110,39 +110,19 @@ class CpuAotCompilationResult : public AotCompilationResult {
       TargetMachineOptionsProto target_machine_options =
           TargetMachineOptionsProto());
 
-  [[deprecated(
-      "HloProfilePrinterData is not used anymore. Use the other Create "
-      "method instead.")]] static absl::
-      StatusOr<std::unique_ptr<CpuAotCompilationResult>>
-      Create(const HloModule* hlo_module,
-             const BufferAssignment* buffer_assignment,
-             absl::string_view function_name,
-             std::vector<ObjFileProto> obj_files,
-             std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
-             std::unique_ptr<FunctionLibrary> function_library,
-             std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-             TargetMachineOptionsProto target_machine_options =
-                 TargetMachineOptionsProto()) {
-    return Create(hlo_module, buffer_assignment, function_name,
-                  std::move(obj_files), std::move(symbols), thunks,
-                  std::move(function_library), target_machine_options);
-  }
-
   ~CpuAotCompilationResult() override = default;
 
   absl::StatusOr<std::string> SerializeAsString() const override {
     return proto_.SerializeAsString();
   }
 
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      [[maybe_unused]] Compiler* compiler,
-      const se::StreamExecutor* stream_exec) &&
-      override;
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
 
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
+  std::shared_ptr<HloModule> shared_optimized_module() override {
+    return module_;
   }
 
   const CompilationResultProto& proto() const { return proto_; }
@@ -211,7 +191,7 @@ class CpuAotCompilationResult : public AotCompilationResult {
         function_library_(std::move(function_library)) {}
 
   CompilationResultProto proto_;
-  std::unique_ptr<HloModule> module_;
+  std::shared_ptr<HloModule> module_;
   std::optional<size_t> temp_allocation_index_;
   std::vector<BufferAllocationInfo> buffer_allocation_infos_;
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
index b3544407d61546..d295e230fd081a 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
@@ -101,19 +101,18 @@ ENTRY e {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                             ParseAndReturnVerifiedModule(hlo));
     TF_ASSERT_OK_AND_ASSIGN(
-        std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+        std::vector<std::unique_ptr<CompiledModule>> aot_results,
         compiler->CompileAheadOfTime(std::move(module), *aot_options));
 
     TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
                             aot_results[0]->SerializeAsString());
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<AotCompilationResult> aot_result,
+        std::unique_ptr<CompiledModule> aot_result,
         compiler->LoadAotCompilationResult(serialized_aot_result));
 
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        std::move(*aot_result)
-            .LoadExecutable(compiler, aot_options->executor()));
+        std::move(*aot_result).LoadExecutable(aot_options->executor()));
     std::unique_ptr<OpaqueExecutable> wrapped_executable =
         test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
index 21ebdebf6730f2..243771e4617563 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
@@ -147,11 +147,11 @@ absl::StatusOr<std::unique_ptr<Executable>> CpuAotLoader::LoadExecutable(
 }
 
 absl::StatusOr<std::unique_ptr<Executable>> CpuAotLoader::LoadExecutable(
-    xla::AotCompilationResult&& compilation_result) {
-  return std::move(compilation_result).LoadExecutable(nullptr, nullptr);
+    CompiledModule&& compilation_result) {
+  return std::move(compilation_result).LoadExecutable(/*executor=*/nullptr);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuAotLoader::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
   xla::cpu::CompilationResultProto proto;
@@ -161,7 +161,7 @@ CpuAotLoader::LoadAotCompilationResult(
   return LoadAotCompilationResult(proto);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuAotLoader::LoadAotCompilationResult(
     const xla::cpu::CompilationResultProto& aot_result_proto) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.h b/third_party/xla/xla/service/cpu/cpu_aot_loader.h
index 1f8e8def43db40..fa0aad5bbfb0f7 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.h
@@ -53,12 +53,12 @@ class CpuAotLoader {
       const xla::cpu::CompilationResultProto& aot_result_proto);
 
   static absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      xla::AotCompilationResult&& compilation_result);
+      CompiledModule&& compilation_result);
 
-  static absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  static absl::StatusOr<std::unique_ptr<CompiledModule>>
   LoadAotCompilationResult(const std::string& serialized_aot_result);
 
-  static absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  static absl::StatusOr<std::unique_ptr<CompiledModule>>
   LoadAotCompilationResult(
       const xla::cpu::CompilationResultProto& aot_result_proto);
 };
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 895657789c891f..295b74e1acc7fb 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -98,8 +98,7 @@ limitations under the License.
 #include "xla/backends/cpu/target_machine_options.h"
 #include "xla/backends/cpu/transforms/collectives/all_reduce_combiner.h"
 #include "xla/backends/cpu/transforms/library_rewriter.h"
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-#include "xla/backends/cpu/xnn_support.h"
+#include "xla/backends/cpu/ynn_support.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -251,10 +250,6 @@ limitations under the License.
 #include "xla/service/cpu/onednn_ops_rewriter.h"
 #endif  // XLA_ONEDNN
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/ynn_support.h"
-#endif  // XLA_YNNPACK
-
 namespace xla {
 namespace {
 
@@ -492,22 +487,13 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
     pipeline->AddPass<GatherSimplifier>();
   }
 
-  if (module->config()
-              .debug_options()
-              .xla_cpu_experimental_xnn_graph_fusion_mode() ==
-          DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED &&
-      !absl::c_contains(module->config()
-                            .debug_options()
-                            .xla_cpu_experimental_xnn_fusion_type(),
-                        DebugOptions::LIBRARY_FUSION_TYPE_REDUCE) &&
-      !absl::c_contains(module->config()
+  if (!absl::c_contains(module->config()
                             .debug_options()
                             .xla_cpu_experimental_ynn_fusion_type(),
                         DebugOptions::LIBRARY_FUSION_TYPE_REDUCE)) {
     pipeline->AddPass<TreeReductionRewriter>();
   }
 
-#ifdef XLA_YNNPACK
   if (absl::c_contains(module->config()
                            .debug_options()
                            .xla_cpu_experimental_ynn_fusion_type(),
@@ -517,7 +503,6 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
           return !IsReduceOpOffloadedToYnn(hlo);
         });
   }
-#endif
 
   // BatchNormExpander can create zero-sized ops, so zero-sized HLO
   // elimination has to come after that pass.
@@ -542,28 +527,25 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
   return pipeline;
 }
 
+auto LibrarySupportsConvolution(
+    HloModule* module, TargetMachineFeatures* target_machine_features) {
+  const bool ynnpack_convolution_enabled = absl::c_linear_search(
+      module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
+  return [=](const HloInstruction& instr) {
+    return ynnpack_convolution_enabled && IsConvolutionOpSupportedByYnn(&instr);
+  };
+}
+
 auto LibrarySupportsDot(HloModule* module,
                         TargetMachineFeatures* target_machine_features) {
-  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
-  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
-  // `XnnFusionThunk`.
-  const bool xnnpack_enabled =
-      module->config().debug_options().xla_cpu_use_xnnpack();
-  const auto xnn_graph_fusion_mode =
-      module->config()
-          .debug_options()
-          .xla_cpu_experimental_xnn_graph_fusion_mode();
-  const bool xnnpack_use_cost_model =
-      xnn_graph_fusion_mode !=
-      DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
-  const bool xnnpack_dot_enabled =
-      xnnpack_enabled &&
-      xnn_graph_fusion_mode != DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED;
+  // TODO(b/468895209): Stop calling YNNPACK from regular Dot thunks. All YNN
+  // Dots should be wrapped in an `__ynn_fusion` fusion region and processed in
+  // `YnnFusionThunk`.
   const bool ynnpack_dot_enabled = absl::c_linear_search(
       module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
   return [=](const HloInstruction& instr) {
-#ifdef XLA_YNNPACK
     if (ynnpack_dot_enabled &&
         IsDotSupportedByYnn(instr.dot_dimension_numbers(),
                             instr.operand(0)->shape(),
@@ -571,16 +553,7 @@ auto LibrarySupportsDot(HloModule* module,
             .value_or(false)) {
       return true;
     }
-#endif  // XLA_YNNPACK
 
-    if (xnnpack_dot_enabled &&
-        IsDotSupportedByXnn(instr.dot_dimension_numbers(),
-                            instr.operand(0)->shape(),
-                            instr.operand(1)->shape(), instr.shape(),
-                            target_machine_features, xnnpack_use_cost_model)
-            .value_or(false)) {
-      return true;
-    }
     return false;
   };
 }
@@ -670,31 +643,41 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   auto library_supports_dot =
       LibrarySupportsDot(module, target_machine_features);
 
-  auto call_library_for_dot = [&](const HloInstruction& instr) {
-    if (instr.opcode() != HloOpcode::kDot) {
+  auto library_supports_convolution =
+      LibrarySupportsConvolution(module, target_machine_features);
+
+  auto call_library_for_instruction = [&](const HloInstruction& instr) {
+    if (instr.opcode() != HloOpcode::kDot &&
+        instr.opcode() != HloOpcode::kConvolution) {
       return false;
     }
 
-    auto dot_strategy = GetDotImplementationStrategy(
-        module->config(), instr, *target_machine_features,
-        /*allow_runtime_calls=*/true);
-    if (dot_strategy != DotImplementationStrategy::kEigen) {
-      // We aren't going to call a library for this dot.
-      return false;
+    if (instr.opcode() == HloOpcode::kDot) {
+      auto dot_strategy = GetDotImplementationStrategy(
+          module->config(), instr, *target_machine_features,
+          /*allow_runtime_calls=*/true);
+      if (dot_strategy != DotImplementationStrategy::kEigen) {
+        // We aren't going to call a library for this dot.
+        return false;
+      }
+      return library_supports_dot(instr);
+    }
+    if (instr.opcode() == HloOpcode::kConvolution) {
+      return library_supports_convolution(instr);
     }
 
-    return library_supports_dot(instr);
+    return false;
   };
 
   // If YNNPACK is enabled, we only need to upcast dots that YnnDotThunk does
   // not support. `upcaster_filter` returns false if the instruction shouldn't
   // be processed.
   HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
-    return !call_library_for_dot(*instr);
+    return !call_library_for_instruction(*instr);
   };
 
-  // xla::cpu::GetDotImplementationStrategy (used by call_library_for_dot)
-  // relies on the canonical form of dots.
+  // xla::cpu::GetDotImplementationStrategy (used by
+  // call_library_for_instruction) relies on the canonical form of dots.
   pipeline.AddPass<DotDecomposer>();
   pipeline.AddPass<OperandUpcaster>(upcaster_filter);
 
@@ -754,7 +737,7 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
-  CpuFloatSupport bf16_support(BF16, call_library_for_dot);
+  CpuFloatSupport bf16_support(BF16, call_library_for_instruction);
 #ifdef XLA_ONEDNN
   bool use_onednn_graph =
       module->config().debug_options().xla_cpu_use_onednn() &&
@@ -798,17 +781,25 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     return false;
   };
   pipeline.AddPass<ConvolutionGroupConverter>(
-      /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model,
+      /*should_expand=*/
+      [&library_supports_convolution](HloInstruction* conv) {
+        return !library_supports_convolution(*conv);
+      },
+      cost_model,
       /*convert_batch_groups_only=*/true);
-  auto feature_group_should_expand = [](HloInstruction* conv) {
-    switch (conv->shape().element_type()) {
-      case F16:
-      case F32:
-        return false;
-      default:
-        return true;
-    }
-  };
+  auto feature_group_should_expand =
+      [&library_supports_convolution](HloInstruction* conv) {
+        if (library_supports_convolution(*conv)) {
+          return false;
+        }
+        switch (conv->shape().element_type()) {
+          case F16:
+          case F32:
+            return false;
+          default:
+            return true;
+        }
+      };
   pipeline.AddPass<ConvolutionGroupConverter>(
       feature_group_should_expand, cost_model,
       /*convert_batch_groups_only=*/false);
@@ -984,31 +975,22 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // XNNPACK ops availability checks depend on the layout information,
   // so until another solution is developed the passes creating XNNPACK fusions
   // have to run after layout assignment.
-  const bool use_ynnpack = absl::c_linear_search(
-      debug_options.xla_cpu_experimental_ynn_fusion_type(),
-      DebugOptions::LIBRARY_FUSION_TYPE_REDUCE);
+  const bool use_ynnpack =
+      !debug_options.xla_cpu_experimental_ynn_fusion_type().empty();
   LibraryRewriterOptions options = {
       /*use_onednn=*/debug_options.xla_cpu_use_onednn(),
-      /*use_xnnpack=*/debug_options.xla_cpu_use_xnnpack(),
       /*use_ynnpack=*/use_ynnpack,
       /*onednn_fusion_types=*/
       &debug_options.xla_cpu_experimental_onednn_fusion_type(),
-      /*xnn_fusion_types=*/
-      &debug_options.xla_cpu_experimental_xnn_fusion_type(),
       /*ynn_fusion_types=*/
       &debug_options.xla_cpu_experimental_ynn_fusion_type()};
-  if (options.use_onednn || options.use_xnnpack || options.use_ynnpack) {
+  if (options.use_onednn || options.use_ynnpack) {
     HloPassPipeline lib_pipeline("dot-library-passes");
     lib_pipeline.AddPass<DotDecomposer>();
     lib_pipeline.AddPass<LibraryRewriter>(target_machine_features, options);
     TF_RETURN_IF_ERROR(lib_pipeline.Run(module).status());
   }
 
-  if (debug_options.xla_cpu_experimental_xnn_graph_fusion_mode() !=
-      DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED) {
-    pipeline.AddPass<XnnGraphFusion>();
-  }
-
   bool use_multi_output_fusion =
       options::UseMultiOutputFusion(module->config());
   pipeline.AddPass<CpuInstructionFusion>(
@@ -1017,7 +999,9 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   if (is_fusion_emitters) {
     bool use_experimental_loop_fusion =
         options::UseExperimentalLoopFusion(module->config());
-    pipeline.AddPass<FusionWrapper>(use_experimental_loop_fusion);
+    bool use_tiled_emitter = options::EnableTiledEmitter(module->config());
+    pipeline.AddPass<FusionWrapper>(use_experimental_loop_fusion,
+                                    use_tiled_emitter);
   }
 
   AliasInfo alias_info;
@@ -2019,11 +2003,10 @@ CpuCompiler::CompileCpuExecutable(
 
   TF_ASSIGN_OR_RETURN(
       auto cpu_executable,
-      CpuExecutable::Create(
-          std::move(function_library), std::move(assignment), std::move(module),
-          std::move(thunks), std::move(constants),
-          std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map),
-          std::move(target_machine_options)));
+      CpuExecutable::Create(std::move(function_library), std::move(assignment),
+                            std::move(module), std::move(thunks),
+                            std::move(constants),
+                            std::move(target_machine_options)));
 
   // Save object files to be able to export them to AOT compilation
   // result.
@@ -2108,7 +2091,7 @@ absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   return std::unique_ptr<Executable>(std::move(cpu_executable));
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
 CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                                 const AotCompilationOptions& aot_options) {
   auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
@@ -2172,7 +2155,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   std::unique_ptr<llvm::TargetMachine> target_machine =
       target_machine_builder();
 
-  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  std::vector<std::unique_ptr<CompiledModule>> results;
   VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name();
   if (hlo_module->has_schedule()) {
     return results;
@@ -2191,7 +2174,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   return std::move(results);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuCompiler::CompileAheadOfTimeThunks(
     std::unique_ptr<HloModule> module,
     IrCompiler::TargetMachineBuilder target_machine_builder,
@@ -2243,12 +2226,6 @@ CpuCompiler::CompileAheadOfTimeThunks(
   const ThunkSequence& thunk_sequence =
       cpu_executable->thunks().thunk_sequence();
 
-  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
-      cpu_executable->module().config().hlo_profiling_enabled()
-          ? std::make_unique<HloProfilePrinterData>(
-                cpu_executable->hlo_profile_printer_data())
-          : nullptr;
-
   if (cpu_executable->obj_files().size() > 1) {
     return Internal(
         "Expected at most one object file for AOT compilation, but got %d",
@@ -2266,7 +2243,6 @@ CpuCompiler::CompileAheadOfTimeThunks(
       cpu_executable->module_name(), std::move(obj_files),
       cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
       std::move(*cpu_executable).consume_function_library(),
-      std::move(executable_hlo_profile_printer_data),
       cpu_executable->target_machine_options().ToProto());
 }
 
@@ -2278,8 +2254,8 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
   return CpuExecutable::ShapeSizeBytes;
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
-    Executable* executable) const {
+absl::StatusOr<std::unique_ptr<CompiledModule>> CpuCompiler::Export(
+    Executable* executable) {
   auto* cpu_executable = tensorflow::down_cast<CpuExecutable*>(executable);
   if (!cpu_executable)
     return Internal("Could not downcast Executable to CpuExecutable");
@@ -2299,12 +2275,6 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
   std::vector<SymbolProto> compiled_symbols_proto =
       cpu_executable->get_compiled_symbols_proto();
 
-  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
-      cpu_executable->module().config().hlo_profiling_enabled()
-          ? std::make_unique<HloProfilePrinterData>(
-                cpu_executable->hlo_profile_printer_data())
-          : nullptr;
-
   TF_ASSIGN_OR_RETURN(auto compiled_symbols,
                       GetCompiledSymbolsFromProto(compiled_symbols_proto));
 
@@ -2319,11 +2289,10 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
       cpu_executable->module_name(), std::move(obj_files),
       std::move(compiled_symbols_proto), *thunk_sequence,
       std::move(function_library),
-      std::move(executable_hlo_profile_printer_data),
       cpu_executable->target_machine_options().ToProto());
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuCompiler::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
   return CpuAotLoader::LoadAotCompilationResult(serialized_aot_result);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 0816a51979f7d2..b72c188ba27dea 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -73,7 +73,7 @@ class CpuCompiler : public LLVMCompiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
 
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                      const AotCompilationOptions& options) override;
 
@@ -81,13 +81,13 @@ class CpuCompiler : public LLVMCompiler {
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
-      Executable* executable) const override;
+  absl::StatusOr<std::unique_ptr<CompiledModule>> Export(
+      Executable* executable) override;
 
   // Returns a (deserialized) AotCompilationResult from a serialized
   // AotCompilationResult.
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  LoadAotCompilationResult(const std::string& serialized_aot_result) override;
+  absl::StatusOr<std::unique_ptr<CompiledModule>> LoadAotCompilationResult(
+      const std::string& serialized_aot_result) override;
 
   absl::StatusOr<HloSchedule> CreateHloSchedule(
       const HloModule& hlo_module) const;
@@ -123,8 +123,7 @@ class CpuCompiler : public LLVMCompiler {
       const llvm::PICLevel::Level& pic_level = llvm::PICLevel::NotPIC,
       const llvm::PIELevel::Level& pie_level = llvm::PIELevel::Default);
 
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  CompileAheadOfTimeThunks(
+  absl::StatusOr<std::unique_ptr<CompiledModule>> CompileAheadOfTimeThunks(
       std::unique_ptr<HloModule> module,
       IrCompiler::TargetMachineBuilder target_machine_builder,
       const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 6e0cf855e34f97..a0597e58eb2e82 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -58,7 +58,7 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/xla_debug_info_manager.h"
@@ -88,16 +88,13 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
     std::vector<ConstantAllocation> constants,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
     TargetMachineOptions target_machine_options) {
   VLOG(2) << "Create CpuExecutable from a thunk sequence; module="
           << hlo_module->name() << ", constants=" << constants.size();
 
-  std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
-      std::move(hlo_module), std::move(hlo_profile_printer_data),
-      std::move(hlo_profile_index_map), std::move(assignment),
-      std::move(target_machine_options)));
+  std::unique_ptr<CpuExecutable> executable(
+      new CpuExecutable(std::move(hlo_module), std::move(assignment),
+                        std::move(target_machine_options)));
   executable->function_library_ = std::move(function_library);
 
   ThunkExecutor::Options thunk_executor_options;
@@ -106,12 +103,6 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
       executable->thunks_,
       ThunkExecutor::Create(std::move(thunks), thunk_executor_options));
 
-  // Find if the thunk sequence contains any XNN fusion thunks. If we do have
-  // any, we will prepare the XNNPACK thread pool for them at run time.
-  executable->thunks_->thunk_sequence().ForEach([&](const Thunk& thunk) {
-    executable->has_xnn_fusions_ |= thunk.kind() == Thunk::Kind::kXnnFusion;
-  });
-
   // Find if the thunk sequence contains any YNN fusion thunks. If we do have
   // any, we will prepare the YNNPACK thread pool for them at run time.
   executable->thunks_->thunk_sequence().ForEach([&](const Thunk& thunk) {
@@ -129,14 +120,10 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
   return executable;
 }
 
-CpuExecutable::CpuExecutable(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<BufferAssignment> assignment,
-    TargetMachineOptions target_machine_options)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
-                 std::move(hlo_profile_index_map)),
+CpuExecutable::CpuExecutable(std::unique_ptr<HloModule> hlo_module,
+                             std::unique_ptr<BufferAssignment> assignment,
+                             TargetMachineOptions target_machine_options)
+    : Executable(std::move(hlo_module)),
       assignment_(std::move(assignment)),
       target_machine_options_(std::move(target_machine_options)) {
   if (assignment_ && has_module()) {
@@ -163,7 +150,7 @@ CpuExecutable::~CpuExecutable() {
   }
 }
 
-static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
+static absl::StatusOr<MaybeOwningDeviceAddress> MemoryForAllocation(
     const BufferAllocation& allocation,
     absl::Span<const ExecutionInput> arguments,
     absl::Span<const ConstantAllocation> constants,
@@ -172,22 +159,22 @@ static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
   if (allocation.is_entry_computation_parameter()) {
     se::DeviceAddressBase out = arguments[allocation.parameter_number()]
                                     .Buffer(allocation.param_shape_index())
-                                    .AsDeviceMemoryBase();
+                                    .AsDeviceAddress();
     CHECK_LE(allocation.size(), out.size())
         << "Size mismatch on param " << allocation.parameter_number()
         << " at shape index " << allocation.param_shape_index().ToString();
     VLOG(3) << "allocation is a parameter";
-    return MaybeOwningDeviceMemory{out};
+    return MaybeOwningDeviceAddress{out};
   } else if (allocation.is_constant()) {
     VLOG(3) << "allocation is a constant";
     if (allocation.index() < constants.size()) {
-      return MaybeOwningDeviceMemory(
+      return MaybeOwningDeviceAddress(
           constants[allocation.index()].AsDeviceMemoryBase());
     }
-    return MaybeOwningDeviceMemory{se::DeviceAddressBase{}};
+    return MaybeOwningDeviceAddress{se::DeviceAddressBase{}};
   } else if (allocation.is_thread_local()) {
     VLOG(3) << "buffer is thread-local";
-    return MaybeOwningDeviceMemory{se::DeviceAddressBase{}};
+    return MaybeOwningDeviceAddress{se::DeviceAddressBase{}};
   }
 
   int64_t buffer_size = allocation.size();
@@ -201,14 +188,14 @@ static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
   // initialized. Mark them initialized so that memory sanitizer doesn't flag
   // loads from these buffers.
   ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
-  return MaybeOwningDeviceMemory{std::move(out)};
+  return MaybeOwningDeviceAddress{std::move(out)};
 }
 
-absl::StatusOr<std::vector<MaybeOwningDeviceMemory>>
+absl::StatusOr<std::vector<MaybeOwningDeviceAddress>>
 CpuExecutable::CreateBufferTable(se::DeviceAddressAllocator* memory_allocator,
                                  int device_ordinal,
                                  absl::Span<ExecutionInput const> arguments) {
-  std::vector<MaybeOwningDeviceMemory> buffers(
+  std::vector<MaybeOwningDeviceAddress> buffers(
       assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -240,7 +227,7 @@ static int32_t GetDeviceOrdinal(const ExecutableRunOptions* run_options) {
 
 absl::Status CpuExecutable::ExecuteThunks(
     const ExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory const> buffers) {
+    absl::Span<MaybeOwningDeviceAddress const> buffers) {
   uint64_t start_ns = tsl::Env::Default()->NowNanos();
 
   size_t profile_counters_size = 0;
@@ -251,9 +238,8 @@ absl::Status CpuExecutable::ExecuteThunks(
   VLOG(3) << "Executing XLA:CPU thunks:";
   VLOG(3) << absl::StrFormat("  Number of buffer allocations: %u",
                              buffers.size());
-  auto mem_printer = [](std::string* out, const MaybeOwningDeviceMemory& mem) {
-    absl::StrAppend(out,
-                    absl::StrFormat("%p", mem.AsDeviceMemoryBase().opaque()));
+  auto mem_printer = [](std::string* out, const MaybeOwningDeviceAddress& mem) {
+    absl::StrAppend(out, absl::StrFormat("%p", mem.AsDeviceAddress().opaque()));
   };
   VLOG(3) << absl::StrFormat("  Buffer allocations: [%s]",
                              absl::StrJoin(buffers, ", ", mem_printer));
@@ -269,12 +255,6 @@ absl::Status CpuExecutable::ExecuteThunks(
   TF_ASSIGN_OR_RETURN(Thunk::CustomCallExecuteParams custom_call_execute_params,
                       Thunk::CustomCallExecuteParams::Create(run_options));
 
-  // Prepare for executing XNNPACK fusions.
-  std::optional<Thunk::XnnParams> xnn_params;
-  if (has_xnn_fusions()) {
-    TF_ASSIGN_OR_RETURN(xnn_params, Thunk::XnnParams::Create(run_options));
-  }
-
   // Prepare for executing YNNPACK fusions.
   std::optional<Thunk::YnnParams> ynn_params;
   if (has_ynn_fusions()) {
@@ -294,7 +274,6 @@ absl::Status CpuExecutable::ExecuteThunks(
       &task_runner,
       &collective_execute_params,
       &custom_call_execute_params,
-      xnn_params ? &*xnn_params : nullptr,
       ynn_params ? &*ynn_params : nullptr};
 
   auto executed_event = thunks_->Execute(execute_params);
@@ -315,7 +294,7 @@ absl::Status CpuExecutable::ExecuteThunks(
 
 absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory> buffers,
+    absl::Span<MaybeOwningDeviceAddress> buffers,
     absl::Span<ExecutionInput> arguments) {
   se::Stream* stream = run_options->stream();
   ExecutionOutput result(/*on_device_shape=*/result_shape(),
@@ -352,7 +331,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     if (alias) {
       CHECK_LT(alias->parameter_number, arguments.size());
       ExecutionInput& input = arguments[alias->parameter_number];
-      MaybeOwningDeviceMemory* maybe_owning_memory =
+      MaybeOwningDeviceAddress* maybe_owning_memory =
           input.MutableBuffer(alias->parameter_index);
       if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
         return InvalidArgument(
@@ -388,24 +367,24 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
             run_options->allocator()->Allocate(
                 stream->parent()->device_ordinal(), allocation_size));
         result_buffer = allocated_buffer.Release();
-        MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
+        MaybeOwningDeviceAddress& registered_buffer = buffers[buffer_index];
         CHECK_EQ(result_buffer.size(),
-                 registered_buffer.AsDeviceMemoryBase().size());
+                 registered_buffer.AsDeviceAddress().size());
         std::memcpy(/*dest=*/result_buffer.opaque(),
-                    /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
+                    /*src=*/registered_buffer.AsDeviceAddress().opaque(),
                     /*n=*/result_buffer.size());
         registered_buffer = result_buffer;
       }
     }
 
     if (result_buffer.is_null()) {
-      MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
+      MaybeOwningDeviceAddress& buffer = buffers[buffer_index];
       if (std::optional<se::ScopedDeviceAddress<uint8_t>> owned_buffer =
               buffer.Release()) {
         result_buffer = owned_buffer->Release();
         buffer = result_buffer;
       } else {
-        result_buffer = buffer.AsDeviceMemoryBase();
+        result_buffer = buffer.AsDeviceAddress();
         result.AddAliasedIndex(index);
       }
     }
@@ -444,7 +423,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   se::Stream* stream = run_options->stream();
   se::DeviceAddressAllocator* memory_allocator = run_options->allocator();
   TF_ASSIGN_OR_RETURN(
-      std::vector<MaybeOwningDeviceMemory> buffers,
+      std::vector<MaybeOwningDeviceAddress> buffers,
       CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(),
                         arguments));
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index ee590e472dbf83..3db37885900445 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_address_allocator.h"
 
@@ -62,8 +62,6 @@ class CpuExecutable : public Executable {
       std::unique_ptr<BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
       std::vector<ConstantAllocation> constants,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
       TargetMachineOptions target_machine_options);
 
   ~CpuExecutable() override;
@@ -74,8 +72,9 @@ class CpuExecutable : public Executable {
 
   // Calls emitted thunk sequence with the given arguments using the supplied
   // buffers.
-  absl::Status ExecuteThunks(const ExecutableRunOptions* run_options,
-                             absl::Span<MaybeOwningDeviceMemory const> buffers);
+  absl::Status ExecuteThunks(
+      const ExecutableRunOptions* run_options,
+      absl::Span<MaybeOwningDeviceAddress const> buffers);
 
   absl::Span<const ObjFileProto> obj_files() const { return obj_files_; }
 
@@ -174,7 +173,7 @@ class CpuExecutable : public Executable {
   //
   //  - buffers_to_free: buffers whose ownership was donated by the caller that
   //    are to be freed by the caller.
-  absl::StatusOr<std::vector<MaybeOwningDeviceMemory>> CreateBufferTable(
+  absl::StatusOr<std::vector<MaybeOwningDeviceAddress>> CreateBufferTable(
       se::DeviceAddressAllocator* memory_allocator, int device_ordinal,
       absl::Span<ExecutionInput const> arguments);
 
@@ -184,7 +183,7 @@ class CpuExecutable : public Executable {
   // assignment.
   absl::StatusOr<ExecutionOutput> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<MaybeOwningDeviceMemory> buffers,
+      absl::Span<MaybeOwningDeviceAddress> buffers,
       absl::Span<ExecutionInput> arguments);
 
   // Returns the instruction value set of the root instruction of the entry
@@ -246,8 +245,6 @@ class CpuExecutable : public Executable {
   std::string entry_function_name_;
 
   CpuExecutable(std::unique_ptr<HloModule> hlo_module,
-                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
                 std::unique_ptr<BufferAssignment> assignment,
                 TargetMachineOptions target_machine_options);
   CpuExecutable(const CpuExecutable&) = delete;
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.cc b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
index af4cf569643a95..5a8a5c6e558fa5 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "xla/service/cpu/fusion_wrapper.h"
 
+#include "xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
@@ -85,6 +87,12 @@ bool FusionWrapper::MustWrapInstruction(const HloInstruction& instruction) {
     case HloOpcode::kTanh:
     case HloOpcode::kXor:
       return using_new_fusion_emitter_;
+    case HloOpcode::kCopy:
+      if (use_tiled_emitter_) {
+        PrimitiveType type = instruction.shape().element_type();
+        return IsSupportedTilingType(type);
+      }
+      return false;
     // The following ops are supported but the performance is not as good as the
     // non-fusion path.
     // TODO(willfroom): Remove this once the performance is improved.
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.h b/third_party/xla/xla/service/cpu/fusion_wrapper.h
index 5f430f93afa8c7..5da07c2f3efc3f 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.h
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.h
@@ -28,8 +28,9 @@ namespace cpu {
 // kick in.
 class FusionWrapper : public emitters::FusionWrapperBase {
  public:
-  explicit FusionWrapper(bool using_new_fusion_emitter)
-      : using_new_fusion_emitter_(using_new_fusion_emitter) {}
+  explicit FusionWrapper(bool using_new_fusion_emitter, bool use_tiled_emitter)
+      : using_new_fusion_emitter_(using_new_fusion_emitter),
+        use_tiled_emitter_(use_tiled_emitter) {}
   ~FusionWrapper() override = default;
 
   absl::string_view name() const override { return "fusion-wrapper"; }
@@ -38,6 +39,7 @@ class FusionWrapper : public emitters::FusionWrapperBase {
 
  private:
   bool using_new_fusion_emitter_;
+  bool use_tiled_emitter_;
 };
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc b/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
index b8e1438ef1dc34..c81369604eb756 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
@@ -56,7 +56,7 @@ TEST_F(FusionWrapperTest, Scatter) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(hlo_string));
-  FusionWrapper wrapper(false);
+  FusionWrapper wrapper(false, false);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, wrapper.Run(m.get()));
   EXPECT_TRUE(changed);
 
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.cc b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
index 2233c84d814d38..94cf1d16e8cfac 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
@@ -59,8 +59,7 @@ MemrefInfoHandler CreateMemrefFromShape(const Shape& shape, const void* buf) {
   result->dtype = shape.element_type();
   result->rank = shape.dimensions().size();
   auto dimensions = shape.dimensions();
-  std::copy(dimensions.begin(), dimensions.end(),
-            absl::MakeSpan(result->dims).begin());
+  absl::c_copy(dimensions, absl::MakeSpan(result->dims).begin());
 
   int64_t stride = 1;
   for (int i : shape.layout().minor_to_major()) {
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
index bf77a1729a4e54..ff6a8a4c99b947 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
@@ -247,7 +247,7 @@ TEST_F(ParallelTaskAssignmentTest, ConstantNotParallelized) {
 
 TEST_F(ParallelTaskAssignmentTest, CustomFusionUnchanged) {
   constexpr absl::string_view hlo_string = R"(
-HloModule jit_xnn_bin_ops
+HloModule jit_ynn_bin_ops
 
 fused_computation (matrix_a: f32[1000,1000], matrix_b: f32[1000,1000]) -> f32[1000,1000] {
   matrix_a = f32[1000,1000] parameter(0)
@@ -260,7 +260,7 @@ fused_computation (matrix_a: f32[1000,1000], matrix_b: f32[1000,1000]) -> f32[10
 ENTRY main (input_x: f32[1000,1000], input_y: f32[1000,1000]) -> f32[1000,1000] {
   input_x = f32[1000,1000] parameter(0)
   input_y = f32[1000,1000] parameter(1)
-  ROOT fused_result = f32[1000,1000] fusion(input_x, input_y), kind=kCustom, calls=fused_computation, backend_config={"outer_dimension_partitions":[],"fusion_config":{"kind":"__xnn_fusion"}}
+  ROOT fused_result = f32[1000,1000] fusion(input_x, input_y), kind=kCustom, calls=fused_computation, backend_config={"outer_dimension_partitions":[],"fusion_config":{"kind":"__ynn_fusion"}}
 }
 )";
 
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 3ea7a53b3b6206..0c80d94287c65c 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -427,18 +427,12 @@ xla_cc_test(
 )
 
 xla_cc_test(
-    name = "xnn_fusion_test",
-    srcs = ["xnn_fusion_test.cc"],
+    name = "cpu_copy_test",
+    srcs = ["cpu_copy_test.cc"],
     deps = [
-        "//xla:error_spec",
-        "//xla/backends/cpu:xnn_gemm_config",
-        "//xla/service:cpu_plugin",
-        "//xla/tests:hlo_test_base",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:platform_port",
+        ":cpu_codegen_test_main",
+        "//xla:literal",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
index ba580c9997131d..d544c86159d194 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
@@ -53,21 +53,20 @@ class CpuAotCompilationTest : public HloTestBase {
         std::vector<std::unique_ptr<Executable>> executables,
         compiler->Compile(std::move(module), {stream_exec}, nullptr));
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<AotCompilationResult> exported_aot_result,
-        compiler->Export(executables[0].get()));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CompiledModule> exported_aot_result,
+                            compiler->Export(executables[0].get()));
 
     // Serialize-deserialize AOT compilation result.
     TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
                             exported_aot_result->SerializeAsString());
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<AotCompilationResult> loaded_aot_result,
+        std::unique_ptr<CompiledModule> loaded_aot_result,
         compiler->LoadAotCompilationResult(serialized_aot_result));
 
     // Load Executable from AOT compilation result.
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        std::move(*loaded_aot_result).LoadExecutable(compiler, stream_exec));
+        std::move(*loaded_aot_result).LoadExecutable(stream_exec));
   }
 };
 
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc
new file mode 100644
index 00000000000000..20de31fc7be8fd
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::cpu {
+namespace {
+
+TEST_F(CpuCodegenTest, SubByteCopy) {
+  const std::string hlo_text = R"hlo(
+HloModule module
+
+ENTRY entry {
+  in = u2[20,20]{1,0:E(2)} iota(), iota_dimension=1
+  transpose = u2[20,20]{0,1:E(2)} transpose(in), dimensions={1,0}
+  copy = u2[20,20]{1,0:E(2)} copy(transpose)
+  ROOT out = u8[20,20]{1,0} convert(copy)
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Literal result,
+      Execute(std::move(module), {}, /*run_hlo_passes=*/false));
+
+  absl::Span<const uint8_t> result_data = result.data<uint8_t>();
+  for (int64_t row = 0; row < 20; ++row) {
+    for (int64_t col = 0; col < 20; ++col) {
+      EXPECT_EQ(result_data[row * 20 + col], row % 4);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc b/third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc
deleted file mode 100644
index 095805fdfffe2f..00000000000000
--- a/third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/xnn_gemm_config.h"
-#include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/test.h"
-#include "tsl/platform/cpu_info.h"
-
-namespace xla::cpu {
-namespace {
-
-using ::testing::HasSubstr;
-
-struct XnnFusionTestParams {
-  std::string in_dtype;
-  std::string out_dtype;  // Only used for mixed input/output types.
-};
-
-class XnnFusionTest
-    : public HloTestBase,
-      public ::testing::WithParamInterface<XnnFusionTestParams> {
- public:
-  static std::string Name(
-      const ::testing::TestParamInfo<XnnFusionTestParams>& info) {
-    return absl::StrCat(info.param.in_dtype, "_", info.param.out_dtype);
-  }
-
- protected:
-  XnnFusionTest() {
-    // Override XnnGemmConfig.
-    GetXnnGemmConfig().SetTestFilter([](const XnnGemm&) { return true; });
-  }
-
-  ~XnnFusionTest() override { GetXnnGemmConfig().SetTestFilter(nullptr); }
-
-  void RunTest(absl::string_view hlo_template, absl::string_view check_str) {
-    XnnFusionTestParams params = GetParam();
-    std::string hlo_text =
-        absl::StrReplaceAll(hlo_template, {{"$dtype", params.in_dtype},
-                                           {"$in_dtype", params.in_dtype},
-                                           {"$out_dtype", params.out_dtype}});
-    bool bf16_compute = params.in_dtype == "bf16" || params.out_dtype == "bf16";
-    double tolerance = bf16_compute ? 1e-2 : 1e-7;
-    EXPECT_TRUE(RunAndCompare(
-        hlo_text, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
-
-    if (bf16_compute && !check_str.empty()) {
-      std::string check_text =
-          absl::StrReplaceAll(check_str, {{"$dtype", params.in_dtype},
-                                          {"$in_dtype", params.in_dtype},
-                                          {"$out_dtype", params.out_dtype}});
-      MatchOptimizedHlo(hlo_text, check_text);
-    }
-  }
-};
-
-bool ShouldSkipDotBf16Test(absl::string_view in_dtype) {
-  return in_dtype == "bf16" &&
-         !tsl::port::TestCPUFeature(tsl::port::AVX512_BF16);
-}
-
-absl::string_view GetOutputTypeSupportedByXnnBatchMatMul(
-    absl::string_view in_dtype) {
-  static const auto* kSupportedOutputTypes =
-      new absl::flat_hash_map<absl::string_view, absl::string_view>(
-          {{"f32", "f32"}, {"bf16", "f32"}});
-
-  return kSupportedOutputTypes->at(in_dtype);
-}
-
-std::string InsertConvertIfNecessary(absl::string_view hlo_text,
-                                     absl::string_view in_dtype,
-                                     absl::string_view out_dtype,
-                                     absl::string_view convert_text) {
-  absl::string_view supported_dtype =
-      GetOutputTypeSupportedByXnnBatchMatMul(in_dtype);
-  bool need_convert = out_dtype != supported_dtype;
-  return absl::StrReplaceAll(
-      hlo_text, {{"$root ", need_convert ? "" : "ROOT "},
-                 {"$dot_dtype", need_convert ? supported_dtype : "$out_dtype"},
-                 {"$convert_if_necessary", need_convert ? convert_text : ""},
-                 {"$dot_or_convert", need_convert ? "%convert" : "%dot"}});
-}
-
-// For tests that always have same input/output types.
-using SameTypeTest = XnnFusionTest;
-
-TEST_P(SameTypeTest, AddAndMultiply) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule add_and_multiply
-
-    xnn_fusion {
-      %lhs = $dtype[4] parameter(0)
-      %rhs = $dtype[4] parameter(1)
-      %add = $dtype[4] add(%lhs, %rhs)
-      ROOT %mul = $in_dtype[4] multiply(%add, %add)
-    }
-
-    ENTRY entry {
-      %p0 = $dtype[4] parameter(0)
-      %p1 = $dtype[4] parameter(1)
-      ROOT %fusion = $dtype[4] fusion(%p0, %p1), kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  // Optimized HLO shouldn't have any convert.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      multiply
-  )";
-
-  RunTest(kModuleStr, kCheckStr);
-}
-
-TEST_P(SameTypeTest, DotAddMultiply) {
-  XnnFusionTestParams params = GetParam();
-  if (ShouldSkipDotBf16Test(params.in_dtype)) {
-    GTEST_SKIP() << "XNNPACK bf16 matmul requires AVX512_BF16 which this CPU "
-                    "doesn't have.";
-  }
-
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule dot_add_multiply
-
-    xnn_fusion {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[5,6] parameter(1)
-      %addend = $dtype[4,6] parameter(2)
-      %multiplier = $dtype[4,6] parameter(3)
-      %dot = $dot_dtype[4,6] dot(%lhs, %rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-      $convert_if_necessary
-      %add = $dtype[4,6] add($dot_or_convert, %addend)
-      ROOT %mul = $dtype[4,6] multiply(%add, %multiplier)
-    }
-
-    ENTRY entry {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[5,6] parameter(1)
-      %addend = $dtype[4, 6] parameter(2)
-      %multiplier = $dtype[4, 6] parameter(3)
-      ROOT %fusion = $dtype[4,6] fusion(%lhs, %rhs, %addend, %multiplier),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  constexpr absl::string_view kConvertStr =
-      "%convert = $dtype[4,6] convert(%dot)";
-
-  // Optimized HLO shouldn't have any convert before the dot.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      dot
-  )";
-
-  RunTest(InsertConvertIfNecessary(kModuleStr, params.in_dtype,
-                                   params.out_dtype, kConvertStr),
-          kCheckStr);
-}
-
-TEST_P(SameTypeTest, DotRhsTransposedAndMultiply) {
-  XnnFusionTestParams params = GetParam();
-  if (ShouldSkipDotBf16Test(params.in_dtype)) {
-    GTEST_SKIP() << "XNNPACK bf16 matmul requires AVX512_BF16 which this CPU "
-                    "doesn't have.";
-  }
-
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule dot_rhs_transposed_and_multiply
-
-    xnn_fusion {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[6,5] parameter(1)
-      %multiplier = $dtype[4,6] parameter(2)
-      %dot = $dot_dtype[4,6] dot(%lhs, %rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={1}
-      $convert_if_necessary
-      ROOT %mul = $dtype[4,6] multiply($dot_or_convert, %multiplier)
-    }
-
-    ENTRY entry {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[6,5] parameter(1)
-      %multiplier = $dtype[4, 6] parameter(2)
-      ROOT %fusion = $dtype[4,6] fusion(%lhs, %rhs, %multiplier),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  constexpr absl::string_view kConvertStr =
-      "%convert = $dtype[4,6] convert(%dot)";
-
-  // Optimized HLO shouldn't have any convert before the dot.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      dot
-  )";
-
-  RunTest(InsertConvertIfNecessary(kModuleStr, params.in_dtype,
-                                   params.out_dtype, kConvertStr),
-          kCheckStr);
-}
-
-std::vector<XnnFusionTestParams> GetSameTypeTestCases() {
-  return std::vector<XnnFusionTestParams>({
-      XnnFusionTestParams{"f32", "f32" /*unused*/},
-      XnnFusionTestParams{"bf16", "bf16" /*unused*/},
-  });
-}
-
-INSTANTIATE_TEST_SUITE_P(SameTypeTestInstantiation, SameTypeTest,
-                         ::testing::ValuesIn(GetSameTypeTestCases()),
-                         XnnFusionTest::Name);
-
-// For tests that we might want to use different input/output types.
-using MixedTypesTest = XnnFusionTest;
-
-TEST_P(MixedTypesTest, BatchedDot) {
-  XnnFusionTestParams params = GetParam();
-  if (ShouldSkipDotBf16Test(params.in_dtype)) {
-    GTEST_SKIP() << "XNNPACK bf16 matmul requires AVX512_BF16 which this CPU"
-                    "doesn't have.";
-  }
-
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule dot_add_multiply
-
-    xnn_fusion {
-      %lhs = $in_dtype[2,3,4,5] parameter(0)
-      %rhs = $in_dtype[2,3,5,6] parameter(1)
-      $root %dot = $dot_dtype[2,3,4,6] dot(%lhs, %rhs),
-        lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-        lhs_contracting_dims={3}, rhs_contracting_dims={2}
-      $convert_if_necessary
-    }
-
-    ENTRY entry {
-      %lhs = $in_dtype[2,3,4,5] parameter(0)
-      %rhs = $in_dtype[2,3,5,6] parameter(1)
-      ROOT %fusion = $out_dtype[2,3,4,6] fusion(%lhs, %rhs),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  constexpr absl::string_view kConvertStr =
-      "ROOT %convert = $out_dtype[2,3,4,6] convert(%dot)";
-
-  // Optimized HLO shouldn't have any convert before the dot.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      dot
-  )";
-
-  RunTest(InsertConvertIfNecessary(kModuleStr, params.in_dtype,
-                                   params.out_dtype, kConvertStr),
-          kCheckStr);
-}
-
-std::vector<XnnFusionTestParams> GetMixedTypesTestCases() {
-  return std::vector<XnnFusionTestParams>({
-      XnnFusionTestParams{"f32", "f32"},
-      XnnFusionTestParams{"bf16", "f32"},
-      XnnFusionTestParams{"bf16", "bf16"},
-  });
-}
-
-INSTANTIATE_TEST_SUITE_P(MixedTypesTestInstantiation, MixedTypesTest,
-                         ::testing::ValuesIn(GetMixedTypesTestCases()),
-                         XnnFusionTest::Name);
-
-TEST_F(XnnFusionTest, ConvertF32ToBF16) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule convert
-
-    xnn_fusion {
-      %input = f32[2,3,4,5] parameter(0)
-      ROOT %dot = bf16[2,3,4,5] convert(%input)
-    }
-
-    ENTRY entry {
-      %input = f32[2,3,4,5] parameter(0)
-      ROOT %fusion = bf16[2,3,4,5] fusion(%input),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  EXPECT_TRUE(RunAndCompare(kModuleStr, ErrorSpec{1e-2}));
-}
-
-// The following tests don't need to be run with different data types.
-TEST_F(XnnFusionTest, UnsupportedDot) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule unsupported_dot
-
-    xnn_fusion {
-      %lhs = f32[5,4] parameter(0)
-      %rhs = f32[5,6] parameter(1)
-      ROOT %dot = f32[4,6] dot(%lhs, %rhs),
-        lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-
-    ENTRY entry {
-      %lhs = f32[5,4] parameter(0)
-      %rhs = f32[5,6] parameter(1)
-      ROOT %fusion = f32[4,6] fusion(%lhs, %rhs),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  auto status = RunAndCompare(kModuleStr, ErrorSpec{0.0});
-  EXPECT_FALSE(status);
-  EXPECT_THAT(status.message(),
-              HasSubstr("Unsupported XNNPACK Dot op variation"));
-}
-
-TEST_F(XnnFusionTest, UnsupportedBatchDot) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule unsupported_dot
-
-    xnn_fusion {
-      %lhs = f32[64,64] parameter(0)
-      %rhs = f32[64,64] parameter(1)
-      ROOT %dot = f32[64]{0} dot(%lhs, %rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY entry {
-      %lhs = f32[64,64] parameter(0)
-      %rhs = f32[64,64] parameter(1)
-      ROOT %fusion = f32[64] fusion(%lhs, %rhs),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  auto status = RunAndCompare(kModuleStr, ErrorSpec{0.0});
-  EXPECT_FALSE(status);
-  EXPECT_THAT(status.message(),
-              HasSubstr("Unsupported XNNPACK Dot op variation"));
-}
-
-TEST_F(XnnFusionTest, UnsupportedOp) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule unsupported_sqrt
-
-    xnn_fusion {
-      %x = f32[10] parameter(0)
-      ROOT %e = f32[10] erf(%x)
-    }
-
-    ENTRY entry {
-      %x = f32[10] parameter(0)
-      ROOT %e = f32[10] fusion(%x), kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  auto status = RunAndCompare(kModuleStr, ErrorSpec{0.0});
-  EXPECT_FALSE(status);
-  EXPECT_THAT(status.message(),
-              HasSubstr("Unsupported elementwise instruction in XNN fusion"));
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 3f9932fc78bc43..5a77a764108d54 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -69,10 +69,10 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/xnn_emitter.h"
-#include "xla/backends/cpu/xnn_support.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
+#include "xla/backends/cpu/ynn_support.h"
 #include "xla/codegen/emitters/computation_fingerprint.h"
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -125,13 +125,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/ynn_emitter.h"
-#include "xla/backends/cpu/ynn_support.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 
 namespace {
@@ -445,15 +438,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
         }
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-        if (backend_config.fusion_config().kind() == kXnnFusionKind) {
-          return EmitXnnFusionThunk(instruction);
-        }
-
-#ifdef XLA_YNNPACK
         if (backend_config.fusion_config().kind() == kYnnFusionKind) {
           return EmitYnnFusionThunk(instruction);
         }
-#endif  // XLA_YNNPACK
 
         return Internal("Unsupported custom fusion kind: %s",
                         backend_config.DebugString());
@@ -486,7 +473,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
       return EmitConvolutionThunk(instruction);
 
     case HloOpcode::kCopy: {
-      if (options_.compile_copy_as_llvm_kernel) {
+      // The copy thunk does not support sub-byte data types.
+      bool has_byte_strides =
+          ShapeUtil::ByteStrides(instruction->shape()).has_value();
+      if (!has_byte_strides || options_.compile_copy_as_llvm_kernel) {
         return EmitElementalKernelThunk(instruction);
       }
       return EmitCopyThunk(instruction);
@@ -767,6 +757,15 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
       /*supported_types=*/
       {PRED, S8, U8, S16, U16, S32, U32, S64, U64, F16, F32, F64, C64, C128}));
 
+  const bool use_ynn = absl::c_linear_search(
+      hlo_module_config_.debug_options().xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
+  if (use_ynn) {
+    if (IsConvolutionOpSupportedByYnn(instruction)) {
+      return EmitYnnFusionThunk(instruction);
+    }
+  }
+
   // TODO(tonywy): Add PotentiallyImplementedAsMKLConvolution to support
   // different data layouts.
   if (PotentiallyImplementedAsEigenConvolution(*instruction,
@@ -1084,7 +1083,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
       TF_ASSIGN_OR_RETURN(BufferAllocation::Slice out_slice,
                           GetAllocationSlice(instruction));
 
-#ifdef XLA_YNNPACK
       const bool use_ynn = absl::c_linear_search(
           hlo_module_config_.debug_options()
               .xla_cpu_experimental_ynn_fusion_type(),
@@ -1098,33 +1096,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
           return EmitYnnFusionThunk(instruction);
         }
       }
-#endif  // XLA_YNNPACK
-
-      // Decide whether to use XNNPACK or Eigen.
-      bool use_xnn = hlo_module_config_.debug_options().xla_cpu_use_xnnpack();
-      if (use_xnn) {
-        const bool use_cost_model =
-            hlo_module_config_.debug_options()
-                .xla_cpu_experimental_xnn_graph_fusion_mode() !=
-            DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
-        TF_ASSIGN_OR_RETURN(
-            use_xnn,
-            IsDotSupportedByXnn(dnums, lhs->shape(), rhs->shape(),
-                                instruction->shape(), &target_machine_features_,
-                                use_cost_model));
-      }
 
-      if (use_xnn) {
-        bool capture_rhs = HloPredicateIsOp<HloOpcode::kParameter>(rhs);
-        return ThunkSequence::Of<XnnDotThunk>(
-            XnnDotThunk::Options{}, ThunkInfo(instruction), dnums, lhs_slice,
-            lhs->shape(), rhs_slice, rhs->shape(), out_slice,
-            instruction->shape(), capture_rhs);
-      } else {
-        return ThunkSequence::Of<DotThunk>(
-            ThunkInfo(instruction), dnums, lhs_slice, lhs->shape(), rhs_slice,
-            rhs->shape(), out_slice, instruction->shape());
-      }
+      return ThunkSequence::Of<DotThunk>(
+          ThunkInfo(instruction), dnums, lhs_slice, lhs->shape(), rhs_slice,
+          rhs->shape(), out_slice, instruction->shape());
     }
   }
 }
@@ -1488,44 +1463,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitOneDnnFusionThunk(
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 }
 
-absl::StatusOr<ThunkSequence> ThunkEmitter::EmitXnnFusionThunk(
-    const HloInstruction* instruction) {
-  auto* fusion = Cast<HloFusionInstruction>(instruction);
-
-  // Collect XNNPACK fusion arguments.
-  std::vector<XnnFusionThunk::Argument> arguments;
-  for (HloInstruction* operand : instruction->operands()) {
-    for (auto& indexed : ShapeUtil::GetLeafShapes(operand->shape())) {
-      TF_ASSIGN_OR_RETURN(
-          BufferAllocation::Slice slice,
-          buffer_assignment_.GetUniqueSlice(operand, indexed.index));
-      arguments.push_back(XnnFusionThunk::Argument{slice, indexed.shape});
-    }
-  }
-
-  // Collect XNNPACK fusion results.
-  std::vector<XnnFusionThunk::Result> results;
-  for (auto& indexed : ShapeUtil::GetLeafShapes(instruction->shape())) {
-    TF_ASSIGN_OR_RETURN(
-        BufferAllocation::Slice slice,
-        buffer_assignment_.GetUniqueSlice(instruction, indexed.index));
-    results.push_back(XnnFusionThunk::Result{slice, indexed.shape});
-  }
-
-  const HloComputation* computation = fusion->fused_instructions_computation();
-
-  // Construct XNNPACK subgraph builder from the fusion computation.
-  TF_ASSIGN_OR_RETURN(auto builder, EmitXnnFusionBuilder(computation));
-
-  return ThunkSequence::Of<XnnFusionThunk>(
-      XnnFusionThunk::Options{}, ThunkInfo(instruction), std::move(arguments),
-      std::move(results),
-      [b = std::move(builder)](auto, auto) mutable { return b(); });
-}
-
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     const HloInstruction* instruction) {
-#ifdef XLA_YNNPACK
   // Collect YNNPACK fusion arguments.
   std::vector<YnnFusionThunk::Argument> arguments;
   for (HloInstruction* operand : instruction->operands()) {
@@ -1561,6 +1500,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     if (capture_rhs) {
       captured_arguments_ids = kCapturedIds;
     }
+  } else if (instruction->opcode() == HloOpcode::kConvolution) {
+    const HloConvolutionInstruction* conv =
+        Cast<HloConvolutionInstruction>(instruction);
+    // Construct YNNPACK subgraph builder from the convolution instruction.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnConvolutionBuilder(conv));
   } else {
     auto* fusion = Cast<HloFusionInstruction>(instruction);
     const HloComputation* computation =
@@ -1576,9 +1520,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
         return b(arg_buffers);
       },
       captured_arguments_ids);
-#else
-  return Unimplemented("XLA is not built with YNNPACK.");
-#endif  // XLA_YNNPACK
 }
 
 absl::StatusOr<ThunkEmitter::HostKernelAllocationSlices>
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index 6d034ddc6234aa..bb716b166487f0 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 23a21b04905d92..facd5747f004a9 100644
--- a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -119,7 +119,7 @@ ENTRY main {
       cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<AotCompilationResult>> aot_compilation_result,
+      std::vector<std::unique_ptr<CompiledModule>> aot_compilation_result,
       cpu_compiler.CompileAheadOfTime(std::move(hlo_module),
                                       aot_compilation_options));
   EXPECT_EQ(aot_compilation_result.size(), 1);
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index a7a0f218b6c64e..bffab9f53e2257 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -514,8 +514,79 @@ llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
   }
 }
 
+llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
+                                llvm::Value* imag, llvm::Module* module,
+                                llvm::IRBuilderBase* b) {
+  auto cplx_type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(),
+                                                  module->getContext());
+  auto complex = b->CreateInsertValue(
+      llvm::ConstantAggregateZero::get(cplx_type), real, {0});
+  if (imag != nullptr) {
+    complex = b->CreateInsertValue(complex, imag, {1});
+  }
+  return complex;
+}
+
 }  // namespace
 
+absl::StatusOr<llvm::Value*> EmitIota(const HloInstruction* hlo,
+                                      const IrArray::Index& target_index,
+                                      llvm::Module* module,
+                                      llvm::IRBuilderBase* b) {
+  auto* iota = Cast<HloIotaInstruction>(hlo);
+  PrimitiveType element_type = iota->shape().element_type();
+  IrArray::Index elem_index =
+      iota->shape().dimensions().size() > 1
+          ? target_index.SourceIndexOfBroadcast(
+                iota->shape(),
+                ShapeUtil::MakeShapeWithDescendingLayout(
+                    element_type,
+                    {iota->shape().dimensions(iota->iota_dimension())}),
+                {iota->iota_dimension()}, b)
+          : target_index;
+  llvm::Value* elem_index_linear = elem_index.linear();
+  if (elem_index_linear == nullptr) {
+    std::vector<int64_t> iota_bound = {
+        iota->shape().dimensions(iota->iota_dimension())};
+    elem_index_linear = elem_index.Linearize(iota_bound, b);
+  }
+  Shape component_shape = ShapeUtil::ElementIsComplex(iota->shape())
+                              ? ShapeUtil::ComplexComponentShape(iota->shape())
+                              : iota->shape();
+  PrimitiveType component_element_type = component_shape.element_type();
+  llvm::Value* iota_result;
+  if (primitive_util::IsIntegralType(component_element_type)) {
+    iota_result =
+        b->CreateIntCast(elem_index_linear,
+                         llvm_ir::PrimitiveTypeToIrType(component_element_type,
+                                                        module->getContext()),
+                         /*isSigned=*/false);
+  } else {
+    TF_RET_CHECK(primitive_util::IsFloatingPointType(component_element_type))
+        << component_element_type;
+    llvm::Type* float_ir_type;
+    if (component_element_type == F8E4M3FNUZ ||
+        component_element_type == F8E5M2FNUZ) {
+      float_ir_type = llvm_ir::PrimitiveTypeToIrType(F16, module->getContext());
+    } else {
+      float_ir_type = llvm_ir::PrimitiveTypeToIrType(component_element_type,
+                                                     module->getContext());
+    }
+    llvm::Value* float_val = b->CreateUIToFP(elem_index_linear, float_ir_type);
+    if (component_element_type == F8E4M3FNUZ ||
+        component_element_type == F8E5M2FNUZ) {
+      iota_result =
+          EmitFxToF8e(module, F16, component_element_type, float_val, b);
+    } else {
+      iota_result = float_val;
+    }
+  }
+  if (ShapeUtil::ElementIsComplex(iota->shape())) {
+    return EmitComposeComplex(iota, iota_result, nullptr, module, b);
+  }
+  return iota_result;
+}
+
 /*static*/ bool ElementalIrEmitter::OpInvalidatesCache(
     const HloInstruction* hlo) {
   switch (hlo->opcode()) {
@@ -621,13 +692,15 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
             primitive_util::ComplexComponentType(to_type),
             module_->getContext());
         if (primitive_util::IsSignedIntegralType(from_type)) {
-          return EmitComposeComplex(
-              op, SIToFP(operand_value, to_ir_component_type), nullptr);
+          return EmitComposeComplex(op,
+                                    SIToFP(operand_value, to_ir_component_type),
+                                    nullptr, module_, b_);
         }
         if (primitive_util::IsUnsignedIntegralType(from_type) ||
             from_type == PRED) {
-          return EmitComposeComplex(
-              op, UIToFP(operand_value, to_ir_component_type), nullptr);
+          return EmitComposeComplex(op,
+                                    UIToFP(operand_value, to_ir_component_type),
+                                    nullptr, module_, b_);
         }
       }
       return Unimplemented("conversion from primitive type %s to %s",
@@ -838,14 +911,14 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
         PrimitiveType to_component_type =
             primitive_util::ComplexComponentType(to_type);
         if (from_type == to_component_type) {
-          return EmitComposeComplex(op, operand_value, nullptr);
+          return EmitComposeComplex(op, operand_value, nullptr, module_, b_);
         }
         return EmitComposeComplex(
             op,
             FPCast(operand_value,
                    llvm_ir::PrimitiveTypeToIrType(to_component_type,
                                                   module_->getContext())),
-            nullptr);
+            nullptr, module_, b_);
       }
       if (to_type == BF16) {
         // F16 to BF16 has to go through an intermediate F32.
@@ -1182,7 +1255,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
                               r);  // handles nan and inf values correctly
 
       TF_ASSIGN_OR_RETURN(auto imag_part, EmitAtan2(component_type, b, a1, ""));
-      return EmitComposeComplex(op, real_part, imag_part);
+      return EmitComposeComplex(op, real_part, imag_part, module_, b_);
     }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
@@ -1198,7 +1271,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           to_component_type, module_->getContext());
       return EmitComposeComplex(
           op, FPCast(EmitExtractReal(operand_value), to_ir_component_type),
-          FPCast(EmitExtractImag(operand_value), to_ir_component_type));
+          FPCast(EmitExtractImag(operand_value), to_ir_component_type), module_,
+          b_);
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
@@ -1233,7 +1307,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto imag_nonzero = Select(exp_a_is_inf, imag_overflow, imag_normal);
       auto imag_result = Select(b_is_zero, zero, imag_nonzero);
 
-      return EmitComposeComplex(op, real_result, imag_result);
+      return EmitComposeComplex(op, real_result, imag_result, module_, b_);
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
@@ -1252,7 +1326,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto cos_b = FAdd(cos_b_minus_one, one);
       auto real_result = FAdd(FMul(expm1_a, cos_b), cos_b_minus_one);
       auto imag_result = Select(b_is_zero, zero, FMul(exp_a, sin_b));
-      return EmitComposeComplex(op, real_result, imag_result);
+      return EmitComposeComplex(op, real_result, imag_result, module_, b_);
     }
     case HloOpcode::kCos:
     case HloOpcode::kSin: {
@@ -1282,7 +1356,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
         real_result = FMul(cos_x, cosh_y);
         imag_result = FNeg(FMul(sin_x, sinh_y));
       }
-      return EmitComposeComplex(op, real_result, imag_result);
+      return EmitComposeComplex(op, real_result, imag_result, module_, b_);
     }
     case HloOpcode::kTan:
       // tan(x+yi) = -i*tanh(-y + xi)
@@ -1437,8 +1511,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
         real = Select(zero_nan, zero, real);
       }
 
-      return op_is_tan ? EmitComposeComplex(op, imag, FMul(neg_one, real))
-                       : EmitComposeComplex(op, real, imag);
+      return op_is_tan ? EmitComposeComplex(op, imag, FMul(neg_one, real),
+                                            module_, b_)
+                       : EmitComposeComplex(op, real, imag, module_, b_);
     }
     case HloOpcode::kAbs: {
       return EmitComplexAbs(component_type, operand_value);
@@ -1450,9 +1525,10 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = FCmpOEQ(cplx_abs, zero);
       return Select(
-          oeq, EmitComposeComplex(op, zero, zero),
+          oeq, EmitComposeComplex(op, zero, zero, module_, b_),
           EmitComposeComplex(op, FDiv(EmitExtractReal(operand_value), cplx_abs),
-                             FDiv(EmitExtractImag(operand_value), cplx_abs)));
+                             FDiv(EmitExtractImag(operand_value), cplx_abs),
+                             module_, b_));
     }
     case HloOpcode::kSqrt: {
       return EmitComplexSqrt(op, component_type, operand_value);
@@ -1462,7 +1538,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
-                                FNeg(EmitExtractImag(operand_value)));
+                                FNeg(EmitExtractImag(operand_value)), module_,
+                                b_);
     case HloOpcode::kReal:
       return EmitExtractReal(operand_value);
     case HloOpcode::kImag:
@@ -1493,7 +1570,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   switch (op->opcode()) {
     case HloOpcode::kComplex:
-      return EmitComposeComplex(op, lhs_value, rhs_value);
+      return EmitComposeComplex(op, lhs_value, rhs_value, module_, b_);
     case HloOpcode::kAdd:
       return FAdd(lhs_value, rhs_value, op->name());
     case HloOpcode::kSubtract:
@@ -1734,14 +1811,16 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexAdd(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   return EmitComposeComplex(
       op, FAdd(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
-      FAdd(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)));
+      FAdd(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)), module_,
+      b_);
 }
 
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexSubtract(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   return EmitComposeComplex(
       op, FSub(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
-      FSub(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)));
+      FSub(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)), module_,
+      b_);
 }
 
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexMultiply(
@@ -1751,7 +1830,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexMultiply(
       FSub(FMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
            FMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))),
       FAdd(FMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)),
-           FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))));
+           FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))),
+      module_, b_);
 }
 
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
@@ -1800,7 +1880,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
   auto c_i = Select(b_r_lt_b_i,
                     FDiv(FSub(FMul(b_r_b_i_ratio, a_i), a_r), b_r_b_i_denom),
                     FDiv(FSub(a_i, FMul(b_i_b_r_ratio, a_r)), b_i_b_r_denom));
-  auto result = EmitComposeComplex(op, c_r, c_i);
+  auto result = EmitComposeComplex(op, c_r, c_i, module_, b_);
 
   // Consider corner cases, if the result is (NaN, NaN).
   auto zero = llvm::ConstantFP::get(type, 0.0);
@@ -1813,8 +1893,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
           Or(Not(FCmpUNO(a_r, zero)), Not(FCmpUNO(a_i, zero))));
   auto inf_with_sign_of_b_r = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::copysign, {inf, b_r}, {type}, b_);
-  auto zero_denominator_result = EmitComposeComplex(
-      op, FMul(inf_with_sign_of_b_r, a_r), FMul(inf_with_sign_of_b_r, a_i));
+  auto zero_denominator_result =
+      EmitComposeComplex(op, FMul(inf_with_sign_of_b_r, a_r),
+                         FMul(inf_with_sign_of_b_r, a_i), module_, b_);
 
   // Case 2. Infinite numerator, finite denominator.
   auto b_r_finite = FCmpONE(b_r_abs, inf);
@@ -1839,7 +1920,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
       FMul(inf,
            FAdd(FMul(a_r_inf_with_sign, b_r), FMul(a_i_inf_with_sign, b_i))),
       FMul(inf,
-           FSub(FMul(a_i_inf_with_sign, b_r), FMul(a_r_inf_with_sign, b_i))));
+           FSub(FMul(a_i_inf_with_sign, b_r), FMul(a_r_inf_with_sign, b_i))),
+      module_, b_);
 
   // Case 3. Finite numerator, infinite denominator.
   auto a_r_finite = FCmpONE(a_r_abs, inf);
@@ -1860,7 +1942,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
       FMul(zero,
            FAdd(FMul(a_r, b_r_inf_with_sign), FMul(a_i, b_i_inf_with_sign))),
       FMul(zero,
-           FSub(FMul(a_i, b_r_inf_with_sign), FMul(a_r, b_i_inf_with_sign))));
+           FSub(FMul(a_i, b_r_inf_with_sign), FMul(a_r, b_i_inf_with_sign))),
+      module_, b_);
 
   auto c_nan = And(FCmpUNO(c_r, zero), FCmpUNO(c_i, zero));
   return Select(c_nan,
@@ -1882,7 +1965,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexLog(
   TF_ASSIGN_OR_RETURN(llvm::Value * abs,
                       EmitComplexAbs(component_type, operand_value));
   TF_ASSIGN_OR_RETURN(llvm::Value * log_abs, EmitLog(component_type, abs));
-  return EmitComposeComplex(op, log_abs, angle);
+  return EmitComposeComplex(op, log_abs, angle, module_, b_);
 }
 
 // Using our EmitComplexPower formula, but setting c=0.5 and d=0, we get:
@@ -1937,8 +2020,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexSqrt(
     imag_part = Select(FCmpOEQ(sin, zero), sin, FMul(r, sin));
   }
 
-  return Select(FCmpOEQ(r, zero), EmitComposeComplex(op, zero, zero),
-                EmitComposeComplex(op, real_part, imag_part));
+  return Select(FCmpOEQ(r, zero),
+                EmitComposeComplex(op, zero, zero, module_, b_),
+                EmitComposeComplex(op, real_part, imag_part, module_, b_));
 }
 
 // Similar to Sqrt, we can use our EmitComplexPower formula, but set
@@ -2004,7 +2088,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexRsqrt(
     imag_part = Select(is_zero_zero, nan, FMul(r, sin));
   }
 
-  return EmitComposeComplex(op, real_part, imag_part);
+  return EmitComposeComplex(op, real_part, imag_part, module_, b_);
 }
 
 //   lhs_value^rhs_value
@@ -2046,34 +2130,38 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexPower(
   // Nothing's Sign Bit, W. Kahan, Section 10.
   auto cutoff_0 =
       Select(And(And(FCmpOEQ(abs, zero), FCmpOEQ(d, zero)), FCmpOLE(zero, c)),
-             EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero),
-             EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)));
+             EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero,
+                                module_, b_),
+             EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q),
+                                module_, b_));
 
   // Case 1:
   // x^0 is defined to be 1 for any x, see
   // Branch Cuts for Complex Elementary Functions or Much Ado About
   // Nothing's Sign Bit, W. Kahan, Section 10.
-  auto cutoff_1 = Select(And(FCmpOEQ(zero, c), FCmpOEQ(d, zero)),
-                         EmitComposeComplex(op, one, zero), cutoff_0);
+  auto cutoff_1 =
+      Select(And(FCmpOEQ(zero, c), FCmpOEQ(d, zero)),
+             EmitComposeComplex(op, one, zero, module_, b_), cutoff_0);
 
   // Case 2:
   // 1^(c + d*i) = 1 + 0*i
-  auto cutoff_2 = Select(And(FCmpOEQ(a, one), FCmpOEQ(b, zero)),
-                         EmitComposeComplex(op, one, zero), cutoff_1);
+  auto cutoff_2 =
+      Select(And(FCmpOEQ(a, one), FCmpOEQ(b, zero)),
+             EmitComposeComplex(op, one, zero, module_, b_), cutoff_1);
 
   // Case 3:
   // inf^(c + 0*i) = inf + 0*i, c > 0
   auto cutoff_3 = Select(
       And(FCmpOEQ(a, inf),
           And(FCmpOEQ(b, zero), And(FCmpOEQ(d, zero), FCmpOGT(c, zero)))),
-      EmitComposeComplex(op, inf, zero), cutoff_2);
+      EmitComposeComplex(op, inf, zero, module_, b_), cutoff_2);
 
   // Case 4:
   // inf^(c + 0*i) = 0 + 0*i, c < 0
   auto cutoff_4 = Select(
       And(FCmpOEQ(a, inf),
           And(FCmpOEQ(b, zero), And(FCmpOEQ(d, zero), FCmpOLT(c, zero)))),
-      EmitComposeComplex(op, zero, zero), cutoff_3);
+      EmitComposeComplex(op, zero, zero, module_, b_), cutoff_3);
 
   return cutoff_4;
 }
@@ -2139,7 +2227,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
           llvm_ir::PrimitiveTypeToIrType(component_type, module_->getContext());
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto one = llvm::ConstantFP::get(type, 1.0);
-      auto i = EmitComposeComplex(op, zero, one);
+      auto i = EmitComposeComplex(op, zero, one, module_, b_);
       TF_ASSIGN_OR_RETURN(auto i_times_y, EmitComplexMultiply(op, i, y));
       TF_ASSIGN_OR_RETURN(auto x_plus_iy, EmitComplexAdd(op, x, i_times_y));
       TF_ASSIGN_OR_RETURN(
@@ -2147,7 +2235,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
           EmitComplexDivide(op, x_plus_iy, sqrt_x_squared_plus_y_squared));
       TF_ASSIGN_OR_RETURN(auto log_result, EmitComplexLog(op, div_result));
       auto negative_one = llvm::ConstantFP::get(type, -1.0);
-      auto negative_i = EmitComposeComplex(op, zero, negative_one);
+      auto negative_i = EmitComposeComplex(op, zero, negative_one, module_, b_);
       return EmitComplexMultiply(op, negative_i, log_result);
     }
     default:
@@ -3411,63 +3499,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kIota:
       return [this, hlo](const IrArray::Index& target_index)
                  -> absl::StatusOr<llvm::Value*> {
-        auto* iota = Cast<HloIotaInstruction>(hlo);
-        PrimitiveType element_type = iota->shape().element_type();
-        IrArray::Index elem_index =
-            iota->shape().dimensions().size() > 1
-                ? target_index.SourceIndexOfBroadcast(
-                      iota->shape(),
-                      ShapeUtil::MakeShapeWithDescendingLayout(
-                          element_type,
-                          {iota->shape().dimensions(iota->iota_dimension())}),
-                      {iota->iota_dimension()}, b_)
-                : target_index;
-        llvm::Value* elem_index_linear = elem_index.linear();
-        if (elem_index_linear == nullptr) {
-          std::vector<int64_t> iota_bound = {
-              iota->shape().dimensions(iota->iota_dimension())};
-          elem_index_linear = elem_index.Linearize(iota_bound, b_);
-        }
-        Shape component_shape =
-            ShapeUtil::ElementIsComplex(iota->shape())
-                ? ShapeUtil::ComplexComponentShape(iota->shape())
-                : iota->shape();
-        PrimitiveType component_element_type = component_shape.element_type();
-        llvm::Value* iota_result;
-        if (primitive_util::IsIntegralType(component_element_type)) {
-          iota_result = b_->CreateIntCast(
-              elem_index_linear,
-              llvm_ir::PrimitiveTypeToIrType(component_element_type,
-                                             module_->getContext()),
-              /*isSigned=*/false);
-        } else {
-          TF_RET_CHECK(
-              primitive_util::IsFloatingPointType(component_element_type))
-              << component_element_type;
-          llvm::Type* float_ir_type;
-          if (component_element_type == F8E4M3FNUZ ||
-              component_element_type == F8E5M2FNUZ) {
-            float_ir_type =
-                llvm_ir::PrimitiveTypeToIrType(F16, module_->getContext());
-          } else {
-            float_ir_type = llvm_ir::PrimitiveTypeToIrType(
-                component_element_type, module_->getContext());
-          }
-          llvm::Value* float_val =
-              b_->CreateUIToFP(elem_index_linear, float_ir_type);
-          if (component_element_type == F8E4M3FNUZ ||
-              component_element_type == F8E5M2FNUZ) {
-            iota_result = EmitFxToF8e(module_, F16, component_element_type,
-                                      float_val, b_);
-          } else {
-            iota_result = float_val;
-          }
-        }
-        if (ShapeUtil::ElementIsComplex(iota->shape())) {
-          return EmitComposeComplex(iota, iota_result, nullptr);
-        } else {
-          return iota_result;
-        }
+        return EmitIota(hlo, target_index, module_, b_);
       };
     case HloOpcode::kSlice:
       return [this, hlo, &operand_to_generator](
@@ -3603,19 +3635,6 @@ llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) {
   return ExtractValue(value, {1});
 }
 
-llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
-                                                    llvm::Value* real,
-                                                    llvm::Value* imag) {
-  auto cplx_type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(),
-                                                  module_->getContext());
-  auto complex =
-      InsertValue(llvm::ConstantAggregateZero::get(cplx_type), real, {0});
-  if (imag != nullptr) {
-    complex = InsertValue(complex, imag, {1});
-  }
-  return complex;
-}
-
 llvm::Value* ElementalIrEmitter::EmitMulAdd(llvm::Value* lhs, llvm::Value* rhs,
                                             llvm::Value* accumulator,
                                             xla::PrimitiveType primitive_type) {
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index c0941888bbe8f5..c6b33783448548 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -263,10 +263,6 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       absl::Span<llvm::Value* const> accumulator_addrs,
       llvm::ArrayRef<llvm::Type*> accumulator_types, bool is_variadic);
 
-  // Composes a complex struct. imag may be nullptr for simple cast operations.
-  llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
-                                  llvm::Value* imag);
-
   // Emit `accumulator + lhs * rhs` for the given primitive type.
   llvm::Value* EmitMulAdd(llvm::Value* lhs, llvm::Value* rhs,
                           llvm::Value* accumulator,
@@ -378,6 +374,11 @@ class ElementalIrEmitterForTests : public ElementalIrEmitter {
 
   HloToElementGeneratorMap generator_map_;
 };
+
+absl::StatusOr<llvm::Value*> EmitIota(
+    const HloInstruction* hlo, const llvm_ir::IrArray::Index& target_index,
+    llvm::Module* module, llvm::IRBuilderBase* b);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
diff --git a/third_party/xla/xla/service/executable.cc b/third_party/xla/xla/service/executable.cc
index b52166c243dea1..a9f8da25d12d1c 100644
--- a/third_party/xla/xla/service/executable.cc
+++ b/third_party/xla/xla/service/executable.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -70,7 +70,7 @@ absl::Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
 }
 
 void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
-                                      MaybeOwningDeviceMemory buffer) {
+                                      MaybeOwningDeviceAddress buffer) {
   *buffers_.mutable_element(index) = std::move(buffer);
   unowned_indices_.insert(index);
 }
@@ -86,12 +86,12 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
   return result;
 }
 
-static ExecutionInput MakeMaybeOwningDeviceMemoryTree(
+static ExecutionInput MakeMaybeOwningDeviceAddressTree(
     const ShapedBuffer& shaped_buffer) {
   ExecutionInput result(shaped_buffer.on_device_shape());
   shaped_buffer.buffers().ForEachElement(
       [&](const ShapeIndex& index, const se::DeviceAddressBase& mem) {
-        result.SetBuffer(index, MaybeOwningDeviceMemory(mem));
+        result.SetBuffer(index, MaybeOwningDeviceAddress(mem));
       });
   return result;
 }
@@ -102,7 +102,7 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteAsyncOnStream(
   std::vector<ExecutionInput> args;
   args.reserve(arguments.size());
   for (const ShapedBuffer* arg : arguments) {
-    args.emplace_back(MakeMaybeOwningDeviceMemoryTree(*arg));
+    args.emplace_back(MakeMaybeOwningDeviceAddressTree(*arg));
   }
   TF_ASSIGN_OR_RETURN(ExecutionOutput out,
                       ExecuteAsyncOnStream(run_options, std::move(args)));
diff --git a/third_party/xla/xla/service/executable.h b/third_party/xla/xla/service/executable.h
index db444230abe342..e76038f8a95f9a 100644
--- a/third_party/xla/xla/service/executable.h
+++ b/third_party/xla/xla/service/executable.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -37,9 +36,8 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -63,11 +61,11 @@ namespace xla {
 // 3) Donated by the caller and freed on error.
 //
 // Case (1) buffers are stored as
-// MaybeOwningDeviceMemory(DeviceAddressBase). Case (2) buffers are
-// stored as MaybeOwningDeviceMemory(ScopedDeviceAddress<uint8_t>),
+// MaybeOwningDeviceAddress(DeviceAddressBase). Case (2) buffers are
+// stored as MaybeOwningDeviceAddress(ScopedDeviceAddress<uint8_t>),
 //   with their indices present in unowned_indices_.
 // Case (3) buffers are stored as
-// MaybeOwningDeviceMemory(ScopedDeviceAddress<uint8_t>),
+// MaybeOwningDeviceAddress(ScopedDeviceAddress<uint8_t>),
 //   with their indices absent from unowned_indices_.
 class ExecutionInput {
  public:
@@ -90,14 +88,14 @@ class ExecutionInput {
     }
   }
 
-  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
+  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceAddress> buffers)
       : buffers_(std::move(buffers)) {
     if (!ShapeUtil::DeviceShapeIsHostShape(buffers_.shape())) {
       SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
     }
   }
   // TODO(b/170310047): remove this overload.
-  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
+  ExecutionInput(ShapeTree<MaybeOwningDeviceAddress> buffers,
                  xla::Shape host_shape)
       : buffers_(std::move(buffers)) {
     if (!ShapeUtil::DeviceShapeIsHostShape(buffers_.shape())) {
@@ -121,12 +119,12 @@ class ExecutionInput {
 
   absl::Status SetDynamicShape(Shape dynamic_shape);
 
-  void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
+  void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceAddress buffer) {
     *buffers_.mutable_element(index) = std::move(buffer);
   }
 
   void SetUnownedBuffer(const ShapeIndex& index,
-                        MaybeOwningDeviceMemory buffer);
+                        MaybeOwningDeviceAddress buffer);
 
   void SetUnownedIndex(const ShapeIndex& index) {
     unowned_indices_.insert(index);
@@ -140,15 +138,17 @@ class ExecutionInput {
     return unowned_indices_;
   }
 
-  const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
+  const ShapeTree<MaybeOwningDeviceAddress>& Buffers() const {
+    return buffers_;
+  }
 
-  ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; }
+  ShapeTree<MaybeOwningDeviceAddress>* MutableBuffers() { return &buffers_; }
 
-  MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) {
+  MaybeOwningDeviceAddress* MutableBuffer(const ShapeIndex& index) {
     return buffers_.mutable_element(index);
   }
 
-  const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const {
+  const MaybeOwningDeviceAddress& Buffer(const ShapeIndex& index) const {
     return buffers_.element(index);
   }
 
@@ -159,7 +159,7 @@ class ExecutionInput {
     }
   }
 
-  ShapeTree<MaybeOwningDeviceMemory> buffers_;
+  ShapeTree<MaybeOwningDeviceAddress> buffers_;
 
   // Set of indices of buffers that should be returned to the caller if an error
   // occurs when enqueuing the computation.
@@ -265,20 +265,6 @@ class Executable {
   // doesn't need it for execution.
   explicit Executable(std::shared_ptr<HloModule> hlo_module)
       : hlo_module_(std::move(hlo_module)) {}
-
-  // TODO(b/172012028): Remove this constructor.
-  // The hlo_module parameter may be nullptr, if the given executable type
-  // doesn't need it for execution.
-  explicit Executable(
-      std::shared_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-      : hlo_module_(std::move(hlo_module)),
-        hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
-        hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
-    CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
-             hlo_profile_index_map_.get() == nullptr);
-  }
   virtual ~Executable() = default;
 
   // Enqueues the compilation result on the provided stream, passing the given
@@ -344,22 +330,6 @@ class Executable {
       const ServiceExecutableRunOptions* run_options,
       std::vector<ExecutionInput> arguments);
 
-  const HloProfilePrinterData& hlo_profile_printer_data() const {
-    CHECK(hlo_profiling_enabled());
-    return *hlo_profile_printer_data_;
-  }
-
-  const HloProfileIndexMap& hlo_profile_index_map() const {
-    CHECK(hlo_profiling_enabled());
-    return *hlo_profile_index_map_;
-  }
-
-  // Returns whether this executable was compiled with HLO profilings support
-  // enabled. If not, the caller should not expect an hlo_execution_profile
-  // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const {
-    return hlo_profile_printer_data_ != nullptr;
-  }
 
   HloModule& module() const {
     CHECK(hlo_module_ != nullptr);
@@ -477,9 +447,6 @@ class Executable {
   // execution.
   int64_t execution_count_ = 0;
 
-  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
-  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
-
   // A map from kernel name to relevant kernel stats.
   ModuleStats module_stats_;
 
diff --git a/third_party/xla/xla/service/generic_transfer_manager_test.cc b/third_party/xla/xla/service/generic_transfer_manager_test.cc
index 4347dded08428f..6c394b0886618c 100644
--- a/third_party/xla/xla/service/generic_transfer_manager_test.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager_test.cc
@@ -65,7 +65,8 @@ class GenericTransferManagerTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(stream_executor_, platform->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, stream_executor_->CreateStream());
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor_);
+        std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+            stream_executor_);
   }
 
   ScopedShapedBuffer AllocateBuffer(const Shape& shape) {
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 74cd0638e466eb..18e97bd54a005d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -193,14 +193,15 @@ xla_test(
     name = "custom_call_test",
     srcs = ["custom_call_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    tags = ["no-oneapi"],  # TODO(intel-tf): Remove it when macro substitutions for SYCL are available in xla/stream_executor/sycl/*.
+    tags = [
+        # TODO(intel-tf): Remove it when macro substitutions for SYCL are available in xla/stream_executor/sycl/*.
+        "no-oneapi",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu:ffi",
         "//xla/ffi",
@@ -210,38 +211,32 @@ xla_test(
         "//xla/hlo/builder/lib:constants",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/testlib:test_helpers",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
-        "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_runner_interface",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
-        "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tests:client_library_test_runner_mixin",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 xla_cc_test(
@@ -424,7 +419,6 @@ cc_library(
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:host_execute_thunk",
         "//xla/backends/gpu/runtime:thunk_id",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:call_inliner",
@@ -469,6 +463,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/llvm:llvm_emitter",
@@ -484,12 +479,14 @@ cc_library(
         "//xla/backends/gpu/runtime:collective_group_thunk",
         "//xla/backends/gpu/runtime:collective_kernel_thunk",
         "//xla/backends/gpu/runtime:collective_metadata_thunk",
+        "//xla/backends/gpu/runtime:collective_multimem",
         "//xla/backends/gpu/runtime:collective_permute_thunk",
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:command_buffer_cmd",
         "//xla/backends/gpu/runtime:command_buffer_cmd_emitter",
         "//xla/backends/gpu/runtime:command_buffer_thunk",
         "//xla/backends/gpu/runtime:conditional_thunk",
+        "//xla/backends/gpu/runtime:convolution_filter_thunk_proto_cc",
         "//xla/backends/gpu/runtime:convolution_reorder_thunk",
         "//xla/backends/gpu/runtime:convolution_thunk",
         "//xla/backends/gpu/runtime:copy_thunk",
@@ -526,6 +523,7 @@ cc_library(
         "//xla/backends/gpu/runtime:wait_for_streams_thunk",
         "//xla/backends/gpu/runtime:while_thunk",
         "//xla/codegen/emitters:kernel_arguments",
+        "//xla/core/host_offloading:host_offloading_executable_proto_cc",
         "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -540,6 +538,7 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:hlo_creation_utils",
+        "//xla/service:hlo_proto_cc",
         "//xla/service:name_uniquer",
         "//xla/service:platform_util",
         "//xla/service/gpu/kernels:custom_kernel",
@@ -564,6 +563,7 @@ cc_library(
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -605,7 +605,6 @@ cc_library(
     srcs = ["kernel_call.cc"],
     hdrs = ["kernel_call.h"],
     deps = [
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -626,7 +625,6 @@ xla_cc_test(
     srcs = ["kernel_call_test.cc"],
     deps = [
         ":kernel_call",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -694,6 +692,7 @@ cc_library(
         "//xla/backends/gpu/runtime:annotation",
         "//xla/backends/gpu/runtime:collective_clique_requests",
         "//xla/backends/gpu/runtime:collective_cliques",
+        "//xla/backends/gpu/runtime:collective_multimem_registry",
         "//xla/backends/gpu/runtime:collective_params",
         "//xla/backends/gpu/runtime:command_buffer_conversion_pass",
         "//xla/backends/gpu/runtime:nvshmem_collective_thunk",
@@ -712,7 +711,7 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:rendezvous",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
@@ -784,6 +783,7 @@ xla_cc_test(
         "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/client:executable_build_options",
         "//xla/codegen/emitters:kernel_arguments",
@@ -1121,6 +1121,8 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -1128,8 +1130,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1503,13 +1503,13 @@ cc_library(
     hdrs = ["fusion_dispatch_pipeline.h"],
     deps = [
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
         "//xla/service/gpu/transforms:fusion_dynamic_memcpy_rewriter",
         "//xla/stream_executor:device_description",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1520,7 +1520,6 @@ cc_library(
     deps = [
         ":alias_info",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -1533,6 +1532,7 @@ cc_library(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/transforms:multi_output_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
+        "//xla/service/gpu/transforms:sort_iota_fusion",
         "//xla/service/gpu/transforms:variadic_op_splitter",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
@@ -1566,12 +1566,8 @@ cc_library(
 
 cc_library(
     name = "gpu_compiler",
-    srcs = [
-        "gpu_compiler.cc",
-    ],
-    hdrs = [
-        "gpu_compiler.h",
-    ],
+    srcs = ["gpu_compiler.cc"],
+    hdrs = ["gpu_compiler.h"],
     tags = ["gpu"],
     deps = [
         ":alias_info",
@@ -1729,11 +1725,13 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "//xla/service:hlo_verifier",
+        "//xla/service:host_offload_utils",
         "//xla/service:layout_assignment",
         "//xla/service:layout_normalization",
         "//xla/service:llvm_compiler",
         "//xla/service:logical_buffer",
         "//xla/service:loop_schedule_linearizer",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:reduce_scatter_combiner",
         "//xla/service:reduce_scatter_reassociate",
         "//xla/service:scatter_determinism_expander",
@@ -1771,6 +1769,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_strength_reduction",
         "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
         "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
+        "//xla/service/gpu/transforms:estimate_cub_scratch_size",
         "//xla/service/gpu/transforms:explicit_collectives_group_async_wrapper",
         "//xla/service/gpu/transforms:explicit_stream_annotation_async_wrapper",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
@@ -1779,7 +1778,9 @@ cc_library(
         "//xla/service/gpu/transforms:gemm_fusion",
         "//xla/service/gpu/transforms:gemm_fusion_swap_operands",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:gemm_workspace_rewriter",
         "//xla/service/gpu/transforms:gemv_rewriter",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:layout_assignment",
         "//xla/service/gpu/transforms:move_copy_to_users",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
@@ -1845,6 +1846,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -1898,6 +1900,7 @@ xla_test(
     },
     backends = ["gpu"],
     data = ["gpu_compiler_test_autotune_db.textproto"],
+    shard_count = 2,
     deps = [
         ":alias_info",
         ":backend_configs_cc",
@@ -2182,6 +2185,7 @@ cc_library(
         "//xla/service/gpu/transforms:cudnn_norm_rewriter",
         "//xla/service/gpu/transforms:cudnn_pad_for_convolutions",
         "//xla/service/gpu/transforms:cudnn_simplify_padding",
+        "//xla/service/gpu/transforms:gemm_workspace_rewriter",
         "//xla/service/gpu/transforms:triangular_solve_rewriter",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -2243,7 +2247,6 @@ xla_test(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
@@ -2321,9 +2324,7 @@ xla_cc_test(
         "no_oss",
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
         "requires-gpu-nvidia",
-    ] + if_google([
-        "ignore_for_dep=third_party/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h",
-    ]),
+    ],
     deps = if_cuda_is_configured([
         ":nvptx_compiler_impl",
         "//xla/stream_executor:cuda_platform",
@@ -2364,7 +2365,6 @@ cc_library(
     ],
     tags = [
         "gpu",
-        "manual",
         "rocm-only",
     ],
     deps = [
@@ -2385,7 +2385,6 @@ cc_library(
     ],
     tags = [
         "gpu",
-        "manual",
         "rocm-only",
     ],
     deps = [
@@ -2496,7 +2495,6 @@ cc_library(
         ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/collectives:async_collective_creator",
@@ -2528,6 +2526,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -2714,6 +2713,7 @@ xla_cc_test(
         ":stream_executor_util",
         "//xla:autotuning_proto_cc",
         "//xla/service:hlo_module_config",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
@@ -2784,7 +2784,6 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_googletest//:gtest",
     ],
@@ -2825,7 +2824,6 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2930,9 +2928,6 @@ xla_cc_test(
 xla_test(
     name = "float_support_test",
     srcs = ["float_support_test.cc"],
-    backend_tags = {"gpu": [
-        "cuda-only"
-    ]},
     backends = [
         "a100",
         "h100",
@@ -3186,6 +3181,11 @@ xla_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
     backends = ["gpu"],
+    # TODO(b/471244513) disabled because it times out.
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         "//xla:literal",
         "//xla:xla_proto_cc",
@@ -3268,7 +3268,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
-        "//xla/service:collective_permute_decomposer",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service:profile_guided_latency_estimator",
@@ -3290,7 +3289,6 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
diff --git a/third_party/xla/xla/service/gpu/alias_info.cc b/third_party/xla/xla/service/gpu/alias_info.cc
index 1d42d7cd120777..0db594dc5cd8cd 100644
--- a/third_party/xla/xla/service/gpu/alias_info.cc
+++ b/third_party/xla/xla/service/gpu/alias_info.cc
@@ -128,6 +128,15 @@ std::optional<bool> FusionCanShareBufferHint(
           continue;
         }
       }
+      // For sort, we can share the buffer if the operand appears only once. We
+      // can share it with that output buffer that corresponds to the operand.
+      if (hlo == non_bitcast_root && hlo->opcode() == HloOpcode::kSort &&
+          absl::c_count(hlo->operands(), hlo_operand) == 1) {
+        if (user_index != ShapeIndex{hlo->operand_index(hlo_operand)}) {
+          return false;
+        }
+        continue;
+      }
       if (non_bitcast_root->opcode() == HloOpcode::kDynamicUpdateSlice &&
           hlo->opcode() == HloOpcode::kDynamicSlice &&
           non_bitcast_root->operand(0) == hlo->operand(0) &&
diff --git a/third_party/xla/xla/service/gpu/alias_info_test.cc b/third_party/xla/xla/service/gpu/alias_info_test.cc
index cc56b0764c8d5f..d64c4ac75a46e2 100644
--- a/third_party/xla/xla/service/gpu/alias_info_test.cc
+++ b/third_party/xla/xla/service/gpu/alias_info_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
@@ -79,8 +78,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -102,8 +101,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -125,8 +124,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -151,8 +150,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
   // The second operand cannot share the buffer with the second fusion output,
@@ -182,8 +181,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
   // The first operand cannot share the buffer with the second fusion output,
@@ -222,8 +221,8 @@ ENTRY %main {
       kind=kLoop, calls=%fused_computation
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {1}));
 }
@@ -265,8 +264,8 @@ TEST_F(AliasInfoTest, BufferCannotBeSharedScatterMultiOutputFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   // We expect that no buffer can be shared, because when Scatter is involved,
   // the only buffer we can potentially share is the first operand of scatter,
@@ -312,8 +311,8 @@ TEST_F(AliasInfoTest, BufferCanBeSharedScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {}));
@@ -353,8 +352,8 @@ TEST_F(AliasInfoTest, BufferCannotBeSharedScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {}));
@@ -401,8 +400,8 @@ TEST_F(AliasInfoTest, BufferCanBeSharedVariadicScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(1), {1}));
@@ -441,8 +440,8 @@ TEST_F(AliasInfoTest,
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {}));
@@ -486,13 +485,81 @@ TEST_F(AliasInfoTest, BufferCannotBeSharedVariadicScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {1}));
 }
 
+TEST_F(AliasInfoTest, BufferCanBeSharedSortFusion) {
+  const char* const kModuleString = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_update_0 = s32[] parameter(2)
+      %rhs_update_0 = s32[] parameter(3)
+      %lhs_permutation = s32[] parameter(4)
+      %rhs_permutation = s32[] parameter(5)
+      ROOT %compare = pred[] compare(%lhs_key, %rhs_key), direction=LT
+    }
+
+    sort_fusion {
+      p0 = s32[16384]{0} parameter(0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) sort(p0, iota, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[16384]{0} parameter(0)
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kInput, calls=sort_fusion
+    }
+    )";
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {2}));
+}
+
+TEST_F(AliasInfoTest, BufferCannotBeSharedSortFusionDuplicateSortOperand) {
+  const char* const kModuleString = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_update_0 = s32[] parameter(2)
+      %rhs_update_0 = s32[] parameter(3)
+      %lhs_permutation = s32[] parameter(4)
+      %rhs_permutation = s32[] parameter(5)
+      ROOT %compare = pred[] compare(%lhs_key, %rhs_key), direction=LT
+    }
+
+    sort_fusion {
+      p0 = s32[16384]{0} parameter(0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) sort(p0, iota, p0), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[16384]{0} parameter(0)
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kInput, calls=sort_fusion
+    }
+    )";
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {2}));
+}
+
 TEST_F(AliasInfoTest, BufferCannotBeSharedConvertedShapeDifferentByteWidth) {
   const char* const kModuleString = R"(
 HloModule fusion
@@ -510,8 +577,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -533,8 +600,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -555,8 +622,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -577,8 +644,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -606,8 +673,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -636,8 +703,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -664,8 +731,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
@@ -702,8 +769,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
@@ -738,8 +805,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
 }
@@ -768,8 +835,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -799,8 +866,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -830,8 +897,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -862,8 +929,8 @@ ENTRY main {
   ROOT %address_computation = (f32[8]{0}, (f32[128]{0}, f32[256]{0})) fusion(p0, p1, p2), kind=kCustom, calls=%dynamic-slice-fusion, backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation","kernel_index":0}},"force_earliest_schedule":false,"reification_cost":[]}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {1, 0}));
 }
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 5b9552d45a747b..a3c8ef193cb532 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -50,8 +50,6 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/gpu/autotuner:cudnn",
-        "//xla/backends/gpu/codegen/triton:tma_utils",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -70,7 +68,6 @@ cc_library(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -78,6 +75,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@local_config_cuda//cuda:cuda_headers",
     ],
 )
@@ -100,7 +98,6 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -119,6 +116,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@local_config_rocm//rocm:rocm_headers",
     ],
 )
@@ -187,6 +185,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:fusion_wrapper",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/service/gpu/transforms:scaled_dot_rewriter",
@@ -243,9 +242,9 @@ xla_test(
         "a100",
         "h100",
         "b200",
+        "amdgpu_any"
     ],
     tags = [
-        "cuda-only",
         "no_mac",
     ],
     deps = [
@@ -258,7 +257,6 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/autotuner:gpu_codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -317,6 +315,7 @@ cc_library(
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:matmul_utils",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/lib/core:bits",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -411,11 +410,7 @@ cc_library(
 xla_cc_test(
     name = "autotune_cache_key_test",
     srcs = ["autotune_cache_key_test.cc"],
-    data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
     deps = [
         ":autotune_cache_key",
         "//xla/hlo/ir:hlo",
@@ -496,7 +491,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu:ir_emission_utils",
@@ -632,7 +627,6 @@ xla_test(
         "amdgpu_any",
     ],
     tags = [
-        "cuda-only",
         "noasan",
         "nomsan",
     ],
@@ -678,14 +672,8 @@ tf_proto_library(
 xla_cc_test(
     name = "autotuner_util_test",
     srcs = ["autotuner_util_test.cc"],
-    data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-    ],
-    tags = [
-        "gpu",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
+    tags = ["gpu"],
     deps = [
         ":autotune_cache_key",
         ":autotuner_status_key",
@@ -759,6 +747,16 @@ cc_library(
 
 cc_library(
     name = "triton_configs",
+    srcs = ["triton_configs.cc"],
     hdrs = ["triton_configs.h"],
-    deps = ["//xla/service/gpu:matmul_utils"],
+    deps = [
+        "//xla:autotuning_proto_cc",
+        "//xla/service/gpu:matmul_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf",
+    ],
 )
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
index c133650da12dda..1c5cf3a2298d40 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
@@ -32,7 +32,7 @@ class AutotuneCacheKey {
   // Tie a version to the cache key in order to invalidate the cache when
   // necessary. This should be incremented on triton upgrades or any other
   // changes that may affect the autotuning results.
-  static constexpr int kCurrentVersion = 19;
+  static constexpr int kCurrentVersion = 21;
 
   AutotuneCacheKey(const se::DeviceDescription& device_description,
                    const HloInstruction& instruction,
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
index 35c5669ade26df..9033896ccae947 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
@@ -66,8 +66,8 @@ TEST(AutotuneCacheKeyTest, DeviceDescriptionToCacheKey) {
     std::string spec_string;
     CHECK_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "tools", "hlo_opt",
-                          "gpu_specs", spec_file_name),
+        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+                          "backends/gpu/target_config/specs", spec_file_name),
         &spec_string));
     EXPECT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
index 28426837c78812..64a1225e10bde4 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -63,7 +63,7 @@ std::vector<ExecutionInput> ExecutionInputsFromBuffers(
     // Our executable doesn't have input-output aliasing, so we can pass
     // unowned input buffers.
     inputs.back().SetUnownedBuffer(
-        /*index=*/{}, MaybeOwningDeviceMemory(/*unowned=*/buffers.at(i)));
+        /*index=*/{}, MaybeOwningDeviceAddress(/*unowned=*/buffers.at(i)));
   }
   return inputs;
 }
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
index 1e992583900e91..22a6eba528db5a 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
@@ -64,7 +64,8 @@ AutotuneConfig GetAutotuneConfig(const DebugOptions& debug_options,
       !debug_options.xla_gpu_cublas_fallback();
   autotune_config.select_first_config =
       debug_options.xla_gpu_deterministic_ops() ||
-      debug_options.xla_gpu_exclude_nondeterministic_ops();
+      debug_options.xla_gpu_exclude_nondeterministic_ops() ||
+      debug_options.xla_gpu_autotune_level() == 0;
 
   if (is_deviceless) {
     // If we are running on a deviceless target, we want to use default configs.
@@ -80,10 +81,12 @@ AutotuneConfig GetAutotuneConfig(const DebugOptions& debug_options,
   return autotune_config;
 }
 
-ProfileOptions GetProfileOptions(const DebugOptions& debug_options) {
+ProfileOptions GetProfileOptions(const DebugOptions& debug_options,
+                                 const AutotuneConfig& autotune_config) {
   ProfileOptions profile_options;
   profile_options.redzone_padding_bytes =
       debug_options.xla_gpu_redzone_padding_bytes();
+  profile_options.should_init_buffers = autotune_config.check_buffers;
   return profile_options;
 }
 
@@ -101,10 +104,12 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
   bool is_deviceless = stream_executor == nullptr;
   AutotuneConfig autotune_config =
       GetAutotuneConfig(debug_options, is_deviceless, optimize_scratch_bytes);
+  VLOG(1) << "Autotune config: " << autotune_config.ToString();
 
   if (!is_deviceless) {
-    profiler = GpuProfiler::Create(stream_executor,
-                                   GetProfileOptions(debug_options), allocator);
+    profiler = GpuProfiler::Create(
+        stream_executor, GetProfileOptions(debug_options, autotune_config),
+        allocator);
   }
 
   std::unique_ptr<AutotunerCacheInterface> cache =
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
index 43a2806d38e1f9..c7da8c617c1199 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/lib/core/bits.h"
 #include "xla/util.h"
 #include "tsl/platform/protobuf.h"
@@ -124,7 +125,7 @@ TritonDotFusionSearchSpace::TritonDotFusionSearchSpace(
 }
 
 std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
-    std::optional<int64_t> force_contracting_split, bool autotune_tma,
+    std::optional<int64_t> force_contracting_split,
     bool autotune_warp_specialization) const {
   std::vector<ConfigWithNotes> configs;
   if (force_contracting_split.has_value()) {
@@ -153,22 +154,12 @@ std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddCtaSizeParameter);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddContractingTiling);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddPipeliningParameter);
-
-  if (autotune_warp_specialization && !autotune_tma) {
-    LOG(WARNING)
-        << "Warp specialization is requested, but TMA is not enabled, hence "
-           "warp specialization will be ignored. Set both "
-           "`is_warp_specialization_allowed` and `is_tma_allowed` "
-           "to true on the configuration to enable warp specialization.";
-  }
-  if (autotune_tma) {
-    VLOG(10) << "Parameterizing all currently constructed configs with "
-                "TMA.";
+  if (stream_executor::gpu::IsTmaAvailableForDevice(device_description_)) {
     ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddTmaParameter);
-    if (autotune_warp_specialization) {
-      ExtendConfigs(
-          configs, &TritonDotFusionSearchSpace::AddWarpSpecializationParameter);
-    }
+  }
+  if (autotune_warp_specialization) {
+    ExtendConfigs(configs,
+                  &TritonDotFusionSearchSpace::AddWarpSpecializationParameter);
   }
 
   std::vector<TritonGemmConfig> result;
@@ -647,7 +638,7 @@ void TritonDotFusionSearchSpace::AddTmaParameter(
   new_config.config.is_tma_allowed = false;
   updated_configs.push_back(new_config);
 
-  if (IsTmaRecommended(config.config)) {
+  if (exhaustive_tiling_search_ || IsTmaRecommended(config.config)) {
     new_config.config.is_tma_allowed = true;
     updated_configs.push_back(new_config);
   }
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
index 3cbae90aa0a9c7..7d6e68ccb0bd8c 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
@@ -46,14 +46,10 @@ class TritonDotFusionSearchSpace {
   // If `force_contracting_split` is set, the search space
   // will be restricted to only include configs with the given split_k factor.
   //
-  // If true, `autotune_tma` and `autotune_warp_specialization` extend the
-  // search space with TMA parameterization and warp specialization
-  // respectively. Setting 'autotune_warp_specialization' to true also requires
-  // `autotune_tma` to be true, given that warp specialization is probably not
-  // useful without TMA.
+  // If true, `autotune_warp_specialization` extends the search space with warp
+  // specialization support.
   std::vector<TritonGemmConfig> GenerateConfigs(
       std::optional<int64_t> force_contracting_split = std::nullopt,
-      bool autotune_tma = false,
       bool autotune_warp_specialization = false) const;
 
   // Restrict the set of configs to the ones compatible with the hints list.
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index 6dfb5b9a5c7966..5828cb9d6b3487 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -92,6 +92,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
 #include "xla/service/gpu/transforms/scaled_dot_rewriter.h"
@@ -106,7 +107,6 @@ limitations under the License.
 #include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream.h"
@@ -145,6 +145,7 @@ namespace {
 std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
     const se::DeviceDescription* device_description) {
   auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<ScaledDotRewriter>());
   pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
   for (GemmRewriterOptions::DType dtype :
        {GemmRewriterOptions::DType::kFp8Only,
@@ -350,6 +351,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     TF_RETURN_IF_ERROR(fusion_wrapper.Run(new_module.get()).status());
   }
 
+  HoistFusedBitcasts hoist_fused_bitcasts;
+  TF_RETURN_IF_ERROR(hoist_fused_bitcasts.Run(new_module.get()).status());
   NestGemmFusion nest_gemm_fusion(gpu_device_info, mlir_context);
   TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(new_module.get()).status());
   return new_module;
@@ -1003,20 +1006,9 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
       supports_contracting_split &&
       debug_options_.xla_gpu_enable_split_k_autotuning();
 
-  // Allow TMA tuning for Hopper+ devices when TMA flag is passed.
-  bool autotune_tma = debug_options_.xla_gpu_experimental_enable_triton_tma() &&
-                      stream_executor::gpu::IsTmaAvailableForDevice(
-                          config_.GetDeviceDescription());
   bool autotune_warp_specialization =
       debug_options_.xla_gpu_experimental_enable_triton_warp_specialization() &&
       IsWarpSpecializationAvailable();
-  if (autotune_warp_specialization && !autotune_tma) {
-    return absl::InvalidArgumentError(
-        "Warp specialization is requested, but TMA is not enabled. If you wish "
-        "to enable warp specialization, set both "
-        "`xla_gpu_experimental_enable_triton_tma` and "
-        "`xla_gpu_experimental_enable_triton_warp_specialization` to true.");
-  }
   TritonDotFusionSearchSpace search_space(config_.GetDeviceDescription(), &dot);
   VLOG(1) << "Generating configs from search space: "
           << search_space.ToString();
@@ -1026,7 +1018,6 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
       /*force_contracting_split=*/autotune_contracting_split
           ? std::nullopt
           : std::make_optional(1),
-      /*autotune_tma=*/autotune_tma,
       /*autotune_warp_specialization=*/autotune_warp_specialization);
 
   if (auto overrides = config_.gemm_config_overrides(); overrides.has_value()) {
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
index 1ca4059f91d77e..19023367bfb7b8 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
index 7dc86e8a9c2fde..f086503f3ec795 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cudnn.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -39,7 +38,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -118,27 +116,13 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
   std::vector<TritonGemmConfig> configs;
 
   if (compute_capability.IsAtLeastBlackwell()) {
-    configs = *kBlackwellConfigs;
-  } else if (compute_capability.IsHopper() || compute_capability.IsAmpere()) {
-    configs = *kHopperAmpereConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kBlackwell);
+  } else if (compute_capability.IsHopper()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopper);
+  } else if (compute_capability.IsAmpere()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kAmpere);
   } else {
-    configs = *kDefaultCudaConfigs;
-  }
-
-  if (!debug_options_.xla_gpu_experimental_enable_triton_tma() ||
-      !stream_executor::gpu::IsTmaAvailableForDevice(
-          config_.GetDeviceDescription())) {
-    return configs;
-  }
-  std::vector<TritonGemmConfig> tma_parameterized_configs;
-  for (auto& config : configs) {
-    config.is_tma_allowed = false;
-    tma_parameterized_configs.push_back(config);
-
-    if (IsTmaRecommended(config)) {
-      config.is_tma_allowed = true;
-      tma_parameterized_configs.push_back(config);
-    }
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultCuda);
   }
 
   // TODO(b/449668102): Currently only supporting warp specialization on
@@ -146,10 +130,10 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
   if (!debug_options_
            .xla_gpu_experimental_enable_triton_warp_specialization() ||
       !compute_capability.IsAtLeastBlackwell()) {
-    return tma_parameterized_configs;
+    return configs;
   }
   std::vector<TritonGemmConfig> warp_specialized_configs;
-  for (auto& config : tma_parameterized_configs) {
+  for (auto& config : configs) {
     config.is_warp_specialization_allowed = false;
     warp_specialized_configs.push_back(config);
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
index 83232e68d4e126..e7d072f1f0d96e 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
@@ -49,7 +49,7 @@ GemmFusionAutotuner::GetPlatformCodegenBackends(
 
 std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     const {
-  return *kDefaultRocmConfigs;
+  return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
index 1744f373b71300..1198ba3ec5af83 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/error_spec.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc b/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
new file mode 100644
index 00000000000000..a540e6a2ede81d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
@@ -0,0 +1,296 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuning/triton_configs.h"
+
+#include <initializer_list>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/autotuning.pb.h"
+#include "xla/service/gpu/matmul_utils.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// TODO(b/467265599): Replace string constants with cc_embed_data when
+// https://github.com/bazelbuild/rules_cc/issues/41 is fixed.
+
+constexpr absl::string_view kBlackwellTritonConfigs = R"(
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 1 num_stages: 1 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 8 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 16 split_k: 512 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 1 num_stages: 5 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 64 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 2 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 4 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 8 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 8 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 32 block_k: 64 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 256 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 256 block_n: 32 block_k: 32 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 512 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 1 num_warps: 16 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 2 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 64 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 8 num_stages: 1 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 1 num_stages: 1 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 32 block_k: 64 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 256 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 256 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 256 block_n: 32 block_k: 32 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+)";
+
+constexpr absl::string_view kDefaultCudaTritonConfigs = R"(
+config { block_m: 32 block_n: 32 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 64 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 128 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 64 block_k: 128 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 128 block_k: 32 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 512 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 512 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 1 num_stages: 2 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 128 block_k: 32 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 256 block_n: 128 block_k: 128 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 64 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 32 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 2 num_stages: 1 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 2 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 64 block_k: 256 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 256 block_n: 256 block_k: 128 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+)";
+
+constexpr absl::string_view kDefaultRocmTritonConfigs = R"(
+config { block_m: 32 block_n: 32 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 64 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 128 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+)";
+
+constexpr absl::string_view kAmpereTritonConfigs = R"(
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 256 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 32 block_k: 128 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 16 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 16 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 4 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 4 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 256 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 128 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 32 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 8 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 8 block_k: 128 split_k: 2 num_stages: 3 num_warps: 4 num_ctas: 1 }
+)";
+
+constexpr absl::string_view kHopperTritonConfigs = R"(
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 256 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 32 block_k: 128 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 16 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 16 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 4 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 4 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 256 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 128 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 32 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 8 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 8 block_k: 128 split_k: 2 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 256 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 32 block_k: 128 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 16 num_stages: 3 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 16 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 256 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+)";
+
+absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>
+LoadTritonConfigs() {
+  absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>
+      result;
+
+  auto parse_config =
+      [](absl::string_view config_str) -> std::vector<TritonGemmConfig> {
+    TritonGemmConfigsProto proto;
+    CHECK(tsl::protobuf::TextFormat::ParseFromString(config_str, &proto))
+        << config_str;
+    std::vector<TritonGemmConfig> configs;
+    absl::c_transform(proto.config(), std::back_inserter(configs),
+                      [](const AutotuneResult::TritonGemmKey& config_proto) {
+                        absl::StatusOr<TritonGemmConfig> config =
+                            TritonGemmConfig::FromProto(config_proto);
+                        CHECK_OK(config);
+                        return *config;
+                      });
+    return configs;
+  };
+
+  const std::initializer_list<
+      std::pair<TritonConfigsPlatform, absl::string_view>>
+      kConfigsMap = {
+          {TritonConfigsPlatform::kAmpere, kAmpereTritonConfigs},
+          {TritonConfigsPlatform::kBlackwell, kBlackwellTritonConfigs},
+          {TritonConfigsPlatform::kDefaultCuda, kDefaultCudaTritonConfigs},
+          {TritonConfigsPlatform::kDefaultRocm, kDefaultRocmTritonConfigs},
+          {TritonConfigsPlatform::kHopper, kHopperTritonConfigs},
+      };
+  for (const auto& [platform, config_str] : kConfigsMap) {
+    result[platform] = parse_config(config_str);
+  }
+
+  return result;
+}
+
+}  // namespace
+
+const std::vector<TritonGemmConfig>& GetTritonConfigsForPlatform(
+    TritonConfigsPlatform platform) {
+  static const absl::NoDestructor<
+      absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>>
+      kConfigs(LoadTritonConfigs());
+  return kConfigs->at(platform);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
index 7cb0896477b419..b37950a6e1d7b0 100644
--- a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
@@ -23,74 +23,16 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using Config = TritonGemmConfig;
-
-static const std::vector<TritonGemmConfig>* const kBlackwellConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(128, 128, 32, 1, 4, 4), Config(128, 128, 64, 1, 1, 8),
-         Config(128, 128, 64, 8, 3, 4), Config(128, 16, 16, 512, 4, 2),
-         Config(128, 16, 32, 16, 3, 2), Config(128, 16, 64, 1, 5, 4),
-         Config(128, 16, 64, 16, 3, 4), Config(128, 16, 64, 64, 1, 2),
-         Config(128, 256, 64, 1, 4, 8), Config(128, 256, 64, 2, 4, 8),
-         Config(128, 256, 64, 4, 3, 8), Config(128, 64, 64, 1, 3, 4),
-         Config(128, 64, 64, 16, 4, 8), Config(128, 64, 64, 8, 4, 4),
-         Config(16, 16, 128, 1, 3, 2),  Config(16, 16, 16, 1, 1, 2),
-         Config(16, 16, 64, 8, 3, 2),   Config(16, 32, 64, 1, 3, 2),
-         Config(256, 128, 64, 1, 3, 8), Config(256, 16, 16, 1, 1, 2),
-         Config(256, 32, 32, 16, 3, 4), Config(32, 16, 32, 1, 4, 2),
-         Config(32, 16, 512, 1, 1, 4),  Config(32, 16, 64, 1, 1, 2),
-         Config(32, 16, 64, 1, 4, 2),   Config(64, 128, 16, 1, 1, 16),
-         Config(64, 128, 16, 1, 3, 2),  Config(64, 128, 64, 1, 4, 4),
-         Config(64, 16, 64, 1, 2, 2),   Config(64, 32, 128, 1, 3, 2),
-         Config(64, 32, 32, 1, 4, 2),   Config(64, 32, 64, 64, 3, 2),
-         Config(64, 64, 128, 8, 1, 8),  Config(64, 64, 16, 1, 1, 2),
-         Config(64, 64, 16, 1, 3, 2)});
-
-static const std::vector<TritonGemmConfig>* const kHopperAmpereConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(16, 16, 64, 1, 4, 2),    Config(16, 16, 128, 1, 4, 4),
-         Config(16, 16, 128, 128, 4, 2), Config(16, 16, 128, 16, 1, 2),
-         Config(16, 256, 16, 1, 1, 2),   Config(32, 32, 128, 16, 1, 4),
-         Config(32, 256, 32, 1, 3, 4),   Config(32, 256, 32, 16, 3, 8),
-         Config(64, 16, 32, 1, 4, 2),    Config(64, 16, 32, 16, 4, 2),
-         Config(64, 16, 64, 1, 1, 4),    Config(64, 16, 64, 4, 3, 2),
-         Config(64, 16, 64, 16, 4, 4),   Config(64, 16, 128, 1, 4, 2),
-         Config(64, 16, 128, 16, 4, 4),  Config(64, 32, 32, 1, 4, 4),
-         Config(64, 32, 64, 16, 3, 4),   Config(64, 32, 128, 1, 3, 2),
-         Config(64, 32, 128, 128, 2, 4), Config(64, 64, 32, 1, 4, 4),
-         Config(64, 64, 64, 1, 4, 4),    Config(64, 64, 64, 4, 4, 4),
-         Config(64, 64, 128, 16, 3, 4),  Config(64, 64, 256, 16, 4, 8),
-         Config(64, 128, 16, 1, 4, 2),   Config(64, 128, 64, 1, 3, 4),
-         Config(64, 128, 128, 8, 1, 4),  Config(64, 256, 32, 1, 4, 4),
-         Config(128, 16, 32, 8, 4, 2),   Config(128, 16, 64, 16, 3, 2),
-         Config(128, 16, 64, 16, 1, 4),  Config(128, 32, 32, 8, 4, 2),
-         Config(128, 128, 32, 8, 4, 8),  Config(128, 256, 32, 1, 4, 8),
-         Config(128, 256, 64, 1, 4, 8),  Config(64, 8, 128, 2, 3, 4, 1)});
-
-static const std::vector<TritonGemmConfig>* const kDefaultCudaConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(32, 32, 256, 1, 1, 4),   Config(64, 32, 32, 16, 1, 4),
-         Config(32, 64, 64, 4, 1, 4),    Config(128, 128, 64, 4, 1, 4),
-         Config(16, 16, 256, 1, 1, 4),   Config(16, 128, 32, 16, 1, 4),
-         Config(16, 64, 128, 1, 1, 4),   Config(16, 128, 32, 8, 1, 4),
-         Config(16, 16, 512, 1, 1, 4),   Config(32, 16, 512, 1, 1, 4),
-         Config(64, 32, 64, 1, 2, 8),    Config(128, 256, 32, 1, 3, 8),
-         Config(256, 128, 32, 1, 3, 8),  Config(256, 64, 32, 1, 4, 4),
-         Config(64, 256, 32, 1, 4, 4),   Config(128, 64, 32, 1, 4, 4),
-         Config(64, 128, 32, 1, 4, 4),   Config(256, 128, 128, 1, 3, 8),
-         Config(256, 64, 128, 1, 4, 4),  Config(64, 256, 128, 1, 4, 4),
-         Config(128, 128, 128, 1, 4, 4), Config(128, 64, 64, 1, 4, 4),
-         Config(64, 128, 64, 1, 4, 4),   Config(128, 32, 64, 1, 4, 4),
-         Config(64, 32, 64, 1, 4, 4),    Config(32, 128, 32, 1, 4, 4),
-         Config(128, 128, 32, 1, 4, 4),  Config(16, 16, 256, 1, 3, 4),
-         Config(128, 128, 64, 2, 1, 8),  Config(64, 64, 64, 1, 2, 4),
-         Config(16, 64, 256, 8, 1, 4),   Config(256, 256, 128, 1, 3, 8)});
-
-static const std::vector<TritonGemmConfig>* const kDefaultRocmConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(32, 32, 256, 1, 1, 4), Config(64, 32, 32, 16, 1, 4),
-         Config(32, 64, 64, 4, 1, 4), Config(128, 128, 64, 4, 1, 4),
-         Config(16, 16, 256, 1, 1, 4), Config(16, 128, 32, 16, 1, 4)});
+enum class TritonConfigsPlatform {
+  kAmpere,
+  kBlackwell,
+  kDefaultCuda,
+  kDefaultRocm,
+  kHopper,
+};
+
+const std::vector<TritonGemmConfig>& GetTritonConfigsForPlatform(
+    TritonConfigsPlatform);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 4d3c77a3a63c38..ff1799169002b9 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -109,6 +109,9 @@ message GemmBackendConfig {
   bool damax_output = 18;
 
   reserved 19;
+
+  // The workspace size used during autotuning when the algorithm was selected.
+  int64 autotune_workspace_size = 20;
 }
 
 // Backend config for bitcast operation generated from MLIR MHLO dialect.
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 849b8d21dc94aa..4b8fb550b7de97 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -2,6 +2,7 @@
 """
 
 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("@rules_shell//shell:sh_test.bzl", "sh_test")
 load("//xla/tests:build_defs.bzl", "prepare_gpu_backend_data")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
@@ -147,7 +148,7 @@ def gen_gpu_hlo_compile_tests(
         ]
 
         for backend in backends:
-            native.sh_test(
+            sh_test(
                 name = "gpu_compile_%s_%s_hlo_test" % (filename, backend),
                 srcs = [name + "_gensh"],
                 args = [
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.cc b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
index f0976d24bd3b27..ecaf28349a6b07 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.cc
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
@@ -99,6 +99,8 @@ const absl::string_view kCudnnfMHASoftmaxDropoutBackwardCallTarget =
     "__cudnn$fmhaSoftmaxDropoutBackward";
 
 const absl::string_view kCubDeviceRadixSortTarget = "__cub$DeviceRadixSort";
+const absl::string_view kCubDeviceRadixSortUnassignedScratchSizeTarget =
+    "__cub$DeviceRadixSortUnassignedScratchSize";
 
 bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
   if (hlo.opcode() != HloOpcode::kCustomCall) {
@@ -186,6 +188,12 @@ bool IsCubDeviceRadixSort(const HloInstruction& hlo) {
          hlo.custom_call_target() == kCubDeviceRadixSortTarget;
 }
 
+bool IsCubDeviceRadixSortNoScratchSize(const HloInstruction& hlo) {
+  return hlo.opcode() == HloOpcode::kCustomCall &&
+         hlo.custom_call_target() ==
+             kCubDeviceRadixSortUnassignedScratchSizeTarget;
+}
+
 absl::StatusOr<CudnnConvKind> GetCudnnConvKind(
     const HloCustomCallInstruction* instr) {
   absl::string_view target = instr->custom_call_target();
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.h b/third_party/xla/xla/service/gpu/cublas_cudnn.h
index 02d712b21c473b..034ec33c9dc983 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.h
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.h
@@ -219,7 +219,13 @@ bool IsCustomCallToBlockScaledDot(const HloInstruction& hlo);
 // Reference: https://nvlabs.github.io/cub/
 extern const absl::string_view kCubDeviceRadixSortTarget;
 
+// CUB library call that allows to not specify the scratch size.
+// EstimateCubScratchSizePass will assign the correct scratch size.
+extern const absl::string_view kCubDeviceRadixSortUnassignedScratchSizeTarget;
+
 bool IsCubDeviceRadixSort(const HloInstruction& hlo);
+bool IsCubDeviceRadixSortNoScratchSize(const HloInstruction& hlo);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 28ad1dd81231a8..d29efd3a727dbd 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -16,31 +16,26 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/const_init.h"
 #include "absl/base/no_destructor.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "xla/literal_util.h"
-
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/include/driver_types.h"
-#define PLATFORM "CUDA"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hip/hip_runtime.h"
-#define PLATFORM "ROCM"
-#endif
-
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/execution_context.h"
@@ -53,38 +48,24 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
-#if GOOGLE_CUDA
-#define gpuSuccess cudaSuccess
-#define gpuMemcpyAsync cudaMemcpyAsync
-#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
-#define gpuMemcpy cudaMemcpy
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-#elif TENSORFLOW_USE_ROCM
-#define gpuSuccess hipSuccess
-#define gpuMemcpyAsync hipMemcpyAsync
-#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define gpuMemcpy hipMemcpy
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-#endif
-
 namespace xla {
 
 struct Range {
@@ -104,7 +85,19 @@ namespace {
 using ::absl_testing::StatusIs;
 using ::testing::HasSubstr;
 
-using CustomCallTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+class CustomCallTest : public ClientLibraryTestRunnerMixin<
+                           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
+ public:
+  std::string PlatformName() {
+    if (test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuCuda)) {
+      return "CUDA";
+    }
+    if (test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuRocm)) {
+      return "ROCM";
+    }
+    LOG(FATAL) << TestName() << " was executed on an unsupported platform.";
+  }
+};
 
 // The test case for custom call with tokens encodes the arguments and result
 // type using a string with A(=Array), T(=Token) and {} for Tuples. It also
@@ -123,32 +116,17 @@ struct TokenTestCase {
   std::string opaque;
 };
 
-void Callback_Tokens(se::gpu::GpuStreamHandle stream, void** buffers,
-                     const char* opaque, size_t opaque_len) {
-  for (int i = 0; i < opaque_len; ++i) {
-    char c = opaque[i];
-    ASSERT_TRUE(c == 'A' || c == 'T');
-    if (c == 'A') {
-      ASSERT_NE(buffers[i], nullptr);
-    } else {
-      ASSERT_EQ(buffers[i], nullptr);
-    }
-  }
-}
-
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Tokens, PLATFORM);
-
 std::vector<TokenTestCase> GetTokenTestCases() {
-  return {{"{AT}{AT}", "{A{AT}A}", "ATATAATA"},  // tokens in input and output
-          {"{A}", "T", "AT"},                    // single token as output
-          {"{{T}}", "A", "TA"},                  // single token as input
+  return {{"{AT}{AT}", "{AATA}", "ATATAATA"},  // tokens in input and output
+          {"{A}", "T", "AT"},                  // single token as output
+          {"{{T}}", "A", "TA"},                // single token as input
           {"AA", "{TA}", "AATA"},
           {"TA{TA{TA}}", "{AA}", "TATATAAA"}};
 }
 
 class CustomCallTokensTest
     : public ::testing::WithParamInterface<TokenTestCase>,
-      public ClientLibraryTestRunnerMixin<HloTestBase> {
+      public CustomCallTest {
  public:
   static std::vector<XlaOp> BuildInputs(XlaBuilder& b,
                                         std::istringstream& str) {
@@ -190,15 +168,23 @@ class CustomCallTokensTest
   }
 };
 
-void Callback_WithStatusSucceeded(se::gpu::GpuStreamHandle /*stream*/,
-                                  void** /*buffers*/, const char* /*opaque*/,
-                                  size_t /*opaque_len*/,
+void Callback_WithStatusSucceeded(void* /*stream*/, void** /*buffers*/,
+                                  const char* /*opaque*/, size_t /*opaque_len*/,
                                   XlaCustomCallStatus* status) {
   XlaCustomCallStatusSetSuccess(status);
 }
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_WithStatusSucceeded, PLATFORM);
+
+void Callback_WithStatusFailed(void* /*stream*/, void** /*buffers*/,
+                               const char* /*opaque*/, size_t /*opaque_len*/,
+                               XlaCustomCallStatus* status) {
+  XlaCustomCallStatusSetFailure(status, "Failed", 6);
+}
 
 TEST_F(CustomCallTest, WithStatusSucceeded) {
+  CustomCallTargetRegistry::Global()->Register(
+      "Callback_WithStatusSucceeded",
+      reinterpret_cast<void*>(Callback_WithStatusSucceeded), PlatformName());
+
   XlaBuilder b(TestName());
   CustomCall(
       &b, "Callback_WithStatusSucceeded", /*operands=*/{},
@@ -210,15 +196,11 @@ TEST_F(CustomCallTest, WithStatusSucceeded) {
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
-void Callback_WithStatusFailed(se::gpu::GpuStreamHandle /*stream*/,
-                               void** /*buffers*/, const char* /*opaque*/,
-                               size_t /*opaque_len*/,
-                               XlaCustomCallStatus* status) {
-  XlaCustomCallStatusSetFailure(status, "Failed", 6);
-}
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_WithStatusFailed, PLATFORM);
-
 TEST_F(CustomCallTest, WithStatusFailed) {
+  CustomCallTargetRegistry::Global()->Register(
+      "Callback_WithStatusFailed",
+      reinterpret_cast<void*>(Callback_WithStatusFailed), PlatformName());
+
   XlaBuilder b(TestName());
   CustomCall(
       &b, "Callback_WithStatusFailed", /*operands=*/{},
@@ -245,10 +227,12 @@ XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
                            .Ret<ffi::AnyBuffer>()   //
                            .Attr<int32_t>("value")  // value
 );
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail",
-                         PLATFORM, kAlwaysFail);
 
 TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$always_fail",
+                                       PlatformName(), kAlwaysFail);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$always_fail", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"{value = 42 : i32}",
@@ -264,6 +248,10 @@ TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
 // Same as the above test but just pass attribute through
 // the backend config proto string instead.
 TEST_F(CustomCallTest, PassAttributesByBackendConfig) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$always_fail",
+                                       PlatformName(), kAlwaysFail);
+
   XlaBuilder b(TestName());
   CustomCall(
       &b, "__xla_test$$always_fail", /*operands=*/{},
@@ -292,10 +280,10 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                            .Ret<ffi::AnyBuffer>(),  // dst
                        {ffi::Traits::kCmdBufferCompatible});
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
-                         kMemcpy);
-
 TEST_F(CustomCallTest, ExportedFfiMemcpy) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PlatformName(), kMemcpy);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$memcpy",
              /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
@@ -318,10 +306,11 @@ XLA_FFI_DEFINE_HANDLER(kHandleUserPointer, HandleUserPointer,
                            .Ret<ffi::AnyBuffer>()  // buffer for result
                            .Attr<ffi::Pointer<std::string>>("message"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$user_data", PLATFORM,
-                         kHandleUserPointer);
-
 TEST_F(CustomCallTest, PassUserPointerWithAttrs) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$user_data", PlatformName(),
+                                       kHandleUserPointer);
+
   std::string message = "User-defined message";
   auto ptr = reinterpret_cast<uintptr_t>(&message);
 
@@ -348,10 +337,10 @@ XLA_FFI_DEFINE_HANDLER(
     kIsInvoked, IsInvoked,
     ffi::Ffi::Bind().Ret<ffi::AnyBuffer>());  // Buffer for result (unused).
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$isinvoked", PLATFORM,
-                         kIsInvoked);
-
 TEST_F(CustomCallTest, ExportedFfiIsInvoked) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$isinvoked", PlatformName(), kIsInvoked);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$isinvoked", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"",
@@ -401,10 +390,10 @@ XLA_FFI_DEFINE_HANDLER(kOpaque, Opaque,
                            .Ret<ffi::AnyBuffer>()  // Dummy result buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$opaque", PLATFORM,
-                         kOpaque);
-
 TEST_F(CustomCallTest, ExportedFfiOpaque) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$opaque", PlatformName(), kOpaque);
+
   XlaBuilder b(TestName());
   const std::string opaque = absl::StrFormat(
       "{opaque = %d : i64}", reinterpret_cast<uintptr_t>(&kExpectedOpaque));
@@ -462,10 +451,10 @@ XLA_FFI_DEFINE_HANDLER(
     ffi::Ffi::Bind().RemainingArgs().RemainingRets().Attr<absl::string_view>(
         "pattern"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens", PLATFORM,
-                         kFfiTokens);
-
 TEST_P(CustomCallTokensTest, ExportedTokensTest) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$tokens", PlatformName(), kFfiTokens);
+
   const TokenTestCase& tc = GetParam();
   XlaBuilder b(TestName());
   std::istringstream input(tc.input);
@@ -499,10 +488,11 @@ static absl::Status AlwaysSucceed(ffi::Result<ffi::AnyBuffer>) {
 XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
                        ffi::Ffi::Bind().Ret<ffi::AnyBuffer>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_succeed",
-                         PLATFORM, kAlwaysSucceed);
-
 TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$always_succeed",
+                                       PlatformName(), kAlwaysSucceed);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$always_succeed", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"",
@@ -542,10 +532,11 @@ XLA_FFI_DEFINE_HANDLER(kFfiAttributes, FfiAttributes,
                            .Attr<absl::Span<const int32_t>>("i32_arr")
                            .Attr<Range>("range"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_attributes",
-                         PLATFORM, kFfiAttributes);
-
 TEST_F(CustomCallTest, FfiAttributes) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "xla.gpu.ffi_attributes", PlatformName(),
+                                       kFfiAttributes);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "xla.gpu.ffi_attributes", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}),
@@ -599,11 +590,11 @@ XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
                            .Ret<ffi::AnyBuffer>()         // dst
                            .Ctx<ffi::CalledComputation>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
-                         "xla.gpu.ext.memcpy_with_called_computation", PLATFORM,
-                         kMemcpyWithCalledComputation);
-
 TEST_F(CustomCallTest, WithCalledComputation) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ext.memcpy_with_called_computation",
+      PlatformName(), kMemcpyWithCalledComputation);
+
   auto shape = ShapeUtil::MakeShape(F32, {128});
 
   // Build a called computation which is just a copy instruction.
@@ -626,6 +617,10 @@ TEST_F(CustomCallTest, WithCalledComputation) {
 }
 
 TEST_F(CustomCallTest, WithCalledComputationAndLayouts) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ext.memcpy_with_called_computation",
+      PlatformName(), kMemcpyWithCalledComputation);
+
   auto shape = ShapeUtil::MakeShapeWithDenseLayout(F32, {128, 128}, {0, 1});
   // Build a called computation which is just a copy instruction.
   XlaBuilder copy("copy");
@@ -644,10 +639,43 @@ TEST_F(CustomCallTest, WithCalledComputationAndLayouts) {
   TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}, &shape));
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
+
 //===----------------------------------------------------------------------===//
 // XLA:FFI handler with execution context
 //===----------------------------------------------------------------------===//
 
+// HloRunnerPjRt doesn't offer a way to provide the execution context for the
+// execution. Therefore we use a global static variable to pass the execution
+// context to the custom call handler.
+absl::Mutex execution_context_mutex(absl::kConstInit);
+ffi::ExecutionContext* global_execution_context
+    ABSL_GUARDED_BY(execution_context_mutex) = nullptr;
+absl::NoDestructor<std::optional<ffi::internal::ScopedExecutionContext>>
+    scoped_execution_context;
+
+template <ffi::ExecutionStage stage>
+absl::Status ExecutionContextRegister(ffi::Result<ffi::AnyBuffer>) {
+  if constexpr (stage != ffi::ExecutionStage::kPrepare) {
+    return absl::OkStatus();
+  }
+
+  absl::MutexLock lock(execution_context_mutex);
+  // ScopedExecutionContext needs to be constructed on the same thread as the
+  // execution context is used. Therefore we use the prepare callback to
+  // create the execution context.
+  scoped_execution_context->emplace(global_execution_context);
+  return absl::OkStatus();
+};
+
+XLA_FFI_DEFINE_HANDLER(
+    kExecutionContextRegisterPrepare,
+    ExecutionContextRegister<ffi::ExecutionStage::kPrepare>,
+    ffi::Ffi::Bind<ffi::ExecutionStage::kPrepare>().Ret<ffi::AnyBuffer>());
+XLA_FFI_DEFINE_HANDLER(
+    kExecutionContextRegisterExecute,
+    ExecutionContextRegister<ffi::ExecutionStage::kExecute>,
+    ffi::Ffi::Bind<ffi::ExecutionStage::kExecute>().Ret<ffi::AnyBuffer>());
+
 // Arbitrary user-defined context passed via the execution context side channel
 // to a custom call handlers.
 struct SomeExtraContext {
@@ -659,7 +687,8 @@ struct SomeExtraContext {
 };
 
 template <ffi::ExecutionStage stage>
-static absl::Status ExecutionContext(ffi::Result<ffi::AnyBuffer>,
+static absl::Status ExecutionContext(ffi::AnyBuffer,
+                                     ffi::Result<ffi::AnyBuffer>,
                                      SomeExtraContext* ctx) {
   if (ctx->value != 42) {
     return absl::InternalError("Unexpected value");
@@ -680,33 +709,59 @@ static absl::Status ExecutionContext(ffi::Result<ffi::AnyBuffer>,
 XLA_FFI_DEFINE_HANDLER(kExecutionContextPrepare,
                        ExecutionContext<ffi::ExecutionStage::kPrepare>,
                        ffi::Ffi::Bind<ffi::ExecutionStage::kPrepare>()
+                           .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
 XLA_FFI_DEFINE_HANDLER(kExecutionContextInitialize,
                        ExecutionContext<ffi::ExecutionStage::kInitialize>,
                        ffi::Ffi::Bind<ffi::ExecutionStage::kInitialize>()
+                           .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
 XLA_FFI_DEFINE_HANDLER(kExecutionContextExecute,
                        ExecutionContext<ffi::ExecutionStage::kExecute>,
                        ffi::Ffi::Bind<ffi::ExecutionStage::kExecute>()
+                           .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_context",
-                         PLATFORM,
-                         {
-                             /*instantiate=*/nullptr,
-                             /*prepare=*/kExecutionContextPrepare,
-                             /*initialize=*/kExecutionContextInitialize,
-                             /*execute=*/kExecutionContextExecute,
-                         });
-
 TEST_F(CustomCallTest, FfiExecutionContext) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.register_ffi_execution_context",
+      PlatformName(),
+      {
+          /*instantiate=*/nullptr,
+          /*prepare=*/kExecutionContextRegisterPrepare,
+          /*initialize=*/nullptr,
+          /*execute=*/kExecutionContextRegisterExecute,
+      });
+
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_context", PlatformName(),
+      {
+          /*instantiate=*/nullptr,
+          /*prepare=*/kExecutionContextPrepare,
+          /*initialize=*/kExecutionContextInitialize,
+          /*execute=*/kExecutionContextExecute,
+      });
+
   XlaBuilder b(TestName());
-  CustomCall(&b, "xla.gpu.ffi_execution_context", /*operands=*/{},
+
+  // This custom call users ScopedExecutionContext to register the execution
+  // context for the duration of the current XLA computation.
+  // Usually the execution context is passed in via ExecutionOptions, but that's
+  // not supported in HloRunnerPjRt.
+  XlaOp output =
+      CustomCall(&b, "xla.gpu.register_ffi_execution_context",
+                 /*operands=*/{}, ShapeUtil::MakeShape(F32, {}),
+                 /*opaque=*/"",
+                 /*has_side_effect=*/true,
+                 /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+                 /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+                 /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  CustomCall(&b, "xla.gpu.ffi_execution_context", /*operands=*/{output},
              ShapeUtil::MakeShape(F32, {}),
              /*opaque=*/"",
              /*has_side_effect=*/false,
@@ -716,9 +771,10 @@ TEST_F(CustomCallTest, FfiExecutionContext) {
 
   ffi::ExecutionContext execution_context;
   TF_ASSERT_OK(execution_context.Emplace<SomeExtraContext>(42));
-
-  ffi::internal::ScopedExecutionContext scoped_execution_context(
-      &execution_context);
+  {
+    absl::MutexLock lock(execution_context_mutex);
+    global_execution_context = &execution_context;
+  }
 
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 
@@ -761,16 +817,16 @@ XLA_FFI_DEFINE_HANDLER(
     kGetState, GetState,
     ffi::Ffi::Bind().Ret<ffi::AnyBuffer>().Ctx<ffi::State<SomeState>>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_state",
-                         PLATFORM,
-                         {
-                             /*instantiate=*/kInstantiateState,
-                             /*prepare=*/nullptr,
-                             /*initialize=*/nullptr,
-                             /*execute=*/kGetState,
-                         });
-
 TEST_F(CustomCallTest, FfiExecutionState) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_state", PlatformName(),
+      {
+          /*instantiate=*/kInstantiateState,
+          /*prepare=*/nullptr,
+          /*initialize=*/nullptr,
+          /*execute=*/kGetState,
+      });
+
   XlaBuilder b(TestName());
   CustomCall(&b, "xla.gpu.ffi_execution_state", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}),
@@ -832,17 +888,20 @@ XLA_FFI_DEFINE_HANDLER(
     kAsyncStartCustomCall, AsyncStartCustomCall,
     ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
         "channel"));
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_start_custom_call",
-                         PLATFORM, kAsyncStartCustomCall);
 
 XLA_FFI_DEFINE_HANDLER(
     kAsyncDoneCustomCall, AsyncDoneCustomCall,
     ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
         "channel"));
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_done_custom_call",
-                         PLATFORM, kAsyncDoneCustomCall);
 
 TEST_F(CustomCallTest, AsyncCustomCalls) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "xla.gpu.async_start_custom_call",
+                                       PlatformName(), kAsyncStartCustomCall);
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "xla.gpu.async_done_custom_call",
+                                       PlatformName(), kAsyncDoneCustomCall);
+
   auto shape = ShapeUtil::MakeShape(F32, {});
 
   XlaBuilder b(TestName());
@@ -873,40 +932,51 @@ TEST_F(CustomCallTest, AsyncCustomCalls) {
 // Testing the use of buffers in custom calls.
 //===----------------------------------------------------------------------===//
 
-class CustomCallHloTest : public HloTestBase {};
+using CustomCallHloTest = CustomCallTest;
 
-void CallBack_AddOne(se::gpu::GpuStreamHandle stream, void** buffers,
-                     const char* /*opaque*/, size_t /*opaque_len*/) {
-  // Expect that the input and output buffers are the same.
-  if (buffers[0] != buffers[1]) {
-    return;
+static absl::Status AddOne(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::Result<ffi::AnyBuffer> ret) {
+  if (src.untyped_data() != ret->untyped_data()) {
+    return absl::InternalError("Input and output buffers must be the same.");
   }
-  int32_t dst[2];
-  auto err = gpuMemcpy(dst, buffers[0], /*count=*/sizeof(int32_t) * 2,
-                       gpuMemcpyDeviceToHost);
-  ASSERT_EQ(err, gpuSuccess);
-  dst[0] += 1;
-  dst[1] += 1;
-  err = gpuMemcpy(buffers[1], dst, /*count=*/sizeof(int32_t) * 2,
-                  gpuMemcpyHostToDevice);
+
+  int32_t data[2];
+  se::DeviceAddressBase buffer_mem = ret->device_memory();
+  TF_RETURN_IF_ERROR(stream->Memcpy(data, buffer_mem, sizeof(data)));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  data[0] += 1;
+  data[1] += 1;
+
+  TF_RETURN_IF_ERROR(stream->Memcpy(&buffer_mem, data, sizeof(data)));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  return absl::OkStatus();
 }
-XLA_REGISTER_CUSTOM_CALL_TARGET(CallBack_AddOne, PLATFORM);
+
+XLA_FFI_DEFINE_HANDLER(kAddOne, AddOne,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::AnyBuffer>()
+                           .Ret<ffi::AnyBuffer>());
 
 TEST_F(CustomCallHloTest, HloBufferStraightLine) {
-  const char* const kModuleStr = R"(
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(), "xla.gpu.add_one",
+                                       PlatformName(), kAddOne);
 
+  const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
     c1 = s32[] constant(1)
     init = s32[2] broadcast(c1), dimensions={}
     b0 = b(s32[2]) custom-call(init), custom_call_target="Pin",
       output_to_operand_aliasing={{}: (0, {})}
-    b1 = b(s32[2]) custom-call(b0), custom_call_target="CallBack_AddOne",
+    b1 = b(s32[2]) custom-call(b0), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
-    b2 = b(s32[2]) custom-call(b1), custom_call_target="CallBack_AddOne",
+      api_version=API_VERSION_TYPED_FFI
+    b2 = b(s32[2]) custom-call(b1), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
+      api_version=API_VERSION_TYPED_FFI
     ROOT v = s32[2] custom-call(b2), custom_call_target="Unpin",
       output_to_operand_aliasing={{}: (0, {})}
   })";
@@ -926,6 +996,9 @@ TEST_F(CustomCallHloTest, HloBufferStraightLine) {
 }
 
 TEST_F(CustomCallHloTest, HloBufferRotated) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(), "xla.gpu.add_one",
+                                       PlatformName(), kAddOne);
+
   const char* const kModuleStr = R"(
 
   HloModule test
@@ -943,12 +1016,12 @@ TEST_F(CustomCallHloTest, HloBufferRotated) {
 
     c1 = s32[] constant(1)
     new_count = s32[] add(count, c1)
-    b4 = b(s32[2]) custom-call(b3), custom_call_target="CallBack_AddOne",
+    b4 = b(s32[2]) custom-call(b3), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
-    b5 = b(s32[2]) custom-call(b4), custom_call_target="CallBack_AddOne",
+      api_version=API_VERSION_TYPED_FFI
+    b5 = b(s32[2]) custom-call(b4), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
+      api_version=API_VERSION_TYPED_FFI
     v0 = s32[2] custom-call(b5), custom_call_target="Unpin",
       output_to_operand_aliasing={{}: (0, {})}
     c1_broadcast = s32[2] broadcast(c1), dimensions={}
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index aad712107a7e1f..481eb3997c7a26 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -134,6 +133,12 @@ class DeterminismTest : public GpuCodegenTest {
     EXPECT_CALL(executor, SynchronizeAllActivity).WillRepeatedly([&]() -> bool {
       return true;
     });
+    EXPECT_CALL(executor, CreateStream).WillRepeatedly([&] {
+      return backend().default_stream_executor()->CreateStream();
+    });
+    EXPECT_CALL(executor, AsBlas).WillRepeatedly([&] {
+      return backend().default_stream_executor()->AsBlas();
+    });
 
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                             ParseAndReturnVerifiedModule(hlo_string));
@@ -166,6 +171,10 @@ class DeterminismTest : public GpuCodegenTest {
 };
 
 TEST_F(DeterminismTest, CublasDot) {
+  // This test expects to use Cublas. Disable other backends, including Triton.
+  debug_options_.clear_xla_gpu_experimental_autotune_backends();
+  debug_options_.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUBLAS);
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
   p0 = f32[128,128] parameter(0)
@@ -183,12 +192,6 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(; CHECK: custom_call_target="__cublas$gemm")",
                     TimerCreation::kForbidden);
   AssertDeterminism(kHloText);
-
-  debug_options_.set_xla_gpu_enable_cublaslt(true);
-  MatchOptimizedHlo(kHloText,
-                    R"(; CHECK: custom_call_target="__cublas$lt$matmul")",
-                    TimerCreation::kForbidden);
-  AssertDeterminism(kHloText);
 }
 
 TEST_F(DeterminismTest, DeterministicTritonGemmUsesDefaultConfig) {
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
index 992bef7b2ae8e2..a8b83730b145a9 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/fusion_dispatch_pipeline.h"
 
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/gpu/transforms/fusion_block_level_rewriter.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
index 8d8b26a2ef9387..2fae2a309bf1ac 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index ce4176081976e6..f8101d4126e4a4 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/transforms/multi_output_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
+#include "xla/service/gpu/transforms/sort_iota_fusion.h"
 #include "xla/service/gpu/transforms/variadic_op_splitter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_cse.h"
@@ -59,6 +59,7 @@ HloPassPipeline FusionPipeline(
       std::make_unique<CpuGpuVerifierMetadata>(std::move(opts)),
       "hlo verifier (debug)");
 
+  fusion.AddPass<SortIotaFusion>();
   GpuHloCostAnalysis::Options cost_analysis_options{
       shape_size_bytes_function,
       /*per_second_rates=*/{},
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/xla/xla/service/gpu/fusion_pipeline.h
index 64b4ceea176562..89802eb3a8b422 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
index 11141620ef3d66..e9da581af3382b 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
@@ -62,9 +62,8 @@ class GpuAotCompilationResult : public AotCompilationResult {
     return serialized;
   }
 
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
-      final {
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && final {
     stream_executor::Platform::Id platform_id =
         stream_exec->GetPlatform()->id();
     const auto symbol_resolver = [&](absl::string_view symbol_name) {
@@ -79,8 +78,8 @@ class GpuAotCompilationResult : public AotCompilationResult {
 
   const HloModule* optimized_module() const final { return hlo_module_.get(); };
 
-  std::unique_ptr<HloModule> consume_optimized_module() final {
-    return std::move(hlo_module_);
+  std::shared_ptr<HloModule> shared_optimized_module() final {
+    return hlo_module_;
   };
 
  private:
@@ -90,7 +89,7 @@ class GpuAotCompilationResult : public AotCompilationResult {
         hlo_module_(std::move(hlo_module)) {}
 
   GpuExecutableProto executable_;
-  std::unique_ptr<HloModule> hlo_module_;
+  std::shared_ptr<HloModule> hlo_module_;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
index 091e616bac25f7..65fa6965ea14f8 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
@@ -189,9 +189,8 @@ TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
 
   EnsureCudaSymbolIsRegistered();
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*result).LoadExecutable(/*compiler=*/nullptr, &executor_));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*result).LoadExecutable(&executor_));
 
   auto* gpu_executable = dynamic_cast<GpuExecutable*>(executable.get());
   ASSERT_NE(gpu_executable, nullptr) << "Executable is not a GpuExecutable.";
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index bf02596ba3a477..3acb11b387fabb 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -102,9 +102,8 @@ TEST_P(GpuAotCompilationTest, ExportAndLoadExecutable) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec));
 }
 
 TEST_P(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
@@ -144,9 +143,8 @@ TEST_P(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec));
 }
 
 namespace {
@@ -257,9 +255,8 @@ TEST_P(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 55c74ba56d3ecb..73e15a18b29f4b 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <set>
 #include <string>
 #include <utility>
 #include <variant>
@@ -56,6 +57,7 @@ limitations under the License.
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "google/protobuf/text_format.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
@@ -69,7 +71,6 @@ limitations under the License.
 #include "xla/core/host_offloading/hlo_host_device_type_call_wrapper.h"
 #include "xla/core/host_offloading/host_compute_asyncifier.h"
 #include "xla/hlo/analysis/alias_info.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -231,6 +232,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_strength_reduction.h"
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 #include "xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h"
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
 #include "xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h"
 #include "xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
@@ -239,6 +241,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/gemm_fusion_swap_operands.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/service/gpu/transforms/gemv_rewriter.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/layout_assignment.h"
 #include "xla/service/gpu/transforms/move_copy_to_users.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
@@ -273,10 +276,12 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/service/host_offload_utils.h"
 #include "xla/service/layout_assignment.h"
 #include "xla/service/layout_normalization.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/reduce_scatter_reassociate.h"
 #include "xla/service/scatter_determinism_expander.h"
 #include "xla/service/scatter_expander.h"
@@ -317,6 +322,7 @@ limitations under the License.
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/profiler/lib/scoped_annotation.h"
 #include "tsl/profiler/lib/traceme.h"
+#include "xla/tsl/platform/status_macros.h"
 
 #ifdef PLATFORM_GOOGLE
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
@@ -528,7 +534,7 @@ absl::Status SetHostDeviceType(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr->backend_config<GpuBackendConfig>());
   backend_config.set_device_type(DEVICE_TYPE_HOST);
-  TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
+  RETURN_IF_ERROR(instr->set_backend_config(backend_config));
   return absl::OkStatus();
 }
 
@@ -553,10 +559,9 @@ bool BackendConfigDeviceTypeIsHost(HloInstruction* instr) {
 }  // namespace
 
 absl::Status RunOptimizationPasses(
-    HloModule* hlo_module, stream_executor::StreamExecutor* stream_exec,
-    const Compiler::GpuTargetConfig& gpu_target_config,
+    HloModule* hlo_module, const Compiler::GpuTargetConfig& gpu_target_config,
     const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
-    absl::string_view platform_name) {
+    absl::string_view platform_name, bool enable_sort_rewriter) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
   se::GpuComputeCapability gpu_version =
       gpu_target_config.device_description.gpu_compute_capability();
@@ -613,19 +618,10 @@ absl::Status RunOptimizationPasses(
   // would do.
   pipeline.AddPass<PermutationSortExpander>();
 
-  // SortRewriter needs to ask the device how much scratch space is needed,
-  // which isn't feasible if we don't have a device.
-  if (hlo_module->config().debug_options().xla_gpu_enable_cub_radix_sort()) {
-    if (stream_exec != nullptr) {
-      pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
-                                     std::string{platform_name});
-    } else {
-      LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
-                      "which will be slower at runtime. To avoid this, "
-                      "compile with a GPU present.";
-    }
+  if (enable_sort_rewriter) {
+    pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
+                                   std::string{platform_name});
   }
-
   // Comparison total order expander
   pipeline.AddPass<ComparisonExpander>(std::array{std::make_pair(BF16, F32)});
 
@@ -716,17 +712,10 @@ absl::Status RunOptimizationPasses(
   // DynamicPadder creates a stable KeyValue sort for dynamic reshapes.
   pipeline.AddPass<DynamicPadder>(dynamic_padder_options);
   // SortRewriter needs to run before StableSortExpander.
-  if (debug_options.xla_gpu_enable_cub_radix_sort()) {
-    if (stream_exec != nullptr) {
-      pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
-                                     gpu_target_config.platform_name);
-    } else {
-      LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
-                      "which will be slower at runtime. To avoid this, "
-                      "compile with a GPU present.";
-    }
+  if (enable_sort_rewriter) {
+    pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
+                                   gpu_target_config.platform_name);
   }
-
   // Expand the sort op to support stable sorting if required.
   pipeline.AddPass<StableSortExpander>();
 
@@ -750,21 +739,6 @@ absl::Status RunOptimizationPasses(
         gpu_target_config.device_description.gpu_compute_capability());
     pipeline.AddPass<GpuAlgebraicSimplifier>(layout_insensitive_algsimp_opts,
                                              gpu_version);
-    // Only merge "smallish" dots.  This threshold defaults to 32MB today, with
-    // a flag to override.
-    // Do not merge dots when they are assigned different stream ids.
-    std::function<bool(const HloInstruction* dot_a,
-                       const HloInstruction* dot_b)>
-        can_merge = [&](const HloInstruction* dot_a,
-                        const HloInstruction* dot_b) -> bool {
-      return dot_a->backend_config<GpuBackendConfig>()->operation_queue_id() ==
-             dot_b->backend_config<GpuBackendConfig>()->operation_queue_id();
-    };
-    pipeline.AddPass<DotMerger>(
-        /*max_size_to_merge=*/int64_t{debug_options
-                                          .xla_gpu_dot_merger_threshold_mb()}
-            << 20,
-        can_merge);
     pipeline.AddPass<SortSimplifier>();
     pipeline.AddPass<TupleSimplifier>();
     pipeline.AddPass<WhileLoopConstantSinking>();
@@ -777,11 +751,32 @@ absl::Status RunOptimizationPasses(
     pipeline.AddPass<HloConstantFolding>();
     pipeline.AddPass<ConditionalSimplifier>();
     pipeline.AddPass<RealImagExpander>();
-    pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot);
+    // Do not fold transpose operands into dots yet. This can undo the normal
+    // form established by DotDecomposer, which the DotMerger pass requires.
+    pipeline.AddPass<TransposeFolding>(
+        /*dot_can_fold_transpose_operand=*/
+        [&](const HloInstruction& dot,
+            int64_t operand) -> absl::StatusOr<bool> { return false; });
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pipeline.AddPass<HloDCE>();
   }();
 
+  // Do not merge dots when they are assigned different stream ids.
+  std::function<int64_t(const HloInstruction* dot)> queue_id =
+      [&](const HloInstruction* dot) -> int64_t {
+    return dot->backend_config<GpuBackendConfig>()->operation_queue_id();
+  };
+  // Only merge "smallish" dots. This threshold defaults to 32MB today, with
+  // a flag to override.
+  pipeline.AddPass<DotMerger>(
+      /*max_size_to_merge=*/int64_t{debug_options
+                                        .xla_gpu_dot_merger_threshold_mb()}
+          << 20,
+      queue_id);
+  // Folding transpose operands into dots can undo the normal form established
+  // by DotDecomposer. Subsequent passes must not rely on it from this point on.
+  pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot);
+
   // ConvertMover and ReshapeMover fight with each other: ConvertMover wants
   // to move some converts down the graph, but ReshapeMover wants to move them
   // up the graph.  As a compromise, let ReshapeMover run to a fixed point,
@@ -881,6 +876,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
         /*postprocess_backward_peeled_op=*/{},
         /*postprocess_backward_rotated_op=*/{},
         /*postprocess_backward_peeled_trailing_op=*/{},
@@ -904,6 +900,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
         /*postprocess_backward_peeled_op=*/{},
         /*postprocess_backward_rotated_op=*/{},
         /*postprocess_backward_peeled_trailing_op=*/{},
@@ -927,6 +924,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
         /*postprocess_backward_peeled_op=*/{},
         /*postprocess_backward_rotated_op=*/{},
         /*postprocess_backward_peeled_trailing_op=*/{},
@@ -936,6 +934,105 @@ absl::Status RunCollectiveOptimizationPasses(
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
 
+  if (debug_options.xla_gpu_enable_pipelined_host_offloading() ||
+      IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
+    // Forward pass host offloading pipelining
+    CollectivePipeliner::Config config{
+        /*level_to_operate_on=*/0,
+        /*max_pipelining_per_loop=*/INT64_MAX,
+        /*last_run=*/true,
+        /*pipeline_use_tree=*/true,
+        /*process_different_sized_ops=*/true,
+        /*pipelining_direction=*/
+        collective_pipeliner_utils::PipeliningDirection::kForward,
+        /*should_process=*/
+        host_offload_utils::IsMoveToHostWithDynamicUpdateSlice,
+        /*acceptable_formatting=*/HloPredicateTrue,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
+        /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+        /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
+        /*should_add_loop_invariant_op_in_chain=*/false,
+        /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
+        /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+        /*delay_sinking_large_collectives=*/true,
+        /*unique_channel_id=*/true,
+        /*postprocess_transformed_while_loop=*/
+        host_offload_utils::MarkDynamicVariables,
+    };
+    collectives_pipeline.AddPass<CollectivePipeliner>(config);
+  }
+
+  if (debug_options.xla_gpu_enable_pipelined_host_offloading() ||
+      IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
+    // Backward pass host offloading pipelining
+    auto acceptable_formatting = [](const HloInstruction* instr) {
+      return instr->opcode() == HloOpcode::kReshape ||
+             instr->opcode() == HloOpcode::kBroadcast ||
+             instr->opcode() == HloOpcode::kTranspose;
+    };
+    CollectivePipeliner::Config config_backward{
+        /*level_to_operate_on=*/0,
+        /*max_pipelining_per_loop=*/INT64_MAX,
+        /*last_run=*/true,
+        /*pipeline_use_tree=*/true,
+        /*process_different_sized_ops=*/true,
+        /*pipelining_direction=*/
+        collective_pipeliner_utils::PipeliningDirection::kBackward,
+        /*should_process=*/host_offload_utils::IsMoveToDeviceWithDynamicSlice,
+        /*acceptable_formatting=*/acceptable_formatting,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
+        /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+        /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/
+        [acceptable_formatting](
+            HloInstruction* instr) -> std::optional<HloInstruction*> {
+          if (!instr->IsCustomCall(
+                  memory_annotations::kMoveToDeviceCustomCallTarget)) {
+            return std::nullopt;
+          }
+          if (instr->operand_count() == 0) {
+            return std::nullopt;
+          }
+
+          std::vector<HloInstruction*> to_check = {instr->mutable_operand(0)};
+          std::set<HloInstruction*> visited;
+
+          while (!to_check.empty()) {
+            HloInstruction* current = to_check.back();
+            to_check.pop_back();
+
+            if (visited.insert(current).second) {
+              if (current->opcode() == HloOpcode::kDynamicSlice) {
+                return current;
+              }
+              if (acceptable_formatting(current)) {
+                for (HloInstruction* operand : current->operands()) {
+                  to_check.push_back(operand);
+                }
+              }
+            }
+          }
+          return std::nullopt;
+        },
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
+        /*should_add_loop_invariant_op_in_chain=*/true,
+        /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
+        /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+        /*delay_sinking_large_collectives=*/true,
+        /*unique_channel_id=*/true,
+        /*postprocess_transformed_while_loop=*/
+        host_offload_utils::MarkDynamicVariables,
+    };
+
+    collectives_pipeline.AddPass<CollectivePipeliner>(config_backward);
+  }
+
   DebugOptions::PipelineParallelismOptLevel pipeline_parallelism_opt_level =
       debug_options.xla_gpu_experimental_pipeline_parallelism_opt_level();
   if (debug_options.xla_gpu_enable_pipelined_p2p()) {
@@ -1034,17 +1131,17 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
 
   HloPassPipeline pre_fusion("pre-fusion");
   pre_fusion.AddPass<AddTrackingSuffixToInstructionNames>();
-  TF_RETURN_IF_ERROR(pre_fusion.Run(hlo_module).status());
+  RETURN_IF_ERROR(pre_fusion.Run(hlo_module).status());
 
-  TF_RETURN_IF_ERROR(
-      FusionPipeline(hlo_module->config().debug_options(), shape_size_fn,
-                     alias_info, thread_pool, gpu_device_info, mlir_context)
-          .Run(hlo_module, {HloInstruction::kMainExecutionThread})
-          .status());
+  RETURN_IF_ERROR(FusionPipeline(hlo_module->config().debug_options(),
+                                 shape_size_fn, alias_info, thread_pool,
+                                 gpu_device_info, mlir_context)
+                      .Run(hlo_module, {HloInstruction::kMainExecutionThread})
+                      .status());
 
   if (VLOG_IS_ON(2)) {
     HloFusionStatsVisitor stats;
-    TF_RETURN_IF_ERROR(hlo_module->entry_computation()->Accept(&stats));
+    RETURN_IF_ERROR(hlo_module->entry_computation()->Accept(&stats));
     VLOG(2) << stats.ToString();
   }
 
@@ -1272,7 +1369,7 @@ absl::Status RunDynamicSliceFusionPasses(HloModule* hlo_module,
           });
       return hero_op.has_value();
     });
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         pipeline.Run(hlo_module, {HloInstruction::kMainExecutionThread})
             .status());
   }
@@ -1385,14 +1482,14 @@ absl::Status GpuCompiler::OptimizeHloModule(
         ClearBackendConfigDeviceType;
     pipeline.AddPass<HloHostDeviceTypeCallWrapper>(
         hlo_host_device_type_call_wrapper_options);
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
-  TF_RETURN_IF_ERROR(RunPreSPMDPartitionerPasses(hlo_module));
+  RETURN_IF_ERROR(RunPreSPMDPartitionerPasses(hlo_module));
   // Set max_windowed_einsum_iteration to slice_size, as there will be
   // significant overhead when scaled beyond the maximum size of the
   // fast-interconnect domain.
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunSPMDPasses(hlo_module, gpu_target_config, alias_info,
                     layout_insensitive_algsimp_opts,
                     /*max_windowed_einsum_iteration=*/options.slice_size));
@@ -1402,7 +1499,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<HostComputeAsyncifier>(BackendConfigDeviceTypeIsHost);
     pipeline.AddPass<HostOffloadingPrepare>(
         HostOffloadingPrepare::Rewrite::kConvertToCustomCall);
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
   // Dump the HLO module after SPMD partitioning. There should be no more Python
@@ -1411,12 +1508,23 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_ASSIGN_OR_RETURN(
       const stream_executor::Platform* platform,
       stream_executor::PlatformManager::PlatformWithId(PlatformId()));
-  TF_RETURN_IF_ERROR(
-      RunOptimizationPasses(hlo_module, stream_exec, gpu_target_config,
-                            layout_insensitive_algsimp_opts, platform->Name()));
+
+  // SortRewriter needs to ask the device how much scratch space is needed,
+  // which isn't feasible if we don't have a device.
+  bool enable_sort_rewriter =
+      hlo_module->config().debug_options().xla_gpu_enable_cub_radix_sort();
+  if (stream_exec == nullptr && !options.early_exit_with_layouts) {
+    LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
+                    "which will be slower at runtime. To avoid this, "
+                    "compile with a GPU present.";
+    enable_sort_rewriter = false;
+  }
+  RETURN_IF_ERROR(RunOptimizationPasses(
+      hlo_module, gpu_target_config, layout_insensitive_algsimp_opts,
+      platform->Name(), enable_sort_rewriter));
   se::GpuComputeCapability gpu_version =
       device_description.gpu_compute_capability();
-  TF_RETURN_IF_ERROR(RunCollectiveOptimizationPasses(
+  RETURN_IF_ERROR(RunCollectiveOptimizationPasses(
       hlo_module, options, layout_insensitive_algsimp_opts, gpu_version,
       platform->VisibleDeviceCount(), pointer_size_));
 
@@ -1428,14 +1536,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
     TF_ASSIGN_OR_RETURN(dnn_version, GetDnnVersionInfo(stream_exec));
   }
 
-  TF_RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
+  RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
       hlo_module, gpu_version, dnn_version,
       device_description.runtime_version()));
 
-  TF_RETURN_IF_ERROR(RunLayoutAssignmentPasses(
-      hlo_module, gpu_version, dnn_version, device_description));
+  RETURN_IF_ERROR(RunLayoutAssignmentPasses(hlo_module, gpu_version,
+                                            dnn_version, device_description));
+  if (options.early_exit_with_layouts) {
+    return absl::OkStatus();
+  }
 
-  TF_RETURN_IF_ERROR(RunLayoutNormalizationPasses(
+  RETURN_IF_ERROR(RunLayoutNormalizationPasses(
       hlo_module,
       GetAlgebraicSimplifierOptions(
           AlgebraicSimplifierMode::kLayoutNormalization,
@@ -1444,22 +1555,22 @@ absl::Status GpuCompiler::OptimizeHloModule(
       gpu_version));
 
   // Run target-specific HLO optimization passes after layout assignment.
-  TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
+  RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
       hlo_module, stream_exec, options, gpu_target_config, alias_info,
       thread_pool.get_mutable()));
 
   // This is a "low effort, high impact" fusion that should be run first.
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunDynamicSliceFusionPasses(hlo_module, /*platform_id=*/PlatformId()));
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunFusionPasses(hlo_module, gpu_target_config, thread_pool.get_mutable(),
                       ShapeSizeBytesFunction(), alias_info, &mlir_context_));
-  TF_RETURN_IF_ERROR(RunPostFusionPasses(hlo_module, device_description,
-                                         alias_info, pointer_size_, options,
-                                         &mlir_context_));
-  TF_RETURN_IF_ERROR(RunAsyncCollectivesConversionPasses(hlo_module));
-  TF_RETURN_IF_ERROR(RunPostFusionSimplificationPasses(
+  RETURN_IF_ERROR(RunPostFusionPasses(hlo_module, device_description,
+                                      alias_info, pointer_size_, options,
+                                      &mlir_context_));
+  RETURN_IF_ERROR(RunAsyncCollectivesConversionPasses(hlo_module));
+  RETURN_IF_ERROR(RunPostFusionSimplificationPasses(
       hlo_module,
       GetAlgebraicSimplifierOptions(
           AlgebraicSimplifierMode::kPostFusionSimplification,
@@ -1467,10 +1578,10 @@ absl::Status GpuCompiler::OptimizeHloModule(
           gpu_target_config.platform_name == "ROCM"),
       gpu_version, gpu_target_config));
 
-  TF_RETURN_IF_ERROR(RunPostFusionVerificationPasses(
+  RETURN_IF_ERROR(RunPostFusionVerificationPasses(
       hlo_module, stream_exec, options, gpu_target_config, &mlir_context_));
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunCollectiveScheduleLinearizerPasses(hlo_module, stream_exec));
 
   {
@@ -1481,16 +1592,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
         DebugOptions::DETECTION_MODE_NONE) {
       pipeline.AddPass<UnstableReductionDetector>();
     }
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
-  TF_RETURN_IF_ERROR(RunAsyncDotPasses(hlo_module));
+  RETURN_IF_ERROR(RunAsyncDotPasses(hlo_module));
   {
     HloPassPipeline pipeline("autotune-fusion-emitters");
-    TF_RETURN_IF_ERROR(AddFusionAutotuningPass(
+    pipeline.AddPass<FusionWrapper>(gpu_target_config.device_description);
+    RETURN_IF_ERROR(AddFusionAutotuningPass(
         &pipeline, hlo_module, options, thread_pool.get_mutable(), stream_exec,
         &gpu_target_config, ShapeSizeBytesFunction()));
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
   return absl::OkStatus();
@@ -1531,7 +1643,8 @@ void AddGemmRewriterPasses(HloPassPipeline& pipeline,
       GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only, bias_mode});
   pipeline.AddPass<GemmRewriter>(
       gpu_version, toolkit_version,
-      GemmRewriterOptions{GemmRewriterOptions::DType::kNonFp8Only, bias_mode});
+      GemmRewriterOptions{GemmRewriterOptions::DType::kNonFp8Only, bias_mode,
+                          debug_options.xla_gpu_enable_cublaslt()});
 }
 }  // namespace
 
@@ -1694,7 +1807,8 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // annotations, this pass will add the annotations.
     pipeline.AddPass<SubByteNormalization>(
         SubByteNormalization::SET_ELEMENT_SIZE);
-    TF_RETURN_IF_ERROR(
+    pipeline.AddPass<EstimateCubScratchSize>(gpu_target_config.platform_name);
+    RETURN_IF_ERROR(
         pipeline.Run(hlo_module, {HloInstruction::kMainExecutionThread})
             .status());
   }
@@ -1715,7 +1829,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // f32).
   add_float_normalization(pipeline);
 
-  TF_RETURN_IF_ERROR(AddGemmFusionAutotuningPasses(
+  RETURN_IF_ERROR(AddGemmFusionAutotuningPasses(
       &pipeline, hlo_module, autotune_config, thread_pool,
       options.key_value_store,
       gpu_target_config.device_description.runtime_version(), stream_exec));
@@ -1731,7 +1845,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   AddGemmRewriterPasses(pipeline, debug_options, gpu_version,
                         gpu_target_config.device_description.runtime_version());
 
-  TF_RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
+  RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
       &pipeline, gpu_version, options, hlo_module, autotune_config, thread_pool,
       stream_exec, &gpu_target_config));
 
@@ -1756,8 +1870,9 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // normalized again.
   add_float_normalization(pipeline);
 
-  // Match the location of this pass in `gemm_fusion_autotuner.cc` to make sure
-  // that there is no discrepancy.
+  // GemmFusionAutotuner runs hoist-fused-bitcasts and nest-gemm-fusion,
+  // matching its behavior here.
+  pipeline.AddPass<HoistFusedBitcasts>();
   pipeline.AddPass<NestGemmFusion>(gpu_target_config.device_description,
                                    &mlir_context_);
 
@@ -1812,7 +1927,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
       "end-of-post-layout_assignment");
 #endif  // NDEBUG
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       pipeline.Run(hlo_module, {HloInstruction::kMainExecutionThread})
           .status());
   return absl::OkStatus();
@@ -1829,7 +1944,7 @@ GpuCompiler::GetTargetConfig(const Compiler::CompileOptions& options,
   }
   if (!debug_opts.xla_gpu_target_config_filename().empty()) {
     std::string gpu_target_config_string;
-    TF_RETURN_IF_ERROR(tsl::ReadFileToString(
+    RETURN_IF_ERROR(tsl::ReadFileToString(
         tsl::Env::Default(), debug_opts.xla_gpu_target_config_filename(),
         &gpu_target_config_string));
     stream_executor::GpuTargetConfigProto gpu_target_config_proto;
@@ -1872,7 +1987,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   }
 
   const DebugOptions debug_opts = module->config().debug_options();
-  TF_RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
+  RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
   bool is_deviceless = options.gpu_target_config.has_value() ||
                        !debug_opts.xla_gpu_target_config_filename().empty();
 
@@ -1894,12 +2009,15 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   const se::DeviceDescription& device_description =
       gpu_target_config.device_description;
   std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(device_description);
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), is_deviceless ? nullptr : stream_exec,
                         options, gpu_target_config, alias_info.get()));
+  if (options.early_exit_with_layouts) {
+    return std::move(module);
+  }
 
-  TF_RETURN_IF_ERROR(RunPreSchedulingCopyInsertion(*module, device_description,
-                                                   alias_info.get()));
+  RETURN_IF_ERROR(RunPreSchedulingCopyInsertion(*module, device_description,
+                                                alias_info.get()));
 
   uint64_t end_usecs = tsl::Env::Default()->NowMicros();
 
@@ -1916,9 +2034,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
       AutotuneConfig autotune_config,
       AutotuneConfig::FromDebugOptions(device_config, debug_opts));
   if (!is_deviceless) {
-    TF_RETURN_IF_ERROR(
-        AutotunerUtil::SerializeAutotuneResults(&autotune_results));
-    TF_RETURN_IF_ERROR(SerializeAutotuneResultsToFile(debug_opts));
+    RETURN_IF_ERROR(AutotunerUtil::SerializeAutotuneResults(&autotune_results));
+    RETURN_IF_ERROR(SerializeAutotuneResultsToFile(debug_opts));
   }
   const std::optional<std::string> optimized_fingerprint =
       MaybeUploadOptimizedGpuSymbols(module.get(), autotune_results);
@@ -1979,7 +2096,7 @@ absl::Status RunPostSchedulingCopyInsertion(HloModule* module,
           ? kRegionBasedLiveRangeAnalysisLimit
           : 0;
   CopyInsertion copy_insertion(alias_info, kUseRegionBasedLiveRangeAnalysis);
-  TF_RETURN_IF_ERROR(copy_insertion.RemoveUnnecessaryCopies(module));
+  RETURN_IF_ERROR(copy_insertion.RemoveUnnecessaryCopies(module));
 
   // Stash away the schedule during copy insertion, to avoid validation failures
   // while the module is in flux.
@@ -1990,10 +2107,10 @@ absl::Status RunPostSchedulingCopyInsertion(HloModule* module,
   // whether it is legal to remove a copy. However, copies in the graph may be
   // necessary for other reason such as preventing a constant from being live
   // out of the graph. So run AddSpecialCaseCopies to re-insert these copies.
-  TF_RETURN_IF_ERROR(copy_insertion.CopyInsertion::AddSpecialCaseCopies(
+  RETURN_IF_ERROR(copy_insertion.CopyInsertion::AddSpecialCaseCopies(
       module, /*execution_threads=*/{}, ShouldAddCopyForCollectiveMemorySpace));
 
-  TF_RETURN_IF_ERROR(HloDCE().Run(module).status());
+  RETURN_IF_ERROR(HloDCE().Run(module).status());
 
   // The passes above can add and remove copies, update the schedule to
   // account for these transformations. Newly added instructions will be
@@ -2002,8 +2119,8 @@ absl::Status RunPostSchedulingCopyInsertion(HloModule* module,
   // Update and restore the schedule. The saved schedule has a reference to the
   // updated HLO module. The saved schedule needs to be updated before restoring
   // it to the module to avoid validation failures.
-  TF_RETURN_IF_ERROR(saved_schedule.Update());
-  TF_RETURN_IF_ERROR(module->set_schedule(std::move(saved_schedule)));
+  RETURN_IF_ERROR(saved_schedule.Update());
+  RETURN_IF_ERROR(module->set_schedule(std::move(saved_schedule)));
 
   return absl::OkStatus();
 }
@@ -2224,7 +2341,7 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
   // function per module. If caching is not used limit the number of modules to
   // the number of threads.
   int num_modules = CountFunctions(*llvm_module);
-  if (thread_pool.get() != nullptr && !use_cache) {
+  if (thread_pool && !use_cache) {
     num_modules = std::max(1, std::min(thread_pool->NumThreads(), num_modules));
   }
   if (compile_module_results.llvm_module_constants != nullptr) {
@@ -2262,7 +2379,7 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
     absl::StatusOr<BackendCompileResult> result;
   };
   std::vector<NamedCompileResult> compile_results(llvm_modules.size());
-  if (thread_pool.get() != nullptr) {
+  if (thread_pool) {
     absl::BlockingCounter counter(llvm_modules.size());
     for (int i = 0; i < llvm_modules.size(); ++i) {
       thread_pool.get_mutable()->Schedule(
@@ -2348,9 +2465,9 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
               << current_cache.entries_size() << " cached kernels.";
     }
     if (!binaries_to_cache.empty()) {
-      TF_RETURN_IF_ERROR(
-          UpdateDiskKernelCache(resolved_path, /*do_append=*/cache_file_exists,
-                                current_cache, binaries_to_cache));
+      RETURN_IF_ERROR(UpdateDiskKernelCache(resolved_path,
+                                            /*do_append=*/cache_file_exists,
+                                            current_cache, binaries_to_cache));
     }
   }
 
@@ -2392,15 +2509,15 @@ GpuCompiler::CompileToBackendResult(
     const se::DeviceDescription& gpu_device_info) {
   tsl::profiler::TraceMe traceme("CompileToBackendResult");
   std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunPreSchedulingPasses(module, gpu_device_info, alias_info.get()));
   TF_ASSIGN_OR_RETURN(ScheduleMetadata schedule_metadata,
                       ScheduleGpuModule(module, pointer_size_, gpu_device_info,
                                         &mlir_context_, alias_info.get()));
   HloPassPipeline pipeline("scheduled-gpu-module");
   AddHloVerifier(&pipeline);
-  TF_RETURN_IF_ERROR(pipeline.Run(module).status());
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(pipeline.Run(module).status());
+  RETURN_IF_ERROR(
       RunPostSchedulingPipelines(module, schedule_metadata.scheduler_mem_limit,
                                  gpu_device_info, alias_info.get()));
 
@@ -2519,8 +2636,8 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   BinaryMap dnn_compiled_graphs;
   if (stream_exec) {
-    TF_RETURN_IF_ERROR(RunCudnnCompilerPasses(module.get(), stream_exec,
-                                              &dnn_compiled_graphs));
+    RETURN_IF_ERROR(RunCudnnCompilerPasses(module.get(), stream_exec,
+                                           &dnn_compiled_graphs));
   }
 
   const DebugOptions& debug_opts = module->config().debug_options();
@@ -2550,7 +2667,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     cost_analysis_options.set_bytes_per_second(
         gpu_device_info.memory_bandwidth());
     GpuHloCostAnalysis cost_analysis(cost_analysis_options, gpu_device_info);
-    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
+    RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     VLOG(1) << absl::StrFormat(
         "#module=%s,program_id=%d# estimated memory r+w %s", module->name(),
         module->unique_id(),
@@ -2637,6 +2754,11 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   // compilation.
   CHECK_EQ(options.PlatformId(), PlatformId());
 
+  if (options.early_exit_point() !=
+      AotCompilationOptions::EarlyExitPoint::kNone) {
+    return EarlyExitCompileAheadOfTime(std::move(hlo_module), options);
+  }
+
   if (hlo_module->config()
           .debug_options()
           .xla_gpu_experimental_aot_compiled_thunks()) {
@@ -2662,6 +2784,26 @@ GpuCompiler::NewCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   return results;
 }
 
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+GpuCompiler::EarlyExitCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                                         const AotCompilationOptions& options) {
+  bool early_exit_with_layouts =
+      options.early_exit_point() ==
+      AotCompilationOptions::EarlyExitPoint::kAfterLayoutAssignment;
+  CompileOptions compile_options;
+  compile_options.device_allocator = options.device_allocator();
+  compile_options.gpu_target_config = options.gpu_target_config();
+  compile_options.early_exit_with_layouts = early_exit_with_layouts;
+
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  TF_ASSIGN_OR_RETURN(
+      auto optimized_module,
+      RunHloPasses(std::move(hlo_module), options.executor(), compile_options));
+  results.push_back(std::make_unique<EarlyExitCompilationResult>(
+      std::move(optimized_module)));
+  return std::move(results);
+}
+
 absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                                       const AotCompilationOptions& options) {
@@ -2682,7 +2824,6 @@ GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
     optimized_module = std::move(hlo_module);
   }
 
-  std::vector<std::unique_ptr<AotCompilationResult>> results;
 
   const std::optional<Compiler::GpuTargetConfig>& target_config =
       options.gpu_target_config();
@@ -2697,13 +2838,14 @@ GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                              {options.device_allocator()}, gpu_device_info));
 
   // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
   TF_ASSIGN_OR_RETURN(
       results.emplace_back(),
       LegacyGpuAotCompilationResult::FromModule(
           optimized_module.get(),
           res.compile_module_results.buffer_assignment.get(),
           res.backend_result.asm_text, res.backend_result.binary,
-          res.backend_result.dnn_compiled_graphs, pointer_size_));
+          res.backend_result.dnn_compiled_graphs, pointer_size_, this));
 
   return std::move(results);
 }
@@ -2714,7 +2856,7 @@ HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
 }
 
 absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
-    Executable* executable) const {
+    Executable* executable) {
   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
   if (!gpu_executable) {
     return Internal("GpuExecutable is null");
@@ -2731,7 +2873,7 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
   return LegacyGpuAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
       gpu_executable->text(), gpu_executable->binary(),
-      gpu_executable->dnn_compiled_graphs(), pointer_size_);
+      gpu_executable->dnn_compiled_graphs(), pointer_size_, this);
 }
 
 absl::Status GpuCompiler::RunPreSchedulingPasses(
@@ -2849,7 +2991,7 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     const se::DeviceDescription& gpu_device_info,
     const GpuAliasInfo* alias_info) {
   tsl::profiler::TraceMe traceme("RunPostSchedulingPipelines");
-  TF_RETURN_IF_ERROR(RunPostSchedulingCopyInsertion(module, alias_info));
+  RETURN_IF_ERROR(RunPostSchedulingCopyInsertion(module, alias_info));
   HloPassPipeline main_pipeline("post-scheduling-passes");
 
   // Pipeline for async -> sync conversion on for non-overlapped async ops.
@@ -2934,7 +3076,7 @@ absl::Status GpuCompiler::LoadAutotuneResultsFromFile(
     absl::call_once(once, [&file_path, &status] {
       status = AutotunerUtil::LoadAutotuneResultsFromFile(file_path);
     });
-    TF_RETURN_IF_ERROR(status);
+    RETURN_IF_ERROR(status);
   }
   return absl::OkStatus();
 }
@@ -2947,8 +3089,7 @@ absl::Status GpuCompiler::SerializeAutotuneResultsToFile(
       !file_path.empty()) {
     // Warning: This writes the autotune results at every compilation,
     // possibly multiple times per process.
-    TF_RETURN_IF_ERROR(
-        AutotunerUtil::SerializeAutotuneResultsToFile(file_path));
+    RETURN_IF_ERROR(AutotunerUtil::SerializeAutotuneResultsToFile(file_path));
   }
   return absl::OkStatus();
 }
@@ -2968,7 +3109,7 @@ GpuCompiler::LoadAotCompilationResult(
   }
 
   return LegacyGpuAotCompilationResult::FromProto(gpu_executable_proto,
-                                                  pointer_size_);
+                                                  pointer_size_, this);
 }
 
 absl::StatusOr<std::unique_ptr<Executable>>
@@ -3021,7 +3162,7 @@ GpuCompiler::LoadExecutableFromAotResult(
       hlo_module->config()
           .debug_options()
           .xla_gpu_enable_llvm_module_compilation_parallelism()) {
-    TF_RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
+    RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
   }
 
   ThunkEmitter thunk_emitter(&ir_emitter_context);
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 235c184ecf22c6..be97807e6c8c89 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/autotune_results.pb.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
@@ -90,7 +89,7 @@ class GpuCompiler : public LLVMCompiler {
   LoadAotCompilationResult(const std::string& serialized_aot_result) override;
 
   absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
-      Executable* executable) const override;
+      Executable* executable) override;
 
   absl::Status RunPostSchedulingPipelines(
       HloModule* module, int64_t scheduler_mem_limit,
@@ -284,6 +283,10 @@ class GpuCompiler : public LLVMCompiler {
   LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                            const AotCompilationOptions& options);
 
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  EarlyExitCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                              const AotCompilationOptions& options);
+
   se::Platform::Id platform_id_;
 
   // The triple that represents our target.
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index d33164c7a5d926..621de1051095cd 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -107,6 +107,9 @@ using ::testing::IsEmpty;
 using ::testing::IsSupersetOf;
 using ::testing::Matches;
 using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pointee;
+using ::testing::Property;
 using ::testing::SizeIs;
 using ::testing::StartsWith;
 using ::testing::TempDir;
@@ -392,6 +395,13 @@ ENTRY e {
 
 class PersistedAutotuningTest : public HloTestBase {
  protected:
+  void SetUp() override {
+    AutotunerUtil::ClearAutotuneResults();
+    xla_gpu_dump_autotune_results_to_ = GetUniqueTempFilePath(".txt");
+  }
+
+  void TearDown() override { AutotunerUtil::ClearAutotuneResults(); }
+
   static constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -426,7 +436,6 @@ ENTRY e {
 TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
 
   constexpr absl::string_view kInvalidTextProto = "Invalid!";
-  xla_gpu_dump_autotune_results_to_ = GetUniqueTempFilePath(".txt");
 
   // Check that it writes the results on the first compilation.
   TF_EXPECT_OK(GetOptimizedModule(kHloText).status());
@@ -456,6 +465,22 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   }
 }
 
+TEST_F(PersistedAutotuningTest, SingleOperationGetsAutotuned) {
+  TF_EXPECT_OK(GetOptimizedModule(R"(
+e {
+  a = f32[64,128] parameter(0)
+  t = f32[128,64] transpose(a), dimensions={1,0}
+})")
+                   .status());
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string autotune_results_str,
+                          ReadNonEmptyFile(xla_gpu_dump_autotune_results_to_));
+  AutotuneResults results;
+  EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(autotune_results_str,
+                                                         &results));
+  EXPECT_THAT(results.results(), Not(IsEmpty()));
+}
+
 int64_t CountCopies(const HloComputation& computation) {
   int64_t count = 0;
   for (const auto& instruction : computation.instructions()) {
@@ -954,9 +979,8 @@ TEST_P(AotCompilationTest, CompileAndLoadAotResult) {
       std::unique_ptr<AotCompilationResult> aot_result,
       compiler_->LoadAotCompilationResult(serialized_aot_result));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler_, stream_exec_));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec_));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
@@ -992,9 +1016,8 @@ TEST_P(AotCompilationTest, ExportAndImportAotResult) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<AotCompilationResult> aot_result,
                           compiler_->Export(executable.get()));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> new_executable,
-      std::move(*aot_result).LoadExecutable(compiler_, stream_exec_));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> new_executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec_));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(new_executable));
 
@@ -1007,6 +1030,32 @@ TEST_P(AotCompilationTest, ExportAndImportAotResult) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, literal_expected_result));
 }
 
+TEST_P(AotCompilationTest, EarlyExitWithLayouts) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> add_1_hlo,
+      ParseAndReturnVerifiedModule(R"hlo(
+    add1 {
+      p = s32[] parameter(0)
+      c = s32[] constant(1)
+      ROOT a = s32[] add(p, c)
+    }
+
+    ENTRY e {
+      p = s32[] parameter(0)
+      ROOT r = s32[] fusion(p), kind=kLoop, calls=add1
+    })hlo",
+                                   GetModuleConfigForTest()));
+
+  aot_options_->set_early_exit_point(
+      AotCompilationOptions::EarlyExitPoint::kAfterLayoutAssignment);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      compiler_->CompileAheadOfTime(std::move(add_1_hlo), *aot_options_));
+  EXPECT_THAT(aot_results,
+              ElementsAre(Pointee(Property(
+                  &AotCompilationResult::optimized_module, NotNull()))));
+}
+
 class KernelCacheTest : public HloTestBase {
  public:
   void SetUp() override {
@@ -1160,8 +1209,7 @@ ENTRY e {
 
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        std::move(*aot_result)
-            .LoadExecutable(compiler, aot_options.executor()));
+        std::move(*aot_result).LoadExecutable(aot_options.executor()));
     std::unique_ptr<OpaqueExecutable> wrapped_executable =
         test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
@@ -1801,10 +1849,14 @@ TEST_F(PassOrderTest, GemmRewriterRunsAfterDotNormalizer) {
   VerifyNotRunInBetween(pass_range, /*pass_regex=*/"algsimp");
 }
 
-TEST_F(PassOrderTest, NestGemmFusionRunsAfterGemmFusionAutotuner) {
+TEST_F(PassOrderTest, HoistFusedBitcastsRunsAfterGemmFusionAutotuner) {
+  VerifyPassOrder("gemm-fusion-autotuner", "hoist-fused-bitcasts");
+}
+
+TEST_F(PassOrderTest, NestGemmFusionRunsAfterHoistFusedBitcasts) {
   // NestGemmFusion expect to see __triton_gemm custom call with a backend
   // config created by gemm_fusion_autotuner.
-  VerifyPassOrder("gemm-fusion-autotuner", "nest_gemm_fusion");
+  VerifyPassOrder("hoist-fused-bitcasts", "nest_gemm_fusion");
 }
 
 TEST_F(PassOrderTest, TransposeDimensionGrouperRunsBeforeGemmRewriter) {
@@ -2060,7 +2112,7 @@ ENTRY %main {
   EXPECT_CALL(mock_log,
               Log(absl::LogSeverity::kWarning, EndsWith("/gpu_compiler.cc"),
                   StartsWith("Using fallback sort algorithm")))
-      .Times(2);
+      .Times(1);
 
   // StartCapturingLogs has to be called even if we expect not to capture any
   // logs.
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index c44b9350d9dc19..67d67b9594af5f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -27,7 +27,7 @@ results {
 }
 results {
   device: "CUDA: 8.0, Cores: 108, GPU clock: 1.41 GHz, Memory bandwidth: 1555 GB/s, L2 cache: 40 MB, DNN version: 1.2.3"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[4194304]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
   result {
     run_time {
       nanos: 1
@@ -51,7 +51,7 @@ results {
 }
 results {
   device: "CUDA: 8.0, Cores: 108, GPU clock: 1.41 GHz, Memory bandwidth: 2039 GB/s, L2 cache: 40 MB, DNN version: 1.2.3"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[4194304]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
   result {
     run_time {
       nanos: 1
@@ -75,7 +75,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
   result {
     gemm {
       algorithm: -1
@@ -87,7 +87,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "(bf16[12288,16384]{1,0}, s8[33554432]{0}) custom-call(f8e4m3fn[12288,4096]{1,0}, f8e4m3fn[4096,16384]{0,1}, f32[], f32[]), custom_call_target=\"__cublas$lt$matmul$f8\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":0.95703125,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"50331648\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"67108864\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[12288,16384]{1,0}, s8[33554432]{0}) custom-call(f8e4m3fn[12288,4096]{1,0}, f8e4m3fn[4096,16384]{0,1}, f32[], f32[]), custom_call_target=\"__cublas$lt$matmul$f8\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":0.95703125,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"50331648\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"67108864\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
   result {
     gemm {
     }
@@ -192,4 +192,4 @@ results {
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 96d1b4ca13c2d9..1c73b3a5a8002e 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/annotation.h"
 #include "xla/backends/gpu/runtime/collective_clique_requests.h"
 #include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/command_buffer_conversion_pass.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
@@ -72,7 +73,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/rendezvous.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
@@ -88,6 +89,7 @@ limitations under the License.
 #include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/kernel_stats.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -238,6 +240,10 @@ absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(
 
   GpuExecutableThunkPassBufferAllocator allocator(next_idx);
 
+  // TODO(b/461380690): Remove this once we have a better way to distinguish
+  // between compiler-generated and runtime-loaded GPU executables.
+  absl::StatusOr<ThunkProto> thunk_proto = params.executable->ToProto();
+
   TF_RETURN_IF_ERROR(RunThunkPasses(
       params.debug_options, params.device_description, params.executable.get(),
       params.debug_module.get(), allocator));
@@ -251,7 +257,7 @@ absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(
       std::move(allocator.MutableAllocations()), std::move(params.alias_info),
       std::move(params.debug_options), std::move(params.constants),
       std::move(params.output_info), params.enable_debug_info_manager,
-      std::move(params.module_stats)));
+      std::move(params.module_stats), std::move(thunk_proto)));
 }
 
 // Implementation note: HLO profiling is always enabled for GPU executables,
@@ -268,7 +274,8 @@ GpuExecutable::GpuExecutable(
     std::unique_ptr<GpuAliasInfo> alias_info, DebugOptions debug_options,
     std::vector<ConstantInfo> constants,
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
-    bool enable_debug_info_manager, ModuleStats module_stats)
+    bool enable_debug_info_manager, ModuleStats module_stats,
+    absl::StatusOr<ThunkProto> thunk_proto)
     : Executable(std::move(debug_module)),
       text_(std::move(asm_text)),
       binary_(std::move(binary)),
@@ -288,7 +295,8 @@ GpuExecutable::GpuExecutable(
           debug_options.xla_debug_buffer_assignment_show_max()),
       constants_(std::move(constants)),
       output_info_(std::move(output_info)),
-      enable_debug_info_manager_(enable_debug_info_manager) {
+      enable_debug_info_manager_(enable_debug_info_manager),
+      thunk_proto_(std::move(thunk_proto)) {
   if (gpu_version_.IsRocm()) {
     // ROCm uses hsaco hashes to distinguish between modules.
     // Bad things happen if multiple modules with identical code are loaded.
@@ -453,9 +461,13 @@ absl::Status ExecuteThunksImpl(
           collective_max_nchannels, p2p_max_nchannels));
 
   CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
 
   {  // Prepare thunks for execution and collect requested GPU cliques.
-    Thunk::PrepareParams prepare_params{&collective_params, &clique_requests};
+    Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                        &multimem_registry, executor,
+                                        &buffer_allocations};
 
     tsl::profiler::TraceMe trace_prepare("Thunks::Prepare");
     TF_RETURN_IF_ERROR(thunk_sequence.Prepare(prepare_params));
@@ -481,6 +493,8 @@ absl::Status ExecuteThunksImpl(
                 : false));
   }
 
+  TF_RETURN_IF_ERROR(multimem_registry.Build());
+
   {  // Initialize thunks using prepared resources before execution.
     Thunk::InitializeParams initialize_params{
         executor,
@@ -490,6 +504,7 @@ absl::Status ExecuteThunksImpl(
         command_buffer_trace_stream,
         &collective_params,
         &collective_cliques,
+        &multimem_registry,
         run_options->run_options().ffi_execution_context(),
         run_options->local_device_count()};
 
@@ -930,8 +945,8 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
             << " @ index: " << index.ToString();
 
     if (output_info.alias_config) {
-      MaybeOwningDeviceMemory* maybe_owning_memory =
-          [&]() -> xla::MaybeOwningDeviceMemory* {
+      MaybeOwningDeviceAddress* maybe_owning_memory =
+          [&]() -> xla::MaybeOwningDeviceAddress* {
         // ScopedBuffer is never an owned buffer.
         if (std::holds_alternative<absl::Span<const ShapedBuffer* const>>(
                 arguments)) {
@@ -1062,7 +1077,14 @@ absl::Status GpuExecutable::ExecuteThunks(
         }
         module_allocations_[executor][i] =
             buffer_allocations.GetDeviceAddress(i);
-        VLOG(5) << "Gpu address changed for module " << module_name_;
+        const BufferAllocation& allocation =
+            buffer_assignment_->GetAllocation(i);
+        const char* allocation_type =
+            allocation.is_entry_computation_parameter() ? "parameter"
+            : allocation.maybe_live_out()               ? "live-out"
+                                                        : "temp";
+        VLOG(5) << "Gpu address changed for module " << module_name_
+                << ", allocation " << i << " (" << allocation_type << ")";
       }
     }
   }
@@ -1223,7 +1245,10 @@ absl::StatusOr<GpuExecutableProto> GpuExecutable::ToProto() const {
 
   *proto.mutable_gpu_compute_capability() = gpu_version_.ToProto();
 
-  TF_ASSIGN_OR_RETURN(*proto.mutable_thunk(), thunks_->ToProto());
+  // TODO(b/461380690): Generate the proto on-the-fly once we have a better way
+  // to distinguish between compiler-generated and runtime-loaded GPU
+  // executables.
+  TF_ASSIGN_OR_RETURN(*proto.mutable_thunk(), thunk_proto_);
 
   proto.set_module_name(module_name_);
   *proto.mutable_program_shape() = program_shape_.ToProto();
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index ce1a5eff0bb591..867dbf2275fb4c 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -247,7 +247,8 @@ class GpuExecutable : public Executable {
       std::unique_ptr<GpuAliasInfo> alias_info, DebugOptions debug_options,
       std::vector<ConstantInfo> constants,
       absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
-      bool enable_debug_info_manager, ModuleStats module_stats);
+      bool enable_debug_info_manager, ModuleStats module_stats,
+      absl::StatusOr<ThunkProto> thunk_proto);
 
   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
   // version for compute capability, depending on the hardware.
@@ -369,6 +370,10 @@ class GpuExecutable : public Executable {
 
   GpuExecutable(const GpuExecutable&) = delete;
   GpuExecutable& operator=(const GpuExecutable&) = delete;
+
+  // Stores the thunk graph as a proto from before running the thunk pass.
+  // Might contain an error if the given thunk graph is not serializable.
+  absl::StatusOr<ThunkProto> thunk_proto_;
 };
 
 absl::StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_test.cc b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
index 1d5d68823b970f..d4e884d50898b0 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/logical_buffer.h"
+#include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -145,6 +147,7 @@ TEST(GpuExecutableTest, RunThunkPasses) {
   auto create_executable = [&]() {
     Thunk::ThunkInfo thunk_info;
     BufferAllocation alloc(0, 1024, 0);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
     BufferAllocation::Slice slice(&alloc, 0, 1024);
 
     ThunkSequence thunk_sequence;
@@ -157,7 +160,8 @@ TEST(GpuExecutableTest, RunThunkPasses) {
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        thunk_info, slice, slice, 1024));
+        thunk_info, ShapedSlice{slice, shape}, ShapedSlice{slice, shape},
+        1024));
 
     GpuExecutable::Params params;
     params.executable = std::make_unique<SequentialThunk>(
@@ -391,6 +395,7 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
   auto create_executable = [&]() {
     BufferAllocation alloc(0, 1024, 0);
     BufferAllocation::Slice slice(&alloc, 0, 1024);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
 
     ThunkSequence thunk_sequence;
     thunk_sequence.push_back(std::make_unique<KernelThunk>(
@@ -402,7 +407,8 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        ThunkInfoWithId(456), slice, slice, 1024));
+        ThunkInfoWithId(456), ShapedSlice{slice, shape},
+        ShapedSlice{slice, shape}, 1024));
 
     GpuExecutable::Params params;
     params.executable = std::make_unique<SequentialThunk>(
@@ -511,6 +517,8 @@ TEST(GpuExecutableTest, GpuExecutableDump) {
   auto create_executable = [&]() {
     ThunkSequence thunk_sequence;
     BufferAllocation::Slice slice(&alloc, 0, 1024);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
+
     thunk_sequence.push_back(std::make_unique<KernelThunk>(
         ThunkInfoWithId(123),
         /*kernel_name=*/"test_kernel",
@@ -520,7 +528,8 @@ TEST(GpuExecutableTest, GpuExecutableDump) {
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        ThunkInfoWithId(456), slice, slice, 1024));
+        ThunkInfoWithId(456), ShapedSlice{slice, shape},
+        ShapedSlice{slice, shape}, 1024));
 
     GpuExecutable::Params params;
     params.executable = std::make_unique<SequentialThunk>(
@@ -649,5 +658,77 @@ TEST(GpuExecutableTest, FromProtoWithSymbolResolver) {
   EXPECT_EQ(symbol_resolver_invocations, 1);
 }
 
+TEST(GpuExecutableTest, ToProtoReturnsUnchangedThunkGraph) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
+
+  auto create_executable = [&]() {
+    ThunkSequence thunk_sequence;
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(1),
+        /*kernel_name=*/"test_kernel_0",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(2),
+        /*kernel_name=*/"test_kernel_1",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(3),
+        /*kernel_name=*/"test_kernel_2",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(4),
+        /*kernel_name=*/"test_kernel_3",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(5),
+        /*kernel_name=*/"test_kernel_4",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+
+    GpuExecutable::Params params;
+    params.executable = std::make_unique<SequentialThunk>(
+        ThunkInfoWithId(20), std::move(thunk_sequence));
+    params.debug_options = debug_options;
+
+    params.module_name = "test_module";
+    return GpuExecutable::Create(std::move(params));
+  };
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GpuExecutable> executable,
+                          create_executable());
+
+  // We expect our 5 kernel launches got wrapped in a command buffer thunk.
+  // If this assertion fails, you might need to either adjust the thunk graph or
+  // the debug options such that we do some kind of thunk graph transformation
+  // that we can test for.
+  ASSERT_THAT(executable->GetThunk().thunks(), SizeIs(1));
+
+  // The proto should be a straight dump of the thunk graph, without any
+  // transformation.
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto proto, executable->ToProto());
+  ASSERT_TRUE(proto.thunk().has_sequential_thunk());
+  EXPECT_THAT(proto.thunk().sequential_thunk().thunks(), SizeIs(5));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.cc b/third_party/xla/xla/service/gpu/gpu_float_support.cc
index cb0477bf19b9a9..6aa7e4b1ec1f68 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.cc
@@ -131,8 +131,10 @@ bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
         return compute_capability_.IsCuda();
       }
       return false;
+    case HloOpcode::kAbs:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
+    case HloOpcode::kNegate:
       if (LowPrecisionType() == BF16) {
         auto* cuda_compute_capability =
             compute_capability_.cuda_compute_capability();
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
index f464b670a57701..bd88890113d1a5 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
@@ -432,6 +432,35 @@ ENTRY main {
       se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
 }
 
+class Bf16UnaryOpTest : public FloatSupportTest,
+                        public ::testing::WithParamInterface<HloOpcode> {};
+
+TEST_P(Bf16UnaryOpTest, IsOnlyNormalizedPreAmpere) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(
+                              absl::Substitute(R"(
+entry {
+  a = bf16[] parameter(0)
+  r = bf16[] $0(a)
+})",
+                                               HloOpcodeString(GetParam()))));
+  EXPECT_FALSE(
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+                BF16, F32));
+  EXPECT_FALSE(
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+                BF16, F32));
+  EXPECT_TRUE(Normalize(
+      module.get(),
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
+}
+
+INSTANTIATE_TEST_SUITE_P(Bf16UnaryOps, Bf16UnaryOpTest,
+                         ::testing::Values(HloOpcode::kNegate,
+                                           HloOpcode::kAbs));
+
 TEST_F(FloatSupportTest,
        BF16ReductionOnHopperIsOnlyNormalizedIfReducerIsUnsupported) {
   auto cc = se::CudaComputeCapability::Hopper();
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 2afc4787298d11..5a8070693dd481 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
index ef5c5dacda34b6..1cf84062318131 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index c4669b077b1f42..4b78ae2e0fe441 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -124,12 +124,19 @@ class GpuHloScheduleTest : public HloTestBase {
                                        GetModuleConfig(test_config));
   }
 
-  static bool HasValidFingerprint(HloModule* module) {
+  static std::optional<std::string> ValidFingerprint(HloModule* module) {
     // Verify that the fingerprint of HLO prior to LHS is present.
     const FrontendAttributes& attrs = module->frontend_attributes();
     auto it = attrs.map().find(kFingerprintBeforeLHS);
     // The fingerprint is 128 bits stored as a hex string (128/4 hex digits).
-    return it != attrs.map().end() && it->second.size() == 128 / 4;
+    if (it != attrs.map().end() && it->second.size() == 128 / 4) {
+      return it->second;
+    }
+    return std::nullopt;
+  }
+
+  static bool HasValidFingerprint(HloModule* module) {
+    return ValidFingerprint(module).has_value();
   }
 };
 
@@ -1792,6 +1799,44 @@ TEST_F(GpuHloScheduleTest, AsyncOps) {
                           HloOpcode::kAsyncDone, HloOpcode::kAdd));
 }
 
+TEST_F(GpuHloScheduleTest, MetadataIgnoredInFingerprint) {
+  absl::string_view hlo = R"(
+HloModule test
+
+FileNames
+1 "$0"
+
+FunctionNames
+1 "<module>"
+
+FileLocations
+1 {file_name_id=1 function_name_id=1 line=1 end_line=2 column=0 end_column=1}
+
+StackFrames
+1 {file_location_id=1 parent_frame_id=1}
+
+fused_computation {
+  param_0 = f32[1024,1024]{1,0} parameter(0)
+  ROOT exponential.1 = f32[1024,1024]{1,0} exponential(param_0), metadata={stack_frame_id=1}
+}
+
+ENTRY e {
+  p = f32[1024,1024]{1,0} parameter(0)
+  ROOT wrapped_exp = f32[1024,1024]{1,0} fusion(p), kind=kLoop, calls=fused_computation
+})";
+  ASSERT_OK_AND_ASSIGN(auto mod1, ParseAndReturnVerifiedModule(
+                                      absl::Substitute(hlo, "filename1.py")));
+  ASSERT_OK_AND_ASSIGN(auto mod2, ParseAndReturnVerifiedModule(
+                                      absl::Substitute(hlo, "filename2.py")));
+  CHECK_OK(ScheduleGpuModule(mod1.get()).status());
+  CHECK_OK(ScheduleGpuModule(mod2.get()).status());
+  const std::optional<std::string> fp1 = ValidFingerprint(mod1.get());
+  const std::optional<std::string> fp2 = ValidFingerprint(mod2.get());
+  EXPECT_TRUE(fp1.has_value());
+  EXPECT_TRUE(fp2.has_value());
+  EXPECT_EQ(*fp1, *fp2);
+}
+
 // This test verifies that the latency hiding scheduler overlaps host memory
 // offloading (copy-start/copy-done) with computation.
 TEST_P(GpuHloScheduleParameterizedTest, CopyStartDoneScheduled) {
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index 28eabcb1cd7680..d56c3628e91e37 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -408,6 +407,52 @@ TEST_F(GpuLatencyHidingSchedulerBaseTest,
                   GetIndexByName(instruction_sequence, "rs_1"));
 }
 
+TEST_F(GpuLatencyHidingSchedulerBaseTest,
+       AllToAllAndGemmOverlapWithSolCostModel) {
+  // Verify SoL cost model successfully enables all-to-all overlap with compute.
+  absl::string_view kHloModule = R"(
+    HloModule m, replica_count=16
+
+    async_a2a {
+      param = f32[2048,2048] parameter(0)
+      ROOT a2a_inner = f32[2048,2048] all-to-all(param), dimensions={0},
+        replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}}
+    }
+
+    ENTRY main {
+      lhs = f32[8192,8192] parameter(0)
+      rhs = f32[8192,8192] parameter(1)
+      comm = f32[2048,2048] parameter(2)
+      compute = f32[8192,8192] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      a2a = ((f32[2048,2048]), f32[2048,2048]) async-start(comm), calls=async_a2a
+      a2a_done = f32[2048,2048] async-done(a2a)
+      ROOT tuple = (f32[2048,2048], f32[8192,8192]) tuple(a2a_done, compute)
+    }
+  )";
+
+  auto config = GetModuleConfig("");
+  DebugOptions& debug_options = config.mutable_debug_options();
+  debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
+  debug_options.set_xla_gpu_enable_analytical_sol_latency_estimator(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloModule, config));
+  auto scheduled = ScheduleModule(module.get(), /*num_parallel_resources=*/1);
+  TF_ASSERT_OK(scheduled.status());
+
+  const auto& sequence = scheduled.value()
+                             ->schedule()
+                             .sequence(module->entry_computation())
+                             .instructions();
+  int64_t a2a_idx = GetIndexByName(sequence, "a2a");
+  int64_t compute_idx = GetIndexByName(sequence, "compute");
+  int64_t a2a_done_idx = GetIndexByName(sequence, "a2a_done");
+
+  // Check that overlap occurs: a2a < compute < a2a_done
+  EXPECT_LT(a2a_idx, compute_idx);
+  EXPECT_LT(compute_idx, a2a_done_idx);
+}
+
 TEST_F(GpuLatencyHidingSchedulerBaseTest,
        OverlappingRanksPreventOverlappingCollectives) {
   // TODO TJ re-enable this test when the multi-streamed
diff --git a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
index 961e503f06fcb2..54af8dd292eea6 100644
--- a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
@@ -60,7 +60,8 @@ class GpuSpmdPartitioningTest : public HloHardwareIndependentTestBase,
     HloPassPipeline spmd_pipeline("spmd-partitioner");
     se::CudaComputeCapability ampere(8, 0);
     AlgebraicSimplifierOptions alg_simplifier_options;
-    // Ampere Core_count from tensorflow/compiler/xla/tools/hlo_opt/gpu_specs/.
+    // Ampere Core_count from
+    // tensorflow/compiler/xla/backends/gpu/target_config/specs/.
     AddSPMDPasses(module.get(), alg_simplifier_options, ampere, spmd_pipeline,
                   std::nullopt);
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(module.get()).status());
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 1f889842500cfb..feedcbdf5a4e6b 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -183,6 +182,10 @@ HloFusionAnalysis::EmitterFusionKind GetEmitterFusionKind(
     return HloFusionAnalysis::EmitterFusionKind::kScatter;
   }
 
+  if (fusion_roots[0].opcode() == HloOpcode::kSort) {
+    return HloFusionAnalysis::EmitterFusionKind::kSort;
+  }
+
   if (UseConcatenateFusion(fusion_roots, fusion_heroes)) {
     return HloFusionAnalysis::EmitterFusionKind::kConcatenate;
   }
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index c5b8980abdd5ab..78dba963b95ac1 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -49,6 +50,7 @@ class HloFusionAnalysis {
     kScatter,
     kCuDnn,
     kDynamicMemcpy,
+    kSort,
   };
 
   // Precomputed information about inputs (arguments) and outputs (roots) of the
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
index 93914ae3232637..7a02fcddf8ecf6 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace xla::gpu {
@@ -34,7 +33,7 @@ using ::tsl::proto_testing::EqualsProto;
 class HloFusionAnalysisTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -64,7 +63,7 @@ TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionWithMultipleUsers) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -100,7 +99,7 @@ TEST_F(HloFusionAnalysisTest, ReductionWithMultipleUsers) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -133,7 +132,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFused) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -166,7 +165,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFused) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInConsumer) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -197,7 +196,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInConsumer) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInBoth) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -234,7 +233,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInBoth) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReduceMultiOutputFusionWithTransposeBitcast) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -266,7 +265,7 @@ TEST_F(HloFusionAnalysisTest, ReduceMultiOutputFusionWithTransposeBitcast) {
 }
 
 TEST_F(HloFusionAnalysisTest, InvalidReduceMultiOutputFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -303,7 +302,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
   // Verifies that an analysis can be created even with an invalid/empty device
   // info, and that the emitter type is determined correctly.
   // Don't rely on this behavior.
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -320,7 +319,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
     })"));
 
   stream_executor::GpuDeviceInfoProto device_info_proto;
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto device_info,
       stream_executor::DeviceDescription::FromProto(device_info_proto));
   device_info.set_threads_per_warp(32);
@@ -333,7 +332,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
 }
 
 TEST_F(HloFusionAnalysisTest, ConcatFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     fused_computation {
@@ -360,8 +359,41 @@ TEST_F(HloFusionAnalysisTest, ConcatFusion) {
             HloFusionAnalysis::EmitterFusionKind::kConcatenate);
 }
 
+TEST_F(HloFusionAnalysisTest, SortFusion) {
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    less_than {
+      lhs.0 = f32[] parameter(0)
+      rhs.0 = f32[] parameter(1)
+      lhs.1 = s32[] parameter(2)
+      rhs.1 = s32[] parameter(3)
+      ROOT lt = pred[] compare(lhs.0, rhs.0), direction=LT
+    }
+
+    fused_computation {
+      p0 = f32[256] parameter(0)
+      iota = s32[256] iota(), iota_dimension=0
+      ROOT sort = (f32[256], s32[256]) sort(p0, iota), dimensions={0}, to_apply=less_than, is_stable=false
+    }
+
+    ENTRY main {
+      p = f32[256] parameter(0)
+      ROOT fusion = (f32[256], s32[256]) fusion(p), kind=kInput, calls=fused_computation
+    })"));
+
+  auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      HloFusionAdaptor::ForInstruction(root), &device_info);
+  EXPECT_EQ(analysis.emitter_fusion_kind(),
+            HloFusionAnalysis::EmitterFusionKind::kSort);
+}
+
 TEST_F(HloFusionAnalysisTest, ExtractValidGpuBackendConfig) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fused_computation.1 {
@@ -399,7 +431,7 @@ TEST_F(HloFusionAnalysisTest, ExtractValidGpuBackendConfig) {
 
 TEST_F(HloFusionAnalysisTest,
        InvalidGpuBackendConfig_SingleInstruction_Ignored) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     ENTRY entry {
@@ -418,7 +450,7 @@ TEST_F(HloFusionAnalysisTest,
 
 TEST_F(HloFusionAnalysisTest,
        InvalidGpuBackendConfig_ProducerConsumer_Ignored) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fused_computation {
@@ -444,7 +476,7 @@ TEST_F(HloFusionAnalysisTest,
 }
 
 TEST_F(HloFusionAnalysisTest, ConcatenateFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fusion {
@@ -475,7 +507,7 @@ TEST_F(HloFusionAnalysisTest, ConcatenateFusion) {
 }
 
 TEST_F(HloFusionAnalysisTest, ConcatenateFusionFallbackToLoop) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fusion {
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index a749ef0923f87b..72c74c7b8ea8d2 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -177,6 +177,13 @@ static bool IsContiguousSlice(
   return true;
 }
 
+int GetBitwidth(PrimitiveType type) {
+  if (type == PRED) {
+    return 8;
+  }
+  return primitive_util::BitWidth(type);
+}
+
 bool IsContiguousSlice(const HloInstruction& instr) {
   if (auto slice = DynCast<HloSliceInstruction>(&instr)) {
     const Shape& full_shape = slice->operand(0)->shape();
@@ -213,8 +220,8 @@ absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
   return buffer_assignment.GetUniqueSlice(instr, index);
 }
 
-bool IsNormalized(const HloTransposeInstruction& transpose) {
-  const auto& permutation = transpose.dimensions();
+bool IsNormalized(const TransposeDescription& desc) {
+  const auto& permutation = desc.permutation;
   for (int i = 0; i < permutation.size() - 1; ++i) {
     if (permutation[i] + 1 == permutation[i + 1]) {
       return false;
@@ -223,12 +230,12 @@ bool IsNormalized(const HloTransposeInstruction& transpose) {
   return true;
 }
 
-bool CanEmitPackedTranspose(const HloTransposeInstruction& transpose) {
+bool CanEmitPackedTranspose(const TransposeDescription& desc) {
   // Support only normalized transposes.
-  if (!IsNormalized(transpose)) {
+  if (!IsNormalized(desc)) {
     return false;
   }
-  const auto& spec = GetTransposeSpec(&transpose);
+  PackedTransposeDescription spec(desc);
   return GetPackedTransposeTileSizes(spec).ok();
 }
 
@@ -250,13 +257,15 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   absl::InlinedVector<int64_t, 3> dimensions(hero.shape().dimensions().begin(),
                                              hero.shape().dimensions().end());
   int64_t operand_most_minor_dim = hero.operand(0)->shape().dimensions().back();
-  if (CanEmitPackedTranspose(*Cast<HloTransposeInstruction>(&hero))) {
+
+  TransposeDescription desc{&hero, dimensions, permutation,
+                            /*shmem_usage=*/0};
+  if (CanEmitPackedTranspose(desc)) {
     int64_t vector_size =
         kBankBitwidth / GetBitwidth(hero.shape().element_type());
-    int64_t shmem_usage_bytes =
+    desc.shmem_usage =
         kNumShmemBanks * (kBankBitwidth / 8) * kNumShmemBanks * vector_size;
-    return TransposeDescription{&hero, dimensions, permutation,
-                                shmem_usage_bytes};
+    return desc;
   }
   if (permutation.back() == dimensions.size() - 1) {
     operand_most_minor_dim =
@@ -294,17 +303,17 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   return std::nullopt;
 }
 
-TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose) {
-  auto inv_permutation = InversePermutation(transpose->dimensions());
-  auto& output_shape = transpose->shape();
-  llvm::SmallVector<int64_t, 3> canonical_output_shape =
-      llvm::to_vector<3>(output_shape.dimensions());
-  llvm::SmallVector<int64_t, 3> canonical_permutation =
-      llvm::to_vector<3>(transpose->dimensions());
+PackedTransposeDescription::PackedTransposeDescription(
+    const TransposeDescription& description)
+    : transpose(Cast<HloTransposeInstruction>(description.instr)) {
+  permutation = llvm::to_vector<3>(description.permutation);
+  inv_permutation = llvm::to_vector<3>(InversePermutation(permutation));
+  canonical_output_shape = llvm::to_vector<3>(description.dimensions);
+  canonical_permutation = llvm::to_vector<3>(description.permutation);
 
   // If the last dimension is transposed, add a size-1 B dimension.
   if (canonical_permutation.back() != canonical_output_shape.size() - 1) {
-    canonical_permutation.push_back(output_shape.dimensions().size());
+    canonical_permutation.push_back(canonical_output_shape.size());
     canonical_output_shape.push_back(1);
   }
   int64_t dim_t1 = -1;
@@ -326,21 +335,13 @@ TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose) {
     canonical_permutation.insert(canonical_permutation.begin() + dim_t1,
                                  dim_t1);
   }
-  auto canonical_inv_permutation = InversePermutation(canonical_permutation);
-  auto canonical_input_shape =
-      Permute(canonical_output_shape, canonical_inv_permutation);
-  return TransposeSpec{
-      transpose,
-      llvm::to_vector<3>(transpose->dimensions()),
-      llvm::to_vector<3>(inv_permutation),
-      canonical_output_shape,
-      canonical_permutation,
-      llvm::to_vector<3>(canonical_inv_permutation),
-      llvm::to_vector<3>(canonical_input_shape),
-  };
+  canonical_inv_permutation =
+      llvm::to_vector<3>(InversePermutation(canonical_permutation));
+  canonical_input_shape = llvm::to_vector<3>(
+      Permute(canonical_output_shape, canonical_inv_permutation));
 }
 
-std::string TransposeSpec::ToString() const {
+std::string PackedTransposeDescription::ToString() const {
   return absl::Substitute(R"(
 transpose: $0
 canonical_input_shape: $1
@@ -358,7 +359,7 @@ canonical_inv_permutation: $4
 }
 
 absl::StatusOr<absl::InlinedVector<int64_t, 3>> GetPackedTransposeTileSizes(
-    const TransposeSpec& spec) {
+    const PackedTransposeDescription& spec) {
   // Check the side outputs, etc.
   int64_t bits_per_element = GetBitwidth(spec.elem_type());
   if (bits_per_element >= kBankBitwidth) {
@@ -661,6 +662,23 @@ bool IsInductionVariable(const HloInstruction* maybe_variable,
          maybe_variable->tuple_index() == loop.induction_variable_index;
 }
 
+// Returns true if `variable` is marked as a dynamic variable.
+bool IsDynamicVariable(const HloInstruction* variable,
+                       const VerifiedLoop& loop) {
+  auto config = loop.loop->backend_config<xla::WhileLoopBackendConfig>();
+  if (!config.ok()) {
+    return false;
+  }
+
+  int64_t tuple_idx = variable->tuple_index();
+  for (int64_t dynamic_idx : config->dynamic_variable_tuple_indices()) {
+    if (dynamic_idx == tuple_idx) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // Attempts to find the induction variable of `loop` in `dependencies`. If there
 // are any dependencies on non-induction variable loop-carried variables,
 // returns nullopt.
@@ -668,25 +686,38 @@ std::optional<const HloInstruction*> VerifyInductionVariable(
     const Dependencies& dependencies, const VerifiedLoop& loop) {
   const HloInstruction* induction_var = nullptr;
   for (const HloInstruction* gte : dependencies.get_tuple_elements) {
-    if (IsInductionVariable(gte, loop)) {
-      if (induction_var) {
-        // This should never happen.
-        VLOG(5) << "Found non-unique GTEs for the induction variable. Did "
-                   "HloCSE run?";
+    if (IsLoopCarriedVariable(gte, loop)) {
+      if (IsInductionVariable(gte, loop)) {
+        if (induction_var) {
+          // This should never happen.
+          VLOG(5) << "Found non-unique GTEs for the induction variable. Did "
+                     "HloCSE run?";
+          return std::nullopt;
+        }
+        induction_var = gte;
+      } else if (IsDynamicVariable(gte, loop)) {
+        // Dynamic variables are also acceptable because they represent tuple
+        // indices used in DS/DUS that can be optimized by
+        // FusionDynamicMemcpyRewriter.
+        if (induction_var) {
+          // This should never happen.
+          VLOG(5) << "Found non-unique GTEs for the dynamic variable. Did "
+                     "HloCSE run?";
+          return std::nullopt;
+        }
+        induction_var = gte;
+      } else {
+        // Other dependencies on loop-carried variables are not allowed.
+        VLOG(5) << "Found illegal dependency on loop-carried variable.";
         return std::nullopt;
       }
-      induction_var = gte;
-    } else if (IsLoopCarriedVariable(gte, loop)) {
-      // Other dependencies on loop-carried variables are not allowed.
-      VLOG(5) << "Found illegal dependency on loop-carried variable.";
-      return std::nullopt;
     }
     // Other GTEs are OK, as long as their tuples are ultimately just derived
     // from the loop's induction variable. We already verified that there are no
     // side-effecting dependencies in GetLeafDependencies.
   }
   if (!induction_var) {
-    VLOG(5) << "Did not find an induction variable.";
+    VLOG(5) << "Did not find an induction variable or dynamic variable.";
     return std::nullopt;
   }
   return induction_var;
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 18499b6472dabe..510940703bc650 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
-#include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -170,6 +169,46 @@ HloInstructionAdaptor FindNonTrivialHero(const HloInstructionAdaptor& instr);
 // Same as above, but fusion is the parent computation of the hlo instruction.
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
 
+// Returns the bitwidth of the given primitive type. Unfortunately,
+// primitive_util::BitWidth(PRED) return 1 instead of 8.
+int GetBitwidth(PrimitiveType type);
+
+/// Description of how to emit a given transposition.
+struct TransposeDescription {
+  // Transpose instruction.
+  const HloInstruction* instr;
+
+  // Normalized transpose dimensions.
+  absl::InlinedVector<int64_t, 3> dimensions;
+
+  // Permutations of normalized transpose dimensions.
+  // Normalized means that permutation[i] + 1 != permutation[i + 1].
+  absl::InlinedVector<int64_t, 3> permutation;
+
+  // Required amount of shared memory in bytes.
+  int64_t shmem_usage = 0;
+
+  TransposeDescription(const HloInstruction* instr,
+                       absl::InlinedVector<int64_t, 3> dimensions,
+                       absl::InlinedVector<int64_t, 3> permutation,
+                       int64_t shmem_usage)
+      : instr(instr),
+        dimensions(dimensions),
+        permutation(permutation),
+        shmem_usage(shmem_usage) {}
+
+  // Transpose instruction input shape.
+  const Shape& input_shape() const { return instr->operand(0)->shape(); }
+
+  // Returns true, if both descriptions have the same dimensions and
+  // permutation, even if they're produced by different instructions.
+  bool IsEquivalent(const TransposeDescription& other) const {
+    return dimensions == other.dimensions && permutation == other.permutation &&
+           GetBitwidth(instr->shape().element_type()) ==
+               GetBitwidth(other.instr->shape().element_type());
+  }
+};
+
 std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& hero);
 
@@ -185,13 +224,19 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
 // 3. <8x2x32x7x6> -> <6x32x2x7x8> becomes <8x2x32x7x6x1> -> <6x32x2x7x8x1>.
 
 // TODO(b/370690811): Unify this with TransposeDescription.
-struct TransposeSpec {
-  PrimitiveType elem_type() const { return input_shape().element_type(); }
+struct PackedTransposeDescription {
+  explicit PackedTransposeDescription(const TransposeDescription& description);
 
-  const Shape& input_shape() const { return transpose->operand(0)->shape(); }
-  const Shape& output_shape() const { return transpose->shape(); }
+  PrimitiveType elem_type() const {
+    return original_input_shape().element_type();
+  }
+
+  const Shape& original_input_shape() const {
+    return transpose->operand(0)->shape();
+  }
+  const Shape& original_output_shape() const { return transpose->shape(); }
 
-  int64_t rank() const { return input_shape().dimensions().size(); }
+  int64_t rank() const { return original_input_shape().dimensions().size(); }
   int64_t canonical_rank() const { return canonical_input_shape.size(); }
 
   int64_t dim_A() const { return canonical_input_shape[dim_A_id()]; }
@@ -225,11 +270,12 @@ struct TransposeSpec {
   llvm::SmallVector<int64_t, 3> canonical_input_shape;
 };
 
-TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose);
+// Returns true if the given transpose can be emitted using the packed emitter.
+bool CanEmitPackedTranspose(const TransposeDescription& desc);
 
 // Returns the default tile sizes for the packed transpose emitter.
 absl::StatusOr<absl::InlinedVector<int64_t, 3>> GetPackedTransposeTileSizes(
-    const TransposeSpec& spec);
+    const PackedTransposeDescription& spec);
 
 // Verify the given module, and crash if it failed.
 void VerifyModule(const llvm::Module& module);
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index 49345bd960ec58..f14ff91e330107 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -50,10 +51,27 @@ using ::testing::SizeIs;
 
 class IrEmissionUtilsTest : public HloHardwareIndependentTestBase {
  public:
-  TransposeSpec GetTransposeSpecFromRoot(absl::string_view hlo_text) {
+  PackedTransposeDescription GetTransposeSpecFromTransposeDescription(
+      absl::string_view hlo_text,
+      std::optional<absl::InlinedVector<int64_t, 3>> permutation = std::nullopt,
+      std::optional<absl::InlinedVector<int64_t, 3>> dimensions =
+          std::nullopt) {
     auto module = ParseAndReturnVerifiedModule(hlo_text).value();
-    auto* root = module->entry_computation()->root_instruction();
-    return GetTransposeSpec(Cast<HloTransposeInstruction>(root));
+    auto* root = Cast<HloTransposeInstruction>(
+        module->entry_computation()->root_instruction());
+
+    if (!permutation.has_value()) {
+      permutation = absl::InlinedVector<int64_t, 3>(root->dimensions().begin(),
+                                                    root->dimensions().end());
+    }
+    if (!dimensions.has_value()) {
+      dimensions = absl::InlinedVector<int64_t, 3>(
+          root->shape().dimensions().begin(), root->shape().dimensions().end());
+    }
+
+    TransposeDescription description{root, *dimensions, *permutation,
+                                     /*shmem_usage=*/0};
+    return PackedTransposeDescription(description);
   }
 };
 
@@ -957,8 +975,167 @@ TEST_F(IrEmissionUtilsTest, NonInductionVariableLoopCarriedVariable) {
                    .has_value());
 }
 
+TEST_F(IrEmissionUtilsTest, DynamicVariableLoopCarriedVariable) {
+  constexpr absl::string_view kHlo = R"(
+      while_body {
+        p0 = (s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        dynamic_var = s32[] get-tuple-element(p0), index=1
+        other_var = s32[] get-tuple-element(p0), index=2
+
+        c1 = s32[] constant(1)
+        next_ivar = s32[] add(ivar, c1)
+        next_dynamic_var = s32[] add(dynamic_var, c1)
+        next_other = s32[] add(other_var, c1)
+
+        ROOT result = (s32[], s32[], s32[]) tuple(next_ivar, next_dynamic_var, next_other)
+      }
+
+      condition {
+        p0 = (s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        c5 = s32[] constant(5)
+        ROOT cmp = pred[] compare(ivar, c5), direction=LT
+      }
+
+      ENTRY main {
+        c0 = s32[] constant(0)
+        tuple = (s32[], s32[], s32[]) tuple(c0, c0, c0)
+        ROOT while = (s32[], s32[], s32[]) while(tuple),
+            condition=condition, body=while_body,
+            backend_config={"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":[1]}
+      }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("next_ivar"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("next_dynamic_var"))
+                  .has_value());
+
+  ASSERT_FALSE(ResolveFunctionalDependencyOnInductionVariable(
+                   while_body->GetInstructionWithName("next_other"))
+                   .has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, DynamicVariableWithIrrelevantGTE) {
+  constexpr absl::string_view kHlo = R"(
+      while_body {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        dynamic_var = s32[] get-tuple-element(p0), index=1
+        irrelevant_var = s32[] get-tuple-element(p0), index=2
+        other_var = s32[] get-tuple-element(p0), index=3
+
+        c1 = s32[] constant(1)
+        next_ivar = s32[] add(ivar, c1)
+        
+        dynamic_computation = s32[] add(ivar, c1)
+        
+        irrelevant_computation = s32[] add(irrelevant_var, c1)
+        
+        next_other = s32[] add(other_var, c1)
+
+        ROOT result = (s32[], s32[], s32[], s32[]) tuple(next_ivar, dynamic_computation, irrelevant_computation, next_other)
+      }
+
+      condition {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        c5 = s32[] constant(5)
+        ROOT cmp = pred[] compare(ivar, c5), direction=LT
+      }
+
+      ENTRY main {
+        c0 = s32[] constant(0)
+        tuple = (s32[], s32[], s32[], s32[]) tuple(c0, c0, c0, c0)
+        ROOT while = (s32[], s32[], s32[], s32[]) while(tuple),
+            condition=condition, body=while_body,
+            backend_config={"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":[1, 2]}
+      }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("next_ivar"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("dynamic_computation"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("irrelevant_computation"))
+                  .has_value());
+
+  ASSERT_FALSE(ResolveFunctionalDependencyOnInductionVariable(
+                   while_body->GetInstructionWithName("next_other"))
+                   .has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, MultipleDynamicVariables) {
+  constexpr absl::string_view kHlo = R"(
+      while_body {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        dynamic_var1 = s32[] get-tuple-element(p0), index=1
+        dynamic_var2 = s32[] get-tuple-element(p0), index=2
+        regular_var = s32[] get-tuple-element(p0), index=3
+
+        c1 = s32[] constant(1)
+        next_ivar = s32[] add(ivar, c1)
+        
+        compute1 = s32[] add(dynamic_var1, c1)
+        compute2 = s32[] add(dynamic_var2, c1)
+        compute_regular = s32[] add(regular_var, c1)
+
+        ROOT result = (s32[], s32[], s32[], s32[]) tuple(next_ivar, compute1, compute2, compute_regular)
+      }
+
+      condition {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        c5 = s32[] constant(5)
+        ROOT cmp = pred[] compare(ivar, c5), direction=LT
+      }
+
+      ENTRY main {
+        c0 = s32[] constant(0)
+        tuple = (s32[], s32[], s32[], s32[]) tuple(c0, c0, c0, c0)
+        ROOT while = (s32[], s32[], s32[], s32[]) while(tuple),
+            condition=condition, body=while_body,
+            backend_config={"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":[1, 2]}
+      }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("compute1"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("compute2"))
+                  .has_value());
+
+  ASSERT_FALSE(ResolveFunctionalDependencyOnInductionVariable(
+                   while_body->GetInstructionWithName("compute_regular"))
+                   .has_value());
+}
+
 TEST_F(IrEmissionUtilsTest, Transpose_10) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+  auto spec = GetTransposeSpecFromTransposeDescription(R"(ENTRY entry {
     p0 = f32[8, 32] parameter(0)
     ROOT transpose_p0 = f32[32, 8] transpose(p0), dimensions={1, 0}
   })");
@@ -971,7 +1148,7 @@ TEST_F(IrEmissionUtilsTest, Transpose_10) {
 }
 
 TEST_F(IrEmissionUtilsTest, Transpose_210) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+  auto spec = GetTransposeSpecFromTransposeDescription(R"(ENTRY entry {
     p0 = f32[8, 2, 32] parameter(0)
     ROOT transpose_p0 = f32[32, 2, 8] transpose(p0), dimensions={2, 1, 0}
   })");
@@ -982,7 +1159,7 @@ TEST_F(IrEmissionUtilsTest, Transpose_210) {
 }
 
 TEST_F(IrEmissionUtilsTest, Transpose_102) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+  auto spec = GetTransposeSpecFromTransposeDescription(R"(ENTRY entry {
     p0 = f32[8, 2, 32, 7, 6] parameter(0)
     ROOT transpose_p0 = f32[6, 32, 2, 7, 8] transpose(p0),
       dimensions={4, 2, 1, 3, 0}
@@ -993,6 +1170,65 @@ TEST_F(IrEmissionUtilsTest, Transpose_102) {
   EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
 }
 
+TEST_F(IrEmissionUtilsTest,
+       PackedTransposeDescriptionUsesProvidedDims_Grouping) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p = f32[32,32,64]{2,1,0} parameter(0)
+    ROOT t = f32[64,32,32]{2,1,0} transpose(p), dimensions={2,0,1}
+  })",
+      /*permutation=*/InlinedVector({1, 0}),
+      /*dimensions=*/InlinedVector({64, 1024}));
+
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(64, 1, 1024, 1));
+}
+
+TEST_F(IrEmissionUtilsTest, PackedTransposeDescriptionUsesProvidedDims_10) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p0 = f32[8, 4, 8] parameter(0)
+    ROOT transpose_p0 = f32[4, 8, 8] transpose(p0), dimensions={1, 2, 0}
+  })",
+      /*permutation=*/InlinedVector({1, 0}),
+      /*dimensions=*/InlinedVector({32, 8}));
+  EXPECT_THAT(spec.permutation, ElementsAre(1, 0));
+  EXPECT_THAT(spec.inv_permutation, ElementsAre(1, 0));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 1, 32, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 1, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
+}
+
+TEST_F(IrEmissionUtilsTest, PackedTransposeDescriptionUsesProvidedDims_210) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p0 = f32[8, 2, 4, 8] parameter(0)
+    ROOT transpose_p0 = f32[4, 8, 2, 8] transpose(p0),
+      dimensions={2, 3, 1, 0}
+  })",
+      /*permutation=*/InlinedVector({2, 1, 0}),
+      /*dimensions=*/InlinedVector({32, 2, 8}));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 2, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
+}
+
+TEST_F(IrEmissionUtilsTest, PackedTransposeDescriptionUsesProvidedDims_102) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p0 = f32[8, 2, 32, 7, 2, 3] parameter(0)
+    ROOT transpose_p0 = f32[2, 3, 32, 2, 7, 8] transpose(p0),
+      dimensions={4, 5, 2, 1, 3, 0}
+  })",
+      /*permutation=*/InlinedVector({4, 2, 1, 3, 0}),
+      /*dimensions=*/InlinedVector({6, 32, 2, 7, 8}));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 7, 6, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(6, 32, 2, 7, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
+}
+
 TEST(DenseDataIntermediateTest, OwnedDataToProto) {
   const std::vector<uint8_t> data = {1, 2, 3, 4};
   DenseDataIntermediate constant = DenseDataIntermediate::Own(data);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 8268f48c5809de..528be1317cc0fb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/host_execute_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_call.cc b/third_party/xla/xla/service/gpu/kernel_call.cc
index c5c876e02837bf..0038faa9153d93 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call.cc
@@ -29,9 +29,9 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_call.h b/third_party/xla/xla/service/gpu/kernel_call.h
index 104ef7fcbdf548..c158bbf9cd30c0 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.h
+++ b/third_party/xla/xla/service/gpu/kernel_call.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/stream_executor/launch_dim.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernel_call_test.cc b/third_party/xla/xla/service/gpu/kernel_call_test.cc
index 030324a94548e4..de0c4c66ea25d6 100644
--- a/third_party/xla/xla/service/gpu/kernel_call_test.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
index d5b413bd166f9d..6f51d83ddf2ab7 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.pb.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
index e56efa0be3a17f..41b0698075c502 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
@@ -85,8 +85,8 @@ static std::optional<Dim> As(std::optional<Dim3> dim3) {
 }
 
 // Returns a pointer to device memory holding a slice offset.
-static int32_t* SlicePtr(const se::KernelArgsDeviceMemoryArray* args,
-                         int64_t index) {
+static int32_t* SlicePtr(
+    const stream_executor::KernelArgsDeviceAddressArray* args, int64_t index) {
   const void* opaque = args->device_memory_ptr(index);
   return static_cast<int32_t*>(const_cast<void*>(opaque));
 }
@@ -111,7 +111,8 @@ KernelArgsPacking ArgsPacking(GemmMode mode, int32_t batch_count, int32_t m,
   };
 
   return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
-    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
+    auto* mem_args =
+        se::Cast<stream_executor::KernelArgsDeviceAddressArray>(&args);
 
     Arguments arguments = {mode, batch_count, m, n, k};
     arguments.lhs = const_cast<void*>(mem_args->device_memory_ptr(indices.lhs));
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
index 7c45a2664ee8f0..1b88d1847de619 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
@@ -72,7 +72,7 @@ static void BM_RowMajorGemm(benchmark::State& state) {
   CHECK_OK(stream->Memset32(&b, BitPattern(1.2f), b.size()));
   CHECK_OK(stream->MemZero(&c, c.size()));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
index 592b8fd7731b6c..15355017c55b1a 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
@@ -71,7 +71,7 @@ TEST(CutlassGemmKernelTest, SimpleGemm) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Launch gemm kernel with device memory arguments.
-  se::KernelArgsDeviceMemoryArray arr(
+  stream_executor::KernelArgsDeviceAddressArray arr(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   TF_ASSERT_OK(gemm->Launch(custom_kernel.thread_dims(),
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index 2f8950b839117d..63c8848a038663 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
diff --git a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
index c37e0ff701464d..983fdd7728851b 100644
--- a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
@@ -103,7 +103,7 @@ TEST(PtxCustomKernelTest, GetPtxCustomKernel) {
   CHECK_OK(stream->Memset32(&b, 2, byte_length));
   CHECK_OK(stream->MemZero(&c, byte_length));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
@@ -143,7 +143,7 @@ TEST(PtxCustomKernelTest, GetPtxCustomKernelWithClusterDim) {
   CHECK_OK(stream->Memset32(&b, 2, byte_length));
   CHECK_OK(stream->MemZero(&c, byte_length));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
@@ -222,7 +222,7 @@ TEST(PtxCustomKernelTest, GetOwnedPtxCustomKernel) {
   CHECK_OK(stream->Memset32(&b, 2, byte_length));
   CHECK_OK(stream->MemZero(&c, byte_length));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
index 0985ad404f4b9e..36c87453f8ba98 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -28,7 +27,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/compiler.h"
-#include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable.pb.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -44,7 +42,8 @@ absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
 LegacyGpuAotCompilationResult::FromModule(
     const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
     absl::string_view asm_text, absl::Span<const uint8_t> binary,
-    const BinaryMap& dnn_compiled_graphs, int pointer_size) {
+    const BinaryMap& dnn_compiled_graphs, int pointer_size,
+    Compiler* compiler) {
   tsl::profiler::TraceMe traceme("ResultFromModule");
   GpuExecutableProto proto;
   *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
@@ -55,12 +54,13 @@ LegacyGpuAotCompilationResult::FromModule(
                                               dnn_compiled_graphs.cend());
   return std::unique_ptr<LegacyGpuAotCompilationResult>(
       new LegacyGpuAotCompilationResult(hlo_module->Clone(), std::move(proto),
-                                        pointer_size));
+                                        pointer_size, compiler));
 }
 
 absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
 LegacyGpuAotCompilationResult::FromString(const std::string& serialized,
-                                          int pointer_size) {
+                                          int pointer_size,
+                                          Compiler* compiler) {
   tsl::profiler::TraceMe traceme("ResultFromString");
   GpuExecutableProto proto;
   if (!proto.ParseFromString(serialized)) {
@@ -68,19 +68,19 @@ LegacyGpuAotCompilationResult::FromString(const std::string& serialized,
         "Failed to parse serialized LegacyGpuAotCompilationResult.");
   }
 
-  return FromProto(proto, pointer_size);
+  return FromProto(proto, pointer_size, compiler);
 }
 
 absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
 LegacyGpuAotCompilationResult::FromProto(const GpuExecutableProto& proto,
-                                         int pointer_size) {
+                                         int pointer_size, Compiler* compiler) {
   tsl::profiler::TraceMe traceme("ResultFromProto");
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
       HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
   return std::unique_ptr<LegacyGpuAotCompilationResult>(
       new LegacyGpuAotCompilationResult(std::move(module), std::move(proto),
-                                        pointer_size));
+                                        pointer_size, compiler));
 }
 
 absl::StatusOr<std::string> LegacyGpuAotCompilationResult::SerializeAsString()
@@ -90,12 +90,12 @@ absl::StatusOr<std::string> LegacyGpuAotCompilationResult::SerializeAsString()
 
 absl::StatusOr<std::unique_ptr<Executable>>
 LegacyGpuAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) && {
+    const se::StreamExecutor* stream_exec) && {
   if (stream_exec == nullptr) {
     return InvalidArgument("Stream executor is null.");
   }
 
-  return compiler->LoadExecutableFromAotResult(*this, *stream_exec);
+  return compiler_->LoadExecutableFromAotResult(*this, *stream_exec);
 }
 
 absl::StatusOr<std::unique_ptr<BufferAssignment>>
@@ -116,5 +116,24 @@ LegacyGpuAotCompilationResult::buffer_assignment() const {
                                      buffer_size_bytes_function, &alias_info);
 }
 
+absl::StatusOr<std::string> EarlyExitCompilationResult::SerializeAsString()
+    const {
+  return Unavailable(
+      "SerializeAsString() is not supported by EarlyExitCompilationResult.");
+}
+
+absl::StatusOr<std::unique_ptr<Executable>>
+EarlyExitCompilationResult::LoadExecutable(
+    const se::StreamExecutor* stream_exec) && {
+  return Unavailable(
+      "LoadExecutable() is not supported by EarlyExitCompilationResult.");
+}
+
+absl::StatusOr<std::unique_ptr<BufferAssignment>>
+EarlyExitCompilationResult::buffer_assignment() const {
+  return Unavailable(
+      "buffer_assignment() is not supported by EarlyExitCompilationResult.");
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
index b97cecd5a2ebce..476cc2b8dd3ec8 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
@@ -50,23 +50,25 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
   FromModule(const HloModule* hlo_module,
              const BufferAssignment* buffer_assignment,
              absl::string_view asm_text, absl::Span<const uint8_t> binary,
-             const BinaryMap& dnn_compiled_graphs, int pointer_size);
+             const BinaryMap& dnn_compiled_graphs, int pointer_size,
+             Compiler* compiler);
 
   static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
-  FromString(const std::string& serialized, int pointer_size);
+  FromString(const std::string& serialized, int pointer_size,
+             Compiler* compiler);
 
   static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
-  FromProto(const GpuExecutableProto& proto, int pointer_size);
+  FromProto(const GpuExecutableProto& proto, int pointer_size,
+            Compiler* compiler);
 
   absl::StatusOr<std::string> SerializeAsString() const override;
 
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
-      override;
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
+  std::shared_ptr<HloModule> shared_optimized_module() override {
+    return module_;
   }
 
   absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
@@ -76,14 +78,39 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
 
  private:
   LegacyGpuAotCompilationResult(std::unique_ptr<HloModule> module,
-                                GpuExecutableProto proto, int pointer_size)
+                                GpuExecutableProto proto, int pointer_size,
+                                Compiler* compiler)
       : module_(std::move(module)),
         proto_(std::move(proto)),
-        pointer_size_(pointer_size) {}
+        pointer_size_(pointer_size),
+        compiler_(compiler) {}
 
-  std::unique_ptr<HloModule> module_;
+  std::shared_ptr<HloModule> module_;
   GpuExecutableProto proto_;
   int pointer_size_;
+  Compiler* compiler_;
+};
+
+class EarlyExitCompilationResult : public AotCompilationResult {
+ public:
+  explicit EarlyExitCompilationResult(std::unique_ptr<HloModule> module)
+      : module_(std::move(module)) {}
+
+  absl::StatusOr<std::string> SerializeAsString() const override;
+
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && override;
+
+  const HloModule* optimized_module() const override { return module_.get(); }
+  std::shared_ptr<HloModule> shared_optimized_module() override {
+    return module_;
+  }
+
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const override;
+
+ private:
+  std::shared_ptr<HloModule> module_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 668b495f7626ac..fbd35f6609e597 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -171,7 +171,9 @@ cc_library(
         "HAS_SUPPORT_FOR_EMBEDDED_LIB_DEVICE=1",
     ]),
     tags = [
+        "gpu",
         "nofixdeps",  # This target crashes build_cleaner ¯\_(ツ)_/¯
+        "rocm-only",
     ],
     deps = [
         ":llvm_gpu_backend",
@@ -197,6 +199,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:AMDGPUAsmParser",  # buildcleaner: keep
         "@llvm-project//llvm:Analysis",
+        "@llvm-project//llvm:BinaryFormat",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
         "@llvm-project//llvm:CodeGen",
@@ -205,6 +208,7 @@ cc_library(
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
+        "@llvm-project//llvm:Object",
         "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
@@ -309,6 +313,34 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "amdgpu_register_spilling_test",
+    size = "small",
+    srcs = ["amdgpu_register_spilling_test.cc"],
+    data = [
+        "tests_data/amdgpu_dynamic_stack.ll",
+        "tests_data/amdgpu_no_spills.ll",
+        "tests_data/amdgpu_sgpr_spills.ll",
+        "tests_data/amdgpu_vgpr_spills.ll",
+    ],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":amdgpu_backend",
+        ":load_ir_module",
+        "//xla:xla_proto_cc",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_cc_test(
     name = "load_ir_module_test",
     size = "small",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index a515f65186ae2a..8b466dee616d7e 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <fstream>
@@ -46,6 +47,8 @@ limitations under the License.
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/CommandFlags.h"
@@ -61,9 +64,13 @@ limitations under the License.
 #include "llvm/InitializePasses.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/FileSystem.h"
@@ -149,6 +156,232 @@ struct HsacoCache {
 
 static HsacoCache g_hsacoCache;  // NOLINT: static/global vars forbidden
 
+// Structure to hold register spilling and stack information from HSACO metadata
+struct RegisterSpillInfo {
+  uint64_t sgpr_spill_count = 0;
+  uint64_t vgpr_spill_count = 0;
+  uint64_t private_segment_size = 0;
+  bool uses_dynamic_stack = false;
+
+  bool HasSpilling() const {
+    return sgpr_spill_count > 0 || vgpr_spill_count > 0;
+  }
+
+  bool HasStackUsage() const {
+    return private_segment_size > 0 || uses_dynamic_stack;
+  }
+};
+
+// Parse NT_AMDGPU_METADATA note contents and extract register spill counts.
+// The metadata is in MessagePack format containing kernel information.
+RegisterSpillInfo ParseAMDGPUMetadataForSpills(llvm::StringRef metadata) {
+  RegisterSpillInfo spill_info;
+
+  // Parse the MsgPack metadata
+  llvm::msgpack::Document doc;
+  if (!doc.readFromBlob(metadata, /*Multi=*/false)) {
+    VLOG(2) << "Could not parse MsgPack metadata from NT_AMDGPU_METADATA note";
+    return spill_info;
+  }
+
+  llvm::msgpack::DocNode root = doc.getRoot();
+  if (!root.isMap()) {
+    VLOG(2) << "AMDGPU metadata root is not a map (unexpected format)";
+    return spill_info;
+  }
+
+  // Look for "amdhsa.kernels" array
+  llvm::msgpack::MapDocNode root_map = root.getMap();
+  auto kernels_it = root_map.find("amdhsa.kernels");
+
+  if (kernels_it == root_map.end() || !kernels_it->second.isArray()) {
+    VLOG(2) << "NT_AMDGPU_METADATA found but missing 'amdhsa.kernels' array";
+    return spill_info;
+  }
+
+  llvm::msgpack::ArrayDocNode kernels_array = kernels_it->second.getArray();
+
+  // Iterate through each kernel
+  for (auto& kernel_node : kernels_array) {
+    uint64_t kernel_sgpr_spill = 0;
+    uint64_t kernel_vgpr_spill = 0;
+    uint64_t kernel_sgpr_count = 0;
+    uint64_t kernel_vgpr_count = 0;
+    uint64_t kernel_private_size = 0;
+    bool kernel_uses_dynamic = false;
+
+    if (!kernel_node.isMap()) continue;
+
+    llvm::msgpack::MapDocNode kernel_map = kernel_node.getMap();
+
+    // Look for ".sgpr_spill_count"
+    auto sgpr_it = kernel_map.find(".sgpr_spill_count");
+    if (sgpr_it != kernel_map.end() &&
+        sgpr_it->second.getKind() == llvm::msgpack::Type::UInt) {
+      kernel_sgpr_spill = sgpr_it->second.getUInt();
+      spill_info.sgpr_spill_count =
+          std::max(spill_info.sgpr_spill_count, kernel_sgpr_spill);
+    }
+
+    // Look for ".vgpr_spill_count"
+    auto vgpr_it = kernel_map.find(".vgpr_spill_count");
+    if (vgpr_it != kernel_map.end() &&
+        vgpr_it->second.getKind() == llvm::msgpack::Type::UInt) {
+      kernel_vgpr_spill = vgpr_it->second.getUInt();
+      spill_info.vgpr_spill_count =
+          std::max(spill_info.vgpr_spill_count, kernel_vgpr_spill);
+    }
+
+    // Look for ".private_segment_fixed_size"
+    auto priv_it = kernel_map.find(".private_segment_fixed_size");
+    if (priv_it != kernel_map.end() &&
+        priv_it->second.getKind() == llvm::msgpack::Type::UInt) {
+      kernel_private_size = priv_it->second.getUInt();
+      spill_info.private_segment_size =
+          std::max(spill_info.private_segment_size, kernel_private_size);
+    }
+
+    // Look for ".uses_dynamic_stack"
+    auto dyn_it = kernel_map.find(".uses_dynamic_stack");
+    if (dyn_it != kernel_map.end() &&
+        dyn_it->second.getKind() == llvm::msgpack::Type::Boolean) {
+      kernel_uses_dynamic = dyn_it->second.getBool();
+      spill_info.uses_dynamic_stack =
+          spill_info.uses_dynamic_stack || kernel_uses_dynamic;
+    }
+
+    // Helper to get kernel name for logging (only when needed)
+    auto get_kernel_name = [&kernel_map]() -> std::string {
+      auto name_it = kernel_map.find(".name");
+      if (name_it != kernel_map.end() &&
+          name_it->second.getKind() == llvm::msgpack::Type::String) {
+        return name_it->second.getString().str();
+      }
+      return "unknown";
+    };
+
+    // Log per-kernel spill information with register usage
+    if (kernel_sgpr_spill > 0 || kernel_vgpr_spill > 0) {
+      // Look for ".sgpr_count" (total SGPRs used)
+      auto sgpr_count_it = kernel_map.find(".sgpr_count");
+      if (sgpr_count_it != kernel_map.end() &&
+          sgpr_count_it->second.getKind() == llvm::msgpack::Type::UInt) {
+        kernel_sgpr_count = sgpr_count_it->second.getUInt();
+      }
+
+      // Look for ".vgpr_count" (total VGPRs used)
+      auto vgpr_count_it = kernel_map.find(".vgpr_count");
+      if (vgpr_count_it != kernel_map.end() &&
+          vgpr_count_it->second.getKind() == llvm::msgpack::Type::UInt) {
+        kernel_vgpr_count = vgpr_count_it->second.getUInt();
+      }
+
+      VLOG(2) << "Kernel '" << get_kernel_name() << "' has register spilling: "
+              << "SGPR=" << kernel_sgpr_spill << ", VGPR=" << kernel_vgpr_spill
+              << ". Register count: SGPR=" << kernel_sgpr_count
+              << ", VGPR=" << kernel_vgpr_count;
+    }
+
+    // Log per-kernel stack usage
+    if (kernel_private_size > 0 || kernel_uses_dynamic) {
+      VLOG(2) << "Kernel '" << get_kernel_name() << "' stack usage: "
+              << "private=" << kernel_private_size
+              << ", dynamic=" << (kernel_uses_dynamic ? "true" : "false");
+    }
+  }
+
+  return spill_info;
+}
+
+// ELF note descriptor alignment per ELF specification
+constexpr int kElfNoteDescAlignment = 4;
+
+// Returns spill counts by parsing AMDGPU metadata from note sections of HSACO
+// ELF binary.
+//
+// HSACO file (ELF binary)
+//   -- .note section(s)
+//       -- ELF Note with type=NT_AMDGPU_METADATA
+//           -- MessagePack data
+//               -- Root map
+//                   -- "amdhsa.kernels" array
+//                       -- Each kernel object
+//                           - ".sgpr_spill_count"
+//                           - ".vgpr_spill_count"
+//                           - ... (other kernel properties)
+RegisterSpillInfo ExtractRegisterSpillingFromHsaco(
+    const std::vector<uint8_t>& hsaco) {
+  RegisterSpillInfo spill_info;
+
+  // Create memory buffer from HSACO data
+  std::unique_ptr<llvm::MemoryBuffer> mem_buffer =
+      llvm::MemoryBuffer::getMemBuffer(
+          llvm::StringRef(reinterpret_cast<const char*>(hsaco.data()),
+                          hsaco.size()),
+          "", /*RequiresNullTerminator=*/false);
+
+  // Parse as ELF object file
+  llvm::Expected<std::unique_ptr<llvm::object::ObjectFile>> obj_or_err =
+      llvm::object::ObjectFile::createObjectFile(mem_buffer->getMemBufferRef());
+
+  if (!obj_or_err) {
+    VLOG(2) << "Could not parse HSACO as ELF object file: "
+            << llvm::toString(obj_or_err.takeError());
+    return spill_info;
+  }
+
+  llvm::object::ObjectFile* obj = obj_or_err->get();
+
+  // Cast to ELF64LE object file (AMDGPU uses 64-bit little-endian ELF)
+  auto* elf_obj = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj);
+  if (!elf_obj) {
+    VLOG(2) << "HSACO is not a 64-bit little-endian ELF file";
+    return spill_info;
+  }
+
+  // Get the underlying ELFFile to access the notes() API
+  const auto& elf_file = elf_obj->getELFFile();
+
+  for (const auto& section : elf_obj->sections()) {
+    llvm::Expected<const typename llvm::object::ELF64LEObjectFile::Elf_Shdr*>
+        shdr_or_err = elf_obj->getSection(section.getRawDataRefImpl());
+
+    if (!shdr_or_err) {
+      continue;  // Skip sections we can't access
+    }
+
+    const auto* shdr = *shdr_or_err;
+
+    if (shdr->sh_type != llvm::ELF::SHT_NOTE) {
+      continue;
+    }
+
+    llvm::Error err = llvm::Error::success();
+    for (const auto& note : elf_file.notes(*shdr, err)) {
+      if (note.getType() == llvm::ELF::NT_AMDGPU_METADATA) {
+        llvm::StringRef metadata =
+            note.getDescAsStringRef(kElfNoteDescAlignment);
+
+        if (metadata.empty()) {
+          VLOG(2) << "Found NT_AMDGPU_METADATA note but it contains no data";
+          continue;
+        }
+
+        // Parse the metadata and extract spill counts, return immediately
+        return ParseAMDGPUMetadataForSpills(metadata);
+      }
+    }
+
+    if (err) {
+      VLOG(2) << "Error parsing notes: " << llvm::toString(std::move(err));
+    }
+  }
+
+  // If we reach here, no metadata was found
+  VLOG(2) << "No AMDGPU metadata found in HSACO";
+  return spill_info;
+}
+
 bool HsacoCache::Find(const std::string& ir, uint64_t& hash,
                       const std::string& gfx, std::vector<uint8_t>& hsaco) {
   absl::MutexLock lock(g_hsacoCache.mutex);
@@ -317,14 +550,57 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
     }
   }
 
-  // Read HSACO.
+  // Read HSACO file into memory (used for both metadata extraction and return)
   std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
+  if (!hsaco_file) {
+    return xla::Internal("Failed to open HSACO file: %s", hsaco_path);
+  }
   std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
-
   std::vector<uint8_t> hsaco(hsaco_file_size);
   hsaco_file.seekg(0, std::ios::beg);
   hsaco_file.read(reinterpret_cast<char*>(hsaco.data()), hsaco_file_size);
   hsaco_file.close();
+
+  // Check for register spilling using HSACO metadata
+  VLOG(2) << "Checking for register spilling in: "
+          << module->getModuleIdentifier();
+
+  RegisterSpillInfo spill_info = ExtractRegisterSpillingFromHsaco(hsaco);
+
+  if (spill_info.HasSpilling()) {
+    // We can have SGPR spills without stack being used. They are saved to
+    // VGPRs. In that case, we don't want to discard such kernel, so just
+    // report such cases.
+    VLOG(1) << "Register spilling (SGPR: " << spill_info.sgpr_spill_count
+            << ", VGPR: " << spill_info.vgpr_spill_count << ") detected in "
+            << module->getModuleIdentifier();
+  } else {
+    VLOG(2) << "No register spilling detected in "
+            << module->getModuleIdentifier();
+  }
+
+  if (spill_info.HasStackUsage()) {
+    VLOG(1) << "Stack usage (private: " << spill_info.private_segment_size
+            << ", dynamic: "
+            << (spill_info.uses_dynamic_stack ? "true" : "false")
+            << ") detected in " << module->getModuleIdentifier();
+
+    // Filter out kernels with register spilling during autotuning
+    // This matches NVIDIA's behavior in ptx_compiler_impl.cc
+    // TODO: remove ptx from xla_gpu_fail_ptx_compilation_on_register_spilling
+    // to make the flag more general
+    if (debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling()) {
+      VLOG(0) << "Discard module " << module->getModuleIdentifier()
+              << " due register spilling or stack usage";
+      return xla::Cancelled(
+          "Compilation result discarded due to register spilling or stack "
+          "usage");
+    }
+  } else {
+    VLOG(2) << "No stack usage detected in " << module->getModuleIdentifier();
+  }
+
+  // Clean up temp files
   if (!keep_tempfiles) {
     remove(ir_path.c_str());
     remove(isabin_path.c_str());
@@ -560,6 +836,34 @@ std::vector<std::string> GetAMDGPUBackendOptions(
                            backend_extra_llvm_opts.cbegin(),
                            backend_extra_llvm_opts.cend());
 
+  // Manually add LLVM debug options for register usage analysis
+  // Note: The disassembly-based spilling detection is now the primary method.
+  // These options are mainly useful for debugging the compiler itself.
+
+  // Uncomment if you want to see LLVM compilation details:
+
+  // Option 1: Enable LLVM statistics (aggregate stats, not per-kernel)
+  // backend_llvm_opts.push_back("-stats");
+
+  // Option 2: Print final machine code (very verbose)
+  // backend_llvm_opts.push_back("-print-after-all");
+
+  // Option 3: Print after register allocation (shows register assignments)
+  // backend_llvm_opts.push_back("-print-after=regallocfast");
+  // backend_llvm_opts.push_back("-print-after=regallocgreedy");
+
+  // Option 4: Enable pass timing (shows compilation time breakdown)
+  // backend_llvm_opts.push_back("-time-passes");
+
+  // Log the final LLVM options
+  if (!backend_llvm_opts.empty()) {
+    LOG(INFO) << "AMDGPU backend LLVM options (" << backend_llvm_opts.size()
+              << "):";
+    for (const auto& opt : backend_llvm_opts) {
+      LOG(INFO) << "  " << opt;
+    }
+  }
+
   return backend_llvm_opts;
 }
 
@@ -574,6 +878,10 @@ absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
   absl::call_once(backend_init_flag, AMDGPUBackendInit, debug_options,
                   rocdl_dir_path);
   auto llvm_opts = GetAMDGPUBackendOptions(debug_options);
+
+  VLOG(2) << "CompileToHsaco called for module: "
+          << module->getModuleIdentifier();
+
   llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_opts);
 
   std::vector<uint8_t> hsaco;
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
new file mode 100644
index 00000000000000..1edd0d62a1ca70
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h"
+#include "xla/service/gpu/llvm_gpu_backend/load_ir_module.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/test.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace se = ::stream_executor;
+
+static std::string RemoveLLExtension(const std::string& filename) {
+  return filename.substr(0, filename.find(".ll"));
+}
+
+// Test parameter structure
+struct SpillingTestParam {
+  std::string ir_filename;         // IR file to compile
+  bool fail_on_spilling;           // Flag value
+  absl::StatusCode expected_code;  // Expected status code
+  std::string expected_substring;  // Expected substring in error (if any)
+};
+
+class AMDGPURegisterSpillingTest
+    : public ::testing::TestWithParam<SpillingTestParam> {
+ protected:
+  // Helper to load IR module from test data
+  std::unique_ptr<llvm::Module> LoadTestModule(llvm::LLVMContext* context,
+                                               const std::string& filename) {
+    auto path = tsl::testing::XlaSrcRoot();
+    path = path.erase(path.length() - 4);                                            
+    return LoadIRModule(
+        tsl::io::JoinPath(path, "external/local_xla/xla", "service", "gpu",
+                          "llvm_gpu_backend", "tests_data", filename),
+        context);
+  }
+
+  // Helper to compile with given debug options
+  absl::StatusOr<std::vector<uint8_t>> CompileModule(
+      llvm::Module* module, const std::string& module_id,
+      bool fail_on_spilling) {
+    DebugOptions debug_options;
+    debug_options.set_xla_gpu_fail_ptx_compilation_on_register_spilling(
+        fail_on_spilling);
+
+    module->setModuleIdentifier(module_id);
+
+    return amdgpu::CompileToHsaco(
+        module, se::GpuComputeCapability{se::RocmComputeCapability{"gfx1100"}},
+        debug_options, module_id);
+  }
+};
+
+TEST_P(AMDGPURegisterSpillingTest, CompileTest) {
+  const SpillingTestParam& param = GetParam();
+  llvm::LLVMContext context;
+
+  auto module = LoadTestModule(&context, param.ir_filename);
+  ASSERT_NE(module, nullptr);
+
+  // Generate module ID from filename and flag state
+  std::string module_id =
+      RemoveLLExtension(param.ir_filename) +
+      (param.fail_on_spilling ? "_fail_on_spilling" : "_allow_spilling");
+
+  auto result = CompileModule(module.get(), module_id, param.fail_on_spilling);
+
+  EXPECT_EQ(result.status().code(), param.expected_code)
+      << "IR: " << param.ir_filename
+      << ", Flag: " << (param.fail_on_spilling ? "enabled" : "disabled")
+      << ", Status: " << result.status().message();
+
+  if (!param.expected_substring.empty()) {
+    EXPECT_THAT(result.status().message(),
+                ::testing::HasSubstr(param.expected_substring))
+        << "IR: " << param.ir_filename;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RegisterSpillingTests, AMDGPURegisterSpillingTest,
+    ::testing::Values(
+        SpillingTestParam{"amdgpu_no_spills.ll",
+                          /*fail_on_spilling=*/true, absl::StatusCode::kOk, ""},
+        SpillingTestParam{"amdgpu_vgpr_spills.ll",
+                          /*fail_on_spilling=*/false, absl::StatusCode::kOk,
+                          ""},
+        SpillingTestParam{"amdgpu_vgpr_spills.ll",
+                          /*fail_on_spilling=*/true,
+                          absl::StatusCode::kCancelled, "register spilling"},
+        SpillingTestParam{"amdgpu_sgpr_spills.ll",
+                          /*fail_on_spilling=*/false, absl::StatusCode::kOk,
+                          ""},
+        SpillingTestParam{"amdgpu_sgpr_spills.ll",
+                          /*fail_on_spilling=*/true, absl::StatusCode::kOk, ""},
+        SpillingTestParam{"amdgpu_dynamic_stack.ll",
+                          /*fail_on_spilling=*/true,
+                          absl::StatusCode::kCancelled, "stack usage"}),
+    [](const ::testing::TestParamInfo<SpillingTestParam>& info) {
+      return RemoveLLExtension(info.param.ir_filename) +
+             (info.param.fail_on_spilling ? "_fail_on_spilling"
+                                          : "_allow_spilling");
+    });
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll
new file mode 100644
index 00000000000000..5a8b76446c3e55
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll
@@ -0,0 +1,26 @@
+; AMDGPU kernel with dynamic stack usage (indirect function call)
+; Based on real HIP code that uses function pointers
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+@__hip_cuid_40fa47637d275275 = addrspace(1) global i8 0
+
+@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_40fa47637d275275 to ptr)], section "llvm.metadata"
+
+; Kernel that uses indirect function call requiring dynamic stack
+define protected amdgpu_kernel void @_Z4TestPDF16bS_S_(ptr addrspace(1) noundef %dst.coerce, ptr addrspace(1) noundef %ptr1.coerce, ptr addrspace(1) noundef %ptr2.coerce) local_unnamed_addr {
+entry:
+  %0 = ptrtoint ptr addrspace(1) %dst.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = ptrtoint ptr addrspace(1) %ptr1.coerce to i64
+  %3 = inttoptr i64 %2 to ptr
+  %4 = ptrtoint ptr addrspace(1) %ptr2.coerce to i64
+  %5 = inttoptr i64 %4 to ptr
+  %6 = tail call ptr asm "", "=s"() #1
+  tail call void %6(ptr noundef %1, ptr noundef %3, ptr noundef %5) #2
+  ret void
+}
+
+attributes #1 = { nounwind }
+attributes #2 = { nounwind }
+
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll
new file mode 100644
index 00000000000000..4ab9829a36f90d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll
@@ -0,0 +1,29 @@
+; Simple AMDGPU kernel for testing register spilling detection
+; This module has no external dependencies and minimal module flags
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Simple kernel that adds two arrays
+define amdgpu_kernel void @simple_add(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx = zext i32 %tid to i64
+
+  %a_ptr = getelementptr float, ptr addrspace(1) %a, i64 %tidx
+  %b_ptr = getelementptr float, ptr addrspace(1) %b, i64 %tidx
+  %c_ptr = getelementptr float, ptr addrspace(1) %c, i64 %tidx
+
+  %a_val = load float, ptr addrspace(1) %a_ptr, align 4
+  %b_val = load float, ptr addrspace(1) %b_ptr, align 4
+
+  %sum = fadd float %a_val, %b_val
+
+  store float %sum, ptr addrspace(1) %c_ptr, align 4
+  ret void
+}
+
+; Intrinsic declaration
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
+
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll
new file mode 100644
index 00000000000000..51dbc634d680c9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll
@@ -0,0 +1,166 @@
+; AMDGPU kernel with high SGPR pressure to force scalar register spilling
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Kernel using many scalar operations with limited SGPRs
+; We use readfirstlane to force values into SGPRs
+define amdgpu_kernel void @sgpr_pressure(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx = zext i32 %tid to i64
+
+  ; Load many scalar values from memory
+  ; Using readfirstlane forces values into SGPRs (uniform across wavefront)
+  %ptr0 = getelementptr i32, ptr addrspace(1) %in, i64 0
+  %v0_vec = load i32, ptr addrspace(1) %ptr0, align 4
+  %v0 = call i32 @llvm.amdgcn.readfirstlane(i32 %v0_vec)
+
+  %ptr1 = getelementptr i32, ptr addrspace(1) %in, i64 1
+  %v1_vec = load i32, ptr addrspace(1) %ptr1, align 4
+  %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1_vec)
+
+  %ptr2 = getelementptr i32, ptr addrspace(1) %in, i64 2
+  %v2_vec = load i32, ptr addrspace(1) %ptr2, align 4
+  %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v2_vec)
+
+  %ptr3 = getelementptr i32, ptr addrspace(1) %in, i64 3
+  %v3_vec = load i32, ptr addrspace(1) %ptr3, align 4
+  %v3 = call i32 @llvm.amdgcn.readfirstlane(i32 %v3_vec)
+
+  %ptr4 = getelementptr i32, ptr addrspace(1) %in, i64 4
+  %v4_vec = load i32, ptr addrspace(1) %ptr4, align 4
+  %v4 = call i32 @llvm.amdgcn.readfirstlane(i32 %v4_vec)
+
+  %ptr5 = getelementptr i32, ptr addrspace(1) %in, i64 5
+  %v5_vec = load i32, ptr addrspace(1) %ptr5, align 4
+  %v5 = call i32 @llvm.amdgcn.readfirstlane(i32 %v5_vec)
+
+  %ptr6 = getelementptr i32, ptr addrspace(1) %in, i64 6
+  %v6_vec = load i32, ptr addrspace(1) %ptr6, align 4
+  %v6 = call i32 @llvm.amdgcn.readfirstlane(i32 %v6_vec)
+
+  %ptr7 = getelementptr i32, ptr addrspace(1) %in, i64 7
+  %v7_vec = load i32, ptr addrspace(1) %ptr7, align 4
+  %v7 = call i32 @llvm.amdgcn.readfirstlane(i32 %v7_vec)
+
+  %ptr8 = getelementptr i32, ptr addrspace(1) %in, i64 8
+  %v8_vec = load i32, ptr addrspace(1) %ptr8, align 4
+  %v8 = call i32 @llvm.amdgcn.readfirstlane(i32 %v8_vec)
+
+  %ptr9 = getelementptr i32, ptr addrspace(1) %in, i64 9
+  %v9_vec = load i32, ptr addrspace(1) %ptr9, align 4
+  %v9 = call i32 @llvm.amdgcn.readfirstlane(i32 %v9_vec)
+
+  %ptr10 = getelementptr i32, ptr addrspace(1) %in, i64 10
+  %v10_vec = load i32, ptr addrspace(1) %ptr10, align 4
+  %v10 = call i32 @llvm.amdgcn.readfirstlane(i32 %v10_vec)
+
+  %ptr11 = getelementptr i32, ptr addrspace(1) %in, i64 11
+  %v11_vec = load i32, ptr addrspace(1) %ptr11, align 4
+  %v11 = call i32 @llvm.amdgcn.readfirstlane(i32 %v11_vec)
+
+  %ptr12 = getelementptr i32, ptr addrspace(1) %in, i64 12
+  %v12_vec = load i32, ptr addrspace(1) %ptr12, align 4
+  %v12 = call i32 @llvm.amdgcn.readfirstlane(i32 %v12_vec)
+
+  %ptr13 = getelementptr i32, ptr addrspace(1) %in, i64 13
+  %v13_vec = load i32, ptr addrspace(1) %ptr13, align 4
+  %v13 = call i32 @llvm.amdgcn.readfirstlane(i32 %v13_vec)
+
+  %ptr14 = getelementptr i32, ptr addrspace(1) %in, i64 14
+  %v14_vec = load i32, ptr addrspace(1) %ptr14, align 4
+  %v14 = call i32 @llvm.amdgcn.readfirstlane(i32 %v14_vec)
+
+  %ptr15 = getelementptr i32, ptr addrspace(1) %in, i64 15
+  %v15_vec = load i32, ptr addrspace(1) %ptr15, align 4
+  %v15 = call i32 @llvm.amdgcn.readfirstlane(i32 %v15_vec)
+
+  ; Create many scalar computations - chain A
+  %a0 = add i32 %v0, %v1
+  %a1 = mul i32 %a0, %v2
+  %a2 = add i32 %a1, %v3
+  %a3 = mul i32 %a2, %v4
+  %a4 = add i32 %a3, %v5
+  %a5 = mul i32 %a4, %v6
+  %a6 = add i32 %a5, %v7
+  %a7 = mul i32 %a6, %v8
+  %a8 = add i32 %a7, %v9
+  %a9 = mul i32 %a8, %v10
+  %a10 = add i32 %a9, %v11
+  %a11 = mul i32 %a10, %v12
+  %a12 = add i32 %a11, %v13
+  %a13 = mul i32 %a12, %v14
+  %a14 = add i32 %a13, %v15
+
+  ; Chain B - reverse
+  %b0 = mul i32 %v15, %v14
+  %b1 = add i32 %b0, %v13
+  %b2 = mul i32 %b1, %v12
+  %b3 = add i32 %b2, %v11
+  %b4 = mul i32 %b3, %v10
+  %b5 = add i32 %b4, %v9
+  %b6 = mul i32 %b5, %v8
+  %b7 = add i32 %b6, %v7
+  %b8 = mul i32 %b7, %v6
+  %b9 = add i32 %b8, %v5
+  %b10 = mul i32 %b9, %v4
+  %b11 = add i32 %b10, %v3
+  %b12 = mul i32 %b11, %v2
+  %b13 = add i32 %b12, %v1
+  %b14 = mul i32 %b13, %v0
+
+  ; Chain C - subtraction
+  %c0 = sub i32 %v0, %v1
+  %c1 = mul i32 %c0, %v2
+  %c2 = sub i32 %c1, %v3
+  %c3 = mul i32 %c2, %v4
+  %c4 = sub i32 %c3, %v5
+  %c5 = mul i32 %c4, %v6
+  %c6 = sub i32 %c5, %v7
+  %c7 = mul i32 %c6, %v8
+  %c8 = sub i32 %c7, %v9
+  %c9 = mul i32 %c8, %v10
+  %c10 = sub i32 %c9, %v11
+  %c11 = mul i32 %c10, %v12
+  %c12 = sub i32 %c11, %v13
+  %c13 = mul i32 %c12, %v14
+  %c14 = sub i32 %c13, %v15
+
+  ; Chain D - cross dependencies
+  %d0 = add i32 %a0, %b0
+  %d1 = mul i32 %d0, %c0
+  %d2 = add i32 %a1, %b1
+  %d3 = mul i32 %d2, %c1
+  %d4 = add i32 %a2, %b2
+  %d5 = mul i32 %d4, %c2
+  %d6 = add i32 %a3, %b3
+  %d7 = mul i32 %d6, %c3
+  %d8 = add i32 %a4, %b4
+  %d9 = mul i32 %d8, %c4
+  %d10 = add i32 %a5, %b5
+  %d11 = mul i32 %d10, %c5
+  %d12 = add i32 %a6, %b6
+  %d13 = mul i32 %d12, %c6
+
+  ; Combine all chains
+  %r0 = add i32 %a14, %b14
+  %r1 = add i32 %r0, %c14
+  %r2 = add i32 %r1, %d1
+  %r3 = add i32 %r2, %d3
+  %r4 = add i32 %r3, %d5
+  %r5 = add i32 %r4, %d7
+  %r6 = add i32 %r5, %d9
+  %r7 = add i32 %r6, %d11
+  %result = add i32 %r7, %d13
+
+  %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tidx
+  store i32 %result, ptr addrspace(1) %out_ptr, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare i32 @llvm.amdgcn.readfirstlane(i32) #1
+
+; Limit SGPRs to 32, this should force SGPR spilling
+attributes #0 = { "amdgpu-num-sgpr"="32" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll
new file mode 100644
index 00000000000000..5634790c8e6eba
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll
@@ -0,0 +1,145 @@
+; AMDGPU kernel with high register pressure to force spilling
+; This uses many vector operations to exhaust available VGPRs
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Kernel with many live values to force register spilling
+define amdgpu_kernel void @high_register_pressure(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx = zext i32 %tid to i64
+
+  ; Load many vectors from memory - using volatile to prevent optimization
+  %ptr0 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tidx
+  %v0 = load volatile <4 x float>, ptr addrspace(1) %ptr0, align 16
+
+  %ptr1 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 1
+  %v1 = load volatile <4 x float>, ptr addrspace(1) %ptr1, align 16
+
+  %ptr2 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 2
+  %v2 = load volatile <4 x float>, ptr addrspace(1) %ptr2, align 16
+
+  %ptr3 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 3
+  %v3 = load volatile <4 x float>, ptr addrspace(1) %ptr3, align 16
+
+  %ptr4 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 4
+  %v4 = load volatile <4 x float>, ptr addrspace(1) %ptr4, align 16
+
+  %ptr5 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 5
+  %v5 = load volatile <4 x float>, ptr addrspace(1) %ptr5, align 16
+
+  %ptr6 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 6
+  %v6 = load volatile <4 x float>, ptr addrspace(1) %ptr6, align 16
+
+  %ptr7 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 7
+  %v7 = load volatile <4 x float>, ptr addrspace(1) %ptr7, align 16
+
+  %ptr8 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 8
+  %v8 = load volatile <4 x float>, ptr addrspace(1) %ptr8, align 16
+
+  %ptr9 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 9
+  %v9 = load volatile <4 x float>, ptr addrspace(1) %ptr9, align 16
+
+  %ptr10 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 10
+  %v10 = load volatile <4 x float>, ptr addrspace(1) %ptr10, align 16
+
+  %ptr11 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 11
+  %v11 = load volatile <4 x float>, ptr addrspace(1) %ptr11, align 16
+
+  %ptr12 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 12
+  %v12 = load volatile <4 x float>, ptr addrspace(1) %ptr12, align 16
+
+  %ptr13 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 13
+  %v13 = load volatile <4 x float>, ptr addrspace(1) %ptr13, align 16
+
+  %ptr14 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 14
+  %v14 = load volatile <4 x float>, ptr addrspace(1) %ptr14, align 16
+
+  %ptr15 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 15
+  %v15 = load volatile <4 x float>, ptr addrspace(1) %ptr15, align 16
+
+  ; Create many dependent calculations - chain A
+  %a0 = fadd <4 x float> %v0, %v1
+  %a1 = fmul <4 x float> %a0, %v2
+  %a2 = fadd <4 x float> %a1, %v3
+  %a3 = fmul <4 x float> %a2, %v4
+  %a4 = fadd <4 x float> %a3, %v5
+  %a5 = fmul <4 x float> %a4, %v6
+  %a6 = fadd <4 x float> %a5, %v7
+  %a7 = fmul <4 x float> %a6, %v8
+  %a8 = fadd <4 x float> %a7, %v9
+  %a9 = fmul <4 x float> %a8, %v10
+  %a10 = fadd <4 x float> %a9, %v11
+  %a11 = fmul <4 x float> %a10, %v12
+  %a12 = fadd <4 x float> %a11, %v13
+  %a13 = fmul <4 x float> %a12, %v14
+  %a14 = fadd <4 x float> %a13, %v15
+
+  ; Chain B - reverse direction
+  %b0 = fmul <4 x float> %v15, %v14
+  %b1 = fadd <4 x float> %b0, %v13
+  %b2 = fmul <4 x float> %b1, %v12
+  %b3 = fadd <4 x float> %b2, %v11
+  %b4 = fmul <4 x float> %b3, %v10
+  %b5 = fadd <4 x float> %b4, %v9
+  %b6 = fmul <4 x float> %b5, %v8
+  %b7 = fadd <4 x float> %b6, %v7
+  %b8 = fmul <4 x float> %b7, %v6
+  %b9 = fadd <4 x float> %b8, %v5
+  %b10 = fmul <4 x float> %b9, %v4
+  %b11 = fadd <4 x float> %b10, %v3
+  %b12 = fmul <4 x float> %b11, %v2
+  %b13 = fadd <4 x float> %b12, %v1
+  %b14 = fmul <4 x float> %b13, %v0
+
+  ; Chain C - subtraction chain
+  %c0 = fsub <4 x float> %v0, %v1
+  %c1 = fmul <4 x float> %c0, %v2
+  %c2 = fsub <4 x float> %c1, %v3
+  %c3 = fmul <4 x float> %c2, %v4
+  %c4 = fsub <4 x float> %c3, %v5
+  %c5 = fmul <4 x float> %c4, %v6
+  %c6 = fsub <4 x float> %c5, %v7
+  %c7 = fmul <4 x float> %c6, %v8
+  %c8 = fsub <4 x float> %c7, %v9
+  %c9 = fmul <4 x float> %c8, %v10
+  %c10 = fsub <4 x float> %c9, %v11
+  %c11 = fmul <4 x float> %c10, %v12
+  %c12 = fsub <4 x float> %c11, %v13
+  %c13 = fmul <4 x float> %c12, %v14
+  %c14 = fsub <4 x float> %c13, %v15
+
+  ; Chain D - cross dependencies
+  %d0 = fadd <4 x float> %a0, %b0
+  %d1 = fmul <4 x float> %d0, %c0
+  %d2 = fadd <4 x float> %a1, %b1
+  %d3 = fmul <4 x float> %d2, %c1
+  %d4 = fadd <4 x float> %a2, %b2
+  %d5 = fmul <4 x float> %d4, %c2
+  %d6 = fadd <4 x float> %a3, %b3
+  %d7 = fmul <4 x float> %d6, %c3
+  %d8 = fadd <4 x float> %a4, %b4
+  %d9 = fmul <4 x float> %d8, %c4
+  %d10 = fadd <4 x float> %a5, %b5
+  %d11 = fmul <4 x float> %d10, %c5
+
+  ; Final combination to keep all values live
+  %result0 = fadd <4 x float> %a14, %b14
+  %result1 = fadd <4 x float> %result0, %c14
+  %result2 = fadd <4 x float> %result1, %d1
+  %result3 = fadd <4 x float> %result2, %d3
+  %result4 = fadd <4 x float> %result3, %d5
+  %result5 = fadd <4 x float> %result4, %d7
+  %result6 = fadd <4 x float> %result5, %d9
+  %result = fadd <4 x float> %result6, %d11
+
+  %out_ptr = getelementptr <4 x float>, ptr addrspace(1) %out, i64 %tidx
+  store <4 x float> %result, ptr addrspace(1) %out_ptr, align 16
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Limit VGPRs to 64 to force spilling
+attributes #0 = { "amdgpu-num-vgpr"="64" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 6c9dea647fcf37..464df1087303f3 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -50,11 +49,11 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -479,7 +478,8 @@ bool IsTf32Allowed(PrecisionConfig::Algorithm algorithm,
 
 absl::StatusOr<GemmConfig::DescriptorsTuple> GemmConfig::GetMatrixDescriptors(
     se::DeviceAddressBase lhs_buf, se::DeviceAddressBase rhs_buf,
-    se::DeviceAddressBase out_buf) const {
+    se::DeviceAddressBase out_buf,
+    const se::GpuComputeCapability& gpu_version) const {
   auto create_matrix_desc = [](const se::gpu::MatrixLayout& layout,
                                se::DeviceAddressBase data)
       -> absl::StatusOr<se::gpu::MatrixDescriptor> {
@@ -512,7 +512,7 @@ absl::StatusOr<GemmConfig::DescriptorsTuple> GemmConfig::GetMatrixDescriptors(
   TF_ASSIGN_OR_RETURN(out_desc.compute_type,
                       se::gpu::GetBlasComputationType(
                           PrecisionConfig::ALG_UNSET, lhs.dtype, out.dtype,
-                          se::blas::kDefaultComputePrecision));
+                          se::blas::kDefaultComputePrecision, gpu_version));
 
   TF_ASSIGN_OR_RETURN(se::gpu::MatrixDescriptor lhs_desc,
                       create_matrix_desc(lhs, lhs_buf));
@@ -541,8 +541,9 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
   PrimitiveType output_type = primitive_util::NativeToPrimitiveType<Output>();
   TF_ASSIGN_OR_RETURN(
       se::blas::ComputationType computation_type,
-      se::gpu::GetBlasComputationType(precision_algorithm, lhs_type,
-                                      output_type, compute_precision));
+      se::gpu::GetBlasComputationType(
+          precision_algorithm, lhs_type, output_type, compute_precision,
+          stream->parent()->GetDeviceDescription().gpu_compute_capability()));
   se::DeviceAddress<Output> output_data(output.data);
 
   // Set a workspace for all Blas operations launched below.
@@ -626,7 +627,9 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceAddressBase lhs_buffer,
 
   TF_ASSIGN_OR_RETURN(
       GemmConfig::DescriptorsTuple desc,
-      config.GetMatrixDescriptors(lhs_buffer, rhs_buffer, output_buffer));
+      config.GetMatrixDescriptors(
+          lhs_buffer, rhs_buffer, output_buffer,
+          stream->parent()->GetDeviceDescription().gpu_compute_capability()));
 
   se::EngineOptions engine_options{
       deterministic_ops,
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index 8204e4e68c4f67..4c0a4de4dfdf9b 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -146,7 +147,8 @@ struct GemmConfig : public se::gpu::GemmConfig {
   };
   absl::StatusOr<DescriptorsTuple> GetMatrixDescriptors(
       se::DeviceAddressBase lhs_buf, se::DeviceAddressBase rhs_buf,
-      se::DeviceAddressBase out_buf) const;
+      se::DeviceAddressBase out_buf,
+      const se::GpuComputeCapability& gpu_version) const;
 };
 
 // Run the given GEMM instruction `gemm` subject to the configuration
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 7ad8e03a450351..00ba04b9c37463 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -31,7 +31,6 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
@@ -161,7 +160,6 @@ xla_test(
     deps = [
         ":analytical_latency_estimator",
         "//xla:shape_util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
@@ -222,7 +220,6 @@ cc_library(
         ":gpu_dot_fusion_cost_model",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -240,6 +237,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -249,7 +247,6 @@ xla_cc_test(
     deps = [
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
@@ -325,7 +322,6 @@ cc_library(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/triton:fusion",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:backend_configs_cc",
@@ -339,6 +335,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -348,7 +345,6 @@ xla_cc_test(
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -376,7 +372,6 @@ cc_library(
         ":gpu_performance_model_base",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -388,6 +383,7 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -402,7 +398,6 @@ xla_cc_test(
         ":gpu_performance_model_base",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -504,7 +499,6 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_cost_analysis",
@@ -541,7 +535,6 @@ xla_cc_test(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -619,7 +612,6 @@ xla_cc_test(
     deps = [
         ":triton_emitter_constraints",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -650,7 +642,6 @@ cc_library(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -678,7 +669,6 @@ xla_cc_test(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
@@ -857,7 +847,6 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":sol_latency_estimator",
         "//xla/backends/gpu/codegen/triton:support",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -875,6 +864,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -884,7 +874,6 @@ xla_cc_test(
     deps = [
         ":sol_gpu_cost_model_stats_collection",
         "//xla:shape_util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
@@ -921,6 +910,12 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "collective_interpolator_data",
+    hdrs = ["collective_interpolator_data.h"],
+    compatible_with = get_compatible_with_portable(),
+)
+
 cc_library(
     name = "collective_interpolator",
     srcs = ["collective_interpolator.cc"],
@@ -929,6 +924,7 @@ cc_library(
         "collective_interpolator_data.h",
     ],
     deps = [
+        ":collective_interpolator_data",
         ":gpu_hlo_cost_analysis",
         ":hlo_op_profile_proto_cc",
         ":hlo_op_profiles",
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index 9b5e7654a2e36c..0c7aa67f8cbc55 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/time/time.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
index c23b78bbaefeda..2b1c51737dea1f 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
index 9e9e60691146ac..c4dbc694860320 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 5346566cb7c281..59479b7e7114a7 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index c3ac7c3a9a9054..550d847d4af1b7 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index d682e41e49b365..6d81b8c935ec17 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
index 61ac03f237ff80..c2408e4e709ac1 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
@@ -101,13 +101,12 @@ struct InterpolationSpecification {
 // Returns number of participating devices in an input `device_list`. Supports
 // only `iota_replica_group_list`.
 absl::StatusOr<int> GetNumParticipatingDevices(
-    const CollectiveDeviceList& device_list) {
-  auto iota = device_list.iota_replica_group_list();
-  if (!iota.has_value()) {
+    const CollectiveDeviceListBase& device_list) {
+  if (device_list.version() != CollectiveDeviceListVersion::kIota) {
     return absl::FailedPreconditionError(
         "Only iota device assignment is supported.");
   }
-  return iota->num_devices_per_group();
+  return device_list.num_devices_per_group();
 }
 
 absl::StatusOr<InterpolationSpecification> Spec(
@@ -147,13 +146,16 @@ absl::StatusOr<InterpolationSpecification> Spec(
   TF_ASSIGN_OR_RETURN(int num_devices,
                       GetNumParticipatingDevices(collective->device_list()));
 
+  CollectiveDeviceList list_of_devices =
+      ConvertToV1CollectiveDeviceList(collective->device_list());
+
   return InterpolationSpecification{
       /*opcode=*/collective->opcode(),
       /*num_devices=*/num_devices,
       /*transfer_size=*/bytes_transferred,
       /*data_type=*/collective->shape().element_type(),
       /*collective_params=*/
-      CollectiveOpSpecInfo{collective->device_list(), comm}};
+      CollectiveOpSpecInfo{list_of_devices, comm}};
 }
 
 std::unique_ptr<HloModule> AllReduceModule(
@@ -348,11 +350,13 @@ std::unique_ptr<HloModule> CollectivePermuteModule(
   return module;
 }
 
-std::optional<CollectiveDeviceList> CanonicalDeviceList(
+std::optional<std::unique_ptr<CollectiveDeviceListBase>> CanonicalDeviceList(
     const HloCollectiveInstruction& instr) {
-  if (instr.device_list().iota_replica_group_list().has_value()) {
-    return instr.device_list();
+  const CollectiveDeviceListBase& device_list = instr.device_list();
+  if (device_list.version() == CollectiveDeviceListVersion::kIota) {
+    return device_list.Clone();
   }
+
   auto num_groups_and_devices = GetReplicaGroupCountAndSize(&instr);
   if (!num_groups_and_devices.ok() || !num_groups_and_devices->has_value()) {
     VLOG(1) << "Failed to determine a number of devices participating in "
@@ -363,7 +367,7 @@ std::optional<CollectiveDeviceList> CanonicalDeviceList(
 
   IotaReplicaGroupList iota((*num_groups_and_devices)->first,
                             (*num_groups_and_devices)->second);
-  return CollectiveDeviceList(iota);
+  return std::make_unique<CollectiveDeviceList>(iota);
 }
 
 HloOpcode AsyncToSyncOpcode(const HloCollectiveInstruction& instr) {
@@ -687,7 +691,8 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
   int64_t bytes_transferred =
       GetBytesTransferred(instr, device_info_, analysis_);
 
-  if (instr.opcode() == HloOpcode::kCollectivePermute) {
+  if (instr.opcode() == HloOpcode::kCollectivePermute ||
+      instr.opcode() == HloOpcode::kCollectivePermuteStart) {
     auto* cp = Cast<HloCollectivePermuteInstruction>(&instr);
     const CollectivePermuteCostModelType& permute_type =
         GetCollectivePermuteCostModelType(
@@ -700,7 +705,7 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
              << " for instr: " << instr.ToString() << " num_partitions:"
              << cp->GetModule()->config().num_partitions();
     ExactInterpolatorKey exact_key{
-        /*opcode=*/instr.opcode(),
+        /*opcode=*/HloOpcode::kCollectivePermute,
         /*collective_params=*/permute_type,
         /*data_type=*/std::nullopt,
     };
@@ -719,12 +724,15 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
         absl::StrCat("Cannot find key for instr: ", instr.ToString()));
   }
   auto* collective = Cast<HloCollectiveInstruction>(&instr);
-  std::optional<CollectiveDeviceList> devices =
+  std::optional<std::unique_ptr<CollectiveDeviceListBase>> devices =
       CanonicalDeviceList(*collective);
   if (devices.has_value()) {
+    CollectiveDeviceList list_of_devices =
+        ConvertToV1CollectiveDeviceList(*devices.value());
+
     ExactInterpolatorKey exact_key{
         /*opcode=*/instr.opcode(),
-        /*collective_params=*/*devices,
+        /*collective_params=*/list_of_devices,
         /*data_type=*/
         RequiresAccumulation(instr.opcode())
             ? std::make_optional(instr.shape().element_type())
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h b/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
index 5dfdbaa6591553..ac9a6fc3571738 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
@@ -23,41 +23,54 @@ limitations under the License.
 // BEGIN_DEFAULT_PERF_TABLE
 constexpr char kDefaultCollectivePTable[] = R"pb(
   entries {
-    key: "sm_90"
+    key: "sm_100"
     value {
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 20765736
+        fingerprint: "82724b215353fdf447c8f5867b927fe2"
+        network_throughput_bytes_per_sec: 14185281385
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -67,42 +80,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 13787160
+        fingerprint: "70a080ed258662e7a7c448a580386531"
+        network_throughput_bytes_per_sec: 155528554
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 8344198
+        fingerprint: "8c4f72b22cf1c427b5192fde2275b82d"
+        network_throughput_bytes_per_sec: 75804898766
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -112,43 +138,54 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 32064128
+        fingerprint: "b119f67d214e8219e6b672422c7ff82d"
+        network_throughput_bytes_per_sec: 4940892641
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 15873015
+        fingerprint: "30228c58f0f8bfd498c30a0b4c75491e"
+        network_throughput_bytes_per_sec: 523521123354
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -158,20 +195,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 18593840
+        fingerprint: "6a6da3f8a701c6de63d3f3eff5a326d0"
+        network_throughput_bytes_per_sec: 307692307
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -181,175 +224,233 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 47904191
+        fingerprint: "4f39fbed3ed5b26fd8f01ceb12a6958e"
+        network_throughput_bytes_per_sec: 11359602
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 28243601
+        fingerprint: "d99a31a5731cd087d2a57a04dbeda416"
+        network_throughput_bytes_per_sec: 277694915254
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 11838697
+        fingerprint: "991c99cf49ca7ed01041e4151f354da3"
+        network_throughput_bytes_per_sec: 71111111
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 10571522
+        fingerprint: "8dd875a32ca9280cac2fb6c8b4a3f900"
+        network_throughput_bytes_per_sec: 2458583433
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 7492390
+        fingerprint: "36c99c9ecd2afb910616dba7e7604d76"
+        network_throughput_bytes_per_sec: 1120350109
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 7626310
+        fingerprint: "9efe18a44677d5ebf6d950d12f0105d0"
+        network_throughput_bytes_per_sec: 129005394058
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 56939501
+        fingerprint: "b6cdb267bfc64ec38e1e2740095c8805"
+        network_throughput_bytes_per_sec: 7111111111
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 90523338
+        fingerprint: "27a51e9c9148298fd01ee900e6a81c2c"
+        network_throughput_bytes_per_sec: 851808285946
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -359,66 +460,83 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 122137404
+        fingerprint: "22444f0e312a3499cbfd75eaf67c0888"
+        network_throughput_bytes_per_sec: 127937530502
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 58447488
+        fingerprint: "bee86716cff01212fb687be758fc96e1"
+        network_throughput_bytes_per_sec: 520870550009
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 104746317
+        fingerprint: "e67540b745e062c37cb2d5e38a645a43"
+        network_throughput_bytes_per_sec: 363836224843
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -428,88 +546,139 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 134736842
+        fingerprint: "02f84dcfcb5697b10aa0548ff15c1379"
+        network_throughput_bytes_per_sec: 7074265975
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "260f965f1a4678225622c8bb1bb605bd"
+        network_throughput_bytes_per_sec: 117960007874
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
           }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 55220017
+        fingerprint: "f6f986332989bc7ea465794f1c2b6856"
+        network_throughput_bytes_per_sec: 536489870069
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 51990251
+        fingerprint: "4fd09bd390682728f7da420003fb7c37"
+        network_throughput_bytes_per_sec: 533889468
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 154216867
+        fingerprint: "f9d967d15c65b7c80d2055c0c6dbf3c6"
+        network_throughput_bytes_per_sec: 333722196805
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -519,41 +688,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 53691275
+        fingerprint: "bb6b916563714f9bbad245e71a3d56e3"
+        network_throughput_bytes_per_sec: 32031280547
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 90267983
+        fingerprint: "c04d9b7c7ac56f98fcf87ee9c131ab68"
+        network_throughput_bytes_per_sec: 733454255330
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -563,41 +748,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 66528066
+        fingerprint: "ef5d4bc6c48c17023f1713afce3c88c3"
+        network_throughput_bytes_per_sec: 26947368421
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 125984251
+        fingerprint: "644ed15c889d04ca582b384ff68fc71e"
+        network_throughput_bytes_per_sec: 380952380
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -607,65 +808,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 127617148
+        fingerprint: "5b2d32958f53fbfe8142848551afad7c"
+        network_throughput_bytes_per_sec: 9683215130
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 221837088
+        fingerprint: "498af7a3213702edfadb727477672515"
+        network_throughput_bytes_per_sec: 25051987767
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 140814081
+        fingerprint: "6b3fd8cf011b133409ba2ce19f78aed7"
+        network_throughput_bytes_per_sec: 6239146991
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -675,44 +899,53 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 225749559
+        fingerprint: "f9040b578d0f9eba41c4ac1a07ee4224"
+        network_throughput_bytes_per_sec: 44582312925
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 209492635
+        fingerprint: "ea38ae3f2a296149dde0cac2c673fd8f"
+        network_throughput_bytes_per_sec: 429631462026
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
               num_devices_per_group: 8
@@ -721,20 +954,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 136606189
+        fingerprint: "8154a3ab4411af0b0a94fc14f69ac096"
+        network_throughput_bytes_per_sec: 101057825751
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -744,42 +984,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 160200250
+        fingerprint: "64fc8a1589366b418b88651876990852"
+        network_throughput_bytes_per_sec: 590414414414
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
-        }
-        network_throughput_bytes_per_sec: 342245989
-      }
-      entries {
-        instruction {
-          opcode: "all-to-all"
-          shape {
-            element_type: F32
-            dimensions: 1024
-            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          dimensions: 0
-          channel_id: 1
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -789,63 +1015,82 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 101426307
+        fingerprint: "8abe97935f10f37b31406be7fe615de0"
+        network_throughput_bytes_per_sec: 262026612
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 156670746
+        fingerprint: "61c2170cfeaf7234b58eea7d3c5cc136"
+        network_throughput_bytes_per_sec: 12709395908
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 70368334
+        fingerprint: "9f6af0d6b827293d0386ca2378e5d71b"
+        network_throughput_bytes_per_sec: 22743425
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -855,41 +1100,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 253968253
+        fingerprint: "b8e3907c6dfb227acf1602dcedd1dfae"
+        network_throughput_bytes_per_sec: 695270135305
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 228980322
+        fingerprint: "83d2f5e591feefe0553a301d532b898f"
+        network_throughput_bytes_per_sec: 337325398101
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -899,43 +1159,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 254726368
+        fingerprint: "413eb782ec4e2c4409f95026e7b720e6"
+        network_throughput_bytes_per_sec: 48617210682
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 162539682
+        fingerprint: "ca28c1cb14c2361c8daf298f6de5fc47"
+        network_throughput_bytes_per_sec: 102480062548
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -945,20 +1219,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 275565123
+        fingerprint: "c8859ac88de21b0d40acf7b94c89a34e"
+        network_throughput_bytes_per_sec: 757137293394
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -968,20 +1249,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 709141274
+        fingerprint: "e5dc26fcdbdb577aa5941155f0d4fc57"
+        network_throughput_bytes_per_sec: 10252816020
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -991,153 +1278,205 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 277657266
+        fingerprint: "85bb8349a62442dcab56384b99cbe6d0"
+        network_throughput_bytes_per_sec: 511875030510
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 343163538
+        fingerprint: "6632fafd05450c9ca5bdd86c67f7cc0a"
+        network_throughput_bytes_per_sec: 1452482269
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 725212464
+        fingerprint: "c6fbc0a09d2d44806949eef49196c7a2"
+        network_throughput_bytes_per_sec: 639200998
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 251226692
+        fingerprint: "4ffa457833b6c4ff6e5147781e31302a"
+        network_throughput_bytes_per_sec: 113817915388
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 280087527
+        fingerprint: "ad9a13d7c03557ae4a78b547c910aaa9"
+        network_throughput_bytes_per_sec: 18244988864
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 536687631
+        fingerprint: "2e4edd9d5f901a539189122797458ea4"
+        network_throughput_bytes_per_sec: 24526946107
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 471889400
+        fingerprint: "e39d4c38abe32c0b8bf790196f492d26"
+        network_throughput_bytes_per_sec: 383216445865
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -1147,201 +1486,257 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 519796954
+        fingerprint: "ca3eb58708ad6e9c3551f90b9d193653"
+        network_throughput_bytes_per_sec: 227654363873
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 676354029
+        fingerprint: "0f527662a7b68694b48d50d10a297e0b"
+        network_throughput_bytes_per_sec: 3205007824
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 532224532
+        fingerprint: "4351be00ad096ee1fcfe565c2215c7dd"
+        network_throughput_bytes_per_sec: 693502645502
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1160997732
+        fingerprint: "6ab6cfdfc119a9143b046bd2262766d6"
+        network_throughput_bytes_per_sec: 102687525
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 1406593406
+        fingerprint: "aa53481ae940f8be73d0809b81bd85ee"
+        network_throughput_bytes_per_sec: 28971790125
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 497570456
+        fingerprint: "84086fea224a69018a6bcf0db282b861"
+        network_throughput_bytes_per_sec: 584490523968
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 537250786
+        fingerprint: "695c9c2e1a16cd287cd6b80d66c3cf24"
+        network_throughput_bytes_per_sec: 170638893409
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1142857142
+        fingerprint: "0120a2fd4590718b617dbde5030314f0"
+        network_throughput_bytes_per_sec: 111888111
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 503937007
+        fingerprint: "b398f3b5618fef0e8beefc3d9fb45eee"
+        network_throughput_bytes_per_sec: 292082451253
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -1351,41 +1746,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 393543428
+        fingerprint: "a9f1ccc1dcdd0ea4b6d345c626fc0464"
+        network_throughput_bytes_per_sec: 27777777
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 756277695
+        fingerprint: "44d6bfd785c449cc8640234e220f77aa"
+        network_throughput_bytes_per_sec: 10613547107
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -1395,87 +1804,117 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 870008496
+        fingerprint: "b51292fa84bc7fc3b5a42c808ed0538a"
+        network_throughput_bytes_per_sec: 650456170278
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1400820793
+        fingerprint: "19bfe133a0d02c4cac3fe71ed6e3e741"
+        network_throughput_bytes_per_sec: 273208963001
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 2285714285
+        fingerprint: "d04da75eb576bed3d4db82103261bf34"
+        network_throughput_bytes_per_sec: 6370139968
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1777777777
+        fingerprint: "c986af5a1df1d20f73d7d40cf5b1e067"
+        network_throughput_bytes_per_sec: 1066666666
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -1485,20 +1924,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 2343249427
+        fingerprint: "d71debdb12a8986a0f288def2a8ac093"
+        network_throughput_bytes_per_sec: 390095238095
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1508,132 +1954,170 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 2813186813
+        fingerprint: "d6ceb25936203837f994d4ea62fccbcb"
+        network_throughput_bytes_per_sec: 228367528
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 822489959
+        fingerprint: "a83d894078ce2cfadd898cfbdd4955ea"
+        network_throughput_bytes_per_sec: 16094302554
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 722143864
+        fingerprint: "a9e70188f014a7fd5d3664cb93a8ceea"
+        network_throughput_bytes_per_sec: 333550488
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 867796610
+        fingerprint: "04a5265203d9bcb5e98c819340f31d6c"
+        network_throughput_bytes_per_sec: 66651369003
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 503937007
+        fingerprint: "99043b67c066b466fdfabf4ba0e10d9d"
+        network_throughput_bytes_per_sec: 246925488
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1109425785
+        fingerprint: "10f1eac7685082516c77e28c1c570603"
+        network_throughput_bytes_per_sec: 749183531303
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1643,63 +2127,85 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 713091922
+        fingerprint: "62c4fc6430ca1eb1c9da917231cf7c2c"
+        network_throughput_bytes_per_sec: 424438777575
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1650282030
+        fingerprint: "f7fd5b7bdf4d97b0eb10f5fbab3117c5"
+        network_throughput_bytes_per_sec: 14234578627
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 3631205673
+        fingerprint: "a5c25d90d3703c3e05a5428a5fbafe10"
+        network_throughput_bytes_per_sec: 1961450975
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1709,89 +2215,114 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 4302521008
+        fingerprint: "35882d22990344fadfe4e45b8e2721eb"
+        network_throughput_bytes_per_sec: 478822324015
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 3524956970
+        fingerprint: "0f9cdfcb5a2647c85e89d23875e02e61"
+        network_throughput_bytes_per_sec: 70450636
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 4729792147
+        fingerprint: "71fef15b131813a4d472ccf5528d373b"
+        network_throughput_bytes_per_sec: 210526315
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 5535135135
+        fingerprint: "c9746f1c866d390a80ecaa1cd0747467"
+        network_throughput_bytes_per_sec: 213494044589
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -1801,43 +2332,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 1740016992
+        fingerprint: "d0a5ced62829c6ceb591eb442eb1b79c"
+        network_throughput_bytes_per_sec: 653061224
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 4511013215
+        fingerprint: "7090d041bc0599ce9b8dc0095e8d7135"
+        network_throughput_bytes_per_sec: 34168925964
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1847,152 +2391,200 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 3605633802
+        fingerprint: "584651046fab6aa022a9fcdaa741ca49"
+        network_throughput_bytes_per_sec: 4556173526
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 2763832658
+        fingerprint: "e78c4bfa48ee5d7f81743a050aa5f803"
+        network_throughput_bytes_per_sec: 79559438818
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 2167195767
+        fingerprint: "3e00e3751db2f0a54edfc160509a0c32"
+        network_throughput_bytes_per_sec: 7185964912
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 3778597785
+        fingerprint: "df2f19ebe8fc637197e39b52d97a794c"
+        network_throughput_bytes_per_sec: 5143911149
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 3289959839
+        fingerprint: "3fbaf73ace028a5c0673748316b980bf"
+        network_throughput_bytes_per_sec: 331029083303
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 5031941031
+        fingerprint: "28ed8d8831773650ac1210294feb985d"
+        network_throughput_bytes_per_sec: 1213270142
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 7816793893
+        fingerprint: "6411603f6b84e86918c660203d2586d6"
+        network_throughput_bytes_per_sec: 138988802
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2002,20 +2594,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 4437703141
+        fingerprint: "361bab01e1544bbd7d4c57964c7cb2e8"
+        network_throughput_bytes_per_sec: 296124258683
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2025,20 +2625,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 7262411347
+        fingerprint: "f2e7c307868389c7b945a60985fdbfb7"
+        network_throughput_bytes_per_sec: 600473013600
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2048,66 +2656,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 7876923076
+        fingerprint: "d8ba4d1f1855930537e677def956da0c"
+        network_throughput_bytes_per_sec: 85333333333
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 5859799713
+        fingerprint: "21c3dea5239284aed45517eccc2c77a1"
+        network_throughput_bytes_per_sec: 12118916
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 3602462620
+        fingerprint: "b5ef73f8707a38b85661790207aa156e"
+        network_throughput_bytes_per_sec: 704488436788
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2117,175 +2747,229 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 6942372881
+        fingerprint: "e188b963e5912029be041b83f8c32803"
+        network_throughput_bytes_per_sec: 130031746031
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 3599297012
+        fingerprint: "58212b0c758c8c906c2df8d9cd23841e"
+        network_throughput_bytes_per_sec: 675411272141
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 4075621890
+        fingerprint: "aedd2df037a65be1250f90a94291ed61"
+        network_throughput_bytes_per_sec: 11394345076
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 5298835705
+        fingerprint: "b01e85aab5ef3da6357ca6f9cfb67b8f"
+        network_throughput_bytes_per_sec: 99244105294
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 10014669926
+        fingerprint: "3fc3265f1d4f48b553f24df733a0ec07"
+        network_throughput_bytes_per_sec: 137931034
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 13107200000
+        fingerprint: "905de60dd6e7b89f580e55ae80ac8d79"
+        network_throughput_bytes_per_sec: 330989898989
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 16031311154
+        fingerprint: "923306d188529fd23828978fba917eca"
+        network_throughput_bytes_per_sec: 1007564110
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 7953398058
+        fingerprint: "012c7d96d1729d9e95f6cb6f9cc6646d"
+        network_throughput_bytes_per_sec: 1183815028
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2295,20 +2979,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 12720496894
+        fingerprint: "add1d98ea03d7ecc59ff9877b5bd5e93"
+        network_throughput_bytes_per_sec: 203567462628
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2318,20 +3010,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 20739240506
+        fingerprint: "8f93e3c7983b80f6171d861fa67a2bb5"
+        network_throughput_bytes_per_sec: 2135557872
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2341,20 +3039,54 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 9173572228
+        fingerprint: "8370a1824c7b75672651c80e67bfcc33"
+        network_throughput_bytes_per_sec: 671948734380
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "faccbf0108d642668cb20ee319a39541"
+        network_throughput_bytes_per_sec: 510805041535
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2364,20 +3096,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 17031185031
+        fingerprint: "fc2857874212cf751e4b60decec734a7"
+        network_throughput_bytes_per_sec: 52588331
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2387,63 +3126,80 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 8933478735
+        fingerprint: "eebdac40c94b93e0e8351eba013b7958"
+        network_throughput_bytes_per_sec: 44826265389
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 10317380352
+        fingerprint: "5357a763f8b75abbd8c2b3aa99d399b0"
+        network_throughput_bytes_per_sec: 147976878
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 6913080168
+        fingerprint: "00842feada0344771c1b4e414c197917"
+        network_throughput_bytes_per_sec: 513185355119
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2453,86 +3209,112 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 5970845481
+        fingerprint: "25f7485f5479f664a4f493d61ff1a4d8"
+        network_throughput_bytes_per_sec: 5207485
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 12593389700
+        fingerprint: "ea64b110db7c46aafbb394dd547e9e23"
+        network_throughput_bytes_per_sec: 804836343575
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 12328066215
+        fingerprint: "766a46b63049615920fa933700606ad3"
+        network_throughput_bytes_per_sec: 4003910068
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 27125827814
+        fingerprint: "a9dda3180d9de81b8ac47f5af4e3717c"
+        network_throughput_bytes_per_sec: 127023506
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2542,20 +3324,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 27443886097
+        fingerprint: "4f6b04c57baf41d308928831e49e6f05"
+        network_throughput_bytes_per_sec: 6924767540
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2565,111 +3354,143 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 32125490196
+        fingerprint: "a80bdc2e68784e2f6b048f9070199b93"
+        network_throughput_bytes_per_sec: 1185185185
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 35158798283
+        fingerprint: "e824c0c6cda89c864c487464c4714920"
+        network_throughput_bytes_per_sec: 132441685004
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 12681114551
+        fingerprint: "22006dffcceb6352ce0d7f47b568a045"
+        network_throughput_bytes_per_sec: 5436629
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 18265328874
+        fingerprint: "ae02e28f43d17a07b5679b487f20fb4a"
+        network_throughput_bytes_per_sec: 157349339735
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 26047694753
+        fingerprint: "ef648be80ef0366d556323938cba8b8b"
+        network_throughput_bytes_per_sec: 145797552836
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2679,41 +3500,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 8650475184
+        fingerprint: "4a53acb196d9813effc6e5c5955a28c2"
+        network_throughput_bytes_per_sec: 3744058500
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 13826160337
+        fingerprint: "f08943c7fb325a80c9aedfc3d9219cfb"
+        network_throughput_bytes_per_sec: 935159817
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2723,10 +3558,12 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 27629005059
+        fingerprint: "cc87abd4e4f49d00a84464d67ac1cc63"
+        network_throughput_bytes_per_sec: 26947368421
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
@@ -2734,8 +3571,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2745,19 +3588,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 20189772027
+        fingerprint: "b064bd92e8bb26128984b39785b63827"
+        network_throughput_bytes_per_sec: 21375081539
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2767,179 +3619,262 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 24129602356
+        fingerprint: "5cfc87185d90d1302586da3e1c0f6fde"
+        network_throughput_bytes_per_sec: 1067778936
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 50027480916
+        fingerprint: "e6fb7f1db9f0def5ca27f01f28291a63"
+        network_throughput_bytes_per_sec: 842105263
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 33677286742
+        fingerprint: "9c002f2bdf400f2638aac27df778dfc1"
+        network_throughput_bytes_per_sec: 438013106023
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 51200000000
+        fingerprint: "b9ca069b3d1f1eeefa7a6ecf54baacae"
+        network_throughput_bytes_per_sec: 848534088610
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 59795620437
+        fingerprint: "9ca39fa8794daa9631163ee0e266097e"
+        network_throughput_bytes_per_sec: 453900709
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 51280125195
+        fingerprint: "7137b6e88084dce1309b8c65093ae1ff"
+        network_throughput_bytes_per_sec: 938598637743
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 36008791208
+        fingerprint: "d08ba83901c2428a2cacfc0d6e826840"
+        network_throughput_bytes_per_sec: 293924597056
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 66737270875
+        fingerprint: "c2aa9452829d26d4fbfc6be7dcd22902"
+        network_throughput_bytes_per_sec: 638305280779
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "4ce7115cb0355a0436faffe9d4f63a60"
+        network_throughput_bytes_per_sec: 49960967
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2949,41 +3884,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 19106705539
+        fingerprint: "b253ab825e72a17754e0f920ba0ac47b"
+        network_throughput_bytes_per_sec: 619725768321
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 30681647940
+        fingerprint: "90fc6688a7894e5c0b2688b8b3b56e5b"
+        network_throughput_bytes_per_sec: 1204705882
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2993,19 +3944,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 28972590627
+        fingerprint: "0c83ea7171fd3e561e96f900b77b901b"
+        network_throughput_bytes_per_sec: 34026998961
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
             dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3015,19 +3973,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 36612290502
+        fingerprint: "5ecb8bfdacc065c41d076a95fde10de2"
+        network_throughput_bytes_per_sec: 51360501567
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3037,19 +4003,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 35463203463
+        fingerprint: "98b831c4ca72406f37389214c5e19865"
+        network_throughput_bytes_per_sec: 311496769402
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3059,43 +4033,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 47524292965
+        fingerprint: "224b0c65bfb5a0772e3b3ef5e529c630"
+        network_throughput_bytes_per_sec: 20505632040
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 66264914054
+        fingerprint: "7c15270553f884594396ea3a9e22288a"
+        network_throughput_bytes_per_sec: 208464413518
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3105,43 +4091,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 73388577827
+        fingerprint: "b5dc833e1006db332dcf16fc073558e4"
+        network_throughput_bytes_per_sec: 215578947368
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 82022528160
+        fingerprint: "0cb65c209a26b3c7c2d519854b62c67c"
+        network_throughput_bytes_per_sec: 8162157113
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3151,20 +4150,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 60963720930
+        fingerprint: "7cac54d6e1d08a6cd127ab0f81c94048"
+        network_throughput_bytes_per_sec: 608355065632
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3174,20 +4180,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 93090909090
+        fingerprint: "f40071ad502637ac5fbb867e29baea24"
+        network_throughput_bytes_per_sec: 37037037
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3197,41 +4211,58 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 68266666666
+        fingerprint: "a7adb6a1534bbf74d4512d8e05eb5ad0"
+        network_throughput_bytes_per_sec: 859356451365
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 83591836734
+        fingerprint: "96b26a3b1e59d77bab95e0026d23275c"
+        network_throughput_bytes_per_sec: 2074974670
       }
       entries {
         instruction {
-          opcode: "all-to-all"
-          shape {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3241,32 +4272,41 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 58514285714
+        fingerprint: "7607dfea803f89904d8e152fb8189956"
+        network_throughput_bytes_per_sec: 4271115745
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 42833986928
+        fingerprint: "25a76511510793444ae58b29d1d310cd"
+        network_throughput_bytes_per_sec: 612396554241
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
@@ -3274,8 +4314,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3285,64 +4331,79 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 61077353215
+        fingerprint: "17e8b701cba039ffed5c4c64b81bd38e"
+        network_throughput_bytes_per_sec: 50567901234
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 70888047593
+        fingerprint: "f6d54f4055232256fcd6c0bf18d19f73"
+        network_throughput_bytes_per_sec: 155498651441
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 97234421364
+        fingerprint: "1f93b4ce5f502f4229278ea4c2936bb8"
+        network_throughput_bytes_per_sec: 8393442622
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3352,66 +4413,87 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 88983027834
+        fingerprint: "01cbebe328fc80e3c97d6b476fec64ff"
+        network_throughput_bytes_per_sec: 228571428
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 99447647951
+        fingerprint: "abd8c8f7ce2fd95b63645316537b74e7"
+        network_throughput_bytes_per_sec: 528429514717
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 136818371607
+        fingerprint: "bac7b1d59bfbf3ce3c9770a882b7d9d9"
+        network_throughput_bytes_per_sec: 1057851239
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3421,10 +4503,12 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 117658886894
+        fingerprint: "4afbc2a251fde8fe2a2f21f9b3680acf"
+        network_throughput_bytes_per_sec: 12700775193
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
@@ -3432,32 +4516,45 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 98847662141
+        fingerprint: "0f6fd7c2255ef5d6619d3e266e4493b1"
+        network_throughput_bytes_per_sec: 162017305315
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3467,19 +4564,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 132262361251
+        fingerprint: "b3774b957d17024bdec4dfb6aa7015b8"
+        network_throughput_bytes_per_sec: 10680573663
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "d1ed25619ddb7091e280b64f8bf17b71"
+        network_throughput_bytes_per_sec: 499055741
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3489,19 +4622,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 112123182207
+        fingerprint: "d23936a34187f9cecae83b71e3b7c071"
+        network_throughput_bytes_per_sec: 363894811
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3511,108 +4651,117 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 79054282267
+        fingerprint: "0ac37c0c61461a98a53778858abef7c2"
+        network_throughput_bytes_per_sec: 1773160173
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 75112893982
+        fingerprint: "c5c9c12452980db70f61be1deb2f609c"
+        network_throughput_bytes_per_sec: 4000000000
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 80117359413
+        fingerprint: "9ea76d324a2a87d6007857f1bbd58e8b"
+        network_throughput_bytes_per_sec: 6557534520
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 98698795180
+        fingerprint: "e001496ab38237a7a240b9087b52ce67"
+        network_throughput_bytes_per_sec: 759837681159
       }
       entries {
         instruction {
-          opcode: "all-reduce"
-          shape {
-            element_type: F32
-            dimensions: 2097152
-            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
-        }
-        network_throughput_bytes_per_sec: 115076382791
-      }
-      entries {
-        instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3622,43 +4771,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 128944417117
+        fingerprint: "687f7f1c2f260eab74a9b0fe20926d8a"
+        network_throughput_bytes_per_sec: 100000000
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 147936794582
+        fingerprint: "7d42a2c52405b0487725b632c32b3246"
+        network_throughput_bytes_per_sec: 56303161285
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3668,66 +4829,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 208547334924
+        fingerprint: "4fcbe7e7d7bb2d20520c051dd60ad89e"
+        network_throughput_bytes_per_sec: 118832275611
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 141470048569
+        fingerprint: "713bd2c46abfb6ce361f71da4eb7d023"
+        network_throughput_bytes_per_sec: 75851851851
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 153210987726
+        fingerprint: "a2d264c6a62d0909b4dc78be82a6aff6"
+        network_throughput_bytes_per_sec: 124878048
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3737,19 +4918,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 196509745127
+        fingerprint: "03e84199997302127c56cdb92f61f92d"
+        network_throughput_bytes_per_sec: 218271440466
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3759,107 +4948,146 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 118402890695
+        fingerprint: "64fcb07cb34dfdc261edff3094f5e329"
+        network_throughput_bytes_per_sec: 675180232207
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 106649308380
+        fingerprint: "9f83571c52a45ed6fa719a3ef1b35f56"
+        network_throughput_bytes_per_sec: 189910979
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 177364005412
+        fingerprint: "e70429d2d9bdfaadf7c6d6cd451ad2d0"
+        network_throughput_bytes_per_sec: 150433377
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 115481938325
+        fingerprint: "378df9b975bd6f83307c54c5ebf8a5e3"
+        network_throughput_bytes_per_sec: 7192273924
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 109821533305
+        fingerprint: "50e0778e29cbb85bbe9e52c8a9e3f53b"
+        network_throughput_bytes_per_sec: 89134308058
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3869,20 +5097,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 165913924050
+        fingerprint: "fd47bb6bf2e69ef941ae3455e0980ec2"
+        network_throughput_bytes_per_sec: 68195629552
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3892,20 +5127,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 204003112840
+        fingerprint: "70cc3d28822cfa34a6f4f77936bd4122"
+        network_throughput_bytes_per_sec: 567411255411
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3915,133 +5157,174 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 239729309556
+        fingerprint: "7e46aa6a95b79c94bd27c5e9de8038c1"
+        network_throughput_bytes_per_sec: 726223530430
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 277989395546
+        fingerprint: "906601acb6c0bdb8e772d3adb5a7e148"
+        network_throughput_bytes_per_sec: 318474107820
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 193750184774
+        fingerprint: "d1dbd5305c09a5bbfd5a1e9c7c51dd3f"
+        network_throughput_bytes_per_sec: 269367354
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 240388812471
+        fingerprint: "29421a4f1a4df068f67122b659311863"
+        network_throughput_bytes_per_sec: 2371742906
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 246607714016
+        fingerprint: "f6813a624e0d4cd5931fd1911a59ce8d"
+        network_throughput_bytes_per_sec: 781679389
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 158875151515
+        fingerprint: "d84ec32ec9d5bce065cad02d30309053"
+        network_throughput_bytes_per_sec: 413557878130
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4051,19 +5334,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 191345985401
+        fingerprint: "a490a74ef07c6ec14e8318d4b8142f8e"
+        network_throughput_bytes_per_sec: 367401723439
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4073,41 +5365,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 211321241434
+        fingerprint: "a6f8b72cb4b9af96eef324934f9de021"
+        network_throughput_bytes_per_sec: 22199098
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 148544553052
+        fingerprint: "561c9e59f33d316b74205823cb42d04d"
+        network_throughput_bytes_per_sec: 513444984578
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4117,19 +5423,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 153772693943
+        fingerprint: "668e8b905ffbce1829608cbf4befc332"
+        network_throughput_bytes_per_sec: 85333333333
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4139,20 +5453,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 178663486113
+        fingerprint: "48e28feb64dc3689b568926698d8e02e"
+        network_throughput_bytes_per_sec: 5913879
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4162,89 +5484,114 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 259355923818
+        fingerprint: "0690a7935a759eb144aeec77b49771d9"
+        network_throughput_bytes_per_sec: 51160031225
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 296124258683
+        fingerprint: "5d605c04268d15a46391c0ef5400e98e"
+        network_throughput_bytes_per_sec: 128000000000
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 347901791639
+        fingerprint: "3aa698dd32575d30602a31ddba9fefac"
+        network_throughput_bytes_per_sec: 38120333006
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 251276300023
+        fingerprint: "3f33c91a674d3a639ad8460a9fee21f0"
+        network_throughput_bytes_per_sec: 76830732
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4254,86 +5601,3383 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 271581455581
+        fingerprint: "ea826b5b3a3cd11ff917bb395a93cd6f"
+        network_throughput_bytes_per_sec: 524288000000
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 266001014713
+        fingerprint: "d5bb5db4849bbe872fc0a3ab1ec6d0d9"
+        network_throughput_bytes_per_sec: 3657142857
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 253218063269
+        fingerprint: "4447fc843a198e997259f2a45d1c5078"
+        network_throughput_bytes_per_sec: 515460734914
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "7e50a143617aca892c3824cf04c47087"
+        network_throughput_bytes_per_sec: 461495736370
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 64
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "790112769afed6a30fcc1c8cbce08768"
+        network_throughput_bytes_per_sec: 38415366
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e77afb055f2c55ce8a9f881f93f4ccec"
+        network_throughput_bytes_per_sec: 592290333968
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "6dd3b94c1709ebff95103a4422009c8b"
+        network_throughput_bytes_per_sec: 413455961358
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c708aebbaea40d6ad370faaa7d411d0f"
+        network_throughput_bytes_per_sec: 115481938325
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "9c2db8231f5ef24a566554bcd16a60d4"
+        network_throughput_bytes_per_sec: 626225822104
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "b9f629195d1e0d1afeebe40aef9955df"
+        network_throughput_bytes_per_sec: 73253995144
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "bffcfdab46001339d858d2bde0841588"
+        network_throughput_bytes_per_sec: 12424771
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "d4bb9864d0881eafd2fa5efec09c6e99"
+        network_throughput_bytes_per_sec: 50039093
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "d9af6aa7b045746dbcb1015482010070"
+        network_throughput_bytes_per_sec: 26750752589
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "7c2845488c50871c6df2b01b09b93607"
+        network_throughput_bytes_per_sec: 524681639325
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "275f04921ccc4b0fa29127fb3a80a2f2"
+        network_throughput_bytes_per_sec: 2967175261
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "abd659445180bd7e7b25417a2c970841"
+        network_throughput_bytes_per_sec: 220474348191
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "696f90418e9a91559c454b6ef6ce9b85"
+        network_throughput_bytes_per_sec: 6948261238
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3007a47f1e00d56e53224073fd790288"
+        network_throughput_bytes_per_sec: 1878899082
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "8eb0c3afb6d4a5a42937daf0b2ca9327"
+        network_throughput_bytes_per_sec: 1853393665
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "6823bce11114d8ceb5386584c6072fc8"
+        network_throughput_bytes_per_sec: 62972756
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e1c68238c33369569964e0715085e11e"
+        network_throughput_bytes_per_sec: 333728835136
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "3c0dab67f668aa96622d925405aa4c35"
+        network_throughput_bytes_per_sec: 278053085
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "353ed7150bd91617fa8843f2620c704d"
+        network_throughput_bytes_per_sec: 70156207
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "8f9d476f5290f9fd2653c4f975ec810e"
+        network_throughput_bytes_per_sec: 28370562770
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "46eb6d6f2c352a68dd5943ccbf21f917"
+        network_throughput_bytes_per_sec: 455111111111
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "f85a8aefce6c3bcea1643e35008f1774"
+        network_throughput_bytes_per_sec: 390822213939
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "b0b00085d45064632771977a738377fa"
+        network_throughput_bytes_per_sec: 405874201664
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "75c0063e60c072af0ddaa5e2dbbfa741"
+        network_throughput_bytes_per_sec: 356050069
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ade554f65ab408a90ce8976d623c021f"
+        network_throughput_bytes_per_sec: 25680250783
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "d28e01182634547a1027d109446f3b88"
+        network_throughput_bytes_per_sec: 4882729846
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "a69547b3c276d3341642608b1db494a6"
+        network_throughput_bytes_per_sec: 91511754502
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "baf06cabffc7267e4ba88f3f0469f867"
+        network_throughput_bytes_per_sec: 538168440
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c37087da0a44968200e66995156557e7"
+        network_throughput_bytes_per_sec: 399305407463
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "02feb5e94baeeab2372bc39cf64b4dc3"
+        network_throughput_bytes_per_sec: 121212121
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "067cb94ec99578ac4a5d635f77d6836e"
+        network_throughput_bytes_per_sec: 857217600
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "61804978175d4c86d2e9535c315a7c67"
+        network_throughput_bytes_per_sec: 281572502685
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3b80581304fa3c1ea59b5614ce83e167"
+        network_throughput_bytes_per_sec: 4566332218
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "71a05d2710cbb210f44bd79ab02e4544"
+        network_throughput_bytes_per_sec: 155802442594
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "87766ec8522f636b65d29ffbbac0f005"
+        network_throughput_bytes_per_sec: 2681286310
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "77c8cbc50f987483fee44f7a20bf8b1f"
+        network_throughput_bytes_per_sec: 528649357196
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3b97564ed96cdb745f18dfba342d89a0"
+        network_throughput_bytes_per_sec: 198443603330
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fb071244e8d81a3688446f7a2515f445"
+        network_throughput_bytes_per_sec: 633006942348
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3a320de13af8d0a2b9013cfbf39985fb"
+        network_throughput_bytes_per_sec: 295953757
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "60c70a6d99cce4e304e50c40c8f99fce"
+        network_throughput_bytes_per_sec: 378611362482
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c8fe77ad4d6a5f9c9e08e64b66c5aa36"
+        network_throughput_bytes_per_sec: 2064516129
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fab7de7fff87c4825d063b15cf576f4a"
+        network_throughput_bytes_per_sec: 81632653
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "762f33c509d278627132d4806885cfc9"
+        network_throughput_bytes_per_sec: 29283288650
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ad7d779a1ec5f906606de9e4e93650d9"
+        network_throughput_bytes_per_sec: 1601250977
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "dd72a6e820b6d534b73545dc695b9277"
+        network_throughput_bytes_per_sec: 1924812030
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "141f0055829b89a154053476d8d5fabe"
+        network_throughput_bytes_per_sec: 43690666666
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "96ca5c7e1f1638828edc08d3daa8f8f7"
+        network_throughput_bytes_per_sec: 78486227544
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "1c34dad9da64a4a58f0c8f86c14a73a2"
+        network_throughput_bytes_per_sec: 30654738934
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 128
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e94911366d0b4cfff0d6742f123be8a9"
+        network_throughput_bytes_per_sec: 77575757
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "7da45b992c6f90f878e16349e67435ed"
+        network_throughput_bytes_per_sec: 818281032044
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "656be27a022b3b74531f5f2327584a2a"
+        network_throughput_bytes_per_sec: 3757798165
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "24af8009842c9b30f9fc309675fd46c8"
+        network_throughput_bytes_per_sec: 687140235910
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "0588b7c974105348d0a2515c0abd898b"
+        network_throughput_bytes_per_sec: 99712438189
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "188b7c8f49ee518ef3f12f7239d0542a"
+        network_throughput_bytes_per_sec: 8245596376
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "83591db4b7a2726af43dd49fc404007b"
+        network_throughput_bytes_per_sec: 132297300387
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "21ea7fae57499ff2b1700818db1ccc37"
+        network_throughput_bytes_per_sec: 130470612022
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "afc15c1b4deac0e2789adf42fce916f4"
+        network_throughput_bytes_per_sec: 694823821751
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "2965a87c08fc2bb44b5e63f4a8232930"
+        network_throughput_bytes_per_sec: 178086956521
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "66a5b4b136e3e63482a26491f2086663"
+        network_throughput_bytes_per_sec: 380386329
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ffb987920d3dcd0f23601030220f2c32"
+        network_throughput_bytes_per_sec: 515524090462
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "96e82cb9d09f5d9b43c2800f01b5f3ff"
+        network_throughput_bytes_per_sec: 280105783357
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "542e81cf1beafb7de263b511ad1f5d7c"
+        network_throughput_bytes_per_sec: 77926278240
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "2c9f9cea76ec0a7f35987d8bdeea1d30"
+        network_throughput_bytes_per_sec: 3552471812
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "7ab9f4f3c725f05b8e8f79d1d9a79a65"
+        network_throughput_bytes_per_sec: 890434782
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "03330f0defed011fd622da3ddcd58de5"
+        network_throughput_bytes_per_sec: 17636167922
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "d51ae4f483b29ba1518794ed1f635d41"
+        network_throughput_bytes_per_sec: 351171449502
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fac36931d73a8f77c6af1f29aa01f950"
+        network_throughput_bytes_per_sec: 196952667167
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "954fa4e199cb0e689954013992370dc4"
+        network_throughput_bytes_per_sec: 731428571
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "32921213d78db478164e6ece7132d57a"
+        network_throughput_bytes_per_sec: 399457523809
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3a965e23c93556616a155131d28e076f"
+        network_throughput_bytes_per_sec: 693387998016
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e937e6f6cd2b6db16fe9cd7c2979d357"
+        network_throughput_bytes_per_sec: 66064516129
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "1f4b3db733dab96bb8fa97d8f2bb2c7e"
+        network_throughput_bytes_per_sec: 531221766925
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c578baa139f847a50200e65a501cfe37"
+        network_throughput_bytes_per_sec: 1456614509
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "997ccdcc3c56a275439d3efb9d75628b"
+        network_throughput_bytes_per_sec: 6134969
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ecf9d950bcd7b6c6223401f066216fd4"
+        network_throughput_bytes_per_sec: 310303030
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "bdda763fe4cbd4dbd2cd6e538df4d2f5"
+        network_throughput_bytes_per_sec: 350752968723
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "59b183e8704a0ba13869ade15fa2b92b"
+        network_throughput_bytes_per_sec: 156878515858
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "0b76c03ffc5616ae8aaf7fe05d58a8e5"
+        network_throughput_bytes_per_sec: 585960324112
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ee0857faea7d71857c6fb8036e979d2d"
+        network_throughput_bytes_per_sec: 48725650557
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "393073cf2c2fb7f64008edb89f2f877a"
+        network_throughput_bytes_per_sec: 47310939156
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "22d939e4ee859968ac17ebf1c62fef05"
+        network_throughput_bytes_per_sec: 715049908366
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "bd85fa7c974409dc2c659688fdfb262f"
+        network_throughput_bytes_per_sec: 4413793103
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 64
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "f4c16425b37cb7c0469f632bc5f2954f"
+        network_throughput_bytes_per_sec: 62256809
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "b4c5c1f7997e35be9d661c5ab8917bbc"
+        network_throughput_bytes_per_sec: 332036316
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "4a5e09d90eb94f08a4eb31421f1e8443"
+        network_throughput_bytes_per_sec: 80874710166
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "96c966eff2db96826fa05a63d70abdfb"
+        network_throughput_bytes_per_sec: 293133731697
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fd886f592d094bb3c992eb050b3aeb7a"
+        network_throughput_bytes_per_sec: 76920187793
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "a0c14787b94ec324f39d4b4cde6aaee8"
+        network_throughput_bytes_per_sec: 1561049973
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "b3181d33b743bf7d8e93edc0102fa54b"
+        network_throughput_bytes_per_sec: 577250756950
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c092460e68acec60d687e3ff6c5a6674"
+        network_throughput_bytes_per_sec: 788440058273
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ef3bbaf6d14ca5829ae80710febc85eb"
+        network_throughput_bytes_per_sec: 500000000
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "15185f0d28c7a7edc1b4aecfa6d8f221"
+        network_throughput_bytes_per_sec: 293947770934
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "6b3993ffc7a2464a3f8f42c61f55394d"
+        network_throughput_bytes_per_sec: 633198067632
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "1287402cc6f1747882876a7ef488e090"
+        network_throughput_bytes_per_sec: 33590978985
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "9baae3a878bfbfff40ac180eb9c74753"
+        network_throughput_bytes_per_sec: 16384000000
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "47bea0ecc1ea80cb91e9b291e81894f8"
+        network_throughput_bytes_per_sec: 68246672524
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "a9ce40b744201dc85700fe442e18c6d3"
+        network_throughput_bytes_per_sec: 46512420156
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ae9b470b1a59d43e5b5fa203ca090cd9"
+        network_throughput_bytes_per_sec: 327680000000
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "f36d3cb56a18e44b99c8e2bd01e47a20"
+        network_throughput_bytes_per_sec: 100000000
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "56f48d98625522ae94180017c6e4235a"
+        network_throughput_bytes_per_sec: 592592592
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "53d4043108c4d45128534295dc2c4234"
+        network_throughput_bytes_per_sec: 468007312
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "f57f58c135e77c22b55a8bdbe156e2a4"
+        network_throughput_bytes_per_sec: 15208656049
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "5016df44b768d5084d1607b448e77a3d"
+        network_throughput_bytes_per_sec: 13462612982
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "be567d531287055cea40ba66db60de94"
+        network_throughput_bytes_per_sec: 413313362238
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "680f6ed8e838fee643167dc7a214bcd4"
+        network_throughput_bytes_per_sec: 344699539776
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "d8c02d04f293873b5f6c56e662530193"
+        network_throughput_bytes_per_sec: 883755583649
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "8426634cb954bc9a04f5d3df48489b36"
+        network_throughput_bytes_per_sec: 584653470867
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "c5eb925239f355be19b2a2ae51019597"
+        network_throughput_bytes_per_sec: 241998128458
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "2d89d55373f92fb4e1b55e7456f04f79"
+        network_throughput_bytes_per_sec: 134226318484
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "147048ce1f70eaaf4c41e3a478797c71"
+        network_throughput_bytes_per_sec: 648922686
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c161c6ad6758b19e819c0db57a16b335"
+        network_throughput_bytes_per_sec: 6090706319
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 247189061763
+        fingerprint: "0ee3ab31a2cbac43bf62cb9214b7c1d1"
+        network_throughput_bytes_per_sec: 102480062548
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4343,19 +8987,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 317942995755
+        fingerprint: "76f465d808cb817683770b3e6ab6838d"
+        network_throughput_bytes_per_sec: 92827195467
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "4a4598a1e81e510779605c8c674a3ac0"
+        network_throughput_bytes_per_sec: 810630589713
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4365,19 +9045,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 176691549414
+        fingerprint: "1576d013b336b385ef44a95d0cd74a2c"
+        network_throughput_bytes_per_sec: 15968810916
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4387,42 +9075,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 170016376165
+        fingerprint: "95b6a84a10fab4f19072622cadfd1acf"
+        network_throughput_bytes_per_sec: 2560000000
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 203192713884
+        fingerprint: "44ab650a56833b85a693cb515c43dee0"
+        network_throughput_bytes_per_sec: 2058291457
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4432,20 +9134,59 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 314227150134
+        fingerprint: "5326b51b78d6cb159c04918bfead91ed"
+        network_throughput_bytes_per_sec: 584898061637
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "99623f61db0dd4933df0e1e1215c75eb"
+        network_throughput_bytes_per_sec: 124121212121
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4455,20 +9196,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 342671895424
+        fingerprint: "1dc1e161c38f7eff0cea4c9c323dcfad"
+        network_throughput_bytes_per_sec: 642214668504
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4478,20 +9226,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 416680309954
+        fingerprint: "6fdb3d7a311222b991ec1edef14c5c26"
+        network_throughput_bytes_per_sec: 809086419753
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4501,20 +9256,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 306960187353
+        fingerprint: "f791188a95c2a553c9030fa46c38f2a0"
+        network_throughput_bytes_per_sec: 97523809523
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ecaeb13272e9384319989d342680549c"
+        network_throughput_bytes_per_sec: 531671858
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "6b9183f8b747bcf1e11a605c2867f7ff"
+        network_throughput_bytes_per_sec: 128313264806
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4524,64 +9345,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 299037786967
+        fingerprint: "7c115f6e4ef6c361899235020514a18f"
+        network_throughput_bytes_per_sec: 68912723449
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 303847000869
+        fingerprint: "0516ad83d2a5538970091f77531c73c6"
+        network_throughput_bytes_per_sec: 766958445714
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 306825457205
+        fingerprint: "31a79cde8d9fdcd983bb9252c3c92f73"
+        network_throughput_bytes_per_sec: 42622441721
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4591,19 +9434,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 288545954870
+        fingerprint: "48cd9372c015dda07a982da1a5620727"
+        network_throughput_bytes_per_sec: 121442125
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4613,19 +9464,54 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 395838429596
+        fingerprint: "e4c41dd6f65b2df6f78f48e9ef705d3e"
+        network_throughput_bytes_per_sec: 468532618409
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "0a1bea9a023ace07a93df0c757a64d8d"
+        network_throughput_bytes_per_sec: 3642507781
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4635,19 +9521,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 200281921497
+        fingerprint: "b4310b6629879dd69750c1adf77eca35"
+        network_throughput_bytes_per_sec: 20792722
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4657,65 +9552,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 181823478411
+        fingerprint: "22929fda0f320c7393deb2c18af09d78"
+        network_throughput_bytes_per_sec: 114695340
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 216201237113
+        fingerprint: "1f1e13b498f3efcf560154972498ced3"
+        network_throughput_bytes_per_sec: 5616005
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 343936367363
+        fingerprint: "5c21cfc6706586931dd655d0d05f90bb"
+        network_throughput_bytes_per_sec: 126984126
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4725,43 +9641,58 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 383321513434
+        fingerprint: "15f365ac896c0b583d549fa6577c6d56"
+        network_throughput_bytes_per_sec: 266112266
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 32
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 454420801733
+        fingerprint: "ad8d88294e7338062a0ebdb4f4cd2eb5"
+        network_throughput_bytes_per_sec: 22727272
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4771,20 +9702,25 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 326151166407
+        fingerprint: "2168cceecb93680ecc7859d77fc1492b"
+        network_throughput_bytes_per_sec: 30303030
       }
+    }
+  }
+  entries {
+    key: "sm_90"
+    value {
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4794,14 +9730,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 331854102381
+        network_throughput_bytes_per_sec: 421983399567
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4817,14 +9753,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 330312175145
+        network_throughput_bytes_per_sec: 1406593406
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4832,43 +9768,44 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 325265917022
+        network_throughput_bytes_per_sec: 2167195767
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 374926611245
+        network_throughput_bytes_per_sec: 5535135135
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4876,87 +9813,99 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 435771844155
+        network_throughput_bytes_per_sec: 118402890695
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 212665939916
+        fingerprint: "df2f19ebe8fc637197e39b52d97a794c"
+        network_throughput_bytes_per_sec: 7423231579
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 189098712833
+        fingerprint: "f6f986332989bc7ea465794f1c2b6856"
+        network_throughput_bytes_per_sec: 292016122948
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 227204246905
+        network_throughput_bytes_per_sec: 19106705539
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4965,27 +9914,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 368082843352
+        network_throughput_bytes_per_sec: 209492635
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4995,14 +9943,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 402872346556
+        network_throughput_bytes_per_sec: 1109425785
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5018,14 +9966,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 488704223711
+        network_throughput_bytes_per_sec: 277989395546
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5034,89 +9982,99 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 348118354981
+        network_throughput_bytes_per_sec: 537782991954
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 346765656649
+        network_throughput_bytes_per_sec: 28972590627
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 347397523501
+        fingerprint: "d1dbd5305c09a5bbfd5a1e9c7c51dd3f"
+        network_throughput_bytes_per_sec: 342618151
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 353651264755
+        fingerprint: "f08943c7fb325a80c9aedfc3d9219cfb"
+        network_throughput_bytes_per_sec: 1242435732
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5124,92 +10082,96 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 400219847328
+        network_throughput_bytes_per_sec: 3599297012
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 519129154031
+        network_throughput_bytes_per_sec: 497570456
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 220787703321
+        network_throughput_bytes_per_sec: 3602462620
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 193319306331
+        fingerprint: "aa53481ae940f8be73d0809b81bd85ee"
+        network_throughput_bytes_per_sec: 33755343806
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
             dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -5219,14 +10181,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 232468006096
+        network_throughput_bytes_per_sec: 558905190219
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5242,37 +10204,36 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 385603346434
+        network_throughput_bytes_per_sec: 532224532
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 423399772870
+        network_throughput_bytes_per_sec: 220787703321
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5288,18 +10249,17 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 511594072086
+        network_throughput_bytes_per_sec: 208547334924
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
@@ -5311,41 +10271,39 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 358832552668
+        network_throughput_bytes_per_sec: 471889400
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 360350874178
+        network_throughput_bytes_per_sec: 20189772027
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
@@ -5357,14 +10315,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 365819545593
+        network_throughput_bytes_per_sec: 232468006096
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5372,26 +10330,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 367582840366
+        network_throughput_bytes_per_sec: 75112893982
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -5401,61 +10360,63 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 421983399567
+        network_throughput_bytes_per_sec: 51990251
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 558905190219
+        network_throughput_bytes_per_sec: 239729309556
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 227444546849
+        network_throughput_bytes_per_sec: 713091922
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
@@ -5467,36 +10428,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 198288807469
+        network_throughput_bytes_per_sec: 2343249427
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 242242337131
+        fingerprint: "6411603f6b84e86918c660203d2586d6"
+        network_throughput_bytes_per_sec: 182449888
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5512,37 +10479,60 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 393268230936
+        network_throughput_bytes_per_sec: 12681114551
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "6823bce11114d8ceb5386584c6072fc8"
+        network_throughput_bytes_per_sec: 86122792
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 434192960662
+        network_throughput_bytes_per_sec: 503937007
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5551,21 +10541,21 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 537782991954
+        network_throughput_bytes_per_sec: 385603346434
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5574,27 +10564,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 367611030160
+        network_throughput_bytes_per_sec: 2813186813
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -5604,42 +10593,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 371222516014
+        network_throughput_bytes_per_sec: 6913080168
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 389556301155
+        network_throughput_bytes_per_sec: 13107200000
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -5649,80 +10638,90 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 383957524230
+        network_throughput_bytes_per_sec: 60963720930
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 450461571507
+        fingerprint: "e70429d2d9bdfaadf7c6d6cd451ad2d0"
+        network_throughput_bytes_per_sec: 174624829
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 623839068919
+        fingerprint: "ea38ae3f2a296149dde0cac2c673fd8f"
+        network_throughput_bytes_per_sec: 263586259582
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 229968237690
+        fingerprint: "61c2170cfeaf7234b58eea7d3c5cc136"
+        network_throughput_bytes_per_sec: 15856283078
       }
       entries {
         instruction {
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5737,42 +10736,47 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 201566254276
+        network_throughput_bytes_per_sec: 13787160
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 250289469463
+        fingerprint: "f57f58c135e77c22b55a8bdbe156e2a4"
+        network_throughput_bytes_per_sec: 17673026360
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -5782,20 +10786,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 401897616481
+        network_throughput_bytes_per_sec: 101426307
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -5805,37 +10808,36 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 443094410881
+        network_throughput_bytes_per_sec: 450461571507
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 549964466006
+        network_throughput_bytes_per_sec: 7492390
       }
       entries {
         instruction {
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5851,263 +10853,243 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 376090719201
+        network_throughput_bytes_per_sec: 5859799713
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 383183624061
+        network_throughput_bytes_per_sec: 16031311154
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
             dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 410431685299
+        fingerprint: "abd8c8f7ce2fd95b63645316537b74e7"
+        network_throughput_bytes_per_sec: 289717760542
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 388143599946
+        network_throughput_bytes_per_sec: 360350874178
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 468722523974
+        network_throughput_bytes_per_sec: 250289469463
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 642663627744
+        network_throughput_bytes_per_sec: 12720496894
       }
-    }
-  }
-  entries {
-    key: "sm_100"
-    value {
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "9f6af0d6b827293d0386ca2378e5d71b"
-        network_throughput_bytes_per_sec: 22743425
+        fingerprint: "47bea0ecc1ea80cb91e9b291e81894f8"
+        network_throughput_bytes_per_sec: 65284552465
       }
       entries {
-        instruction {
-          name: "_"
-          opcode: "all-reduce"
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "a9f1ccc1dcdd0ea4b6d345c626fc0464"
-        network_throughput_bytes_per_sec: 27777777
+        fingerprint: "275f04921ccc4b0fa29127fb3a80a2f2"
+        network_throughput_bytes_per_sec: 4457321635
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "997ccdcc3c56a275439d3efb9d75628b"
-        network_throughput_bytes_per_sec: 6134969
+        fingerprint: "f6d54f4055232256fcd6c0bf18d19f73"
+        network_throughput_bytes_per_sec: 142461701757
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "2168cceecb93680ecc7859d77fc1492b"
-        network_throughput_bytes_per_sec: 30303030
+        network_throughput_bytes_per_sec: 211321241434
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -6117,89 +11099,70 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "f40071ad502637ac5fbb867e29baea24"
-        network_throughput_bytes_per_sec: 37037037
+        network_throughput_bytes_per_sec: 98698795180
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "48e28feb64dc3689b568926698d8e02e"
-        network_throughput_bytes_per_sec: 5913879
+        fingerprint: "44d6bfd785c449cc8640234e220f77aa"
+        network_throughput_bytes_per_sec: 12531982025
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ad8d88294e7338062a0ebdb4f4cd2eb5"
-        network_throughput_bytes_per_sec: 22727272
+        network_throughput_bytes_per_sec: 299037786967
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -6209,57 +11172,41 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "790112769afed6a30fcc1c8cbce08768"
-        network_throughput_bytes_per_sec: 38415366
+        network_throughput_bytes_per_sec: 181823478411
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "22006dffcceb6352ce0d7f47b568a045"
-        network_throughput_bytes_per_sec: 5436629
+        network_throughput_bytes_per_sec: 176691549414
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -6269,235 +11216,183 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4f39fbed3ed5b26fd8f01ceb12a6958e"
-        network_throughput_bytes_per_sec: 11359602
+        network_throughput_bytes_per_sec: 251226692
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "1f1e13b498f3efcf560154972498ced3"
-        network_throughput_bytes_per_sec: 5616005
+        network_throughput_bytes_per_sec: 227204246905
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "25f7485f5479f664a4f493d61ff1a4d8"
-        network_throughput_bytes_per_sec: 5207485
+        network_throughput_bytes_per_sec: 376090719201
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "d4bb9864d0881eafd2fa5efec09c6e99"
-        network_throughput_bytes_per_sec: 50039093
+        network_throughput_bytes_per_sec: 393543428
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4ce7115cb0355a0436faffe9d4f63a60"
-        network_throughput_bytes_per_sec: 49960967
+        network_throughput_bytes_per_sec: 82022528160
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f36d3cb56a18e44b99c8e2bd01e47a20"
-        network_throughput_bytes_per_sec: 100000000
+        network_throughput_bytes_per_sec: 314227150134
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "991c99cf49ca7ed01041e4151f354da3"
-        network_throughput_bytes_per_sec: 71111111
+        network_throughput_bytes_per_sec: 59795620437
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "21c3dea5239284aed45517eccc2c77a1"
-        network_throughput_bytes_per_sec: 12118916
+        fingerprint: "4ffa457833b6c4ff6e5147781e31302a"
+        network_throughput_bytes_per_sec: 103156802223
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -6507,119 +11402,87 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "bffcfdab46001339d858d2bde0841588"
-        network_throughput_bytes_per_sec: 12424771
+        network_throughput_bytes_per_sec: 177364005412
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f4c16425b37cb7c0469f632bc5f2954f"
-        network_throughput_bytes_per_sec: 62256809
+        network_throughput_bytes_per_sec: 347901791639
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e94911366d0b4cfff0d6742f123be8a9"
-        network_throughput_bytes_per_sec: 77575757
+        network_throughput_bytes_per_sec: 3605633802
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "a6f8b72cb4b9af96eef324934f9de021"
-        network_throughput_bytes_per_sec: 22199098
+        network_throughput_bytes_per_sec: 198288807469
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -6629,145 +11492,109 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b4310b6629879dd69750c1adf77eca35"
-        network_throughput_bytes_per_sec: 20792722
+        network_throughput_bytes_per_sec: 20765736
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fc2857874212cf751e4b60decec734a7"
-        network_throughput_bytes_per_sec: 52588331
+        network_throughput_bytes_per_sec: 42833986928
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3f33c91a674d3a639ad8460a9fee21f0"
-        network_throughput_bytes_per_sec: 76830732
+        network_throughput_bytes_per_sec: 73388577827
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fab7de7fff87c4825d063b15cf576f4a"
-        network_throughput_bytes_per_sec: 81632653
+        network_throughput_bytes_per_sec: 170016376165
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "48cd9372c015dda07a982da1a5620727"
-        network_throughput_bytes_per_sec: 121442125
+        network_throughput_bytes_per_sec: 3524956970
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -6777,27 +11604,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "9f83571c52a45ed6fa719a3ef1b35f56"
-        network_throughput_bytes_per_sec: 189910979
+        network_throughput_bytes_per_sec: 165913924050
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -6807,119 +11626,95 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "02feb5e94baeeab2372bc39cf64b4dc3"
-        network_throughput_bytes_per_sec: 121212121
+        network_throughput_bytes_per_sec: 12593389700
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "3fc3265f1d4f48b553f24df733a0ec07"
-        network_throughput_bytes_per_sec: 137931034
+        fingerprint: "d9af6aa7b045746dbcb1015482010070"
+        network_throughput_bytes_per_sec: 29791629968
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5357a763f8b75abbd8c2b3aa99d399b0"
-        network_throughput_bytes_per_sec: 147976878
+        fingerprint: "c5eb925239f355be19b2a2ae51019597"
+        network_throughput_bytes_per_sec: 206503744116
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "687f7f1c2f260eab74a9b0fe20926d8a"
-        network_throughput_bytes_per_sec: 100000000
+        fingerprint: "d28e01182634547a1027d109446f3b88"
+        network_throughput_bytes_per_sec: 6836280185
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -6929,28 +11724,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "22929fda0f320c7393deb2c18af09d78"
-        network_throughput_bytes_per_sec: 114695340
+        network_throughput_bytes_per_sec: 70888047593
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -6960,55 +11746,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "5c21cfc6706586931dd655d0d05f90bb"
-        network_throughput_bytes_per_sec: 126984126
+        network_throughput_bytes_per_sec: 115076382791
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0120a2fd4590718b617dbde5030314f0"
-        network_throughput_bytes_per_sec: 111888111
+        network_throughput_bytes_per_sec: 68266666666
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7018,56 +11791,71 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "70a080ed258662e7a7c448a580386531"
-        network_throughput_bytes_per_sec: 155528554
+        network_throughput_bytes_per_sec: 5031941031
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "a2d264c6a62d0909b4dc78be82a6aff6"
-        network_throughput_bytes_per_sec: 124878048
+        fingerprint: "a69547b3c276d3341642608b1db494a6"
+        network_throughput_bytes_per_sec: 90527151860
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "83591db4b7a2726af43dd49fc404007b"
+        network_throughput_bytes_per_sec: 112599898656
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7077,27 +11865,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6ab6cfdfc119a9143b046bd2262766d6"
-        network_throughput_bytes_per_sec: 102687525
+        network_throughput_bytes_per_sec: 367582840366
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7107,27 +11887,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b4c5c1f7997e35be9d661c5ab8917bbc"
-        network_throughput_bytes_per_sec: 332036316
+        network_throughput_bytes_per_sec: 189098712833
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7137,180 +11909,139 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d6ceb25936203837f994d4ea62fccbcb"
-        network_throughput_bytes_per_sec: 228367528
+        network_throughput_bytes_per_sec: 97234421364
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "01cbebe328fc80e3c97d6b476fec64ff"
-        network_throughput_bytes_per_sec: 228571428
+        fingerprint: "15185f0d28c7a7edc1b4aecfa6d8f221"
+        network_throughput_bytes_per_sec: 249948514054
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3a320de13af8d0a2b9013cfbf39985fb"
-        network_throughput_bytes_per_sec: 295953757
+        network_throughput_bytes_per_sec: 27443886097
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "66a5b4b136e3e63482a26491f2086663"
-        network_throughput_bytes_per_sec: 380386329
+        fingerprint: "99043b67c066b466fdfabf4ba0e10d9d"
+        network_throughput_bytes_per_sec: 334503879
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8abe97935f10f37b31406be7fe615de0"
-        network_throughput_bytes_per_sec: 262026612
+        network_throughput_bytes_per_sec: 254726368
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ecf9d950bcd7b6c6223401f066216fd4"
-        network_throughput_bytes_per_sec: 310303030
+        network_throughput_bytes_per_sec: 3778597785
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7320,26 +12051,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "75c0063e60c072af0ddaa5e2dbbfa741"
-        network_throughput_bytes_per_sec: 356050069
+        network_throughput_bytes_per_sec: 549964466006
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7349,55 +12073,75 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "71fef15b131813a4d472ccf5528d373b"
-        network_throughput_bytes_per_sec: 210526315
+        network_throughput_bytes_per_sec: 3289959839
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        fingerprint: "15f365ac896c0b583d549fa6577c6d56"
-        network_throughput_bytes_per_sec: 266112266
+        fingerprint: "4a5e09d90eb94f08a4eb31421f1e8443"
+        network_throughput_bytes_per_sec: 76497126892
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
+        }
+        fingerprint: "561c9e59f33d316b74205823cb42d04d"
+        network_throughput_bytes_per_sec: 285606702146
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7407,147 +12151,112 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6a6da3f8a701c6de63d3f3eff5a326d0"
-        network_throughput_bytes_per_sec: 307692307
+        network_throughput_bytes_per_sec: 47524292965
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "d23936a34187f9cecae83b71e3b7c071"
-        network_throughput_bytes_per_sec: 363894811
+        network_throughput_bytes_per_sec: 303847000869
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "ef3bbaf6d14ca5829ae80710febc85eb"
-        network_throughput_bytes_per_sec: 500000000
+        fingerprint: "b01e85aab5ef3da6357ca6f9cfb67b8f"
+        network_throughput_bytes_per_sec: 96956830291
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "954fa4e199cb0e689954013992370dc4"
-        network_throughput_bytes_per_sec: 731428571
+        network_throughput_bytes_per_sec: 371222516014
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "56f48d98625522ae94180017c6e4235a"
-        network_throughput_bytes_per_sec: 592592592
+        network_throughput_bytes_per_sec: 99447647951
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7557,58 +12266,44 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c6fbc0a09d2d44806949eef49196c7a2"
-        network_throughput_bytes_per_sec: 639200998
+        network_throughput_bytes_per_sec: 468722523974
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "c986af5a1df1d20f73d7d40cf5b1e067"
-        network_throughput_bytes_per_sec: 1066666666
+        fingerprint: "bee86716cff01212fb687be758fc96e1"
+        network_throughput_bytes_per_sec: 271017233924
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7618,28 +12313,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a9e70188f014a7fd5d3664cb93a8ceea"
-        network_throughput_bytes_per_sec: 333550488
+        network_throughput_bytes_per_sec: 1740016992
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7649,28 +12336,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "53d4043108c4d45128534295dc2c4234"
-        network_throughput_bytes_per_sec: 468007312
+        network_throughput_bytes_per_sec: 15873015
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7680,26 +12359,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "644ed15c889d04ca582b384ff68fc71e"
-        network_throughput_bytes_per_sec: 380952380
+        network_throughput_bytes_per_sec: 342245989
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7709,26 +12381,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4fd09bd390682728f7da420003fb7c37"
-        network_throughput_bytes_per_sec: 533889468
+        network_throughput_bytes_per_sec: 870008496
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7738,56 +12404,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ecaeb13272e9384319989d342680549c"
-        network_throughput_bytes_per_sec: 531671858
+        network_throughput_bytes_per_sec: 4729792147
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
-          shape {
-            element_type: F32
-            dimensions: 4096
-            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          metadata {}
-          dimensions: 0
-          channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
-        }
-        fingerprint: "147048ce1f70eaaf4c41e3a478797c71"
-        network_throughput_bytes_per_sec: 648922686
-      }
-      entries {
-        instruction {
-          name: "_"
-          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7797,57 +12426,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d0a5ced62829c6ceb591eb442eb1b79c"
-        network_throughput_bytes_per_sec: 653061224
+        network_throughput_bytes_per_sec: 503937007
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e6fb7f1db9f0def5ca27f01f28291a63"
-        network_throughput_bytes_per_sec: 842105263
+        network_throughput_bytes_per_sec: 822489959
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7857,57 +12472,66 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6632fafd05450c9ca5bdd86c67f7cc0a"
-        network_throughput_bytes_per_sec: 1452482269
+        network_throughput_bytes_per_sec: 488704223711
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "a9dda3180d9de81b8ac47f5af4e3717c"
+        network_throughput_bytes_per_sec: 171596145
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f6813a624e0d4cd5931fd1911a59ce8d"
-        network_throughput_bytes_per_sec: 781679389
+        network_throughput_bytes_per_sec: 216201237113
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7917,58 +12541,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a80bdc2e68784e2f6b048f9070199b93"
-        network_throughput_bytes_per_sec: 1185185185
+        network_throughput_bytes_per_sec: 537250786
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "012c7d96d1729d9e95f6cb6f9cc6646d"
-        network_throughput_bytes_per_sec: 1183815028
+        network_throughput_bytes_per_sec: 104746317
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7978,28 +12587,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "bac7b1d59bfbf3ce3c9770a882b7d9d9"
-        network_throughput_bytes_per_sec: 1057851239
+        network_throughput_bytes_per_sec: 326151166407
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8009,28 +12609,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "5cfc87185d90d1302586da3e1c0f6fde"
-        network_throughput_bytes_per_sec: 1067778936
+        network_throughput_bytes_per_sec: 58514285714
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8040,55 +12632,70 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "28ed8d8831773650ac1210294feb985d"
-        network_throughput_bytes_per_sec: 1213270142
+        network_throughput_bytes_per_sec: 389556301155
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "36c99c9ecd2afb910616dba7e7604d76"
-        network_throughput_bytes_per_sec: 1120350109
+        network_throughput_bytes_per_sec: 147936794582
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
+        }
+        fingerprint: "aedd2df037a65be1250f90a94291ed61"
+        network_throughput_bytes_per_sec: 13073861652
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8098,56 +12705,44 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7ab9f4f3c725f05b8e8f79d1d9a79a65"
-        network_throughput_bytes_per_sec: 890434782
+        network_throughput_bytes_per_sec: 30681647940
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "90fc6688a7894e5c0b2688b8b3b56e5b"
-        network_throughput_bytes_per_sec: 1204705882
+        fingerprint: "e824c0c6cda89c864c487464c4714920"
+        network_throughput_bytes_per_sec: 115599104957
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8157,27 +12752,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c578baa139f847a50200e65a501cfe37"
-        network_throughput_bytes_per_sec: 1456614509
+        network_throughput_bytes_per_sec: 259355923818
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8187,27 +12775,48 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c8fe77ad4d6a5f9c9e08e64b66c5aa36"
-        network_throughput_bytes_per_sec: 2064516129
+        network_throughput_bytes_per_sec: 343163538
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "31a79cde8d9fdcd983bb9252c3c92f73"
+        network_throughput_bytes_per_sec: 46920350814
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8217,27 +12826,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "3007a47f1e00d56e53224073fd790288"
-        network_throughput_bytes_per_sec: 1878899082
+        network_throughput_bytes_per_sec: 196509745127
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8247,27 +12848,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "44ab650a56833b85a693cb515c43dee0"
-        network_throughput_bytes_per_sec: 2058291457
+        network_throughput_bytes_per_sec: 115481938325
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8277,27 +12871,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "95b6a84a10fab4f19072622cadfd1acf"
-        network_throughput_bytes_per_sec: 2560000000
+        network_throughput_bytes_per_sec: 32125490196
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8307,28 +12893,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "656be27a022b3b74531f5f2327584a2a"
-        network_throughput_bytes_per_sec: 3757798165
+        network_throughput_bytes_per_sec: 203192713884
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 32
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8338,117 +12916,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ad7d779a1ec5f906606de9e4e93650d9"
-        network_throughput_bytes_per_sec: 1601250977
+        network_throughput_bytes_per_sec: 47904191
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "96b26a3b1e59d77bab95e0026d23275c"
-        network_throughput_bytes_per_sec: 2074974670
+        network_throughput_bytes_per_sec: 158875151515
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8f93e3c7983b80f6171d861fa67a2bb5"
-        network_throughput_bytes_per_sec: 2135557872
+        network_throughput_bytes_per_sec: 148544553052
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8eb0c3afb6d4a5a42937daf0b2ca9327"
-        network_throughput_bytes_per_sec: 1853393665
+        network_throughput_bytes_per_sec: 27125827814
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8458,56 +13005,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "0ac37c0c61461a98a53778858abef7c2"
-        network_throughput_bytes_per_sec: 1773160173
+        network_throughput_bytes_per_sec: 1160997732
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 64
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8dd875a32ca9280cac2fb6c8b4a3f900"
-        network_throughput_bytes_per_sec: 2458583433
+        network_throughput_bytes_per_sec: 55220017
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8517,87 +13050,75 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "29421a4f1a4df068f67122b659311863"
-        network_throughput_bytes_per_sec: 2371742906
+        network_throughput_bytes_per_sec: 200281921497
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "7607dfea803f89904d8e152fb8189956"
-        network_throughput_bytes_per_sec: 4271115745
+        fingerprint: "a0c14787b94ec324f39d4b4cde6aaee8"
+        network_throughput_bytes_per_sec: 2500419687
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "766a46b63049615920fa933700606ad3"
-        network_throughput_bytes_per_sec: 4003910068
+        fingerprint: "21ea7fae57499ff2b1700818db1ccc37"
+        network_throughput_bytes_per_sec: 117049187304
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8607,57 +13128,41 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d5bb5db4849bbe872fc0a3ab1ec6d0d9"
-        network_throughput_bytes_per_sec: 3657142857
+        network_throughput_bytes_per_sec: 53691275
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c5c9c12452980db70f61be1deb2f609c"
-        network_throughput_bytes_per_sec: 4000000000
+        network_throughput_bytes_per_sec: 642663627744
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8667,90 +13172,63 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "bd85fa7c974409dc2c659688fdfb262f"
-        network_throughput_bytes_per_sec: 4413793103
+        network_throughput_bytes_per_sec: 623839068919
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4a53acb196d9813effc6e5c5955a28c2"
-        network_throughput_bytes_per_sec: 3744058500
+        network_throughput_bytes_per_sec: 35463203463
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3b80581304fa3c1ea59b5614ce83e167"
-        network_throughput_bytes_per_sec: 4566332218
+        network_throughput_bytes_per_sec: 10317380352
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          metadata {}
-          dimensions: 0
-          channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8760,26 +13238,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "02f84dcfcb5697b10aa0548ff15c1379"
-        network_throughput_bytes_per_sec: 7074265975
+        network_throughput_bytes_per_sec: 536687631
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8789,85 +13261,71 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "0f527662a7b68694b48d50d10a297e0b"
-        network_throughput_bytes_per_sec: 3205007824
+        network_throughput_bytes_per_sec: 367611030160
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "2c9f9cea76ec0a7f35987d8bdeea1d30"
-        network_throughput_bytes_per_sec: 3552471812
+        network_throughput_bytes_per_sec: 204003112840
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "584651046fab6aa022a9fcdaa741ca49"
-        network_throughput_bytes_per_sec: 4556173526
+        fingerprint: "a5c25d90d3703c3e05a5428a5fbafe10"
+        network_throughput_bytes_per_sec: 2575290789
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8877,57 +13335,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b119f67d214e8219e6b672422c7ff82d"
-        network_throughput_bytes_per_sec: 4940892641
+        network_throughput_bytes_per_sec: 88983027834
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "6b3fd8cf011b133409ba2ce19f78aed7"
-        network_throughput_bytes_per_sec: 6239146991
+        network_throughput_bytes_per_sec: 410431685299
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8937,118 +13380,92 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b3774b957d17024bdec4dfb6aa7015b8"
-        network_throughput_bytes_per_sec: 10680573663
+        network_throughput_bytes_per_sec: 5298835705
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4f6b04c57baf41d308928831e49e6f05"
-        network_throughput_bytes_per_sec: 6924767540
+        network_throughput_bytes_per_sec: 79054282267
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "696f90418e9a91559c454b6ef6ce9b85"
-        network_throughput_bytes_per_sec: 6948261238
+        network_throughput_bytes_per_sec: 227444546849
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "e5dc26fcdbdb577aa5941155f0d4fc57"
-        network_throughput_bytes_per_sec: 10252816020
+        fingerprint: "353ed7150bd91617fa8843f2620c704d"
+        network_throughput_bytes_per_sec: 87089641
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9058,28 +13475,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c161c6ad6758b19e819c0db57a16b335"
-        network_throughput_bytes_per_sec: 6090706319
+        network_throughput_bytes_per_sec: 343936367363
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -9089,44 +13497,35 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "378df9b975bd6f83307c54c5ebf8a5e3"
-        network_throughput_bytes_per_sec: 7192273924
+        network_throughput_bytes_per_sec: 193319306331
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b6cdb267bfc64ec38e1e2740095c8805"
-        network_throughput_bytes_per_sec: 7111111111
+        network_throughput_bytes_per_sec: 136606189
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
             dimensions: 65536
@@ -9134,12 +13533,34 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
+        }
+        fingerprint: "0cb65c209a26b3c7c2d519854b62c67c"
+        network_throughput_bytes_per_sec: 9752380952
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9149,85 +13570,71 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d04da75eb576bed3d4db82103261bf34"
-        network_throughput_bytes_per_sec: 6370139968
+        network_throughput_bytes_per_sec: 8650475184
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5b2d32958f53fbfe8142848551afad7c"
-        network_throughput_bytes_per_sec: 9683215130
+        fingerprint: "7d42a2c52405b0487725b632c32b3246"
+        network_throughput_bytes_per_sec: 57640194044
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3e00e3751db2f0a54edfc160509a0c32"
-        network_throughput_bytes_per_sec: 7185964912
+        network_throughput_bytes_per_sec: 7953398058
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9237,27 +13644,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "188b7c8f49ee518ef3f12f7239d0542a"
-        network_throughput_bytes_per_sec: 8245596376
+        network_throughput_bytes_per_sec: 348118354981
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -9267,57 +13667,48 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "f7fd5b7bdf4d97b0eb10f5fbab3117c5"
-        network_throughput_bytes_per_sec: 14234578627
+        network_throughput_bytes_per_sec: 240388812471
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "224b0c65bfb5a0772e3b3ef5e529c630"
-        network_throughput_bytes_per_sec: 20505632040
+        fingerprint: "d1ed25619ddb7091e280b64f8bf17b71"
+        network_throughput_bytes_per_sec: 640300140
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9327,27 +13718,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "9baae3a878bfbfff40ac180eb9c74753"
-        network_throughput_bytes_per_sec: 16384000000
+        network_throughput_bytes_per_sec: 277657266
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -9357,58 +13741,48 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "03330f0defed011fd622da3ddcd58de5"
-        network_throughput_bytes_per_sec: 17636167922
+        network_throughput_bytes_per_sec: 275565123
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "cc87abd4e4f49d00a84464d67ac1cc63"
-        network_throughput_bytes_per_sec: 26947368421
+        fingerprint: "71a05d2710cbb210f44bd79ab02e4544"
+        network_throughput_bytes_per_sec: 144412841361
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9418,59 +13792,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4afbc2a251fde8fe2a2f21f9b3680acf"
-        network_throughput_bytes_per_sec: 12700775193
+        network_throughput_bytes_per_sec: 393268230936
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "a83d894078ce2cfadd898cfbdd4955ea"
-        network_throughput_bytes_per_sec: 16094302554
+        network_throughput_bytes_per_sec: 35158798283
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -9480,26 +13838,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ad9a13d7c03557ae4a78b547c910aaa9"
-        network_throughput_bytes_per_sec: 18244988864
+        network_throughput_bytes_per_sec: 20739240506
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9509,145 +13861,123 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "1576d013b336b385ef44a95d0cd74a2c"
-        network_throughput_bytes_per_sec: 15968810916
+        network_throughput_bytes_per_sec: 128944417117
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "82724b215353fdf447c8f5867b927fe2"
-        network_throughput_bytes_per_sec: 14185281385
+        fingerprint: "3c0dab67f668aa96622d925405aa4c35"
+        network_throughput_bytes_per_sec: 352404714
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5016df44b768d5084d1607b448e77a3d"
-        network_throughput_bytes_per_sec: 13462612982
+        fingerprint: "1c34dad9da64a4a58f0c8f86c14a73a2"
+        network_throughput_bytes_per_sec: 32431522949
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b064bd92e8bb26128984b39785b63827"
-        network_throughput_bytes_per_sec: 21375081539
+        network_throughput_bytes_per_sec: 51200000000
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "8f9d476f5290f9fd2653c4f975ec810e"
-        network_throughput_bytes_per_sec: 28370562770
+        fingerprint: "260f965f1a4678225622c8bb1bb605bd"
+        network_throughput_bytes_per_sec: 109586962343
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -9657,27 +13987,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7090d041bc0599ce9b8dc0095e8d7135"
-        network_throughput_bytes_per_sec: 34168925964
+        network_throughput_bytes_per_sec: 330312175145
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9687,179 +14010,164 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "762f33c509d278627132d4806885cfc9"
-        network_throughput_bytes_per_sec: 29283288650
+        network_throughput_bytes_per_sec: 51280125195
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "0f9cdfcb5a2647c85e89d23875e02e61"
+        network_throughput_bytes_per_sec: 90820399
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f9040b578d0f9eba41c4ac1a07ee4224"
-        network_throughput_bytes_per_sec: 44582312925
+        network_throughput_bytes_per_sec: 56939501
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "eebdac40c94b93e0e8351eba013b7958"
-        network_throughput_bytes_per_sec: 44826265389
+        network_throughput_bytes_per_sec: 32064128
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "bb6b916563714f9bbad245e71a3d56e3"
-        network_throughput_bytes_per_sec: 32031280547
+        network_throughput_bytes_per_sec: 756277695
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "498af7a3213702edfadb727477672515"
-        network_throughput_bytes_per_sec: 25051987767
+        fingerprint: "30228c58f0f8bfd498c30a0b4c75491e"
+        network_throughput_bytes_per_sec: 290438102739
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0c83ea7171fd3e561e96f900b77b901b"
-        network_throughput_bytes_per_sec: 34026998961
+        network_throughput_bytes_per_sec: 306825457205
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9869,85 +14177,63 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "2e4edd9d5f901a539189122797458ea4"
-        network_throughput_bytes_per_sec: 24526946107
+        network_throughput_bytes_per_sec: 83591836734
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ade554f65ab408a90ce8976d623c021f"
-        network_throughput_bytes_per_sec: 25680250783
+        network_throughput_bytes_per_sec: 70368334
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ef5d4bc6c48c17023f1713afce3c88c3"
-        network_throughput_bytes_per_sec: 26947368421
+        network_throughput_bytes_per_sec: 153772693943
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9957,117 +14243,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "1287402cc6f1747882876a7ef488e090"
-        network_throughput_bytes_per_sec: 33590978985
+        network_throughput_bytes_per_sec: 253218063269
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "141f0055829b89a154053476d8d5fabe"
-        network_throughput_bytes_per_sec: 43690666666
+        network_throughput_bytes_per_sec: 1650282030
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e937e6f6cd2b6db16fe9cd7c2979d357"
-        network_throughput_bytes_per_sec: 66064516129
+        network_throughput_bytes_per_sec: 160200250
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "a9ce40b744201dc85700fe442e18c6d3"
-        network_throughput_bytes_per_sec: 46512420156
+        network_throughput_bytes_per_sec: 153210987726
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10077,27 +14334,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7c115f6e4ef6c361899235020514a18f"
-        network_throughput_bytes_per_sec: 68912723449
+        network_throughput_bytes_per_sec: 17031185031
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10107,59 +14356,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "713bd2c46abfb6ce361f71da4eb7d023"
-        network_throughput_bytes_per_sec: 75851851851
+        network_throughput_bytes_per_sec: 221837088
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0690a7935a759eb144aeec77b49771d9"
-        network_throughput_bytes_per_sec: 51160031225
+        network_throughput_bytes_per_sec: 346765656649
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10169,28 +14402,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "fd886f592d094bb3c992eb050b3aeb7a"
-        network_throughput_bytes_per_sec: 76920187793
+        network_throughput_bytes_per_sec: 225749559
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10200,26 +14425,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d8ba4d1f1855930537e677def956da0c"
-        network_throughput_bytes_per_sec: 85333333333
+        network_throughput_bytes_per_sec: 6942372881
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10229,175 +14447,132 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "5ecb8bfdacc065c41d076a95fde10de2"
-        network_throughput_bytes_per_sec: 51360501567
+        network_throughput_bytes_per_sec: 212665939916
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ee0857faea7d71857c6fb8036e979d2d"
-        network_throughput_bytes_per_sec: 48725650557
+        network_throughput_bytes_per_sec: 136818371607
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "413eb782ec4e2c4409f95026e7b720e6"
-        network_throughput_bytes_per_sec: 48617210682
+        network_throughput_bytes_per_sec: 106649308380
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "17e8b701cba039ffed5c4c64b81bd38e"
-        network_throughput_bytes_per_sec: 50567901234
+        network_throughput_bytes_per_sec: 156670746
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "668e8b905ffbce1829608cbf4befc332"
-        network_throughput_bytes_per_sec: 85333333333
+        network_throughput_bytes_per_sec: 709141274
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fd47bb6bf2e69ef941ae3455e0980ec2"
-        network_throughput_bytes_per_sec: 68195629552
+        network_throughput_bytes_per_sec: 201566254276
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10407,27 +14582,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "f791188a95c2a553c9030fa46c38f2a0"
-        network_throughput_bytes_per_sec: 97523809523
+        network_throughput_bytes_per_sec: 251276300023
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10437,27 +14604,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ca28c1cb14c2361c8daf298f6de5fc47"
-        network_throughput_bytes_per_sec: 102480062548
+        network_throughput_bytes_per_sec: 1400820793
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10467,12 +14627,10 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "0ee3ab31a2cbac43bf62cb9214b7c1d1"
-        network_throughput_bytes_per_sec: 102480062548
+        network_throughput_bytes_per_sec: 867796610
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
@@ -10480,15 +14638,9 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10498,28 +14650,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "8154a3ab4411af0b0a94fc14f69ac096"
-        network_throughput_bytes_per_sec: 101057825751
+        network_throughput_bytes_per_sec: 117658886894
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10529,205 +14673,160 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c708aebbaea40d6ad370faaa7d411d0f"
-        network_throughput_bytes_per_sec: 115481938325
+        network_throughput_bytes_per_sec: 402872346556
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e188b963e5912029be041b83f8c32803"
-        network_throughput_bytes_per_sec: 130031746031
+        network_throughput_bytes_per_sec: 722143864
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "96ca5c7e1f1638828edc08d3daa8f8f7"
-        network_throughput_bytes_per_sec: 78486227544
+        network_throughput_bytes_per_sec: 365819545593
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "542e81cf1beafb7de263b511ad1f5d7c"
-        network_throughput_bytes_per_sec: 77926278240
+        fingerprint: "96c966eff2db96826fa05a63d70abdfb"
+        network_throughput_bytes_per_sec: 244936493728
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "76f465d808cb817683770b3e6ab6838d"
-        network_throughput_bytes_per_sec: 92827195467
+        network_throughput_bytes_per_sec: 401897616481
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "50e0778e29cbb85bbe9e52c8a9e3f53b"
-        network_throughput_bytes_per_sec: 89134308058
+        network_throughput_bytes_per_sec: 122137404
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "6b9183f8b747bcf1e11a605c2867f7ff"
-        network_throughput_bytes_per_sec: 128313264806
+        network_throughput_bytes_per_sec: 2763832658
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10737,27 +14836,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4fcbe7e7d7bb2d20520c051dd60ad89e"
-        network_throughput_bytes_per_sec: 118832275611
+        network_throughput_bytes_per_sec: 676354029
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10767,150 +14858,112 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "2d89d55373f92fb4e1b55e7456f04f79"
-        network_throughput_bytes_per_sec: 134226318484
+        network_throughput_bytes_per_sec: 80117359413
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "0588b7c974105348d0a2515c0abd898b"
-        network_throughput_bytes_per_sec: 99712438189
+        fingerprint: "b9f629195d1e0d1afeebe40aef9955df"
+        network_throughput_bytes_per_sec: 77324336780
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fac36931d73a8f77c6af1f29aa01f950"
-        network_throughput_bytes_per_sec: 196952667167
+        network_throughput_bytes_per_sec: 374926611245
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0f6fd7c2255ef5d6619d3e266e4493b1"
-        network_throughput_bytes_per_sec: 162017305315
+        network_throughput_bytes_per_sec: 383183624061
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "99623f61db0dd4933df0e1e1215c75eb"
-        network_throughput_bytes_per_sec: 124121212121
+        network_throughput_bytes_per_sec: 1777777777
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10920,26 +14973,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ae02e28f43d17a07b5679b487f20fb4a"
-        network_throughput_bytes_per_sec: 157349339735
+        network_throughput_bytes_per_sec: 246607714016
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10949,26 +14996,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "59b183e8704a0ba13869ade15fa2b92b"
-        network_throughput_bytes_per_sec: 156878515858
+        network_throughput_bytes_per_sec: 162539682
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10978,26 +15018,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ef648be80ef0366d556323938cba8b8b"
-        network_throughput_bytes_per_sec: 145797552836
+        network_throughput_bytes_per_sec: 288545954870
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11007,87 +15041,65 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "22444f0e312a3499cbfd75eaf67c0888"
-        network_throughput_bytes_per_sec: 127937530502
+        network_throughput_bytes_per_sec: 66737270875
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "5d605c04268d15a46391c0ef5400e98e"
-        network_throughput_bytes_per_sec: 128000000000
+        network_throughput_bytes_per_sec: 1142857142
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "2965a87c08fc2bb44b5e63f4a8232930"
-        network_throughput_bytes_per_sec: 178086956521
+        network_throughput_bytes_per_sec: 4302521008
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11097,57 +15109,65 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "03e84199997302127c56cdb92f61f92d"
-        network_throughput_bytes_per_sec: 218271440466
+        network_throughput_bytes_per_sec: 8933478735
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "61804978175d4c86d2e9535c315a7c67"
-        network_throughput_bytes_per_sec: 281572502685
+        network_throughput_bytes_per_sec: 242242337131
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "9ca39fa8794daa9631163ee0e266097e"
+        network_throughput_bytes_per_sec: 668352778
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -11157,27 +15177,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ca3eb58708ad6e9c3551f90b9d193653"
-        network_throughput_bytes_per_sec: 227654363873
+        network_throughput_bytes_per_sec: 247189061763
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11187,13 +15200,12 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ae9b470b1a59d43e5b5fa203ca090cd9"
-        network_throughput_bytes_per_sec: 327680000000
+        network_throughput_bytes_per_sec: 154216867
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
             dimensions: 524288
@@ -11201,45 +15213,52 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "393073cf2c2fb7f64008edb89f2f877a"
+        network_throughput_bytes_per_sec: 55877860968
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "19bfe133a0d02c4cac3fe71ed6e3e741"
-        network_throughput_bytes_per_sec: 273208963001
+        network_throughput_bytes_per_sec: 12328066215
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -11249,28 +15268,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "3b97564ed96cdb745f18dfba342d89a0"
-        network_throughput_bytes_per_sec: 198443603330
+        network_throughput_bytes_per_sec: 400219847328
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11280,26 +15291,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "905de60dd6e7b89f580e55ae80ac8d79"
-        network_throughput_bytes_per_sec: 330989898989
+        network_throughput_bytes_per_sec: 347397523501
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -11309,266 +15313,240 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "abd659445180bd7e7b25417a2c970841"
-        network_throughput_bytes_per_sec: 220474348191
+        network_throughput_bytes_per_sec: 229968237690
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b5dc833e1006db332dcf16fc073558e4"
-        network_throughput_bytes_per_sec: 215578947368
+        network_throughput_bytes_per_sec: 132262361251
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7c15270553f884594396ea3a9e22288a"
-        network_throughput_bytes_per_sec: 208464413518
+        network_throughput_bytes_per_sec: 4075621890
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "695c9c2e1a16cd287cd6b80d66c3cf24"
-        network_throughput_bytes_per_sec: 170638893409
+        network_throughput_bytes_per_sec: 127617148
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "add1d98ea03d7ecc59ff9877b5bd5e93"
-        network_throughput_bytes_per_sec: 203567462628
+        fingerprint: "1f4b3db733dab96bb8fa97d8f2bb2c7e"
+        network_throughput_bytes_per_sec: 292410046019
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "d99a31a5731cd087d2a57a04dbeda416"
-        network_throughput_bytes_per_sec: 277694915254
+        fingerprint: "faccbf0108d642668cb20ee319a39541"
+        network_throughput_bytes_per_sec: 286685266708
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c37087da0a44968200e66995156557e7"
-        network_throughput_bytes_per_sec: 399305407463
+        network_throughput_bytes_per_sec: 271581455581
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "d71debdb12a8986a0f288def2a8ac093"
-        network_throughput_bytes_per_sec: 390095238095
+        fingerprint: "dd72a6e820b6d534b73545dc695b9277"
+        network_throughput_bytes_per_sec: 2518484359
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e4c41dd6f65b2df6f78f48e9ef705d3e"
-        network_throughput_bytes_per_sec: 468532618409
+        network_throughput_bytes_per_sec: 66264914054
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "8c4f72b22cf1c427b5192fde2275b82d"
+        network_throughput_bytes_per_sec: 70532512139
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -11578,117 +15556,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "e67540b745e062c37cb2d5e38a645a43"
-        network_throughput_bytes_per_sec: 363836224843
+        network_throughput_bytes_per_sec: 10571522
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "32921213d78db478164e6ece7132d57a"
-        network_throughput_bytes_per_sec: 399457523809
+        network_throughput_bytes_per_sec: 140814081
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "46eb6d6f2c352a68dd5943ccbf21f917"
-        network_throughput_bytes_per_sec: 455111111111
+        network_throughput_bytes_per_sec: 4437703141
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "361bab01e1544bbd7d4c57964c7cb2e8"
-        network_throughput_bytes_per_sec: 296124258683
+        network_throughput_bytes_per_sec: 511594072086
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -11698,146 +15647,111 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "e1c68238c33369569964e0715085e11e"
-        network_throughput_bytes_per_sec: 333728835136
+        network_throughput_bytes_per_sec: 191345985401
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b398f3b5618fef0e8beefc3d9fb45eee"
-        network_throughput_bytes_per_sec: 292082451253
+        network_throughput_bytes_per_sec: 331854102381
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c9746f1c866d390a80ecaa1cd0747467"
-        network_throughput_bytes_per_sec: 213494044589
+        network_throughput_bytes_per_sec: 8344198
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "96e82cb9d09f5d9b43c2800f01b5f3ff"
-        network_throughput_bytes_per_sec: 280105783357
+        network_throughput_bytes_per_sec: 112123182207
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "83d2f5e591feefe0553a301d532b898f"
-        network_throughput_bytes_per_sec: 337325398101
+        fingerprint: "067cb94ec99578ac4a5d635f77d6836e"
+        network_throughput_bytes_per_sec: 1329007138
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -11847,208 +15761,160 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "be567d531287055cea40ba66db60de94"
-        network_throughput_bytes_per_sec: 413313362238
+        network_throughput_bytes_per_sec: 58447488
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
             dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ea826b5b3a3cd11ff917bb395a93cd6f"
-        network_throughput_bytes_per_sec: 524288000000
+        network_throughput_bytes_per_sec: 395838429596
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "0b76c03ffc5616ae8aaf7fe05d58a8e5"
-        network_throughput_bytes_per_sec: 585960324112
+        fingerprint: "9ea76d324a2a87d6007857f1bbd58e8b"
+        network_throughput_bytes_per_sec: 7858504706
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f85a8aefce6c3bcea1643e35008f1774"
-        network_throughput_bytes_per_sec: 390822213939
+        network_throughput_bytes_per_sec: 178663486113
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "77c8cbc50f987483fee44f7a20bf8b1f"
-        network_throughput_bytes_per_sec: 528649357196
+        network_throughput_bytes_per_sec: 134736842
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b3181d33b743bf7d8e93edc0102fa54b"
-        network_throughput_bytes_per_sec: 577250756950
+        network_throughput_bytes_per_sec: 296124258683
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b0b00085d45064632771977a738377fa"
-        network_throughput_bytes_per_sec: 405874201664
+        network_throughput_bytes_per_sec: 7876923076
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12058,56 +15924,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d84ec32ec9d5bce065cad02d30309053"
-        network_throughput_bytes_per_sec: 413557878130
+        network_throughput_bytes_per_sec: 280087527
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "62c4fc6430ca1eb1c9da917231cf7c2c"
-        network_throughput_bytes_per_sec: 424438777575
+        network_throughput_bytes_per_sec: 443094410881
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -12117,27 +15969,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d08ba83901c2428a2cacfc0d6e826840"
-        network_throughput_bytes_per_sec: 293924597056
+        network_throughput_bytes_per_sec: 353651264755
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12147,102 +15992,89 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "98b831c4ca72406f37389214c5e19865"
-        network_throughput_bytes_per_sec: 311496769402
+        network_throughput_bytes_per_sec: 93090909090
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "e39d4c38abe32c0b8bf790196f492d26"
-        network_throughput_bytes_per_sec: 383216445865
+        fingerprint: "9efe18a44677d5ebf6d950d12f0105d0"
+        network_throughput_bytes_per_sec: 111468000783
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "70cc3d28822cfa34a6f4f77936bd4122"
-        network_throughput_bytes_per_sec: 567411255411
+        fingerprint: "baf06cabffc7267e4ba88f3f0469f867"
+        network_throughput_bytes_per_sec: 657252888
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "64fc8a1589366b418b88651876990852"
-        network_throughput_bytes_per_sec: 590414414414
+        network_throughput_bytes_per_sec: 9173572228
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
@@ -12250,136 +16082,100 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3a965e23c93556616a155131d28e076f"
-        network_throughput_bytes_per_sec: 693387998016
+        network_throughput_bytes_per_sec: 383321513434
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8426634cb954bc9a04f5d3df48489b36"
-        network_throughput_bytes_per_sec: 584653470867
+        network_throughput_bytes_per_sec: 317942995755
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "f2e7c307868389c7b945a60985fdbfb7"
-        network_throughput_bytes_per_sec: 600473013600
+        fingerprint: "1f93b4ce5f502f4229278ea4c2936bb8"
+        network_throughput_bytes_per_sec: 11199384799
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "58212b0c758c8c906c2df8d9cd23841e"
-        network_throughput_bytes_per_sec: 675411272141
+        network_throughput_bytes_per_sec: 306960187353
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -12389,205 +16185,164 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "85bb8349a62442dcab56384b99cbe6d0"
-        network_throughput_bytes_per_sec: 511875030510
+        network_throughput_bytes_per_sec: 383957524230
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ffb987920d3dcd0f23601030220f2c32"
-        network_throughput_bytes_per_sec: 515524090462
+        network_throughput_bytes_per_sec: 7626310
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4447fc843a198e997259f2a45d1c5078"
-        network_throughput_bytes_per_sec: 515460734914
+        network_throughput_bytes_per_sec: 193750184774
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "906601acb6c0bdb8e772d3adb5a7e148"
-        network_throughput_bytes_per_sec: 318474107820
+        network_throughput_bytes_per_sec: 7816793893
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "3fbaf73ace028a5c0673748316b980bf"
-        network_throughput_bytes_per_sec: 331029083303
+        fingerprint: "04a5265203d9bcb5e98c819340f31d6c"
+        network_throughput_bytes_per_sec: 69698295057
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "6dd3b94c1709ebff95103a4422009c8b"
-        network_throughput_bytes_per_sec: 413455961358
+        fingerprint: "e78c4bfa48ee5d7f81743a050aa5f803"
+        network_throughput_bytes_per_sec: 74252717998
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b253ab825e72a17754e0f920ba0ac47b"
-        network_throughput_bytes_per_sec: 619725768321
+        network_throughput_bytes_per_sec: 109821533305
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12597,58 +16352,72 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6b3993ffc7a2464a3f8f42c61f55394d"
-        network_throughput_bytes_per_sec: 633198067632
+        network_throughput_bytes_per_sec: 24129602356
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "10f1eac7685082516c77e28c1c570603"
-        network_throughput_bytes_per_sec: 749183531303
+        fingerprint: "7c2845488c50871c6df2b01b09b93607"
+        network_throughput_bytes_per_sec: 271990404586
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "3aa698dd32575d30602a31ddba9fefac"
+        network_throughput_bytes_per_sec: 41157750127
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -12658,28 +16427,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7cac54d6e1d08a6cd127ab0f81c94048"
-        network_throughput_bytes_per_sec: 608355065632
+        network_throughput_bytes_per_sec: 33677286742
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12689,28 +16450,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "1dc1e161c38f7eff0cea4c9c323dcfad"
-        network_throughput_bytes_per_sec: 642214668504
+        network_throughput_bytes_per_sec: 36008791208
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -12720,174 +16473,133 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "e001496ab38237a7a240b9087b52ce67"
-        network_throughput_bytes_per_sec: 759837681159
+        network_throughput_bytes_per_sec: 266001014713
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5326b51b78d6cb159c04918bfead91ed"
-        network_throughput_bytes_per_sec: 584898061637
+        fingerprint: "87766ec8522f636b65d29ffbbac0f005"
+        network_throughput_bytes_per_sec: 4868583314
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "84086fea224a69018a6bcf0db282b861"
-        network_throughput_bytes_per_sec: 584490523968
+        network_throughput_bytes_per_sec: 325265917022
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e77afb055f2c55ce8a9f881f93f4ccec"
-        network_throughput_bytes_per_sec: 592290333968
+        network_throughput_bytes_per_sec: 434192960662
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f9d967d15c65b7c80d2055c0c6dbf3c6"
-        network_throughput_bytes_per_sec: 333722196805
+        network_throughput_bytes_per_sec: 50027480916
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "d51ae4f483b29ba1518794ed1f635d41"
-        network_throughput_bytes_per_sec: 351171449502
+        network_throughput_bytes_per_sec: 26047694753
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -12897,87 +16609,64 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "9c002f2bdf400f2638aac27df778dfc1"
-        network_throughput_bytes_per_sec: 438013106023
+        network_throughput_bytes_per_sec: 519129154031
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b51292fa84bc7fc3b5a42c808ed0538a"
-        network_throughput_bytes_per_sec: 650456170278
+        network_throughput_bytes_per_sec: 4511013215
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
-        }
-        fingerprint: "4351be00ad096ee1fcfe565c2215c7dd"
-        network_throughput_bytes_per_sec: 693502645502
+        }
+        network_throughput_bytes_per_sec: 27629005059
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -12987,12 +16676,10 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6fdb3d7a311222b991ec1edef14c5c26"
-        network_throughput_bytes_per_sec: 809086419753
+        network_throughput_bytes_per_sec: 2285714285
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
@@ -13000,15 +16687,9 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13018,88 +16699,65 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c2aa9452829d26d4fbfc6be7dcd22902"
-        network_throughput_bytes_per_sec: 638305280779
+        network_throughput_bytes_per_sec: 358832552668
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "22d939e4ee859968ac17ebf1c62fef05"
-        network_throughput_bytes_per_sec: 715049908366
+        network_throughput_bytes_per_sec: 725212464
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7da45b992c6f90f878e16349e67435ed"
-        network_throughput_bytes_per_sec: 818281032044
+        network_throughput_bytes_per_sec: 368082843352
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13109,115 +16767,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "25a76511510793444ae58b29d1d310cd"
-        network_throughput_bytes_per_sec: 612396554241
+        network_throughput_bytes_per_sec: 125984251
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "9c2db8231f5ef24a566554bcd16a60d4"
-        network_throughput_bytes_per_sec: 626225822104
+        network_throughput_bytes_per_sec: 18593840
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fb071244e8d81a3688446f7a2515f445"
-        network_throughput_bytes_per_sec: 633006942348
+        network_throughput_bytes_per_sec: 388143599946
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "680f6ed8e838fee643167dc7a214bcd4"
-        network_throughput_bytes_per_sec: 344699539776
+        network_throughput_bytes_per_sec: 435771844155
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13227,57 +16856,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a490a74ef07c6ec14e8318d4b8142f8e"
-        network_throughput_bytes_per_sec: 367401723439
+        network_throughput_bytes_per_sec: 3631205673
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7e50a143617aca892c3824cf04c47087"
-        network_throughput_bytes_per_sec: 461495736370
+        network_throughput_bytes_per_sec: 423399772870
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13287,27 +16901,47 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "64fcb07cb34dfdc261edff3094f5e329"
-        network_throughput_bytes_per_sec: 675180232207
+        network_throughput_bytes_per_sec: 36612290502
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "0a1bea9a023ace07a93df0c757a64d8d"
+        network_throughput_bytes_per_sec: 4709737693
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13317,27 +16951,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7e46aa6a95b79c94bd27c5e9de8038c1"
-        network_throughput_bytes_per_sec: 726223530430
+        network_throughput_bytes_per_sec: 228980322
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -13347,28 +16974,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "27a51e9c9148298fd01ee900e6a81c2c"
-        network_throughput_bytes_per_sec: 851808285946
+        network_throughput_bytes_per_sec: 11838697
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13378,59 +16996,47 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b8e3907c6dfb227acf1602dcedd1dfae"
-        network_throughput_bytes_per_sec: 695270135305
+        network_throughput_bytes_per_sec: 61077353215
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "0516ad83d2a5538970091f77531c73c6"
-        network_throughput_bytes_per_sec: 766958445714
+        fingerprint: "923306d188529fd23828978fba917eca"
+        network_throughput_bytes_per_sec: 1285019607
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
-          channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -13440,55 +17046,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a7adb6a1534bbf74d4512d8e05eb5ad0"
-        network_throughput_bytes_per_sec: 859356451365
+        network_throughput_bytes_per_sec: 5970845481
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "24af8009842c9b30f9fc309675fd46c8"
-        network_throughput_bytes_per_sec: 687140235910
+        network_throughput_bytes_per_sec: 13826160337
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 64
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13498,176 +17091,131 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "afc15c1b4deac0e2789adf42fce916f4"
-        network_throughput_bytes_per_sec: 694823821751
+        network_throughput_bytes_per_sec: 28243601
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4a4598a1e81e510779605c8c674a3ac0"
-        network_throughput_bytes_per_sec: 810630589713
+        network_throughput_bytes_per_sec: 90267983
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "bdda763fe4cbd4dbd2cd6e538df4d2f5"
-        network_throughput_bytes_per_sec: 350752968723
+        network_throughput_bytes_per_sec: 98847662141
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "60c70a6d99cce4e304e50c40c8f99fce"
-        network_throughput_bytes_per_sec: 378611362482
+        network_throughput_bytes_per_sec: 253968253
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "35882d22990344fadfe4e45b8e2721eb"
-        network_throughput_bytes_per_sec: 478822324015
+        network_throughput_bytes_per_sec: 10014669926
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b5ef73f8707a38b85661790207aa156e"
-        network_throughput_bytes_per_sec: 704488436788
+        network_throughput_bytes_per_sec: 454420801733
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13677,27 +17225,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c8859ac88de21b0d40acf7b94c89a34e"
-        network_throughput_bytes_per_sec: 757137293394
+        network_throughput_bytes_per_sec: 90523338
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -13707,148 +17248,134 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d8c02d04f293873b5f6c56e662530193"
-        network_throughput_bytes_per_sec: 883755583649
+        network_throughput_bytes_per_sec: 416680309954
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c04d9b7c7ac56f98fcf87ee9c131ab68"
-        network_throughput_bytes_per_sec: 733454255330
+        network_throughput_bytes_per_sec: 18265328874
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ea64b110db7c46aafbb394dd547e9e23"
-        network_throughput_bytes_per_sec: 804836343575
+        network_throughput_bytes_per_sec: 66528066
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "00842feada0344771c1b4e414c197917"
+        network_throughput_bytes_per_sec: 269253988123
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7137b6e88084dce1309b8c65093ae1ff"
-        network_throughput_bytes_per_sec: 938598637743
+        network_throughput_bytes_per_sec: 342671895424
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8370a1824c7b75672651c80e67bfcc33"
-        network_throughput_bytes_per_sec: 671948734380
+        network_throughput_bytes_per_sec: 7262411347
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13858,37 +17385,30 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c092460e68acec60d687e3ff6c5a6674"
-        network_throughput_bytes_per_sec: 788440058273
+        network_throughput_bytes_per_sec: 519796954
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b9ca069b3d1f1eeefa7a6ecf54baacae"
-        network_throughput_bytes_per_sec: 848534088610
+        network_throughput_bytes_per_sec: 141470048569
       }
     }
   }
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
index 8c0fdd437b8614..843b3f6b8b31ba 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
index 2e39608c5780ca..975e02453cfc94 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index f6d737b033aa14..8d0ae07cc7f7af 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -680,6 +680,11 @@ GpuPerformanceModelWithIndexingAnalysis::TryFindBestTilingForFusion(
         EstimateRunTimeForTiledHloComputation(
             fusion_adaptor, tiled_hlo_computation, launch_dimensions));
 
+    // Skip tilings with infinite runtime (e.g., due to register spilling).
+    if (estimate_run_time_data.exec_time == absl::InfiniteDuration()) {
+      continue;
+    }
+
     if (!best_tiled_run_time_data.has_value() ||
         estimate_run_time_data.exec_time <
             best_tiled_run_time_data->runtime_data.exec_time) {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index 88c15cd960926a..85f40e8505482d 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
index 4e54b437174750..27fba492d0bf81 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
 #include "xla/codegen/tiling/tiling_specification.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index 4a97ae21959e91..956bc9f926ca01 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -319,6 +319,5 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
   VLOG(8) << "RecordEstimatedRunTime: " << instruction->ToString();
 }
 
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 9882cec1c2824f..c150db6dde043f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index 3975bbfafe32a0..e369f215601a70 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index 166fc8e6842e9c..25dd6242985702 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
index cbdf16d34ffcda..c517f352eeba0f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index e14d7dfc1ed7a7..8bc33e08942fe1 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
index 05417dbb997dc3..01f7df1228dd53 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
@@ -224,6 +224,27 @@ absl::StatusOr<absl::Duration> SolGPUCostModel::RingLatency(
   return ret + xla_flag_config_.nccl_op_launch_time;
 }
 
+absl::StatusOr<absl::Duration> SolGPUCostModel::AllToAllLatency(
+    const int64_t buff_size_bytes, const int num_nodes,
+    const int num_communicators) const {
+  TF_ASSIGN_OR_RETURN(
+      int num_gpus,
+      NumGpusPerComm(num_nodes, SolGPUCostModel::CollectiveType::kAllToAll,
+                     num_communicators));
+
+  const int num_gpus_per_node = num_gpus / num_nodes;
+  // Each GPU sends to (num_gpus_per_node * (num_nodes-1)) peers off-node.
+  const int inter_node_peers_per_gpu = num_gpus_per_node * (num_nodes - 1);
+  // Sending buff_size_bytes / (num_gpus - 1) bytes to each peer off-node.
+  const int64_t per_peer_bytes = buff_size_bytes / (num_gpus - 1);
+  absl::Duration per_peer_duration = TransferDuration(per_peer_bytes) +
+                                     ChunkPrepLatency(per_peer_bytes) +
+                                     xla_flag_config_.rtt;
+  absl::Duration total = inter_node_peers_per_gpu * per_peer_duration;
+
+  return total + xla_flag_config_.nccl_op_launch_time;
+}
+
 // Helper functions
 absl::StatusOr<int> SolGPUCostModel::NumGpusPerComm(
     int num_nodes, const CollectiveType& coll_type,
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
index 0634f118c528cd..21cf4bd32f9cc7 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
@@ -50,8 +50,9 @@ class SolGPUCostModel {
   };
 
   enum class CollectiveType {
-    kAllReduce,
     kAllGather,
+    kAllReduce,
+    kAllToAll,
     kReduceScatter,
     kSendRecv,
   };
@@ -73,6 +74,16 @@ class SolGPUCostModel {
                                              const CollectiveType& coll_type,
                                              int num_communicators) const;
 
+  // Returns the latency of an AllToAll collective across multiple nodes.
+  //
+  // `buff_size_bytes`: the size of the message to be transferred.
+  // `num_nodes`: the number of nodes participating in the all-to-all.
+  // `num_communicators`: the number of communicators participating in the
+  // all-to-all.
+  absl::StatusOr<absl::Duration> AllToAllLatency(int64_t buff_size_bytes,
+                                                 int num_nodes,
+                                                 int num_communicators) const;
+
  private:
   // Helper functions to estimate the latency subcomponents
   absl::Duration ChunkPrepLatency(int64_t per_gpu_msg_size_bytes) const;
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
index 6d7ccde25851f5..3deed08bd99c69 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/hlo_verifier.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
index ea03349b3ab584..13cf1dfa711985 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc
index 3adfda9b07671d..7b778e0f682b61 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc
@@ -27,17 +27,18 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
-constexpr int64_t kTenMB = 10 * 1024 * 1024;  // 10MB
+constexpr int64_t kEightMB = 8 * 1024 * 1024;  // 8MB
 
 using ::testing::TestWithParam;
 using ::testing::ValuesIn;
 
-struct RingLatencyTestCase {
+struct LatencyTestCase {
   SolGPUCostModel::CollectiveType collective_type;
+  int num_nodes;
   absl::Duration expected_latency;
 };
 
-class SolGPUCostModelTest : public TestWithParam<RingLatencyTestCase> {
+class SolGPUCostModelTest : public TestWithParam<LatencyTestCase> {
  protected:
   SolGPUCostModelTest()
       : model_({
@@ -45,30 +46,61 @@ class SolGPUCostModelTest : public TestWithParam<RingLatencyTestCase> {
             /*nic_speed_gbps=*/100,
             /*chunk_prep_time=*/absl::Microseconds(100),
             /*rtt=*/absl::Microseconds(100),
-            /*gpus_per_node=*/100,
+            /*gpus_per_node=*/8,
             /*chunk_size_bytes=*/4 * 1024 * 1024,
         }) {}
   SolGPUCostModel model_;
 };
 
-TEST_P(SolGPUCostModelTest, TestRingLatency) {
-  const RingLatencyTestCase& test_case = GetParam();
-  absl::Duration actual_latency =
-      absl::Trunc(*model_.RingLatency(kTenMB, 1, test_case.collective_type,
-                                      /*num_communicators=*/1),
-                  absl::Microseconds(1));
+TEST_P(SolGPUCostModelTest, TestLatency) {
+  const LatencyTestCase& test_case = GetParam();
+  absl::Duration actual_latency;
+  if (test_case.collective_type == SolGPUCostModel::CollectiveType::kAllToAll) {
+    actual_latency =
+        absl::Trunc(*model_.AllToAllLatency(kEightMB, test_case.num_nodes,
+                                            /*num_communicators=*/1),
+                    absl::Microseconds(1));
+  } else {
+    actual_latency =
+        absl::Trunc(*model_.RingLatency(kEightMB, test_case.num_nodes,
+                                        test_case.collective_type,
+                                        /*num_communicators=*/1),
+                    absl::Microseconds(1));
+  }
   EXPECT_EQ(actual_latency, test_case.expected_latency);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    SolGPUCostModelTests, SolGPUCostModelTest,
-    ValuesIn<RingLatencyTestCase>({
-        {SolGPUCostModel::CollectiveType::kAllGather, absl::Microseconds(299)},
-        {SolGPUCostModel::CollectiveType::kAllReduce, absl::Microseconds(498)},
-        {SolGPUCostModel::CollectiveType::kReduceScatter,
-         absl::Microseconds(299)},
-        {SolGPUCostModel::CollectiveType::kSendRecv, absl::Microseconds(353)},
-    }));
+INSTANTIATE_TEST_SUITE_P(SolGPUCostModelTests, SolGPUCostModelTest,
+                         ValuesIn<LatencyTestCase>({
+                             {SolGPUCostModel::CollectiveType::kAllGather,
+                              /*num_nodes=*/1, absl::Microseconds(284)},
+                             {SolGPUCostModel::CollectiveType::kAllGather,
+                              /*num_nodes=*/2, absl::Microseconds(485)},
+                             {SolGPUCostModel::CollectiveType::kAllGather,
+                              /*num_nodes=*/4, absl::Microseconds(885)},
+                             {SolGPUCostModel::CollectiveType::kAllReduce,
+                              /*num_nodes=*/1, absl::Microseconds(468)},
+                             {SolGPUCostModel::CollectiveType::kAllReduce,
+                              /*num_nodes=*/2, absl::Microseconds(870)},
+                             {SolGPUCostModel::CollectiveType::kAllReduce,
+                              /*num_nodes=*/4, absl::Microseconds(1670)},
+                             {SolGPUCostModel::CollectiveType::kReduceScatter,
+                              /*num_nodes=*/1, absl::Microseconds(284)},
+                             {SolGPUCostModel::CollectiveType::kReduceScatter,
+                              /*num_nodes=*/2, absl::Microseconds(485)},
+                             {SolGPUCostModel::CollectiveType::kReduceScatter,
+                              /*num_nodes=*/4, absl::Microseconds(885)},
+                             {SolGPUCostModel::CollectiveType::kSendRecv,
+                              /*num_nodes=*/1, absl::Microseconds(292)},
+                             {SolGPUCostModel::CollectiveType::kSendRecv,
+                              /*num_nodes=*/2, absl::Microseconds(485)},
+                             {SolGPUCostModel::CollectiveType::kAllToAll,
+                              /*num_nodes=*/1, absl::Microseconds(100)},
+                             {SolGPUCostModel::CollectiveType::kAllToAll,
+                              /*num_nodes=*/2, absl::Microseconds(1745)},
+                             {SolGPUCostModel::CollectiveType::kAllToAll,
+                              /*num_nodes=*/4, absl::Microseconds(4966)},
+                         }));
 
 TEST(SolGPUCostModelGetConfigTest, ConfigForHopper) {
   constexpr absl::string_view kDummyModule = R"(
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
index 72bb3f5033896e..3b734ee1bb3f6e 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
@@ -61,7 +61,10 @@ using ::mlir::MLIRContext;
 bool IsSupportedCollectiveOp(const HloInstruction& instr) {
   return HloPredicateIsOp<HloOpcode::kAllReduceStart, HloOpcode::kAllReduce,
                           HloOpcode::kReduceScatter, HloOpcode::kAllGatherStart,
-                          HloOpcode::kAllGather>(&instr);
+                          HloOpcode::kAllToAll,
+                          HloOpcode::kCollectivePermuteStart,
+                          HloOpcode::kCollectivePermute, HloOpcode::kAllGather>(
+      &instr);
 }
 
 bool IsHostOffloaded(const HloInstruction& instr) {
@@ -127,6 +130,14 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
       result += runtime;
       break;
     }
+    case HloOpcode::kAllToAll: {
+      TF_ASSIGN_OR_RETURN(
+          absl::Duration runtime,
+          sol_model.AllToAllLatency(msg_size, num_participating_hosts,
+                                    num_communicators));
+      result += runtime;
+      break;
+    }
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllReduceStart: {
       result += gpu_performance_model.Get()
@@ -165,10 +176,19 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
                                 num_communicators));
         result += runtime;
       }
+      if (instr.async_wrapped_opcode() == HloOpcode::kAllToAll) {
+        TF_ASSIGN_OR_RETURN(
+            absl::Duration runtime,
+            sol_model.AllToAllLatency(msg_size, num_participating_hosts,
+                                      num_communicators));
+        result += runtime;
+      }
       break;
     }
     case HloOpcode::kRecv:
-    case HloOpcode::kSend: {
+    case HloOpcode::kSend:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart: {
       TF_ASSIGN_OR_RETURN(
           absl::Duration runtime,
           sol_model.RingLatency(msg_size, num_participating_hosts,
@@ -310,26 +330,53 @@ SolLatencyEstimator::ComputeCollectiveTime(
     return absl::ZeroDuration();
   }
 
-  const HloCollectiveInstruction* collective_instr =
-      DynCast<HloCollectiveInstruction>(
-          instr.IsAsynchronous() ? instr.async_wrapped_instruction() : &instr);
-
-  if (collective_instr == nullptr) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Unsupported collective instruction: ", instr.ToString()));
+  const HloInstruction* collective =
+      instr.IsAsynchronous() ? instr.async_wrapped_instruction() : &instr;
+  if (const auto* cp = DynCast<HloCollectivePermuteInstruction>(collective)) {
+    // Handles the collective-permute ops.
+    int64_t partition_size = GetPartitionSize(*cp, sol_flags);
+    CollectivePermuteCostModelType cost_model_type =
+        GetCollectivePermuteCostModelType(*cp, partition_size);
+
+    switch (cost_model_type) {
+      case CollectivePermuteCostModelType::kIntraPartitionOneWay:
+      case CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual:
+      case CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual:
+        return collective_interpolator->EstimatedRuntime(*cp);
+      case CollectivePermuteCostModelType::kInterPartitionOneWay:
+      case CollectivePermuteCostModelType::kInterPartitionTwoWayAllMutual:
+      case CollectivePermuteCostModelType::kInterPartitionTwoWayHasNonMutual: {
+        // TODO(wfelix): Distinguish different types of inter-partition
+        // collectives.
+        TF_ASSIGN_OR_RETURN(
+            absl::Duration duration,
+            DCNCollectiveDuration(/*num_participating_hosts=*/2,
+                                  /*num_communicators=*/1, *cp, gpu_device_info,
+                                  sol_flags, analysis, mlir_context));
+        return duration;
+      }
+      case CollectivePermuteCostModelType::kUnknown:
+        return absl::InvalidArgumentError(
+            "Unknown collective permute cost model type.");
+    }
+  } else if (const auto* collective_instr =
+                 DynCast<HloCollectiveInstruction>(collective)) {
+    // Handles the collective ops.
+    int64_t partition_size = GetPartitionSize(*collective_instr, sol_flags);
+    TF_ASSIGN_OR_RETURN(
+        GPUCommunicationType communication_type,
+        CommunicationType(partition_size, *collective_instr,
+                          gpu_device_info.gpu_compute_capability()));
+    TF_ASSIGN_OR_RETURN(
+        absl::Duration result,
+        DispatchEstimation(communication_type, *collective_instr,
+                           gpu_device_info, sol_flags, analysis,
+                           collective_interpolator, mlir_context));
+    return result;
   }
 
-  int64_t partition_size = GetPartitionSize(*collective_instr, sol_flags);
-  TF_ASSIGN_OR_RETURN(
-      GPUCommunicationType communication_type,
-      CommunicationType(partition_size, *collective_instr,
-                        gpu_device_info.gpu_compute_capability()));
-  TF_ASSIGN_OR_RETURN(
-      absl::Duration result,
-      DispatchEstimation(communication_type, *collective_instr, gpu_device_info,
-                         sol_flags, analysis, collective_interpolator,
-                         mlir_context));
-  return result;
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unsupported collective instruction: ", instr.ToString()));
 }
 
 /*static*/ absl::StatusOr<std::unique_ptr<SolLatencyEstimator>>
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index 4ecd0253fee7ab..daec75b8f1c563 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
@@ -351,7 +352,7 @@ HloModule m
 ENTRY e {
   p0 = bf16[1024,1024] parameter(0)
   p1 = bf16[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$gemm",
     backend_config={
       "gemm_backend_config":{
@@ -379,7 +380,7 @@ HloModule m
 ENTRY e {
   p0 = f8e5m2[1024,1024] parameter(0)
   p1 = f8e4m3fn[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -407,7 +408,7 @@ HloModule m
 ENTRY e {
   p0 = f8e5m2[1024,1024] parameter(0)
   p1 = f8e4m3fn[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -435,7 +436,7 @@ HloModule m
 ENTRY e {
   p0 = f8e4m3fn[1024,1024] parameter(0)
   p1 = f8e5m2[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -463,7 +464,7 @@ HloModule m
 ENTRY e {
   p0 = f8e4m3fn[1024,1024] parameter(0)
   p1 = f8e4m3fn[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -532,8 +533,109 @@ ENTRY e {
       /*cost_type=*/CostType::kNodeCost,
       /*expected_latency=*/absl::ZeroDuration(),
   };
+  // Test for CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual
+  EstimatorTestCase collective_permute_intra_host_ring_shift = {
+      /*test_name=*/"collective_permute_intra_host_ring_shift",
+      /*module_string=*/R"(
+HloModule m, num_partitions=4
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0), sharding={devices=[4,1]<=[4]}
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(3706),
+  };
+
+  // Test for CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual
+  EstimatorTestCase collective_permute_intra_host_bidirectional = {
+      /*test_name=*/"collective_permute_intra_host_bidirectional",
+      /*module_string=*/R"(
+HloModule m, num_partitions=4
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0), sharding={devices=[4,1]<=[4]}
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1, source_target_pairs={{0,1},{1,0},{2,3},{3,2}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(3696),
+  };
+
+  // Test for CollectivePermuteCostModelType::kIntraPartitionOneWay
+  EstimatorTestCase collective_permute_intra_host_one_way = {
+      /*test_name=*/"collective_permute_intra_host_one_way",
+      /*module_string=*/R"(
+HloModule m, num_partitions=4
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0), sharding={devices=[4,1]<=[4]}
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1, source_target_pairs={{0,1},{2,3}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(3961),
+  };
+
+  EstimatorTestCase collective_permute_inter_host_global = {
+      /*test_name=*/"collective_permute_inter_host_global",
+      /*module_string=*/R"(
+HloModule m, num_partitions=16
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0)
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1,
+      source_target_pairs={{0,15},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6},{8,7},{9,8},{10,9},{11,10},{12,11},{13,12},{14,13},{15,14}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(27816),
+  };
+
+  EstimatorTestCase collective_permute_inter_host_rail_aligned_bidirection = {
+      /*test_name=*/"collective_permute_inter_host_rail_aligned_bidirection",
+      /*module_string=*/R"(
+HloModule m, num_partitions=16
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0)
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1,
+      source_target_pairs={{0,8},{8,0},{1,9},{9,1},{2,10},{10,2},{3,11},{11,3},{4,12},{12,4},{5,13},{13,5},{6,14},{14,6},{7,15},{15,7}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(27816),
+  };
+
+  EstimatorTestCase collective_permute_inter_host_rail_aligned_unidirection = {
+      /*test_name=*/"collective_permute_inter_host_rail_aligned_unidirection",
+      /*module_string=*/R"(
+HloModule m, num_partitions=16
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0)
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1,
+      source_target_pairs={{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(27816),
+  };
 
-  return {all_gather_intra_host,
+  return {collective_permute_intra_host_ring_shift,
+          collective_permute_intra_host_bidirectional,
+          collective_permute_intra_host_one_way,
+          collective_permute_inter_host_global,
+          collective_permute_inter_host_rail_aligned_bidirection,
+          collective_permute_inter_host_rail_aligned_unidirection,
+          all_gather_intra_host,
           all_gather_inter_host_pairwise,
           all_gather_all_ranks,
           reduce_scatter_all_ranks,
@@ -553,6 +655,55 @@ INSTANTIATE_TEST_SUITE_P(SolLatencyEstimatorTests, SolLatencyEstimatorTest,
                            return info.param.test_name;
                          });
 
+TEST_F(HloHardwareIndependentTestBase, CollectiveCostModelDispatching) {
+  const auto shape_size_fn = HloCostAnalysis::DefaultShapeSize;
+  const auto gpu_info = TestGpuDeviceInfo::RTXH100SXMDeviceInfo();
+  const SolGPUCostModel::Config sol_flags = {
+      absl::Microseconds(100), 100, absl::Microseconds(100),
+      absl::Microseconds(100), 8,   4 * 1024 * 1024};
+  mlir::MLIRContext mlir_ctx;
+  auto interpolator =
+      *CollectiveInterpolator::Create(sol_flags.gpus_per_node, gpu_info,
+                                      /*analysis=*/nullptr);
+
+  // NVLink domain collective should use CollectiveInterpolator.
+  TF_ASSERT_OK_AND_ASSIGN(auto nvl_module, ParseAndReturnVerifiedModule(R"(
+HloModule m, num_partitions=16
+ENTRY main {
+  p = bf16[8,16000,1000] parameter(0)
+  ROOT a2a = bf16[8,16000,1000] all-to-all(p),
+    replica_groups={{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}},
+    channel_id=1, dimensions={0}
+})"));
+  HloInstruction* nvl_instr = hlo_query::FindInstruction(
+      nvl_module->entry_computation(), HloOpcode::kAllToAll);
+  EXPECT_FALSE(SolLatencyEstimator::ComputeCollectiveTime(
+                   *nvl_instr, gpu_info, shape_size_fn, sol_flags, &mlir_ctx,
+                   /*collective_interpolator=*/nullptr)
+                   .ok());
+  EXPECT_TRUE(SolLatencyEstimator::ComputeCollectiveTime(
+                  *nvl_instr, gpu_info, shape_size_fn, sol_flags, &mlir_ctx,
+                  interpolator.get())
+                  .ok());
+
+  // Cross-partition collective should use S-curve model (world-level across 2
+  // hosts).
+  TF_ASSERT_OK_AND_ASSIGN(auto ib_module, ParseAndReturnVerifiedModule(R"(
+HloModule m, num_partitions=16
+ENTRY main {
+  p = bf16[16,16000,1000] parameter(0)
+  ROOT a2a = bf16[16,16000,1000] all-to-all(p),
+    replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}},
+    channel_id=1, dimensions={0}
+})"));
+  HloInstruction* ib_instr = hlo_query::FindInstruction(
+      ib_module->entry_computation(), HloOpcode::kAllToAll);
+  EXPECT_TRUE(SolLatencyEstimator::ComputeCollectiveTime(
+                  *ib_instr, gpu_info, shape_size_fn, sol_flags, &mlir_ctx,
+                  /*collective_interpolator=*/nullptr)
+                  .ok());
+}
+
 class IsSolLatencyEstimatorEnabledTest : public HloTestBase {
  protected:
   IsSolLatencyEstimatorEnabledTest()
@@ -588,10 +739,34 @@ class IsSolLatencyEstimatorEnabledTest : public HloTestBase {
         module->AddEmbeddedComputation(wrapped_computation.Build());
     entry->AddInstruction(HloInstruction::CreateAllReduce(
         shape, {dummy_operand}, subcomp,
-        /*replica_groups=*/{}, /*constrain_layout=*/false,
+        /*device_list=*/CollectiveDeviceList(), /*constrain_layout=*/false,
         /*channel_id=*/std::nullopt, /*use_global_device_ids=*/false));
   }
 
+  // Helper to add a AllToAll instruction.
+  void AddAlltoAll(HloModule* module) {
+    HloComputation* entry = module->entry_computation();
+    Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+    auto dummy_operand = entry->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}})));
+    entry->AddInstruction(HloInstruction::CreateAllToAll(
+        shape, {dummy_operand},
+        /*device_list=*/CollectiveDeviceList(),
+        /*constrain_layout=*/false, /*channel_id=*/false,
+        /*split_dimension=*/std::nullopt));
+  }
+
+  void AddCollectiveBcast(HloModule* module) {
+    HloComputation* entry = module->entry_computation();
+    Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+    auto dummy_operand = entry->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}})));
+    entry->AddInstruction(HloInstruction::CreateCollectiveBroadcast(
+        shape, {dummy_operand},
+        /*device_list=*/CollectiveDeviceList(),
+        /*constrain_layout=*/false, /*channel_id=*/std::nullopt));
+  }
+
   // Helper to add a CollectivePermute instruction.
   void AddCollectivePermute(HloModule* module) {
     HloComputation* entry = module->entry_computation();
@@ -655,7 +830,7 @@ TEST_F(IsSolLatencyEstimatorEnabledTest,
       stream_executor::CudaComputeCapability::Hopper());
 
   auto module = CreateTestModule(config);
-  AddCollectivePermute(module.get());  // Unsupported collective
+  AddCollectiveBcast(module.get());  // Unsupported collective
 
   EXPECT_FALSE(
       SolLatencyEstimator::IsSupportedForModule(*module, gpu_device_info_));
@@ -671,8 +846,10 @@ TEST_F(IsSolLatencyEstimatorEnabledTest,
       stream_executor::CudaComputeCapability::Hopper());
 
   auto module = CreateTestModule(config);
-  AddAllReduce(module.get());          // Supported
-  AddCollectivePermute(module.get());  // Unsupported
+  AddAllReduce(module.get());          // Supported collective
+  AddCollectivePermute(module.get());  // Supported collective
+  AddAlltoAll(module.get());           // Supported collective
+  AddCollectiveBcast(module.get());    // Unsupported collective
 
   EXPECT_FALSE(
       SolLatencyEstimator::IsSupportedForModule(*module, gpu_device_info_));
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
index 8be49e50bd197a..f7ded1b6f880b6 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
index 153e4b8676c970..1857526b3fc404 100644
--- a/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/nvptx_alias_info.h"
 
-#include <cstdint>
 #include <memory>
 #include <optional>
 
 #include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index d48523abc3da12..56c680f4de81d7 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -91,6 +91,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/cudnn_norm_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/transforms/cudnn_simplify_padding.h"
+#include "xla/service/gpu/transforms/gemm_workspace_rewriter.h"
 #include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
@@ -375,6 +376,11 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
                             thread_pool, should_autotune, target_config,
                             options.device_allocator));
   pipeline->AddPass(std::move(autotuner_pass));
+
+  // After autotuning, update GEMM workspace sizes to match the exact
+  // requirements of the selected algorithms, potentially reducing memory usage.
+  pipeline->AddPass<GemmWorkspaceRewriter>(gpu_version, stream_exec);
+
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index fe0469980f45ed..6a09c25d21220d 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
index 34cd3b7067eef4..cf22f97fea8042 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 246ba2fe0a5e51..dc9e3e9bafe18c 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -267,7 +267,6 @@ xla_test(
         "swap_conv_operands_test.cc",
     ],
     backends = ["gpu"],
-    tags = ["cuda-only"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -398,7 +397,6 @@ xla_test(
         "b200",
         "amdgpu_any",
     ],
-    tags = ["cuda-only"], # TODO(rocm) 240729 Test checks only for cuda capability
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo b/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
index a70d34d60e935a..df82c8c89526ba 100644
--- a/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
+++ b/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = s4[8,2]{1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index dca775f862228b..5e91d5fa997469 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // Arguments are passed separately.
 // Even constant arguments are passed as arguments.
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
index dd2a75881159ba..da5ad41a3c177c 100644
--- a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -1,13 +1,13 @@
-// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
-// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
-// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
 
 
-// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} convert(%{{.+}})
-// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} convert(%{{.+}})
+// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} fusion(%{{.+}})
+// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} fusion(%{{.+}})
 // CHECK-SM70: custom-call(%[[convert1]], %[[convert2]]), custom_call_target="__cublas$gemm"
 
-// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} fusion(%{{.+}})
 // CHECK-SM80: %[[b:.+]] = bf16[32,1536]{1,0} parameter(1)
 // CHECK-SM80: custom-call(%[[convert]], %[[b]]), custom_call_target="__cublas$gemm"
 
@@ -22,11 +22,11 @@ ENTRY %computation1 {
 
 // -----
 
-// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} convert(%{{.+}})
-// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} convert(%{{.+}})
+// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} fusion(%{{.+}})
+// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} fusion(%{{.+}})
 // CHECK-SM70: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(%[[convert1]], %[[convert2]]), custom_call_target="__cublas$gemm"
 
-// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} fusion(%{{.+}})
 // CHECK-SM80: %[[b:.+]] = bf16[32,1536]{1,0} parameter(1)
 // CHECK-SM80: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(%[[convert]], %[[b]]), custom_call_target="__cublas$gemm"
 
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 3e984387001e6f..8a0329b6a1fbf6 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -1372,7 +1372,7 @@ class FlashAttentionPagedAttention : public MultiHeadedAttentionTest {
     ENTRY %main.7 (Arg_0.1: bf16[1,128,2,128], Arg_1.2: bf16[1,128,2,128]) -> bf16[1,128,2,128] {
       %Arg_1.2 = bf16[1,128,2,128]{3,2,1,0} parameter(1)
       %Arg_0.1 = bf16[1,128,2,128]{3,2,1,0} parameter(0)
-      %custom-call.3 = (bf16[1,2,128,128]{3,1,2,0}, u8[256]{0}) custom-call(%Arg_0.1, %Arg_1.2, %Arg_1.2), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "workspace_size": "0"}, "fmha_scale": 1.0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["1", "2", "128", "128"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "is_flash_attention": true, "mask_type": "NO_MASK", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "dropout_rate": 0, "seed": 42, "sliding_window_length": 0, "max_seg_per_batch": 1, "is_paged_attention": false}}
+      %custom-call.3 = (bf16[1,2,128,128]{3,1,2,0}, u8[0]{0}) custom-call(%Arg_0.1, %Arg_1.2, %Arg_1.2), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "workspace_size": "0"}, "fmha_scale": 1.0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["1", "2", "128", "128"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "is_flash_attention": true, "mask_type": "NO_MASK", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "dropout_rate": 0, "seed": 42, "sliding_window_length": 0, "max_seg_per_batch": 1, "is_paged_attention": false}}
       %get-tuple-element.4.0 = bf16[1,2,128,128]{3,1,2,0} get-tuple-element(%custom-call.3), index=0
       ROOT %bitcast.6.0 = bf16[1,128,2,128]{3,2,1,0} bitcast(%get-tuple-element.4.0)
     }
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
index 68ae5c6eb8294e..28f5b2af8c2366 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
@@ -52,7 +52,7 @@ ENTRY primitive_computation_mul.8 {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                           GetOptimizedModule(hlo_text));
 
-  se::StreamExecutorMemoryAllocator allocator(
+  stream_executor::StreamExecutorAddressAllocator allocator(
       backend().default_stream_executor());
   absl::StatusOr<std::unique_ptr<Executable>> failed_executable =
       backend().compiler()->RunBackend(std::move(optimized_module),
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index 26fa9db336994b..d3e2fb2a8de1ab 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // All fusions must reuse the same kernel:
 // CHECK-LABEL: target triple
diff --git a/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo b/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
index 76c221666c840a..5e68f9853367e6 100644
--- a/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK %s
 
 HloModule jit_f, entry_computation_layout={()->(f32[4]{0:S(5)}, f32[4]{0})}, allow_spmd_sharding_propagation_to_output={true,true}
 
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 6e147df3928c09..0b8df6244c3b15 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
index 94752f8871c81e..e233780616d8a2 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = bf16[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo b/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
index e5772500423492..48b73ef80dee86 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-after-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-after-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule test
 
diff --git a/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc b/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc
index d5bce4d4325985..fee55012b58f32 100755
--- a/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
 #include <utility>
 
 #include <gtest/gtest.h>
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index e140b56af9d60c..30acdae0aef8e5 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index 893619567093d3..31b2c4e34689ce 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,6 +1,6 @@
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100_sxm.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/h100_sxm.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
 
 // CHECK-DAG: sqrt.approx.f32
 
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 79fdfb950966e4..cd0fabcf205468 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index c86363f5193060..2c049479cb17da 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 7e9e5dea71ed67..58d7222b990469 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -72,7 +72,8 @@ to_apply=compare
       kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
   // We expect that all types except PRED and F8 types are rewritten to a custom
   // call.
-  bool rewrite = GetParam() != PRED && !primitive_util::IsF8Type(GetParam());
+  bool rewrite = GetParam() != PRED && GetParam() != S4 && GetParam() != U4 &&
+                 !primitive_util::IsF8Type(GetParam());
   std::string check = rewrite ? "CHECK: custom-call" : "CHECK-NOT: custom-call";
   MatchOptimizedHlo(hlo, check);
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{0, 0}));
@@ -83,9 +84,9 @@ INSTANTIATE_TEST_SUITE_P(
     // 4bit types like U4, S4, or F4E2M1FN are currently not supported.
     // F8E8M0FNU cannot represent NaNs and fails the test below.
     ::testing::ValuesIn({
-        PRED,                               // boolean
-        S8,         S16,    S32,      S64,  // signed
-        U8,         U16,    U32,      U64,  // unsigned
+        PRED,                                              // boolean
+        S4,         S8,     S16,      S32,           S64,  // signed
+        U4,         U8,     U16,      U32,           U64,  // unsigned
         F8E5M2,     F8E4M3, F8E4M3FN, F8E4M3B11FNUZ, F8E3M4, F8E5M2FNUZ,
         F8E4M3FNUZ, F16,    BF16,     F32,           F64  // floating point
     }),
@@ -121,6 +122,90 @@ ENTRY TestComputation {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+TEST_F(SortingTest, PackedElementType) {
+  const char* hlo_text = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_update_0 = s4[] parameter(2)
+      %rhs_update_0 = s4[] parameter(3)
+      %lhs_permutation = s32[] parameter(4)
+      %rhs_permutation = s32[] parameter(5)
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      ROOT %compare.2 = pred[] compare(%lhs_key, %rhs_key), direction=LT
+    }
+
+    ENTRY main {
+      p0 = s32[16384]{0} parameter(0)
+      p1 = s4[16384]{0} parameter(1)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s4[16384]{0}, s32[16384]{0}) sort(p0, p1, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+  )";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(SortingTest, SortFusionWithIotaOperand) {
+  const char* hlo_text = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_index = s32[] parameter(2)
+      %rhs_index = s32[] parameter(3)
+      %lt_key = pred[] compare(%lhs_key, %rhs_key), direction=LT
+      %gt_key = pred[] compare(%rhs_key, %lhs_key), direction=LT
+      %eq_key = pred[] compare(%lt_key, %gt_key), direction=EQ
+      %lt_index = pred[] compare(%lhs_index, %rhs_index), direction=LT
+      ROOT res = pred[] select(%eq_key, %lt_index, %lt_key)
+    }
+
+    sort_fusion {
+      p0 = s32[16384]{0} parameter(0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}) sort(p0, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[16384]{0} parameter(0)
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kCustom, calls=sort_fusion
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(SortingTest, SortFusionWithIotaOperandTinySortDim) {
+  const char* hlo_text = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_index = s32[] parameter(2)
+      %rhs_index = s32[] parameter(3)
+      %lt_key = pred[] compare(%lhs_key, %rhs_key), direction=LT
+      %gt_key = pred[] compare(%rhs_key, %lhs_key), direction=LT
+      %eq_key = pred[] compare(%lt_key, %gt_key), direction=EQ
+      %lt_index = pred[] compare(%lhs_index, %rhs_index), direction=LT
+      ROOT res = pred[] select(%eq_key, %lt_index, %lt_key)
+    }
+
+    sort_fusion {
+      p0 = s32[2]{0} parameter(0)
+      iota = s32[2]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[2]{0}, s32[2]{0}) sort(p0, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[2]{0} parameter(0)
+      ROOT fusion = (s32[2]{0}, s32[2]{0}) fusion(p), kind=kCustom, calls=sort_fusion
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 // Test that verifies the IgnoreMemorySpace option works correctly
 TEST_F(SortingTest, LayoutsInShapesEqualWithIgnoreMemorySpace) {
   const char* hlo_text = R"(
diff --git a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
index 74a24914d4b718..26fca0c7fb1d6a 100644
--- a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --split-input-file --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --split-input-file --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = s4[4,16]{1,0:E(4)} parameter(0)
@@ -19,11 +19,11 @@ e {
 
 // CHECK-NOT: convert
 // CHECK:      s4[4,2]{1,0:E(4)} parameter
-// CHECK-NEXT: s4[2,4]{1,0:E(4)} transpose
+// CHECK:      s4[2,4]{1,0:E(4)} fusion(%{{.+}})
 // CHECK-NEXT: s8[2,2]{0,1} bitcast
 // CHECK:      s8[2,4]{0,1} all-gather-done
 // CHECK-NEXT: s4[4,4]{1,0:E(4)} bitcast
-// CHECK-NEXT: s4[4,4]{1,0:E(4)} transpose
+// CHECK:      s4[4,4]{1,0:E(4)} fusion(%{{.+}})
 
 // -----
 
diff --git a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
index 6a83c444793d47..1722fd44a349ae 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // Verify that Triton kernels have the correct calling convention:
 // - PTX_KERNEL (71) for NVIDIA targets
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index 0578a2b058fcde..3ccf5bb8327503 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // CHECK-PTX: define ptx_kernel void @triton_gemm_r(
 // CHECK-GCN: define amdgpu_kernel void @triton_gemm_r(
diff --git a/third_party/xla/xla/service/gpu/tests/xla-opt.cc b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
index 850a788aaf73ef..2a1cf0e0cec05a 100644
--- a/third_party/xla/xla/service/gpu/tests/xla-opt.cc
+++ b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
@@ -43,8 +43,6 @@ limitations under the License.
 
 namespace {
 
-mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
-
 struct TritonPipelineOptions
     : public mlir::PassPipelineOptions<TritonPipelineOptions> {
   Option<std::string> target{*this, "target", llvm::cl::init("8.0")};
@@ -75,8 +73,7 @@ mlir::PassPipelineRegistration<TritonPipelineOptions>
                                             options.allow_tma,
                                             options.num_stages);
           xla::gpu::CreateTritonPipeline(&pm, gpu_cc, options.num_warps,
-                                         options.num_ctas, options.num_stages,
-                                         cluster_info);
+                                         options.num_ctas, options.num_stages);
         });
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo b/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
index 59f448644172d4..d0897c286192ca 100644
--- a/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
+++ b/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 e {
   p0 = s32[8,9] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 68033ed80ec6c8..0235073dcce207 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -73,6 +73,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
+#include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
@@ -108,6 +109,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/core/host_offloading/host_offloading_executable.pb.h"
 #include "xla/ffi/attribute_map.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -146,11 +148,11 @@ limitations under the License.
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/triton_call.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -160,11 +162,13 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/human_readable_json.h"
 #include "tsl/platform/platform.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -429,12 +433,13 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConditional(
   }
   TF_ASSIGN_OR_RETURN(auto slice,
                       GetAllocationSliceForHlo(instr->operand(0), {}));
-  bool branch_index_is_bool = instr->operand(0)->shape().element_type() == PRED;
 
-  return GetThunkSequence(std::make_unique<ConditionalThunk>(
+  auto placeholder = GetThunkSequence(std::make_unique<ConditionalThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      slice, std::move(branch_thunks), branch_index_is_bool));
+      ShapedSlice{slice, instr->operand(0)->shape()},
+      std::move(branch_thunks)));
+  return placeholder;
 }
 
 // Input = {dynamic array(with dynamic dimension meta data at the end)}
@@ -519,21 +524,20 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCommandBufferThunk(
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
     const HloCustomCallInstruction* instr) {
-  std::vector<BufferAllocation::Slice> operand_slices;
+  std::vector<ShapedSlice> operand_slices;
   operand_slices.reserve(instr->operand_count());
   for (const HloInstruction* operand : instr->operands()) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                        GetAllocationSliceForHlo(operand, {}));
+    ASSIGN_OR_RETURN(ShapedSlice slice, GetShapedSliceForHlo(operand, {}));
     operand_slices.push_back(slice);
   }
 
   // The first and the last element in the result tuple for a convolution are
   // always the result and the scratch buffer. It may have auxiliary results in
   // addition to the main result.
-  std::vector<BufferAllocation::Slice> result_slices;
+  std::vector<ShapedSlice> result_slices;
   for (int i = 0; i < instr->shape().tuple_shapes().size() - 1; i++) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                        GetAllocationSliceForHlo(instr, {i}));
+    ASSIGN_OR_RETURN(ShapedSlice result_slice,
+                     GetShapedSliceForHlo(instr, {i}));
     result_slices.push_back(result_slice);
   }
 
@@ -672,8 +676,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCublasLtMatmulThunk(
       HloPrintOptions::Fingerprint().set_print_backend_config(true));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       std::move(thunk_info), std::move(canonical_hlo), std::move(gemm_config),
-      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax, workspace_buffer);
+      blas_lt_epilogue, algorithm, config.autotune_workspace_size(), a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax, workspace_buffer);
   return GetThunkSequence(std::move(thunk));
 }
 
@@ -767,46 +771,37 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCublasLtMatmulThunkF8(
       HloPrintOptions::Fingerprint().set_print_backend_config(true));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       std::move(thunk_info), std::move(canonical_hlo), std::move(gemm_config),
-      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax, workspace_buffer);
+      blas_lt_epilogue, algorithm, config.autotune_workspace_size(), a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax, workspace_buffer);
   return GetThunkSequence(std::move(thunk));
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionReorderThunk(
     const HloCustomCallInstruction* instr) {
   bool has_bias = instr->operand_count() > 1;
-  Shape shape = has_bias ? instr->shape().tuple_shapes(0) : instr->shape();
-  if (shape.dimensions().size() != 5 || shape.dimensions(4) != 32) {
-    return Internal("Unexpected shape for convolution reorder: %s",
-                    instr->ToString());
-  }
-  ConvolutionFilterDimensions filter_dimensions;
-  filter_dimensions.set_output_feature_map_count(shape.dimensions(0));
-  filter_dimensions.set_input_feature_map_count(shape.dimensions(1) * 32);
-  filter_dimensions.set_input_filter_height(shape.dimensions(2));
-  filter_dimensions.set_input_filter_width(shape.dimensions(3));
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
-                      GetAllocationSliceForHlo(instr->operand(0)));
 
-  BufferAllocation::Slice filter_output;
+  TF_ASSIGN_OR_RETURN(ShapedSlice filter_input,
+                      GetShapedSliceForHlo(instr->operand(0)));
+
+  ShapedSlice filter_output;
   std::optional<ConvolutionReorderThunk::BiasBuffers> biases;
   if (has_bias) {
-    TF_ASSIGN_OR_RETURN(filter_output, GetAllocationSliceForHlo(instr, {0}));
+    TF_ASSIGN_OR_RETURN(filter_output, GetShapedSliceForHlo(instr, {0}));
 
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
-                        GetAllocationSliceForHlo(instr->operand(1)));
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
-                        GetAllocationSliceForHlo(instr, {1}));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_input,
+                        GetShapedSliceForHlo(instr->operand(1)));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_output,
+                        GetShapedSliceForHlo(instr, {1}));
     biases = {{bias_input, bias_output}};
   } else {
-    TF_ASSIGN_OR_RETURN(filter_output, GetAllocationSliceForHlo(instr));
+    TF_ASSIGN_OR_RETURN(filter_output, GetShapedSliceForHlo(instr));
   }
 
-  auto thunk = std::make_unique<ConvolutionReorderThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      std::move(filter_dimensions), filter_input, filter_output, biases);
+  ASSIGN_OR_RETURN(auto thunk,
+                   ConvolutionReorderThunk::Create(
+                       Thunk::ThunkInfo::WithProfileAnnotation(
+                           instr, ir_emitter_context_->GetNextThunkId()),
+                       filter_input, filter_output, biases));
   return GetThunkSequence(std::move(thunk));
 }
 
@@ -922,8 +917,18 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitPtxCustomCall(
 
 absl::StatusOr<BufferAllocation::Slice> ThunkEmitter::GetAllocationSliceForHlo(
     const HloInstruction* instr, const ShapeIndex& index) const {
-  return xla::gpu::GetAllocationSlice(ir_emitter_context_->buffer_assignment(),
-                                      instr, index);
+  return ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, index);
+}
+
+absl::StatusOr<ShapedSlice> ThunkEmitter::GetShapedSliceForHlo(
+    const HloInstruction* instr, const ShapeIndex& index) const {
+  ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                   GetAllocationSliceForHlo(instr, index));
+  ASSIGN_OR_RETURN(
+      Shape shape,
+      ir_emitter_context_->buffer_assignment().GetShapeForUniqueSlice(instr,
+                                                                      index));
+  return ShapedSlice{slice, shape};
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCubDeviceRadixSort(
@@ -1125,17 +1130,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTriangularSolveCustomCall(
   TF_RET_CHECK(has_fortran_layout(operands[1]->shape().layout()));
   TF_RET_CHECK(has_fortran_layout(instr->shape().tuple_shapes(0).layout()));
 
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_slice,
-                      GetAllocationSliceForHlo(operands[0]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b_slice,
-                      GetAllocationSliceForHlo(operands[1]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                      GetAllocationSliceForHlo(instr, {0}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice temp_slice,
-                      GetAllocationSliceForHlo(instr, {1}));
-
-  const Shape b_shape = operands[1]->shape();
-  const PrimitiveType elem_ty = b_shape.element_type();
+  ASSIGN_OR_RETURN(ShapedSlice a_slice, GetShapedSliceForHlo(operands[0]));
+  ASSIGN_OR_RETURN(ShapedSlice b_slice, GetShapedSliceForHlo(operands[1]));
+  ASSIGN_OR_RETURN(ShapedSlice result_slice, GetShapedSliceForHlo(instr, {0}));
+  ASSIGN_OR_RETURN(ShapedSlice temp_slice, GetShapedSliceForHlo(instr, {1}));
 
   TriangularSolveOptions backend_config;
   auto& backend_config_str = instr->raw_backend_config_string();
@@ -1148,30 +1146,19 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTriangularSolveCustomCall(
 
   // Triangular solve is in-place on 'b', so copy 'b' to the output
   // if they aren't the same buffer.
-  if (b_slice != result_slice) {
+  if (b_slice.slice != result_slice.slice) {
     thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             instr, ir_emitter_context_->GetNextThunkId()),
         /*source_buffer=*/b_slice,
         /*destination_buffer=*/result_slice,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(b_shape)));
-  }
-
-  int64_t m = b_shape.dimensions(b_shape.dimensions().size() - 2);
-  int64_t n = b_shape.dimensions(b_shape.dimensions().size() - 1);
-  int64_t batch_size = std::accumulate(
-      b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
-      [](int64_t a, int64_t b) { return a * b; });
-  int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(elem_ty);
-  int64_t a_batch_stride =
-      backend_config.left_side() ? m * m * elem_size : n * n * elem_size;
-  int64_t b_batch_stride = m * n * elem_size;
+        /*mem_size=*/ShapeUtil::ByteSizeOf(b_slice.shape)));
+  }
+
   thunks.push_back(std::make_unique<TriangularSolveThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      backend_config,
-      /*a_buffer=*/a_slice, /*b_buffer=*/result_slice, temp_slice, elem_ty,
-      batch_size, m, n, a_batch_stride, b_batch_stride));
+      backend_config, a_slice, result_slice, temp_slice));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
@@ -1341,7 +1328,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTritonCustomCall(
     }
 
     kernel_modules_.push_back(std::move(result.llvm_module));
-    return {{kernel_name, launch_dimensions, result.cluster_dim,
+    return {{kernel_name, launch_dimensions, /*cluster_dim=*/std::nullopt,
              result.shmem_bytes}};
   };
 
@@ -1359,7 +1346,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTritonCustomCall(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
       entry->kernel_name, kernel_arguments, entry->launch_dimensions,
-      entry->cluster_dim, entry->shmem_bytes, entry->tma_metadata));
+      /*cluster_dim=*/std::nullopt, entry->shmem_bytes, entry->tma_metadata));
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitAsyncComputation(
@@ -1439,8 +1426,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopy(
   return GetThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      /*source_buffer=*/src_buffer,
-      /*destination_buffer=*/dst_buffer,
+      /*source_buffer=*/ShapedSlice{src_buffer, instr->operand(0)->shape()},
+      /*destination_buffer=*/ShapedSlice{dst_buffer, instr->shape()},
       /*mem_size=*/src_buffer.size()));
 }
 
@@ -1565,12 +1552,12 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitSort(
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and
     // outputs is the same.
-    TF_RET_CHECK(
-        LayoutUtil::LayoutsInShapesEqual(keys_shape, sort->operand(i)->shape(),
-                                         Layout::Equal().IgnoreMemorySpace()));
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, sort->operand(i)->shape(),
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
     TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
         keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index),
-        Layout::Equal().IgnoreMemorySpace()));
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
 
     BufferAllocation::Slice destination_buffer;
     BufferAllocation::Slice source_address;
@@ -1590,9 +1577,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitSort(
       thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(
               sort, ir_emitter_context_->GetNextThunkId()),
-          /*source_buffer=*/source_address,
-          /*destination_buffer=*/destination_buffer,
-          /*mem_size=*/
+          /*source_buffer=*/
+          ShapedSlice{source_address, sort->operand(i)->shape()},
+          /*destination_buffer=*/
+          ShapedSlice{destination_buffer, sort->operand(i)->shape()},
           ShapeUtil::ByteSizeOf(sort->operand(i)->shape())));
     }
   }
@@ -1677,10 +1665,12 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCollectivePermute(
                                                           : normal_shape_idx));
 
     const int64_t src_memory_space = operand_shape.layout().memory_space();
+    Shape result_buffer_shape = (result_shape.IsTuple())
+                                    ? result_shape.tuple_shapes(oprd_idx)
+                                    : result_shape;
+
     const int64_t dst_memory_space =
-        (result_shape.IsTuple())
-            ? result_shape.tuple_shapes(0).layout().memory_space()
-            : result_shape.layout().memory_space();
+        result_buffer_shape.layout().memory_space();
 
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice source_slice,
                         GetAllocationSliceForHlo(operand));
@@ -1691,8 +1681,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCollectivePermute(
       thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(
               instr, ir_emitter_context_->GetNextThunkId()),
-          /*source_buffer=*/source_slice,
-          /*destination_buffer=*/result_slice,
+          /*source_buffer=*/ShapedSlice{source_slice, operand_shape},
+          /*destination_buffer=*/
+          ShapedSlice{result_slice, result_buffer_shape},
           /*mem_size=*/ShapeUtil::ByteSizeOf(operand_shape)));
       // Signal that start thunk not created with nullptr.
       GetCollectivesAsyncEvents().try_emplace(instr, nullptr);
@@ -2098,8 +2089,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDegeneratedCollectiveThunk(
     thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             inst, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/buffers[i].source_buffer,
-        /*destination_buffer=*/buffers[i].destination_buffer,
+        /*source_buffer=*/ShapedSlice{buffers[i].source_buffer, shape},
+        /*destination_buffer=*/
+        ShapedSlice{buffers[i].destination_buffer, shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
   }
   if (thunks.size() == 1) {
@@ -2234,8 +2226,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyStartThunk(
     auto thunk = std::make_unique<DeviceToHostCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             copy_start_instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/src_buffer,
-        /*destination_buffer=*/dst_buffer,
+        /*source_buffer=*/ShapedSlice{src_buffer, input_shape},
+        /*destination_buffer=*/ShapedSlice{dst_buffer, input_shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape),
         /*copy_events=*/copy_events_,
         /*copy_start_instr=*/copy_start_instr);
@@ -2245,8 +2237,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyStartThunk(
     auto thunk = std::make_unique<HostToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             copy_start_instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/src_buffer,
-        /*destination_buffer=*/dst_buffer,
+        /*source_buffer=*/ShapedSlice{src_buffer, input_shape},
+        /*destination_buffer=*/ShapedSlice{dst_buffer, input_shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape),
         /*copy_events=*/copy_events_,
         /*copy_start_instr=*/copy_start_instr);
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.h b/third_party/xla/xla/service/gpu/thunk_emitter.h
index a478a2aff0729a..0680fa2cd9c8c2 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -214,7 +215,9 @@ class ThunkEmitter {
   absl::Status AssertNonDeterminismIsOkay(const std::string& op_name);
 
   absl::StatusOr<BufferAllocation::Slice> GetAllocationSliceForHlo(
-      const HloInstruction* hlo, const ShapeIndex& index = {}) const;
+      const HloInstruction* instr, const ShapeIndex& index = {}) const;
+  absl::StatusOr<ShapedSlice> GetShapedSliceForHlo(
+      const HloInstruction* instr, const ShapeIndex& index = {}) const;
 
   CollectivesAsyncEvents& GetCollectivesAsyncEvents() {
     return ir_emitter_context_->collectives_async_events();
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 468d34e6e3fa50..3b7c9f8a81361d 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -160,6 +160,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1207,7 +1208,6 @@ xla_cc_test(
     name = "dynamic_slice_fusion_rewriter_test",
     srcs = ["dynamic_slice_fusion_rewriter_test.cc"],
     tags = [
-        "cuda-only",
         "gpu",
     ],
     deps = [
@@ -1375,6 +1375,44 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "gemm_workspace_rewriter",
+    srcs = ["gemm_workspace_rewriter.cc"],
+    hdrs = ["gemm_workspace_rewriter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_test(
+    name = "gemm_workspace_rewriter_test",
+    srcs = ["gemm_workspace_rewriter_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":gemm_workspace_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "gemm_fusion",
     srcs = ["gemm_fusion.cc"],
@@ -1545,7 +1583,6 @@ xla_test(
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
-        "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
@@ -1556,7 +1593,6 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
@@ -1739,11 +1775,7 @@ lit_test_suite(
     ),
     tags = ["cuda-only"],
     cfg = "//xla:lit.cfg.py",
-    data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
     default_tags = tf_gpu_tests_tags(),
     tools = [
         "//xla/tools:hlo-opt",
@@ -1843,6 +1875,63 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "hoist_fused_bitcasts",
+    srcs = ["hoist_fused_bitcasts.cc"],
+    hdrs = ["hoist_fused_bitcasts.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:call_graph",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "hoist_fused_bitcasts_test",
+    srcs = ["hoist_fused_bitcasts_test.cc"],
+    tags = [
+        "nomsan",
+    ],
+    deps = [
+        ":hoist_fused_bitcasts",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "nest_gemm_fusion",
     srcs = ["nest_gemm_fusion.cc"],
@@ -1857,7 +1946,6 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:symbolic_tiled_hlo_instruction",
         "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
@@ -1898,18 +1986,18 @@ xla_cc_test(
     deps = [
         ":nest_gemm_fusion",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
@@ -1978,7 +2066,6 @@ xla_cc_test(
     deps = [
         ":priority_fusion",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2311,6 +2398,7 @@ cc_library(
     srcs = ["composite_rewriter.cc"],
     hdrs = ["composite_rewriter.h"],
     deps = [
+        "//xla:literal",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -2337,7 +2425,9 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -2447,7 +2537,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2488,7 +2577,6 @@ xla_cc_test(
         ":softmax_rewriter_triton",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2510,17 +2598,49 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "sort_iota_fusion",
+    srcs = ["sort_iota_fusion.cc"],
+    hdrs = ["sort_iota_fusion.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "sort_iota_fusion_test",
+    srcs = [
+        "sort_iota_fusion_test.cc",
+    ],
+    deps = [
+        ":sort_iota_fusion",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:pattern_matcher",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "sort_rewriter",
     srcs = ["sort_rewriter.cc"],
     hdrs = ["sort_rewriter.h"],
     deps = [
-        "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/runtime:cub_sort_thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:pattern_matcher",
@@ -2544,10 +2664,10 @@ xla_test(
         "gpu",
     ],
     tags = [
-        "cuda-only",
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
+        ":estimate_cub_scratch_size",
         ":sort_rewriter",
         "//xla:error_spec",
         "//xla:shape_util",
@@ -2568,6 +2688,49 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "estimate_cub_scratch_size",
+    srcs = ["estimate_cub_scratch_size.cc"],
+    hdrs = ["estimate_cub_scratch_size.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/runtime:cub_sort_thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "estimate_cub_scratch_size_test",
+    srcs = ["estimate_cub_scratch_size_test.cc"],
+    backends = ["h100"],
+    tags = [
+        "cuda-only",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
+    deps = [
+        ":estimate_cub_scratch_size",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "stream_attribute_annotator",
     srcs = ["stream_attribute_annotator.cc"],
@@ -2741,7 +2904,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
@@ -2864,7 +3027,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/runtime:buffer_comparator",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2894,6 +3056,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -2913,7 +3076,6 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/service/gpu/autotuning:autotuner_compile_util",
@@ -3107,6 +3269,7 @@ cc_library(
     srcs = ["ragged_all_to_all_multi_host_decomposer.cc"],
     hdrs = ["ragged_all_to_all_multi_host_decomposer.h"],
     deps = [
+        "//xla:array",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
diff --git a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
index a96615edca6eb3..7529a1c4be4d45 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/async_wrapper.h"
 
-#include <algorithm>
 #include <deque>
 #include <iterator>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -70,9 +70,8 @@ absl::StatusOr<bool> AsyncWrapper::RunImpl(
       // instructions that can potentially be made async.
       if (HloPredicateIsOp<HloOpcode::kCall, HloOpcode::kConditional,
                            HloOpcode::kWhile>(instruction)) {
-        std::copy(instruction->called_computations().begin(),
-                  instruction->called_computations().end(),
-                  std::back_inserter(computations));
+        absl::c_copy(instruction->called_computations(),
+                     std::back_inserter(computations));
       }
     }
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
index c2939ffff5ff47..27c3592138be77 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
@@ -300,7 +300,6 @@ cc_library(
         ":collective_ops_utils",
         ":convert_async_collectives_to_sync",
         "//xla:util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -316,6 +315,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -324,7 +324,6 @@ xla_cc_test(
     srcs = ["collective_combiner_annotator_test.cc"],
     deps = [
         ":collective_combiner_annotator",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
@@ -593,7 +592,6 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
-        "//xla/service:source_target_pairs",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
index d9bbc6808310db..5798942c82a05a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
index c8854d79b7861a..b211e9647edd9f 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
index 522094a560e749..4b62feaf8069bb 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
index faf8c9375d9a8a..5c64c8a8d40c66 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/source_target_pairs.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -142,29 +141,11 @@ std::pair<CycleType, std::set<int>> GetCycleTypeAndIndicesArray(
   return GetCycleTypeAndIndices(pairs);
 }
 
-// Copies the frontend attributes from the original CP and splits the
-// _xla_send_recv_validation attribute;
+// Copies the frontend attributes from the original CP.
 absl::StatusOr<std::pair<FrontendAttributes, FrontendAttributes>>
 DecomposeFrontendAttributes(const FrontendAttributes& orig,
                             const CycleType cycle_type) {
   FrontendAttributes attr1 = orig, attr2 = orig;
-  auto it = orig.map().find(kSendRecvValidationAttr);
-  if (it == orig.map().end() || it->second == "invalid") {
-    return std::make_pair(attr1, attr2);
-  }
-
-  TF_ASSIGN_OR_RETURN(SourceTargetPairs bounds,
-                      SourceTargetPairs::FromString(it->second));
-  int64_t num_pairs = bounds.size();
-  if (num_pairs < 2) {
-    return Internal("Invalid number of replica groups");
-  }
-
-  // TODO: b/391377472 - this also need to be able to work with multiple cycles.
-  auto [cp1_bounds, cp2_bounds] =
-      collective_permute_cycle::SplitEdges(bounds, cycle_type);
-  (*attr1.mutable_map())[kSendRecvValidationAttr] = cp1_bounds.ToString();
-  (*attr2.mutable_map())[kSendRecvValidationAttr] = cp2_bounds.ToString();
   return std::make_pair(attr1, attr2);
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc
index 1ff9b780cb5bbd..e05dcb487bb724 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc
@@ -112,7 +112,6 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
       p = u32[8,8] parameter(0)
       ROOT start = u32[8,8] collective-permute(p), channel_id=1,
         source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
-        frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
         metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
     }
   )";
@@ -126,10 +125,10 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
     // CHECK-DAG:   %{{.+}} = u32[8,8] parameter(0)
 
     // CHECK-DAG:   %[[cp1:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=1,
-    // CHECK-SAME{LITERAL}: source_target_pairs={{3,0}}, frontend_attributes={_xla_send_recv_validation={{3,10}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: source_target_pairs={{3,0}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   %[[cp2:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=2,
-    // CHECK-SAME{LITERAL}: source_target_pairs={{0,1},{1,2},{2,3}}, frontend_attributes={_xla_send_recv_validation={{0,7},{1,8},{2,9}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: source_target_pairs={{0,1},{1,2},{2,3}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   ROOT %{{.+}} = u32[8,8] select(%[[compare]], %[[cp1]], %[[cp2]])
     // CHECK-DAG: }
@@ -220,8 +219,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycleWithMatmul) {
     weights = f32[2,2] get-tuple-element(param), index=2
     cp = f32[2,2] collective-permute(data),
       channel_id=1,
-      source_target_pairs={{0,1}, {1,2}, {2,3}, {3,0}},
-      frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"}
+      source_target_pairs={{0,1}, {1,2}, {2,3}, {3,0}}
     matmul = f32[2,2] dot(weights, cp), lhs_contracting_dims={1}, rhs_contracting_dims={0}
     iter_increment = u32[] constant(1)
     next_iter = u32[] add(iter, iter_increment)
@@ -241,12 +239,8 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycleWithMatmul) {
   Decomposed deco = FindComponents(module.get(), "cp");
   EXPECT_THAT(deco.cp_bwd->ToString(),
               HasSubstr("source_target_pairs={{3,0}}"));
-  EXPECT_THAT(deco.cp_bwd->ToString(),
-              HasSubstr("_xla_send_recv_validation={{3,10}}"));
   EXPECT_THAT(deco.cp_fwd->ToString(),
               HasSubstr("source_target_pairs={{0,1},{1,2},{2,3}}"));
-  EXPECT_THAT(deco.cp_fwd->ToString(),
-              HasSubstr("_xla_send_recv_validation={{0,7},{1,8},{2,9}}"));
 }
 
 TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
@@ -260,7 +254,6 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
       p = u32[8,8] parameter(0)
       ROOT start = u32[8,8] collective-permute(p), channel_id=1,
         source_target_pairs={{0,3},{1,0},{2,1},{3,2}},
-        frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
         metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
     })";
 
@@ -274,10 +267,10 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
     // CHECK-DAG:   %{{.+}} = u32[8,8] parameter(0)
 
     // CHECK-DAG:   %[[cp1:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=1, source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{0,3}}, frontend_attributes={_xla_send_recv_validation={{0,7}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: {{0,3}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   %[[cp2:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=2, source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}, frontend_attributes={_xla_send_recv_validation={{1,8},{2,9},{3,10}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   ROOT %{{.+}} = u32[8,8] select(%[[compare]], %[[cp1]], %[[cp2]])
     // CHECK-DAG: }
@@ -293,8 +286,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycleNoChannel) {
     ENTRY test_computation {
       p = u32[8,8] parameter(0)
       ROOT start = u32[8,8] collective-permute(p),
-        source_target_pairs={{0,3},{1,0},{2,1},{3,2}},
-        frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"}
+        source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -307,10 +299,10 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycleNoChannel) {
     // CHECK-DAG:   %{{.+}} = u32[8,8] parameter(0)
 
     // CHECK-DAG:   %[[cp1:.+]] = u32[8,8] collective-permute(%{{.+}}), source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{0,3}}, frontend_attributes={_xla_send_recv_validation={{0,7}}}
+    // CHECK-SAME{LITERAL}: {{0,3}}
 
     // CHECK-DAG:   %[[cp2:.+]] = u32[8,8] collective-permute(%{{.+}}), source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}, frontend_attributes={_xla_send_recv_validation={{1,8},{2,9},{3,10}}}
+    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}
 
     // CHECK-DAG:   ROOT %{{.+}} = u32[8,8] select(%[[compare]], %[[cp1]], %[[cp2]])
     // CHECK-DAG: }
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
index 6478bac78a9f6d..cae46d2980bfbb 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
@@ -275,6 +275,7 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
       /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
       /*should_allow_control_dependencies=*/false,
+      /*additional_chain_start_op_finder=*/nullptr,
       /*postprocess_backward_peeled_op=*/{},
       /*postprocess_backward_rotated_op=*/{},
       /*postprocess_backward_peeled_trailing_op=*/{},
@@ -370,6 +371,7 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
       /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
       /*should_allow_control_dependencies=*/false,
+      /*additional_chain_start_op_finder=*/nullptr,
       /*postprocess_backward_peeled_op=*/{},
       /*postprocess_backward_rotated_op=*/{},
       /*postprocess_backward_peeled_trailing_op=*/{},
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
index 7afd2c38666760..dc8e754bf52ae2 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -143,28 +144,51 @@ absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
     const HloInstruction* lhs_scale = call->operand(2);
     const HloInstruction* rhs_scale = call->operand(3);
 
-    if (lhs->shape().element_type() != BF16) {
-      int64_t contracting_dim =
-          dot_dimension_numbers.lhs_contracting_dimensions(0);
-      int64_t scale_factor = lhs->shape().dimensions(contracting_dim) /
-                             lhs_scale->shape().dimensions(contracting_dim);
-      if (scale_factor != 32) {
-        VLOG(2) << "LHS scale_factor is not 32: " << scale_factor
-                << " ignore such scaled_dot. It will be inlined later.";
-        continue;
+    int64_t lhs_contracting_dim =
+        dot_dimension_numbers.lhs_contracting_dimensions(0);
+    int64_t rhs_contracting_dim =
+        dot_dimension_numbers.rhs_contracting_dimensions(0);
+
+    auto is_supported = [&](const HloInstruction* operand,
+                            const HloInstruction* scale,
+                            int64_t contracting_dim) {
+      auto op_type = operand->shape().element_type();
+      auto scale_type = scale->shape().element_type();
+      if ((op_type == F8E4M3FN || op_type == F8E5M2) &&
+          scale_type == F8E8M0FNU) {
+        if (contracting_dim >= scale->shape().dimensions_size()) {
+          return false;
+        }
+        int64_t operand_dim_size = operand->shape().dimensions(contracting_dim);
+        int64_t scale_dim_size = scale->shape().dimensions(contracting_dim);
+
+        if (scale_dim_size == 0 || operand_dim_size % scale_dim_size != 0) {
+          return false;
+        }
+        int64_t scale_factor = operand_dim_size / scale_dim_size;
+        return scale_factor % 32 == 0;
       }
-    }
-
-    if (rhs->shape().element_type() != BF16) {
-      int64_t contracting_dim =
-          dot_dimension_numbers.rhs_contracting_dimensions(0);
-      int64_t scale_factor = rhs->shape().dimensions(contracting_dim) /
-                             rhs_scale->shape().dimensions(contracting_dim);
-      if (scale_factor != 32) {
-        VLOG(2) << "RHS scale_factor is not 32: " << scale_factor
-                << " ignore such scaled_dot for now. It will be inlined later.";
-        continue;
+      if (op_type == BF16 && scale_type == BF16) {
+        if (scale->shape().dimensions_size() !=
+            operand->shape().dimensions_size()) {
+          return false;
+        }
+        for (int64_t dim : scale->shape().dimensions()) {
+          if (dim != 1) {
+            return false;
+          }
+        }
+        if (scale->opcode() != HloOpcode::kConstant) {
+          return false;
+        }
+        return scale->literal().IsAllFloat(1.0);
       }
+      return false;
+    };
+
+    if (!is_supported(lhs, lhs_scale, lhs_contracting_dim) ||
+        !is_supported(rhs, rhs_scale, rhs_contracting_dim)) {
+      continue;
     }
 
     PrecisionConfig precision{};
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
index a2935374ef2c6b..e81a5f13a9e842 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/composite_rewriter.h"
 
+#include <optional>
 #include <string>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/tsl/platform/statusor.h"
@@ -27,39 +32,109 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
-TEST(CompositeRewriterTest, ScaledDotCompositeRewrite) {
-  const std::string hlo_string = R"(
-    HloModule jit_my_dot
+struct TestCase {
+  std::string test_name;
+  std::string lhs_type;
+  std::string rhs_type;
+  std::string lhs_scale_type;
+  std::string rhs_scale_type;
+  std::string lhs_scale_shape;
+  std::string rhs_scale_shape;
+  std::optional<float> lhs_scale_const_val;
+  std::optional<float> rhs_scale_const_val;
+  bool expected_rewrite;
+};
+
+std::string GenerateHlo(const TestCase& test_case) {
+  // Helper to generate scale definition (either parameter or constant)
+  // and maintain the list of main parameters.
+  std::string main_params_decl;
+  std::vector<std::string> call_operands;
+  int param_idx = 0;
+
+  // LHS operand (always param 0)
+  main_params_decl +=
+      absl::Substitute("  %lhs = $0[3,128,256]{2,1,0} parameter($1)\n",
+                       test_case.lhs_type, param_idx++);
+  call_operands.push_back("%lhs");
+
+  // RHS operand (always param 1)
+  main_params_decl +=
+      absl::Substitute("  %rhs = $0[3,256,128]{2,1,0} parameter($1)\n",
+                       test_case.rhs_type, param_idx++);
+  call_operands.push_back("%rhs");
+
+  // LHS Scale
+  if (test_case.lhs_scale_const_val.has_value()) {
+    std::string val_str = std::to_string(*test_case.lhs_scale_const_val);
+    // Remove trailing zeros for cleanliness
+    val_str.erase(val_str.find_last_not_of('0') + 1, std::string::npos);
+    if (val_str.back() == '.') {
+      val_str.pop_back();
+    }
+
+    std::string literal;
+    if (test_case.lhs_scale_shape == "3,1,1") {
+      literal = absl::Substitute("{{{$0}}, {{$0}}, {{$0}}}", val_str);
+    } else {
+      // Assume rank 3 scalar for 1,1,1 or others
+      literal = absl::Substitute("{{{$0}}}", val_str);
+    }
+
+    main_params_decl += absl::Substitute(
+        "  %lhs_scales = $0[$1]{2,1,0} constant($2)\n",
+        test_case.lhs_scale_type, test_case.lhs_scale_shape, literal);
+  } else {
+    main_params_decl += absl::Substitute(
+        "  %lhs_scales = $0[$1]{2,1,0} parameter($2)\n",
+        test_case.lhs_scale_type, test_case.lhs_scale_shape, param_idx++);
+  }
+  call_operands.push_back("%lhs_scales");
+
+  // RHS Scale
+  if (test_case.rhs_scale_const_val.has_value()) {
+    std::string val_str = std::to_string(*test_case.rhs_scale_const_val);
+    val_str.erase(val_str.find_last_not_of('0') + 1, std::string::npos);
+    if (val_str.back() == '.') {
+      val_str.pop_back();
+    }
+
+    std::string literal;
+    if (test_case.rhs_scale_shape == "3,1,1") {
+      literal = absl::Substitute("{{{$0}}, {{$0}}, {{$0}}}", val_str);
+    } else {
+      literal = absl::Substitute("{{{$0}}}", val_str);
+    }
+
+    main_params_decl += absl::Substitute(
+        "  %rhs_scales = $0[$1]{2,1,0} constant($2)\n",
+        test_case.rhs_scale_type, test_case.rhs_scale_shape, literal);
+  } else {
+    main_params_decl += absl::Substitute(
+        "  %rhs_scales = $0[$1]{2,1,0} parameter($2)\n",
+        test_case.rhs_scale_type, test_case.rhs_scale_shape, param_idx++);
+  }
+  call_operands.push_back("%rhs_scales");
+
+  // Construct the HLO string
+  // Note: We use a dummy body for xla.scaled_dot.1 because the rewriter
+  // currently doesn't inspect it, only the call site.
+  // We match the parameter types to avoid parser errors.
+  std::string hlo_template = R"(
+    HloModule test_module
 
     %xla.scaled_dot.1 {
-      %lhs = f8e4m3fn[3,128,256]{2,1,0} parameter(0)
-      %lhs_bf16 = bf16[3,128,256]{2,1,0} convert(%lhs)
-      %lhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(2)
-      %lhs_scales_bf16 = bf16[3,128,8]{2,1,0} convert(%lhs_scales)
-      %lhs_scales_bf16_broadcasted = bf16[3,128,8,32]{3,2,1,0} broadcast(%lhs_scales_bf16), dimensions={0,1,2}
-      %lhs_scales_broadcasted = bf16[3,128,256]{2,1,0} reshape(%lhs_scales_bf16_broadcasted)
-      %lhs_scaled = bf16[3,128,256]{2,1,0} multiply(%lhs_bf16, %lhs_scales_broadcasted)
-      %rhs = f8e4m3fn[3,128,256]{2,1,0} parameter(1)
-      %rhs_bf16 = bf16[3,128,256]{2,1,0} convert(%rhs)
-      %rhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(3)
-      %rhs_scales_bf16 = bf16[3,128,8]{2,1,0} convert(%rhs_scales)
-      %rhs_scales_bf16_broadcasted = bf16[3,128,8,32]{3,2,1,0} broadcast(%rhs_scales_bf16), dimensions={0,1,2}
-      %rhs_scales_broadcasted = bf16[3,128,256]{2,1,0} reshape(%rhs_scales_bf16_broadcasted)
-      %rhs_scaled = bf16[3,128,256]{2,1,0} multiply(%rhs_bf16, %rhs_scales_broadcasted)
-      %rhs_scaled_transposed = bf16[3,256,128]{1,2,0} transpose(%rhs_scaled), dimensions={0,2,1}
-      ROOT %dot_general.1 = bf16[3,128,128]{2,1,0} dot(%lhs_scaled, %rhs_scaled_transposed),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={2},
-          rhs_batch_dims={0},
-          rhs_contracting_dims={1}
+      %p0 = $0[3,128,256]{2,1,0} parameter(0)
+      %p1 = $1[3,256,128]{2,1,0} parameter(1)
+      %p2 = $2[$4]{2,1,0} parameter(2)
+      %p3 = $3[$5]{2,1,0} parameter(3)
+      // Dummy root with correct shape
+      ROOT %dummy = bf16[3,128,128]{2,1,0} constant({...})
     }
 
     ENTRY %main {
-      %lhs = f8e4m3fn[3,128,256]{2,1,0} parameter(0)
-      %rhs = f8e4m3fn[3,256,128]{2,1,0} parameter(1)
-      %lhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(2)
-      %rhs_scales = f8e8m0fnu[3,8,128]{2,1,0} parameter(3)
-      ROOT %call.1 = bf16[3,128,128]{2,1,0} call(%lhs, %rhs, %lhs_scales, %rhs_scales),
+      $6
+      ROOT %call = bf16[3,128,128]{2,1,0} call($7),
           to_apply=%xla.scaled_dot.1,
           is_composite=true,
           frontend_attributes={
@@ -67,14 +142,138 @@ TEST(CompositeRewriterTest, ScaledDotCompositeRewrite) {
             composite.name="xla.scaled_dot",
             composite.version="1"
           }
-    })";
+    }
+  )";
+
+  std::string call_operands_str = absl::StrJoin(call_operands, ", ");
+
+  return absl::Substitute(hlo_template, test_case.lhs_type, test_case.rhs_type,
+                          test_case.lhs_scale_type, test_case.rhs_scale_type,
+                          test_case.lhs_scale_shape, test_case.rhs_scale_shape,
+                          main_params_decl, call_operands_str);
+}
+
+class CompositeRewriterParameterizedTest
+    : public ::testing::TestWithParam<TestCase> {};
+
+TEST_P(CompositeRewriterParameterizedTest, Run) {
+  const TestCase& test_case = GetParam();
+  std::string hlo_string = GenerateHlo(test_case);
+  LOG(INFO) << "HLO string: \n" << hlo_string;
+
   CompositeRewriter rewriter;
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
-  EXPECT_THAT(rewriter.Run(module.get()), absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(module->entry_computation()->root_instruction()->opcode(),
-              HloOpcode::kScaledDot);
+
+  auto result = rewriter.Run(module.get());
+
+  if (test_case.expected_rewrite) {
+    EXPECT_THAT(result, absl_testing::IsOkAndHolds(true));
+    EXPECT_THAT(module->entry_computation()->root_instruction()->opcode(),
+                HloOpcode::kScaledDot);
+  } else {
+    // If it didn't rewrite, it should either be OkAndHolds(false)
+    // or arguably just check that the opcode is still Call.
+    // The current implementation returns OkAndHolds(false) if no change.
+    EXPECT_THAT(result, absl_testing::IsOkAndHolds(false));
+    EXPECT_THAT(module->entry_computation()->root_instruction()->opcode(),
+                HloOpcode::kCall);
+  }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+
+    ScaledDotTests, CompositeRewriterParameterizedTest,
+
+    ::testing::Values(
+        TestCase{
+            /*test_name=*/"FP8_Standard_Case",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"f8e8m0fnu",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,8",
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/true,
+        },
+        TestCase{
+            /*test_name=*/"BF16_Identity_Case",
+            /*lhs_type=*/"bf16",
+            /*rhs_type=*/"bf16",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"bf16",
+            /*lhs_scale_shape=*/"1,1,1",
+            /*rhs_scale_shape=*/"1,1,1",
+            /*lhs_scale_const_val=*/1.0f,
+            /*rhs_scale_const_val=*/1.0f,
+            /*expected_rewrite=*/true,
+        },
+        TestCase{
+            /*test_name=*/"BF16_Invalid_Scale_Value",
+            /*lhs_type=*/"bf16",
+            /*rhs_type=*/"bf16",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"bf16",
+            /*lhs_scale_shape=*/"1,1,1",
+            /*rhs_scale_shape=*/"1,1,1",
+            /*lhs_scale_const_val=*/1.0f,
+            /*rhs_scale_const_val=*/2.0f,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"BF16_Invalid_Scale_Shape",
+            /*lhs_type=*/"bf16",
+            /*rhs_type=*/"bf16",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"bf16",
+            /*lhs_scale_shape=*/"3,128,1",
+            /*rhs_scale_shape=*/"1,1,1",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/1.0f,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"Mixed_Type_Fail_BF16_Scale_With_FP8_Op",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,8",
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"FP8_ScaleFactor_16",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"f8e8m0fnu",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,16",  // 256 / 16 = 16 (not divisible by
+                                             // 32)
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"FP8_ScaleFactor_64",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"f8e8m0fnu",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,4",  // 256 / 4 = 64 (divisible by 32)
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/true,
+        }),
+    [](const ::testing::TestParamInfo<TestCase>& info) {
+      return info.param.test_name;
+    });
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
index 434a33236b76d4..ce9bf49de7ae16 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/gpu/gpu_fusible.h"
@@ -176,6 +177,14 @@ absl::StatusOr<bool> CopyFusion::DoCopyFusion(
         HloInstruction::CreateTuple(tuple_elements));
     fused_computation->set_root_instruction(new_root,
                                             /*accept_different_shape=*/true);
+    // Creates a new original value for the fusion instruction and the new root
+    // of the fused computation.
+    if (hlo->original_value() != nullptr) {
+      std::shared_ptr<xla::OriginalValue> new_original_value =
+          xla::OriginalValue::CreateFromInstruction(new_root);
+      new_root->set_original_value(new_original_value);
+      hlo->set_original_value(new_original_value);
+    }
     *hlo->mutable_shape() = new_root->shape();
     for (HloInstruction* caller :
          call_graph->GetComputationCallers(fused_computation)) {
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
index 1bf5e2237dbbca..fc38dc4920b9bf 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/copy_fusion.h"
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -649,5 +650,31 @@ TEST_F(CopyFusionTest, CopyFusionWithMoreThanMaxCopies) {
   EXPECT_FALSE(CreateFusionWithNumCopies(max_copies));
 }
 
+TEST_F(CopyFusionTest, PropagateOriginalValue) {
+  ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation {
+      two = f32[] constant(2.0)
+      broadcast = f32[16,32]{1,0} broadcast(two), dimensions={}
+      s.1 = f32[16,32]{1,0} sqrt(broadcast)
+      ROOT c.1 = f32[32,16]{1,0} transpose(s.1), dimensions={1,0}, origin={{"transpose"}}
+    }
+
+    ENTRY main {
+      fusion = f32[32,16]{1,0} fusion(), kind=kInput, calls=fused_computation, origin={{"transpose"}}
+      copy.1 = f32[32,16]{1,0} copy(fusion)
+      copy.2 = f32[32,16]{1,0} copy(fusion)
+      ROOT t = (f32[32,16]{1,0}, f32[32,16]{1,0}) tuple(copy.2, copy.1)
+    })")));
+
+  ASSERT_OK_AND_ASSIGN(auto changed, cf_.Run(module.get()));
+  ASSERT_TRUE(changed);
+  const HloInstruction* fusion =
+      *module->entry_computation()->instructions().begin();
+
+  EXPECT_TRUE(fusion != nullptr);
+  EXPECT_EQ(fusion->original_value()->ToString(), R"(({"transpose"}, {}, {}))");
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
index 16c26d533e50e0..a877a45991fb78 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
@@ -583,7 +583,7 @@ ENTRY test {
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
                 GmockMatch(m::GetTupleElement(
                     m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                  m::Negate(m::Parameter(2))),
+                                  m::Fusion(m::Parameter(2))),
                     0)));
   }
 }
@@ -625,7 +625,7 @@ ENTRY test {
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
                 GmockMatch(m::GetTupleElement(
                     m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                  m::Negate(m::Parameter(2))),
+                                  m::Fusion(m::Parameter(2))),
                     0)));
   }
 }
@@ -932,7 +932,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
 ; CHECK-NEXT:  [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
-; CHECK-NEXT:  ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} add([[GEMM]], [[BIAS]])
+; CHECK:  ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} fusion([[GEMM]], [[BIAS]]), kind=kLoop
 )");
 }
 
@@ -1399,7 +1399,7 @@ ENTRY test {
 ; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK:           }
 ; CHECK-NEXT:    [[GETTUPLE:%[^ ]+]] = f32[4,4]{1,0} get-tuple-element([[MATMUL]]), index=0
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} slice([[GETTUPLE]]), slice={[0:2], [0:3]}
+; CHECK:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} fusion([[GETTUPLE]]), kind=kLoop
       )");
 }
 
@@ -1775,7 +1775,7 @@ ENTRY test {
 ; CHECK-DAG:         "epilogue":"RELU"
 ; CHECK:           }
 ; CHECK:         [[MATMUL:%[^ ]+]] = f32[2,4]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} slice([[MATMUL]]), slice={[0:2], [0:2]}
+; CHECK:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} fusion([[MATMUL]]), kind=kLoop
       )");
 }
 
@@ -2467,7 +2467,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -2544,7 +2546,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -2827,7 +2831,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -3145,7 +3151,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -3335,7 +3343,7 @@ ENTRY test {
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
                 GmockMatch(m::GetTupleElement(
                     m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                  m::Negate(m::Parameter(2))),
+                                  m::Fusion(m::Parameter(2))),
                     0)));
   }
 }
@@ -3434,7 +3442,9 @@ ENTRY %test (x: f32[2,3,4], y: f32[4,5,7], z: f32[7]) -> f32[2,3,5,7] {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text, config));
 
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index aa5c0c396ef34e..c509f381ecfe87 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -14,8 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 
-#include <algorithm>
-#include <cmath>
 #include <cstdint>
 #include <iterator>
 #include <optional>
@@ -88,70 +86,12 @@ void SetChannelIdForNewCollective(HloInstruction* new_instr,
 
 using Interval = std::pair<int64_t, int64_t>;
 
-// Parses a string of the format `{{a,b},{c,d},{e,f}...}` to a vector of pairs.
-absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
-    absl::string_view str) {
-  TF_ASSIGN_OR_RETURN(std::vector<ReplicaGroup> replica_groups,
-                      ParseReplicaGroupsOnly(str));
-  std::vector<Interval> res;
-  res.reserve(replica_groups.size());
-  for (const ReplicaGroup& replica_group : replica_groups) {
-    TF_RET_CHECK(replica_group.replica_ids_size() == 2);
-    int64_t a = replica_group.replica_ids(0);
-    int64_t b = replica_group.replica_ids(1);
-    res.emplace_back(a, b);
-  }
-  return res;
-}
-
 // This function fixes the `_xla_send_recv_validation` attribute for peeled
 // instructions. When the loop trip count is odd, the peeled instructions are
 // moved before the loop. The collectives in these instructions correspond to
 // the first iteration of the original loop. We have to run this peeled
 // collective for all those devices that had the 0-th iteration as a valid
 // iteration.
-absl::Status SetSendRecvValidationForPeeledInstr(HloInstruction* new_instr,
-                                                 HloInstruction* old_instr) {
-  TF_RET_CHECK(
-      new_instr->opcode() == old_instr->opcode() &&
-      "cloned instruction and original instruction have different opcodes");
-  if (HloPredicateIsNotOp<HloOpcode::kCollectivePermute,
-                          HloOpcode::kCollectivePermuteStart, HloOpcode::kSend,
-                          HloOpcode::kRecv>(old_instr)) {
-    return absl::OkStatus();
-  }
-
-  const auto& attribute_map = new_instr->frontend_attributes().map();
-  if (!attribute_map.contains(kSendRecvValidationAttr)) {
-    return absl::OkStatus();
-  }
-
-  VLOG(3) << "Original send-recv iterations: "
-          << attribute_map.at(kSendRecvValidationAttr);
-
-  TF_ASSIGN_OR_RETURN(
-      auto send_recv_validation_attr,
-      ParseVectorOfPairs(attribute_map.at(kSendRecvValidationAttr)));
-
-  uint64_t n_pairs = send_recv_validation_attr.size();
-  if (n_pairs == 0) {
-    return absl::OkStatus();
-  }
-  std::vector<Interval> send_recv_validation_attr_updated(n_pairs, {1, 0});
-  // Check which of the attributes have iteration number zero as valid
-  // iteration. For all those, set the peeled instruction to run.
-  for (std::uint64_t i = 0; i < send_recv_validation_attr.size(); i++) {
-    if (send_recv_validation_attr[i].first <= 0 &&
-        send_recv_validation_attr[i].second >= 0) {
-      send_recv_validation_attr_updated[i] = {0, 0};
-    }
-  }
-
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      /*instr=*/new_instr, /*attr_name=*/kSendRecvValidationAttr,
-      /*intervals=*/send_recv_validation_attr_updated);
-  return absl::OkStatus();
-}
 
 // This function fixes the `_xla_send_recv_validation` attribute for the two new
 // collectives inside the loop. The calculation of the new valid iterations
@@ -180,65 +120,6 @@ absl::Status SetSendRecvValidationForPeeledInstr(HloInstruction* new_instr,
 //
 // In a similar fashion we can generalize the computation of new values based on
 // the values of the old attribute as done in the logic below.
-absl::Status SetSendRecvValidation(HloInstruction* cp1, HloInstruction* cp2,
-                                   bool is_peeled) {
-  TF_RET_CHECK(
-      cp2->opcode() == cp1->opcode() &&
-      "cloned instruction and original instruction have different opcodes");
-  if (HloPredicateIsNotOp<HloOpcode::kCollectivePermute,
-                          HloOpcode::kCollectivePermuteStart, HloOpcode::kSend,
-                          HloOpcode::kRecv>(cp1)) {
-    return absl::OkStatus();
-  }
-  const auto& attribute_map = cp2->frontend_attributes().map();
-  if (!attribute_map.contains(kSendRecvValidationAttr)) {
-    return absl::OkStatus();
-  }
-  VLOG(3) << "Original send-recv iterations: "
-          << attribute_map.at(kSendRecvValidationAttr);
-
-  TF_ASSIGN_OR_RETURN(
-      auto send_recv_validation_attr,
-      ParseVectorOfPairs(attribute_map.at(kSendRecvValidationAttr)));
-
-  if (send_recv_validation_attr.size() == 0) {
-    return absl::OkStatus();
-  }
-
-  std::vector<Interval> send_recv_iterations_new_instr1,
-      send_recv_iterations_new_instr2;
-  send_recv_iterations_new_instr1.reserve(send_recv_validation_attr.size());
-  send_recv_iterations_new_instr2.reserve(send_recv_validation_attr.size());
-  for (const Interval& pair : send_recv_validation_attr) {
-    int64_t a = pair.first;
-    int64_t b = pair.second;
-    if (is_peeled) {
-      send_recv_iterations_new_instr1.emplace_back(
-          std::floor(a / 2.0), std::max(0.0, std::floor((b - 1) / 2.0)));
-      send_recv_iterations_new_instr2.emplace_back(
-          std::max(0.0, std::floor((a - 1) / 2.0)),
-          std::max(0.0, std::floor((b - 2) / 2.0)));
-    } else {
-      send_recv_iterations_new_instr1.emplace_back(std::floor((a + 1) / 2.0),
-                                                   std::floor(b / 2.0));
-      send_recv_iterations_new_instr2.emplace_back(
-          std::floor(a / 2.0), std::max(0.0, std::floor((b - 1) / 2.0)));
-    }
-  }
-
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      /*instr=*/cp1, /*attr_name=*/kSendRecvValidationAttr,
-      /*intervals=*/send_recv_iterations_new_instr1);
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      /*instr=*/cp2, /*attr_name=*/kSendRecvValidationAttr,
-      /*intervals=*/send_recv_iterations_new_instr2);
-
-  VLOG(3) << "Updated send-recv iterations for " << cp1->name() << " : "
-          << cp1->frontend_attributes().map().at(kSendRecvValidationAttr);
-  VLOG(3) << "Updated send-recv iterations for " << cp2->name() << " : "
-          << cp2->frontend_attributes().map().at(kSendRecvValidationAttr);
-  return absl::OkStatus();
-}
 
 // Handle control predecessors/successors for every old-new instruction pair.
 // For every new instruction, we find the relevant predecessor/successor
@@ -373,8 +254,13 @@ absl::StatusOr<bool> FullyUnroll(HloInstruction* while_instr,
     changed = true;
   }
 
-  WhileLoopBackendConfig new_config;
+  WhileLoopBackendConfig old_config;
+  TF_ASSIGN_OR_RETURN(old_config,
+                      while_instr->backend_config<WhileLoopBackendConfig>());
+
+  WhileLoopBackendConfig new_config = old_config;
   new_config.mutable_known_trip_count()->set_n(1);
+
   TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
 
   return changed;
@@ -406,7 +292,6 @@ absl::Status PeelInstructionsForOddTripCount(HloModule* module,
             old_instr->shape(), new_operands, suffix));
 
     SetChannelIdForNewCollective(new_instr, module);
-    CHECK_OK(SetSendRecvValidationForPeeledInstr(new_instr, old_instr));
     old_to_new_map[old_instr] = new_instr;
     VLOG(2) << "Added instruction " << new_instr->ToString()
             << " to parent computation.";
@@ -498,8 +383,6 @@ absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
       skip_control_dep_injection.insert(old_instr);
     }
     SetChannelIdForNewCollective(new_instr, module);
-    CHECK_OK(SetSendRecvValidation(old_instr, new_instr,
-                                   /*is_peeled=*/peel_one_iteration));
     old_to_new_map[old_instr] = new_instr;
     VLOG(2) << "Added instruction " << new_instr->ToString();
   }
@@ -514,15 +397,9 @@ absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
                                                &old_loop_roots, input_parameter,
                                                skip_control_dep_injection));
 
-  WhileLoopBackendConfig new_config;
+  WhileLoopBackendConfig new_config = config;
   new_config.mutable_known_trip_count()->set_n(exact_trip_count / 2);
 
-  // Keep known induction variable metadata if it was present before.
-  if (config.has_known_induction_variable()) {
-    *new_config.mutable_known_induction_variable() =
-        config.known_induction_variable();
-  }
-
   // Update the init/step metadata if it was present before.
   if (config.has_known_init_step()) {
     int64_t step = config.known_init_step().step();
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
index 6e16fcb9754c91..dbe914ce6037d3 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <set>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
@@ -42,7 +44,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-
 int64_t CountInstructions(HloComputation& computation, HloOpcode opcode) {
   int64_t count = 0;
   hlo_query::ForEachInstructionWithOpcode(
@@ -1003,8 +1004,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
-                             frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1026,10 +1026,10 @@ ENTRY main {
   VLOG(1) << module->ToString();
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body {{.+}} {
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main {{.+}} {
@@ -1057,8 +1057,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}},
-                             frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10},{4,11},{5,12},{6,13},{7,14}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1080,13 +1079,13 @@ ENTRY main {
   VLOG(1) << module->ToString();
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6},{3,6}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]])
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: ENTRY %main {{.+}} {
-    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}}
+    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}
     // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.*}}%[[cp_peeled]], {{.*}})
     // CHECK:   %[[while:.+]] = {{.+}} while({{.*}}%[[out_peeled]])
     // CHECK: }
@@ -1112,8 +1111,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                             frontend_attributes={_xla_send_recv_validation="{{7,13},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1135,10 +1133,10 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = f32[] collective-permute(%param_0), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{4,6},{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = f32[] collective-permute(%param_0), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3},{0,2}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: ENTRY %main
     // CHECK-NOT: collective-permute
@@ -1165,8 +1163,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                             frontend_attributes={_xla_send_recv_validation="{{7,14},{6,13},{5,12},{4,11},{3,10},{2,9},{1,8},{0,7}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1188,14 +1185,14 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3},{0,2}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main
-    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}}
+    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}
     // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.*}}%[[cp_peeled]], {{.*}})
     // CHECK:   ROOT {{.+}} = {{.+}} while({{.*}}%[[out_peeled]])
     // CHECK: }
@@ -1221,8 +1218,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute-start = (f32[], f32[], u32[], u32[]) collective-permute-start(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
-                             frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"}
+  collective-permute-start = (f32[], f32[], u32[], u32[]) collective-permute-start(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
   collective-permute = f32[] collective-permute-done(collective-permute-start)
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
@@ -1245,11 +1241,11 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp_start1:.+]] = {{.+}} collective-permute-start({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}}
+    // CHECK:   %[[cp_start1:.+]] = {{.+}} collective-permute-start({{.+}}), {{.+}}
     // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute-done({{.*}}%[[cp_start1]])
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp_start2:.+]] = {{.+}} collective-permute-start({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
+    // CHECK:   %[[cp_start2:.+]] = {{.+}} collective-permute-start({{.*}}), {{.+}}
     // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute-done({{.*}}%[[cp_start2]])
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
@@ -1281,8 +1277,7 @@ body {
   recv.0 = (f32[], u32[], token[]) recv(after-all.0), channel_id=1,
         frontend_attributes={
           _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,0}}",
-          _xla_send_recv_pipeline="0",
-          _xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"
+          _xla_send_recv_pipeline="0"
         }
   recv-done.0 = (f32[], token[]) recv-done(recv.0), channel_id=1,
         frontend_attributes={
@@ -1310,8 +1305,8 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[recv1:.+]] = {{.+}} recv({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}
-    // CHECK:   %[[recv2:.+]] = {{.+}} recv({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}
+    // CHECK:   %[[recv1:.+]] = {{.+}} recv({{.+}}), {{.+}}
+    // CHECK:   %[[recv2:.+]] = {{.+}} recv({{.+}}), {{.+}}
     // CHECK: ENTRY %main
     // CHECK-NOT: recv
     // CHECK: }
@@ -1340,8 +1335,7 @@ body {
   send.0 = (f32[], u32[], token[]) send(param_0, after-all.0), channel_id=1,
         frontend_attributes={
           _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,0}}",
-          _xla_send_recv_pipeline="0",
-          _xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"
+          _xla_send_recv_pipeline="0"
         }
   send-done.0 = token[] send-done(send.0), channel_id=1,
         frontend_attributes={
@@ -1368,8 +1362,8 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[send1:.+]] = {{.+}} send({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}
-    // CHECK:   %[[send2:.+]] = {{.+}} send({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}
+    // CHECK:   %[[send1:.+]] = {{.+}} send({{.+}}), {{.+}}
+    // CHECK:   %[[send2:.+]] = {{.+}} send({{.+}}), {{.+}}
     // CHECK: ENTRY %main
     // CHECK-NOT: send
     // CHECK: }
@@ -1505,6 +1499,94 @@ TEST_F(GpuLoopDoubleBufferTransformerTest, UpdateInitStepEvenTripCount) {
   EXPECT_EQ(config.known_init_step().step(), 4);
 }
 
+TEST_F(GpuLoopDoubleBufferTransformerTest,
+       PreserveDynamicVariableIndicesAfterDoubleBuffering) {
+  absl::string_view kModuleString = R"(
+HloModule test
+
+condition {
+  input_tuple = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=0
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+  input_tuple = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) parameter(0)
+  idx = s32[] get-tuple-element(input_tuple), index=0
+  buffer = f32[2,8]{1,0:S(5)} get-tuple-element(input_tuple), index=1
+  update = f32[1,8]{1,0} get-tuple-element(input_tuple), index=2
+  counter = s32[] get-tuple-element(input_tuple), index=3
+
+  c0 = s32[] constant(0)
+  dus = f32[2,8]{1,0:S(5)} dynamic-update-slice(buffer, update, idx, c0)
+
+  c1 = s32[] constant(1)
+  idx_plus_1 = s32[] add(idx, c1)
+  counter_plus_1 = s32[] add(counter, c1)
+  ROOT output = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) tuple(idx_plus_1, dus, update, counter_plus_1)
+}
+
+ENTRY main {
+  c0 = s32[] constant(0)
+  buffer_init = f32[2,8]{1,0:S(5)} parameter(0)
+  update_init = f32[1,8]{1,0} parameter(1)
+  input_tuple = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) tuple(c0, buffer_init, update_init, c0)
+  ROOT while = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) while(input_tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"},"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":["3","0"]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kDoubleBuffer);
+  TupleSimplifier tuple_simplifier;
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, double_buffer.Run(module.get()));
+  ASSERT_TRUE(changed);
+  TF_ASSERT_OK_AND_ASSIGN(changed, tuple_simplifier.Run(module.get()));
+
+  std::vector<HloInstruction*> while_loops;
+  for (HloComputation* comp : module->computations()) {
+    for (HloInstruction* instr : comp->instructions()) {
+      if (instr->opcode() == HloOpcode::kWhile) {
+        while_loops.push_back(instr);
+      }
+    }
+  }
+
+  ASSERT_FALSE(while_loops.empty())
+      << "Expected at least one while loop after double buffering";
+
+  for (HloInstruction* while_loop : while_loops) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        WhileLoopBackendConfig config,
+        while_loop->backend_config<WhileLoopBackendConfig>());
+
+    std::set<int64_t> dynamic_indices(
+        config.dynamic_variable_tuple_indices().begin(),
+        config.dynamic_variable_tuple_indices().end());
+
+    EXPECT_FALSE(dynamic_indices.empty())
+        << "Expected dynamic_variable_tuple_indices to be preserved for while "
+           "loop: "
+        << while_loop->name()
+        << ". Double buffering should not erase indices set by "
+           "CollectivePipeliner.";
+
+    EXPECT_NE(dynamic_indices.find(0), dynamic_indices.end())
+        << "Expected tuple index 0 (induction variable) to be preserved as "
+           "dynamic for while loop: "
+        << while_loop->name();
+
+    EXPECT_NE(dynamic_indices.find(3), dynamic_indices.end())
+        << "Expected tuple index 3 (additional counter) to be preserved as "
+           "dynamic for while loop: "
+        << while_loop->name();
+  }
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc
new file mode 100644
index 00000000000000..63dd80dc641cae
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc
@@ -0,0 +1,123 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla::gpu {
+
+// Rewrites a single sort instruction with a custom call.
+absl::StatusOr<bool> EstimateCubScratchSize::RunOnInstruction(
+    HloCustomCallInstruction* custom_call) {
+  CHECK_EQ(custom_call->custom_call_target(),
+           kCubDeviceRadixSortUnassignedScratchSizeTarget);
+  const Shape& key_shape = custom_call->operand(0)->shape();
+  PrimitiveType key_type = key_shape.element_type();
+  std::optional<PrimitiveType> value_type;
+  if (custom_call->operand_count() == 2) {
+    value_type = custom_call->operand(1)->shape().element_type();
+  }
+
+  ASSIGN_OR_RETURN(
+      std::unique_ptr<CubSortRunnerInterface> runner,
+      CubSortRunnerInterface::Create(key_type, value_type, platform_name_));
+
+  int64_t num_elements = Product(key_shape.dimensions());
+  // It is assumed that the sorting happens on the innermost dimension.
+  int64_t batch_size = num_elements / key_shape.dimensions().back();
+
+  ASSIGN_OR_RETURN(int64_t scratch_size,
+                   runner->GetScratchSize(num_elements, batch_size));
+
+  // Align and increase scratch size to fit the offsets.
+  if (batch_size > 1) {
+    scratch_size += sizeof(int) - scratch_size % sizeof(int);
+    scratch_size += (batch_size + 1) * sizeof(int);
+  }
+
+  // Update the custom call.
+  Shape new_shape = custom_call->shape();
+  new_shape.mutable_tuple_shapes()->back() =
+      ShapeUtil::MakeShape(U8, {scratch_size});
+  HloInstruction* new_custom_call =
+      custom_call->AddInstruction(HloInstruction::CreateCustomCall(
+          new_shape, absl::MakeSpan(custom_call->operands()),
+          kCubDeviceRadixSortTarget));
+  new_custom_call->SetupDerivedInstruction(custom_call);
+  RETURN_IF_ERROR(custom_call->parent()->ReplaceInstructionWithDifferentShape(
+      custom_call, new_custom_call));
+  return true;
+}
+
+// Rewrites the sorts in the given computation into calls to CUB.
+absl::StatusOr<bool> EstimateCubScratchSize::RunOnComputation(
+    HloComputation* computation) {
+  std::vector<HloCustomCallInstruction*> custom_calls;
+  for (auto* inst : computation->instructions()) {
+    if (auto custom_call = DynCast<HloCustomCallInstruction>(inst)) {
+      if (custom_call->custom_call_target() ==
+          kCubDeviceRadixSortUnassignedScratchSizeTarget) {
+        custom_calls.push_back(custom_call);
+      }
+    }
+  }
+  bool changed = false;
+  for (auto* call : custom_calls) {
+    ASSIGN_OR_RETURN(bool result, RunOnInstruction(call));
+    changed |= result;
+  }
+  return changed;
+}
+
+// Replace compatible sort operations with custom calls.
+absl::StatusOr<bool> EstimateCubScratchSize::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(
+      3, "EstimateCubScratchSize::RunImpl(), before:\n" + module->ToString());
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  XLA_VLOG_LINES(
+      3, "EstimateCubScratchSize::RunImpl(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h
new file mode 100644
index 00000000000000..a8207dbef54736
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ESTIMATE_CUB_SCRATCH_SIZE_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ESTIMATE_CUB_SCRATCH_SIZE_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Updates the scratch size of CUB sort custom calls to match the actual
+// scratch size. Also changes the custom call target from
+// kCubDeviceRadixSortUnassignedScratchSizeTarget to kCubDeviceRadixSortTarget.
+class EstimateCubScratchSize : public HloModulePass {
+ public:
+  explicit EstimateCubScratchSize(std::string platform_name)
+      : platform_name_(platform_name) {}
+
+  absl::string_view name() const override {
+    return "estimate-cub-scratch-size";
+  }
+
+ protected:
+  absl::StatusOr<bool> RunOnInstruction(HloCustomCallInstruction* custom_call);
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  std::string platform_name_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ESTIMATE_CUB_SCRATCH_SIZE_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc
new file mode 100644
index 00000000000000..499121a92b4c35
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc
@@ -0,0 +1,317 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+class EstimateCubScratchSizeTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
+ public:
+  void SetUp() override {
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>::SetUp();
+    ASSERT_OK_AND_ASSIGN(test_platform_, PlatformUtil::GetPlatform("gpu"));
+  }
+
+  void RunAndCheck(absl::string_view hlo, absl::string_view expected) {
+    RunAndFilecheckHloRewrite(
+        hlo, EstimateCubScratchSize(GetTestPlatform()->Name()), expected);
+  }
+
+  const stream_executor::Platform* GetTestPlatform() const {
+    return test_platform_;
+  }
+
+ private:
+  stream_executor::Platform* test_platform_ = nullptr;
+};
+
+// Basic sort: ascending.
+TEST_F(EstimateCubScratchSizeTest, U32_F32) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u32[1000] parameter(0)
+      %values = f32[1000] parameter(1)
+      %custom-call = (u32[1000]{0}, f32[1000]{0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u32[1000]{0}, f32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[1000] parameter(0)
+      %custom-call = (f32[1000]{0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = f32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, S32_S32) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = s32[1000] parameter(0)
+      %values = s32[1000] parameter(1)
+      %custom-call = (s32[1000]{0}, s32[1000]{0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = s32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (s32[1000]{0}, s32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[1000] parameter(0)
+      %custom-call = (f32[1000]{0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = f32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32_Rank3) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[10,10,10] parameter(0)
+      %custom-call = (f32[10,10,10]{2,1,0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = f32[10,10,10]{2,1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[10,10,10]{2,1,0}, u8[4756]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32_Rank2) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[10,100] parameter(0)
+      %custom-call = (f32[10,100]{1,0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = f32[10,100]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[10,100]{1,0}, u8[4396]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_F16_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = f16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U32_F32_Rank2) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u32[16,128] parameter(0)
+      %values = f32[16,128] parameter(1)
+      %custom-call = (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u32[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[16708]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U64_F64_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u64[16,128] parameter(0)
+      %values = f64[16,128] parameter(1)
+      %custom-call = (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u64[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[33092]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_BF16) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = bf16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_BF16_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = bf16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_F16) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = f16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U32_F32_Rank2_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u32[16,128] parameter(0)
+      %values = f32[16,128] parameter(1)
+      %custom-call = (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u32[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[16708]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U64_F64) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u64[16,128] parameter(0)
+      %values = f64[16,128] parameter(1)
+      %custom-call = (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u64[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[33092]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index c77d76d6954aed..8036bc09ab2ce6 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -227,6 +227,58 @@ ENTRY e {
   EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
 }
 
+TEST_F(GemmFusionTest, FuseSliceWithOtherUsersWhenDotHasSmallK) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = bf16[512,3584]{1,0} parameter(0)
+  p1 = bf16[3584,14400]{0,1} parameter(1)
+  p2 = bf16[64,14336]{1,0} parameter(2)
+
+  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
+
+  sl1 = bf16[512,64]{1,0} slice(d0), slice={[0:512], [14336:14400]}
+  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
+})"));
+
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
+  EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
+
+  // Check that the second dot is fused and the fusion contains sl1.
+  // We make no assumptions about other fusions.
+  constexpr absl::string_view kExpectedHloText = R"(
+    CHECK: %[[FUSION_DOT:.*]] (
+    CHECK:   %[[SLICE:.*]] = bf16[512,64]{1,0} slice(%parameter_0), slice={[0:512], [14336:14400]}
+    CHECK:   ROOT {{.*}} = bf16[512,14336]{1,0} dot(%[[SLICE]], %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    CHECK: ENTRY
+    CHECK-DAG: %[[FUSION_D1:.*]] = bf16[512,14336]{1,0} fusion({{.*}}, {{.*}}), kind=kCustom, calls=%[[FUSION_DOT]]
+    CHECK-DAG: ROOT %a0 = bf16[512,14336]{1,0} add({{.*}}, %[[FUSION_D1]])
+  )";
+  MatchHloModule(*module, kExpectedHloText);
+}
+
+TEST_F(GemmFusionTest, DoNotFuseSliceWithOtherUsersWhenDotHasLargeK) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = bf16[512,3584]{1,0} parameter(0)
+  p1 = bf16[3584,14400]{0,1} parameter(1)
+  p2 = bf16[1400,14336]{1,0} parameter(2)
+
+  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
+  sl1 = bf16[512,1400]{1,0} slice(d0), slice={[0:512], [13000:14400]}
+
+  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
+})"));
+
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
+  EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
+}
+
 TEST_F(GemmFusionTest, DoNotFuseSliceOfMixedDimensions) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
index d420d471002a79..e5ff188990e778 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
@@ -2047,10 +2047,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   absl::StatusOr<absl::string_view> GetNonFp8GemmCustomCallTarget(
       const HloInstruction& instr,
       const GemmBackendConfig& gemm_backend_config) const {
-    if (!instr.GetModule()
-             ->config()
-             .debug_options()
-             .xla_gpu_enable_cublaslt()) {
+    if (!options_.enable_cublaslt) {
       // cublasLt is not enabled.
       return absl::string_view(kGemmCallTarget);
     }
@@ -2098,7 +2095,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         const se::blas::ComputationType compute_type,
         se::gpu::GetBlasComputationType(
             instr.precision_config().algorithm(), a_dtype, output_type,
-            stream_executor::blas::kDefaultComputePrecision));
+            stream_executor::blas::kDefaultComputePrecision, gpu_version_));
     se::blas::DataType scale_type =
         se::gpu::GetScaleType(output_dtype, compute_type);
 
@@ -2196,10 +2193,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return false;
     }
 
-    TF_ASSIGN_OR_RETURN(
-        const se::blas::ComputationType compute_type,
-        se::gpu::GetBlasComputationType(
-            algorithm, a_dtype, instr.shape().element_type(), max_precision));
+    TF_ASSIGN_OR_RETURN(const se::blas::ComputationType compute_type,
+                        se::gpu::GetBlasComputationType(
+                            algorithm, a_dtype, instr.shape().element_type(),
+                            max_precision, gpu_version_));
     se::blas::DataType scale_type =
         se::gpu::GetScaleType(output_dtype, compute_type);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
index f49f7ed3a1da84..e4b9bf8cc9d9f8 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
@@ -72,6 +72,10 @@ struct GemmRewriterOptions {
   // In this case, the two GEMMs can be scheduled in parallel.
   enum class BiasMode { kBias, kNoBias };
   BiasMode bias_mode = BiasMode::kBias;
+
+  // Enables the use of cublasLt for non-FP8 GEMMs.
+  // FP8 GEMMs are always rewritten to use cublasLt.
+  bool enable_cublaslt = false;
 };
 
 class GemmRewriter : public HloModulePass {
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
index c76b340ce86906..04f1cbd82df4da 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
@@ -136,9 +137,18 @@ class ParameterizedFp8GemmRewriteTest
     if (expected.has_value()) {
       std::string replaced_pattern =
           absl::StrReplaceAll(expected.value(), replacements_);
+      std::vector<absl::string_view> additional_check_prefixes;
+      if (IsCuda()) {
+        additional_check_prefixes.push_back("CHECK-PTX");
+      }
+      if (IsRocm()) {
+        additional_check_prefixes.push_back("CHECK-GCN");
+      }
+
       GemmRewriteTestBase::RunAndFilecheckHloRewrite(
           absl::StrReplaceAll(hlo, replacements_), std::move(hlo_pass),
-          replaced_pattern, after_pass_checks, config);
+          replaced_pattern, after_pass_checks, config,
+          additional_check_prefixes);
     }
   }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
index 83fb5c8649560c..5fe18d59307d37 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
@@ -47,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
@@ -131,7 +129,7 @@ ENTRY AddDotsFunc {
     return ParseAndReturnVerifiedModule(hlo_text, config);
   };
 
-  se::StreamExecutorMemoryAllocator allocator(
+  stream_executor::StreamExecutorAddressAllocator allocator(
       backend().default_stream_executor());
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloModule> optimized_module,
@@ -586,10 +584,9 @@ ENTRY AddDotsFunc {
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc ({{.*}}: f32[3,2,5], {{.*}}: f32[5,3,4]) -> f32[5,2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,2,5]{2,1,0} parameter(0)
+; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} fusion([[P0]])
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} transpose([[P0]])
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[FUSION]], [[P1]]),
-; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
+; CHECK:         {{[^ ]+}} = {{.*}} custom-call([[FUSION]], [[P1]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -1275,7 +1272,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1299,7 +1298,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1323,7 +1324,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -1344,7 +1347,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1368,7 +1373,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1396,7 +1403,9 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1424,7 +1433,9 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1454,7 +1465,9 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1476,8 +1489,9 @@ class GemmRewriteAllocationTest : public GpuCodegenTest {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo));
     if (allocator_ == nullptr) {
-      allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
-          backend().default_stream_executor());
+      allocator_ =
+          std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+              backend().default_stream_executor());
     }
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
@@ -1546,8 +1560,7 @@ ENTRY DotFunc {
 ; CHECK-LABEL: ENTRY %DotFunc ({{.*}}: f32[3,3], {{.*}}: f32[3,3]) -> f32[3,3] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,3]{1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} dot([[P0]], [[P1]]),
-; CHECK:           lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK-NEXT:    ROOT {{[^ ]+}} = f32[3,3]{1,0} fusion([[P0]], [[P1]]), kind=kLoop
 )");
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc
new file mode 100644
index 00000000000000..31a17257a3ccd7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc
@@ -0,0 +1,240 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/gemm_workspace_rewriter.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+using se::gpu::BlasLt;
+
+namespace {
+
+absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+      return BlasLt::Epilogue::kDefault;
+    case GemmBackendConfig::RELU:
+      return BlasLt::Epilogue::kReLU;
+    case GemmBackendConfig::GELU:
+      return BlasLt::Epilogue::kGELU;
+    case GemmBackendConfig::GELU_AUX:
+      return BlasLt::Epilogue::kGELUWithAux;
+    case GemmBackendConfig::BIAS:
+      return BlasLt::Epilogue::kBias;
+    case GemmBackendConfig::BIAS_RELU:
+      return BlasLt::Epilogue::kBiasThenReLU;
+    case GemmBackendConfig::BIAS_GELU:
+      return BlasLt::Epilogue::kBiasThenGELU;
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return BlasLt::Epilogue::kBiasThenGELUWithAux;
+    default:
+      return absl::InternalError("Unsupported Epilogue.");
+  }
+}
+
+// Visitor that updates workspace sizes for cuBLASLt GEMM operations
+// based on the selected algorithm's actual workspace requirement.
+class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit GemmWorkspaceRewriteVisitor(
+      const se::GpuComputeCapability& gpu_version,
+      se::StreamExecutor* stream_exec)
+      : gpu_version_(gpu_version), stream_exec_(stream_exec) {}
+
+  absl::Status HandleCustomCall(HloInstruction* instr) override {
+    // Only handle cuBLASLt matmul calls
+    if (instr->custom_call_target() != kCublasLtMatmulCallTarget &&
+        instr->custom_call_target() != kCublasLtMatmulF8CallTarget) {
+      return absl::OkStatus();
+    }
+
+    // Skip if stream executor is not available
+    if (stream_exec_ == nullptr) {
+      return absl::OkStatus();
+    }
+
+    // Get the backend config
+    ASSIGN_OR_RETURN(auto gpu_config,
+                     instr->backend_config<GpuBackendConfig>());
+    const GemmBackendConfig& config = gpu_config.gemm_backend_config();
+
+    // Skip if no algorithm has been selected (not autotuned yet)
+    if (config.algorithm_case() != GemmBackendConfig::kSelectedAlgorithm) {
+      return absl::OkStatus();
+    }
+
+    int64_t selected_algorithm = config.selected_algorithm();
+
+    // Get the current output shape - must be a tuple with workspace as last
+    // element
+    if (!instr->shape().IsTuple() || instr->shape().tuple_shapes().empty()) {
+      return absl::OkStatus();
+    }
+
+    // Get the current workspace size
+    const Shape& current_workspace_shape = instr->shape().tuple_shapes().back();
+    if (current_workspace_shape.element_type() != S8) {
+      return absl::OkStatus();
+    }
+    int64_t current_workspace_size =
+        ShapeUtil::ByteSizeOf(current_workspace_shape);
+
+    // Create GemmConfig to get the matmul plan
+    ASSIGN_OR_RETURN(GemmConfig gemm_config,
+                     GemmConfig::For(instr, gpu_version_));
+
+    // Get the epilogue
+    ASSIGN_OR_RETURN(BlasLt::Epilogue epilogue,
+                     AsBlasLtEpilogue(config.epilogue()));
+
+    // Create a stream to query algorithms
+    ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                     stream_exec_->CreateStream());
+
+    // Get the matmul plan
+    ASSIGN_OR_RETURN(
+        std::unique_ptr<BlasLt::MatmulPlan> plan,
+        se::gpu::BlasLt::GetMatmulPlan(stream.get(), gemm_config, epilogue));
+
+    // Query algorithms with the current workspace size limit
+    ASSIGN_OR_RETURN(
+        std::vector<BlasLt::MatmulAlgorithm> algorithms,
+        plan->GetAlgorithms(stream.get(), GemmConfig::kNumAlgorithms,
+                            current_workspace_size));
+
+    // Verify that the selected algorithm index is valid
+    if (selected_algorithm < 0 ||
+        selected_algorithm >= static_cast<int64_t>(algorithms.size())) {
+      VLOG(3) << "Selected algorithm index " << selected_algorithm
+              << " is out of range for " << instr->name()
+              << ", skipping workspace update.";
+      return absl::OkStatus();
+    }
+
+    // Get the actual workspace size for the selected algorithm
+    int64_t actual_workspace_size =
+        static_cast<int64_t>(algorithms[selected_algorithm].workspace_size);
+
+    // If the workspace size is already optimal, nothing to do
+    if (actual_workspace_size == current_workspace_size) {
+      return absl::OkStatus();
+    }
+
+    // Ensure we're not increasing the workspace size
+    if (actual_workspace_size > current_workspace_size) {
+      VLOG(3) << "Algorithm workspace size (" << actual_workspace_size
+              << ") exceeds current allocation (" << current_workspace_size
+              << ") for " << instr->name() << ", skipping update.";
+      return absl::OkStatus();
+    }
+
+    VLOG(2) << "Updating workspace size for " << instr->name() << " from "
+            << current_workspace_size << " to " << actual_workspace_size;
+
+    // Build the new output shape with updated workspace size
+    Shape new_output_shape = instr->shape();
+    *new_output_shape.mutable_tuple_shapes(
+        new_output_shape.tuple_shapes().size() - 1) =
+        ShapeUtil::MakeShape(S8, {actual_workspace_size});
+
+    // Clone the instruction with the new shape
+    HloInstruction* new_call = instr->AddInstruction(
+        instr->CloneWithNewOperands(new_output_shape, instr->operands()));
+
+    // Update operand aliasing if present
+    auto* custom_call = Cast<HloCustomCallInstruction>(new_call);
+    if (!custom_call->output_to_operand_aliasing().empty()) {
+      custom_call->set_output_to_operand_aliasing(
+          Cast<HloCustomCallInstruction>(instr)->output_to_operand_aliasing());
+    }
+
+    // Collect users first to avoid modifying during iteration
+    std::vector<HloInstruction*> users(instr->users().begin(),
+                                       instr->users().end());
+
+    // Replace all users of the old instruction
+    for (HloInstruction* user : users) {
+      HloGetTupleElementInstruction* user_get_tuple =
+          DynCast<HloGetTupleElementInstruction>(user);
+      if (user_get_tuple == nullptr) {
+        continue;
+      }
+      HloInstruction* get_output =
+          instr->AddInstruction(HloInstruction::CreateGetTupleElement(
+              new_call, user_get_tuple->tuple_index()));
+      RETURN_IF_ERROR(ReplaceInstruction(user_get_tuple, get_output));
+    }
+
+    MarkAsChanged();
+    return absl::OkStatus();
+  }
+
+ private:
+  se::GpuComputeCapability gpu_version_;
+  se::StreamExecutor* stream_exec_;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> GemmWorkspaceRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // Skip if stream executor is not available
+  if (stream_exec_ == nullptr) {
+    VLOG(2) << "Stream executor not available, skipping workspace rewrite.";
+    return false;
+  }
+
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    GemmWorkspaceRewriteVisitor visitor(gpu_version_, stream_exec_);
+    RETURN_IF_ERROR(computation->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h
new file mode 100644
index 00000000000000..b7c5e3e5f47feb
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_WORKSPACE_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_WORKSPACE_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// This pass updates the workspace size for cuBLAS/cuBLASLt GEMM operations
+// after autotuning has selected a specific algorithm. The GemmRewriter pass
+// conservatively allocates workspace before autotuning. After autotuning,
+// we know the exact algorithm selected and can query its actual workspace
+// requirement, potentially reducing memory usage.
+class GemmWorkspaceRewriter : public HloModulePass {
+ public:
+  explicit GemmWorkspaceRewriter(const se::GpuComputeCapability& gpu_version,
+                                 stream_executor::StreamExecutor* stream_exec)
+      : gpu_version_(gpu_version), stream_exec_(stream_exec) {}
+
+  absl::string_view name() const override { return "gemm-workspace-rewriter"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability gpu_version_;
+  stream_executor::StreamExecutor* stream_exec_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_WORKSPACE_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc
new file mode 100644
index 00000000000000..e8028932f2ad5c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/gemm_workspace_rewriter.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = ::stream_executor;
+
+class GemmWorkspaceRewriterTest : public GpuCodegenTest {};
+
+// Tests that cuBLASLt calls with a selected algorithm and large workspace
+// are rewritten to use a smaller workspace.
+TEST_F(GemmWorkspaceRewriterTest,
+       CublasLtCallWithSelectedAlgorithmIsRewritten) {
+  // This HLO simulates a cuBLASLt matmul after autotuning - it has
+  // selected_algorithm set and a conservatively large workspace.
+  const char* hlo_text = R"(
+HloModule TestModule
+
+ENTRY main {
+  lhs = f32[32,64] parameter(0)
+  rhs = f32[64,128] parameter(1)
+  custom_call = (f32[32,128], s8[4194304]) custom-call(lhs, rhs),
+    custom_call_target="__cublas$lt$matmul",
+    backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"gemm_backend_config":{"alpha_real":1,"alpha_imag":0,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT","selected_algorithm":"0"}}
+  ROOT result = f32[32,128] get-tuple-element(custom_call), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  se::StreamExecutor* stream_exec = backend().default_stream_executor();
+  GemmWorkspaceRewriter pass(
+      stream_exec->GetDeviceDescription().gpu_compute_capability(),
+      stream_exec);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+
+  // The pass should reduce the workspace size from 4MB to the algorithm's
+  // actual requirement (typically much smaller).
+  EXPECT_TRUE(changed);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
new file mode 100644
index 00000000000000..ca1a24b835ec14
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
@@ -0,0 +1,981 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla::gpu {
+namespace {
+
+// Extracts the TritonGemmConfig from the given fusion's backend config.
+absl::StatusOr<TritonGemmConfig> GetTritonGemmConfig(
+    const HloFusionInstruction& fusion) {
+  TF_ASSIGN_OR_RETURN(auto gpu_config,
+                      fusion.backend_config<GpuBackendConfig>());
+  const FusionBackendConfig& backend_config =
+      gpu_config.fusion_backend_config();
+  if (!backend_config.has_triton_gemm_config()) {
+    return absl::InternalError(
+        "The fusion's backend config doesn't have a triton_gemm_config.");
+  }
+  return TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+}
+
+using HloInstructionSetVector =
+    llvm::SetVector<HloInstruction*, std::vector<HloInstruction*>,
+                    HloInstructionSet>;
+
+// Returns the set of instructions that are reachable from 'instruction' using
+// the given accessor.
+template <typename T>
+HloInstructionSetVector GetTransitiveInstructionSet(
+    const HloInstruction* instruction, T (HloInstruction::*get)() const) {
+  std::deque<HloInstruction*> worklist;
+  auto append = [&](const auto& instructions) {
+    worklist.insert(worklist.end(), instructions.begin(), instructions.end());
+  };
+  append((instruction->*get)());
+  HloInstructionSetVector result;
+  while (!worklist.empty()) {
+    HloInstruction* front = worklist.front();
+    worklist.pop_front();
+    if (result.insert(front)) {
+      append((front->*get)());
+    }
+  }
+  return result;
+}
+
+// Returns the set of producers reachable from 'instruction' in use-before-def
+// order.
+HloInstructionSetVector GetProducerSet(const HloInstruction* instruction) {
+  return GetTransitiveInstructionSet(instruction, &HloInstruction::operands);
+}
+// Returns the set of consumers reachable from 'instruction' in def-before-use
+// order.
+HloInstructionSetVector GetConsumerSet(const HloInstruction* instruction) {
+  return GetTransitiveInstructionSet(instruction, &HloInstruction::users);
+}
+
+// Verifies that the set of instructions is closed under the given accessor,
+// i.e. that the set of instructions reachable through the given accessor are
+// either in the set itself or the root.
+template <typename T>
+absl::Status VerifyIsClosedInstructionSet(
+    const HloInstructionSetVector& instructions, const HloInstruction* root,
+    T (HloInstruction::*get)() const) {
+  for (HloInstruction* instruction : instructions) {
+    for (HloInstruction* reachable : (instruction->*get)()) {
+      if (reachable != root && instructions.count(reachable) == 0) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Instruction ", reachable->ToString(),
+                         " is reachable from ", instruction->ToString(),
+                         ", which is not in the recursive set of, or ",
+                         root->ToString(), " itself."));
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status VerifyIsClosedProducerSet(
+    const HloInstructionSetVector& instructions, const HloInstruction* root) {
+  return VerifyIsClosedInstructionSet(instructions, root,
+                                      &HloInstruction::users);
+}
+
+// Copies the element type and size from `source` to `destination`.
+void CopyElementType(const Shape& source, Shape* destination) {
+  destination->set_element_type(source.element_type());
+  destination->mutable_layout()->set_element_size_in_bits(
+      source.layout().element_size_in_bits());
+}
+
+llvm::SmallVector<int64_t> GetInversePermutation(
+    absl::Span<const int64_t> permutation) {
+  llvm::SmallVector<int64_t> result(permutation.size());
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    result[permutation[i]] = i;
+  }
+  return result;
+}
+
+// Applies the backward-mapping 'permutation' to 'values'.
+llvm::SmallVector<int64_t> ApplyPermutation(
+    absl::Span<const int64_t> values, absl::Span<const int64_t> permutation) {
+  llvm::SmallVector<int64_t> result;
+  result.reserve(permutation.size());
+  for (int64_t index : permutation) {
+    result.push_back(values[index]);
+  }
+  return result;
+}
+
+// Returns the dimensions of 'shape' in minor-to-major order.
+llvm::SmallVector<int64_t> GetPhysicalDimensions(const Shape& shape) {
+  return ApplyPermutation(shape.dimensions(), shape.layout().minor_to_major());
+}
+
+// Parameters to rewrite a bitcast(broadcast/transpose) as
+// broadcast/transpose(bitcast) and vice versa.
+struct BitcastParams {
+  Shape new_shape;                      // The bitcast output shape.
+  llvm::SmallVector<int64_t> new_dims;  // The dims of the broadcast/transpose.
+};
+
+// Returns parameters to rewrite a broadcast + bitcast as bitcast + broadcast.
+//
+// Example:
+//
+// broadcast = broadcast(operand)
+// result = result_shape bitcast(broadcast)
+//
+// to
+//
+// bitcast = new_shape bitcast(operand)
+// result = broadcast(bitcast), dimensions={new_dims}.
+//
+// Assumes that:
+// - broadcast does not transpose dimensions (checked by hlo_verifier);
+// - bitcast does not mix operand and broadcast dimensions (checks);
+absl::StatusOr<BitcastParams> CalculateBitcastOfBroadcast(
+    const HloBroadcastInstruction* broadcast, const Shape& result_shape) {
+  const Shape& broadcast_shape = broadcast->shape();
+
+  // Maps broadcast dimension index to whether it's an operand dimension.
+  llvm::SmallVector<bool> is_operand_dim(broadcast_shape.dimensions().size());
+  for (const int64_t index : broadcast->dimensions()) {
+    is_operand_dim[index] = true;
+  }
+
+  // Dimensions of the new broadcast.
+  llvm::SmallVector<int64_t> new_dims;
+  llvm::SmallVector<int64_t> broadcast_physical_dims =
+      GetPhysicalDimensions(broadcast_shape);
+  auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
+                               broadcast_physical_dims);
+  for (int64_t i = 1; i < factors.size(); ++i) {
+    auto [result_from, broadcast_from] = factors[i - 1];
+    auto [result_to, broadcast_to] = factors[i];
+
+    bool all_operands = true, any_operands = false;
+    for (int64_t j = broadcast_from; j < broadcast_to; ++j) {
+      if (broadcast_physical_dims[j] == 1) {
+        // If dimension size is 1 then we can ignore it: it's either immediately
+        // dropped by old reshape or it's coming from the operand and then the
+        // new reshape will handle it.
+        continue;
+      }
+      bool value = is_operand_dim[broadcast_shape.layout().minor_to_major(j)];
+      all_operands &= value;
+      any_operands |= value;
+    }
+    if (!any_operands) {
+      continue;  // All dimensions in this group are broadcast dimensions.
+    }
+    if (!all_operands) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
+                       " as it mixes operand and broadcast dimensions."));
+    }
+
+    for (int64_t j = result_from; j < result_to; ++j) {
+      new_dims.push_back(result_shape.layout().minor_to_major(j));
+    }
+  }
+  absl::c_sort(new_dims);  // Sort into logical order.
+
+  BitcastParams result;
+  CopyElementType(result_shape, &result.new_shape);
+  for (int64_t index : new_dims) {
+    result.new_shape.add_dimensions(result_shape.dimensions(index));
+  }
+  auto* new_layout =
+      result.new_shape.mutable_layout()->mutable_minor_to_major();
+  new_layout->reserve(new_dims.size());
+  for (int64_t index : result_shape.layout().minor_to_major()) {
+    if (auto it = absl::c_lower_bound(new_dims, index);
+        it != new_dims.end() && *it == index) {
+      new_layout->push_back(it - new_dims.begin());
+    }
+  }
+  result.new_dims = std::move(new_dims);
+
+  VLOG(3) << "CalculateBitcastOfBroadcast:";
+  VLOG(3) << "  broadcast = " << broadcast_shape.ToString(true) << " broadcast("
+          << broadcast->operand(0)->shape().ToString(true)
+          << " operand), dimensions="
+          << absl::StrJoin(broadcast->dimensions(), ",");
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
+          << broadcast_shape.ToString(true) << " broadcast)";
+  VLOG(3) << "--------------------------------";
+  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
+          << broadcast->operand(0)->shape().ToString(true) << " operand)";
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
+          << result.new_shape.ToString(true)
+          << " bitcast), dimensions=" << absl::StrJoin(result.new_dims, ",");
+
+  return result;
+}
+
+// Returns parameters to rewrite a bitcast + broadcast as broadcast + bitcast.
+//
+// Example:
+//
+// bitcast = bitcast(operand_shape operand)
+// result = broadcast(bitcast)
+//
+// to
+//
+// broadcast = new_shape broadcast(operand), dimensions={new_dims}.
+// result = bitcast(broadcast)
+//
+// Assumes that:
+// - broadcast does not transpose dimensions (checked by hlo_verifier);
+// - bitcast does not mix operand and broadcast dimensions (checks);
+absl::StatusOr<BitcastParams> CalculateBroadcastOfBitcast(
+    const HloBroadcastInstruction* broadcast, const Shape& operand_shape) {
+  const Shape& bitcast_shape = broadcast->operand(0)->shape();
+  const Shape& result_shape = broadcast->shape();
+
+  // Maps logical result dimension index to a range of physical operand
+  // dimensions, or nullopt if the dimension is broadcasted.
+  llvm::SmallVector<std::optional<std::pair<int64_t, int64_t>>>
+      result_to_operand_range(result_shape.dimensions().size());
+  auto result_inv_layout =
+      GetInversePermutation(result_shape.layout().minor_to_major());
+  auto factors = CommonFactors(GetPhysicalDimensions(bitcast_shape),
+                               GetPhysicalDimensions(operand_shape));
+  for (int64_t i = 1; i < factors.size(); ++i) {
+    auto [bitcast_from, operand_from] = factors[i - 1];
+    auto [bitcast_to, operand_to] = factors[i];
+
+    llvm::SmallVector<int64_t> indices;
+    indices.reserve(bitcast_to - bitcast_from);
+    for (int64_t j = bitcast_from; j < bitcast_to; ++j) {
+      int64_t index =
+          broadcast->dimensions()[bitcast_shape.layout().minor_to_major(j)];
+
+      // Store the entire operand dimension range in the minor-most dimension
+      // index and an empty range in all others.
+      result_to_operand_range[index].emplace(operand_from, operand_to);
+      operand_from = operand_to;
+
+      // Check that the physical result indices form a contiguous range.
+      indices.push_back(result_inv_layout[index]);
+    };
+
+    if (indices.back() - indices.front() >= bitcast_to - bitcast_from ||
+        !absl::c_is_sorted(indices)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
+                       " because result dimensions are not contiguous."));
+    }
+  }
+
+  BitcastParams result;
+  CopyElementType(operand_shape, &result.new_shape);
+  result.new_dims.resize(operand_shape.dimensions().size());
+  auto* new_layout =
+      result.new_shape.mutable_layout()->mutable_minor_to_major();
+  int64_t new_rank = operand_shape.dimensions().size() +
+                     result_shape.dimensions().size() -
+                     bitcast_shape.dimensions().size();
+  new_layout->reserve(new_rank);
+  llvm::SmallVector<int64_t> new_shape_dims(new_rank);
+
+  // We are free to insert the broadcast dimensions in any order. Insert them
+  // at the end of the the logical dimension order.
+  int64_t broadcast_index = operand_shape.dimensions().size();
+
+  // Iterate through the logical result dimension indices in physical order.
+  for (int64_t result_index : result_shape.layout().minor_to_major()) {
+    if (auto range = result_to_operand_range[result_index]) {
+      // This result dimension corresponds to a group of operand dimensions.
+      // Iterate through the range of physical operand dimension indices.
+      for (int64_t i = range->first; i < range->second; ++i) {
+        int64_t operand_index = operand_shape.layout().minor_to_major(i);
+        int64_t new_index = operand_index;
+        new_shape_dims[new_index] = operand_shape.dimensions(operand_index);
+        new_layout->push_back(new_index);
+        result.new_dims[operand_index] = new_index;
+      }
+    } else {
+      // This is a new dimension introduced by the original broadcast.
+      int64_t new_index = broadcast_index++;
+      new_shape_dims[new_index] = result_shape.dimensions(result_index);
+      new_layout->push_back(new_index);
+    }
+  }
+  absl::c_sort(result.new_dims);  // Sort into logical order.
+  for (int64_t dimension : new_shape_dims) {
+    result.new_shape.add_dimensions(dimension);
+  }
+
+  VLOG(3) << "CalculateBroadcastOfBitcast:";
+  VLOG(3) << "  bitcast   = " << bitcast_shape.ToString(true) << " bitcast("
+          << operand_shape.ToString(true) << " operand)";
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
+          << bitcast_shape.ToString(true) << " bitcast), dimensions="
+          << absl::StrJoin(broadcast->dimensions(), ",");
+  VLOG(3) << "--------------------------------";
+  VLOG(3) << "  broadcast = " << result.new_shape.ToString(true)
+          << " broadcast(" << operand_shape.ToString(true)
+          << " operand), dimensions=" << absl::StrJoin(result.new_dims, ",");
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
+          << result.new_shape.ToString(true) << " broadcast)";
+
+  return result;
+}
+
+// Implements CalculateBitcastOfTranspose(), except that result.new_dims is
+// the inverse permutation, mapping the input dimensions to the output
+// dimensions.
+absl::StatusOr<BitcastParams> CalculateBitcastOfTransposeImpl(
+    const HloTransposeInstruction* transpose, const Shape& result_shape,
+    const Shape& transpose_shape, const Shape& operand_shape,
+    absl::Span<const int64_t> transpose_dims) {
+  if (transpose->shape().layout() != transpose->operand(0)->shape().layout()) {
+    return absl::InternalError(
+        absl::StrCat("Expected input and output layouts to be the same for ",
+                     transpose->ToString()));
+  }
+
+  // Maps physical operand dimension index to a range of physical result
+  // dimensions.
+  llvm::SmallVector<std::pair<int64_t, int64_t>> operand_to_result_range(
+      operand_shape.dimensions().size());
+  // Maps logical operand dimension index to the physical dimension index.
+  llvm::SmallVector<int64_t> operand_inv_layout =
+      GetInversePermutation(operand_shape.layout().minor_to_major());
+
+  const absl::InlinedVector<std::pair<int64_t, int64_t>, 8> factors =
+      ::xla::gpu::detail::CommonFactorsMergingTrivialRanges(
+          GetPhysicalDimensions(result_shape),
+          GetPhysicalDimensions(transpose_shape));
+  for (int64_t i = 1; i < factors.size(); ++i) {
+    auto [result_from, transpose_from] = factors[i - 1];
+    auto [result_to, transpose_to] = factors[i];
+
+    llvm::SmallVector<int64_t> indices;
+    indices.reserve(transpose_to - transpose_from);
+    for (int64_t j = transpose_from; j < transpose_to; ++j) {
+      int64_t index = operand_inv_layout
+          [transpose_dims[transpose_shape.layout().minor_to_major(j)]];
+
+      // Store the entire result dimension range in the minor-most dimension
+      // index and an empty range in all others.
+      operand_to_result_range[index] = {result_from, result_to};
+      result_from = result_to;
+
+      // Check that the physical operand indices form a contiguous range.
+      indices.push_back(index);
+    };
+
+    if (indices.empty()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
+                       " because size-1 dims in bitcasts are not yet supported "
+                       "(b/466065483)."));
+    }
+    if (indices.back() - indices.front() >= transpose_to - transpose_from ||
+        !absl::c_is_sorted(indices)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
+                       " because result dimensions are not contiguous."));
+    }
+  }
+
+  BitcastParams result;
+  CopyElementType(result_shape, &result.new_shape);
+  // Just like the old transpose, the new transpose does not change the
+  // layout.
+  *result.new_shape.mutable_layout() = result_shape.layout();
+  result.new_dims.resize(result_shape.dimensions().size());
+  llvm::SmallVector<int64_t> new_shape_dims(result_shape.dimensions().size());
+  // Iterate through the physical operand and new_shape dimension indices.
+  for (int64_t i = 0, j = 0; i < operand_shape.dimensions().size(); ++i) {
+    auto range = operand_to_result_range[i];
+    // Iterate through corresponding range of physical result dimension
+    // indices.
+    for (int64_t k = range.first; k < range.second; ++k) {
+      int64_t new_index = result_shape.layout().minor_to_major(j++);
+      int64_t result_index = result_shape.layout().minor_to_major(k);
+      new_shape_dims[new_index] = result_shape.dimensions(result_index);
+      result.new_dims[new_index] = result_index;
+    }
+  }
+  for (int64_t dimension : new_shape_dims) {
+    result.new_shape.add_dimensions(dimension);
+  }
+
+  VLOG(3) << "CalculateBitcastOfTransposeImpl:";
+  VLOG(3) << "  transpose = " << transpose_shape.ToString(true) << " transpose("
+          << operand_shape.ToString(true)
+          << " operand), dimensions=" << absl::StrJoin(transpose_dims, ",");
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
+          << transpose_shape.ToString(true) << " transpose)";
+  VLOG(3) << "--------------------------------";
+  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
+          << operand_shape.ToString(true) << " operand)";
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " transpose("
+          << result.new_shape.ToString(true) << " bitcast), dimensions="
+          << absl::StrJoin(GetInversePermutation(result.new_dims), ",");
+
+  return result;
+}
+
+// Returns parameters to rewrite a transpose + bitcast as bitcast + transpose.
+//
+// Example:
+//
+// transpose = transpose(operand)
+// result = result_shape bitcast(transpose)
+//
+// to
+//
+// bitcast = new_shape bitcast(operand)
+// result = transpose(bitcast), dimensions={new_dims}.
+//
+// Assumes that:
+// - bitcast only mixes contiguous dimensions (checks);
+// - transpose does not change layout (checks);
+absl::StatusOr<BitcastParams> CalculateBitcastOfTranspose(
+    const HloTransposeInstruction* transpose, const Shape& result_shape) {
+  TF_ASSIGN_OR_RETURN(
+      BitcastParams result,
+      CalculateBitcastOfTransposeImpl(
+          transpose, result_shape, transpose->shape(),
+          transpose->operand(0)->shape(), transpose->dimensions()));
+  result.new_dims = GetInversePermutation(result.new_dims);
+  return result;
+}
+
+// Returns parameters to rewrite a bitcast + transpose as transpose + bitcast.
+//
+// Example:
+//
+// bitcast = bitcast(operand_shape operand)
+// result = transpose(bitcast)
+//
+// to
+//
+// transpose = new_shape transpose(operand), dimensions={new_dims}.
+// result = bitcast(transpose)
+//
+// Assumes that:
+// - bitcast only mixes contiguous dimensions (checks);
+// - transpose does not change layout (checks);
+absl::StatusOr<BitcastParams> CalculateTransposeOfBitcast(
+    const HloTransposeInstruction* transpose, const Shape& operand_shape) {
+  return CalculateBitcastOfTransposeImpl(
+      transpose, operand_shape, transpose->operand(0)->shape(),
+      transpose->shape(), GetInversePermutation(transpose->dimensions()));
+}
+
+// Simulates a rewrite of all producers of a given bitcast/reshape, moving the
+// instruction outside of the computation. Returns the new shapes of affected
+// instructions in order of traversal from consumers to producers.
+absl::StatusOr<std::vector<std::pair<HloInstruction*, Shape>>>
+PlanHoistBitcastUpwardsToCallers(const HloInstruction* bitcast) {
+  // Check that all producers only affect the bitcast. If there are any
+  // other consumers: refuse the hoisting.
+  // It is possible to support more cases by sinking the bitcast from such
+  // producers downward.
+  HloInstructionSetVector producers = GetProducerSet(bitcast);
+  TF_RETURN_IF_ERROR(VerifyIsClosedProducerSet(producers, bitcast));
+  if (bitcast->shape().element_type() !=
+      bitcast->operand(0)->shape().element_type()) {
+    return absl::UnimplementedError(
+        absl::StrCat("Hoisting bitcast with type conversion is not supported: ",
+                     bitcast->ToString()));
+  }
+
+  HloInstructionMap<Shape> result_shapes;
+  auto set_result_shape =
+      [&](const absl::Span<HloInstruction* const> instructions,
+          const Shape& shape) -> absl::Status {
+    for (HloInstruction* instruction : instructions) {
+      // Only update the dimensions keeping the type intact.
+      Shape new_shape(shape);
+      CopyElementType(instruction->shape(), &new_shape);
+      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
+               ShapeUtil::ArrayDataSize(instruction->shape()))
+          << " instruction " << instruction->ToString()
+          << " updating result shape from "
+          << ShapeUtil::HumanStringWithLayout(instruction->shape()) << " to "
+          << ShapeUtil::HumanStringWithLayout(new_shape)
+          << " with different data size";
+      auto it = result_shapes.find(instruction);
+      if (it == result_shapes.end()) {
+        VLOG(2) << "updating the result shape of " << instruction->ToString()
+                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
+        result_shapes.emplace(instruction, new_shape);
+      } else if (it->second != new_shape) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Conflicting shape assignment for ", instruction->ToString(),
+            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
+            ShapeUtil::HumanStringWithLayout(shape)));
+      }
+    }
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(set_result_shape(bitcast->operands(), bitcast->shape()));
+
+  std::vector<std::pair<HloInstruction*, Shape>> result;
+  // We want to visit instructions in order from consumers to producers: we
+  // hoist the bitcast upwards and having a valid HLO at every rewrite step
+  // helps a lot. A simple DFS or BFS over operands will not work in non-tree
+  // situations when there are multiple consumers of the same producer. Instead
+  // of writing a custom traversal we can simply walk the post-order (producers
+  // before consumers) list backward and only update the instructions affected.
+  // TODO(b/393299275): use MakeInstructionPostOrderFrom(bitcast) - that should
+  // be slightly more efficient.
+  auto def_before_use = bitcast->parent()->MakeInstructionPostOrder();
+  for (HloInstruction* instruction :
+       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
+    auto it = result_shapes.find(instruction);
+    if (it == result_shapes.end()) {
+      continue;  // Not affected.
+    }
+    Shape& result_shape = it->second;
+    if (instruction->shape() == result_shape) {
+      continue;  // No change.
+    }
+    result.emplace_back(instruction, result_shape);
+    switch (instruction->opcode()) {
+      case HloOpcode::kParameter:
+      case HloOpcode::kConstant:
+        // No operands.
+        break;
+      case HloOpcode::kReshape:  // Reshape is a bitcast.
+      case HloOpcode::kBitcast:
+        // Other bitcast will be hoisted separately so we don't need to
+        // update its operand.
+        break;
+      case HloOpcode::kBroadcast: {
+        TF_ASSIGN_OR_RETURN(
+            BitcastParams params,
+            CalculateBitcastOfBroadcast(
+                Cast<HloBroadcastInstruction>(instruction), result_shape));
+        TF_RETURN_IF_ERROR(
+            set_result_shape(instruction->operands(), params.new_shape));
+        break;
+      }
+      case HloOpcode::kTranspose: {
+        TF_ASSIGN_OR_RETURN(
+            BitcastParams params,
+            CalculateBitcastOfTranspose(
+                Cast<HloTransposeInstruction>(instruction), result_shape));
+        TF_RETURN_IF_ERROR(
+            set_result_shape(instruction->operands(), params.new_shape));
+        break;
+      }
+      default:
+        if (!instruction->IsElementwise()) {
+          return absl::FailedPreconditionError(absl::StrCat(
+              "Cannot hoist bitcast past ", instruction->ToString()));
+        }
+        TF_RETURN_IF_ERROR(
+            set_result_shape(instruction->operands(), result_shape));
+        break;
+    }
+  }
+  return result;
+}
+
+// Returns the shape of the root instruction after hoisting bitcasts away from
+// the dot instruction. If traversal encounters an instruction we cannot hoist
+// bitcasts past we try to sink the bitcast starting from that instruction.
+//
+// For example, given:
+//
+// dot = dot_shape dot
+// bitcast = bitcast(dot)
+// ROOT root = transpose(bitcast)
+//
+// Returns root_shape for:
+//
+// dot = dot_shape dot
+// ROOT root = roots_shape transpose(dot)
+//
+absl::StatusOr<Shape> ComputeRootShapeAfterHoistingBitcasts(
+    const HloInstruction* dot) {
+  if (dot->IsRoot()) {
+    return dot->shape();
+  }
+
+  HloInstructionMap<Shape> operand_shapes;
+  auto set_operand_shape =
+      [&](const absl::Span<HloInstruction* const> instructions,
+          const Shape& shape) -> absl::Status {
+    for (HloInstruction* instruction : instructions) {
+      // Only update the dimensions keeping the type intact.
+      Shape new_shape(shape);
+      const HloInstruction* operand = instruction->operand(0);
+      CopyElementType(operand->shape(), &new_shape);
+      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
+               ShapeUtil::ArrayDataSize(operand->shape()))
+          << " instruction " << instruction->ToString()
+          << " updating operand shape from "
+          << ShapeUtil::HumanStringWithLayout(operand->shape()) << " to "
+          << ShapeUtil::HumanStringWithLayout(new_shape)
+          << " with different data size";
+      auto it = operand_shapes.find(instruction);
+      if (it == operand_shapes.end()) {
+        VLOG(2) << "updating the operand shape of "
+                << instruction->ToString(
+                       HloPrintOptions().set_print_operand_shape(true))
+                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
+        operand_shapes.emplace(instruction, new_shape);
+      } else if (it->second != new_shape) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Conflicting shape assignment for ", instruction->ToString(),
+            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
+            ShapeUtil::HumanStringWithLayout(shape)));
+      }
+    }
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(set_operand_shape(dot->users(), dot->shape()));
+
+  for (HloInstruction* instruction : GetConsumerSet(dot)) {
+    auto it = operand_shapes.find(instruction);
+    if (it == operand_shapes.end()) {
+      continue;  // Not affected.
+    }
+    Shape& operand_shape = it->second;
+    TF_ASSIGN_OR_RETURN(Shape result_shape, [&]() -> absl::StatusOr<Shape> {
+      switch (instruction->opcode()) {
+        case HloOpcode::kBroadcast: {
+          auto paramsOr = CalculateBroadcastOfBitcast(
+              Cast<HloBroadcastInstruction>(instruction), operand_shape);
+          if (paramsOr.ok()) {
+            return paramsOr->new_shape;
+          }
+          VLOG(2) << "Failed to calculate broadcast of bitcast: "
+                  << paramsOr.status();
+          return instruction->shape();
+        }
+        case HloOpcode::kTranspose: {
+          auto paramsOr = CalculateTransposeOfBitcast(
+              Cast<HloTransposeInstruction>(instruction), operand_shape);
+          if (paramsOr.ok()) {
+            return paramsOr->new_shape;
+          }
+          VLOG(2) << "Failed to calculate transpose of bitcast: "
+                  << paramsOr.status();
+          return instruction->shape();
+        }
+        case HloOpcode::kReshape:
+        case HloOpcode::kBitcast:
+          return operand_shape;
+        default:
+          if (instruction->IsElementwise()) {
+            return operand_shape;
+          }
+          // TODO(b/467421789): we can probably allow sinking from this op down.
+          return absl::FailedPreconditionError(absl::StrCat(
+              "Cannot hoist bitcast past ", instruction->ToString()));
+      }
+    }());
+    if (instruction->IsRoot()) {
+      CopyElementType(instruction->shape(), &result_shape);
+      return result_shape;
+    }
+    TF_RETURN_IF_ERROR(set_operand_shape(instruction->users(), result_shape));
+  }
+  return absl::InternalError("No root found");
+}
+
+// Hoists the given 'bitcast' upwards out of its computation, to the parent of
+// each caller.
+absl::Status HoistBitcastUpwardsToCallers(HloInstruction* bitcast,
+                                          absl::Span<HloInstruction*> callers) {
+  TF_ASSIGN_OR_RETURN(auto rewrite_plan,
+                      PlanHoistBitcastUpwardsToCallers(bitcast));
+  for (auto [instruction, result_shape] : rewrite_plan) {
+    VLOG(2) << absl::StrCat("rewriting result shape of ",
+                            instruction->ToString(), " to ",
+                            ShapeUtil::HumanStringWithLayout(result_shape));
+    switch (instruction->opcode()) {
+      case HloOpcode::kParameter: {
+        // Create a new bitcast in callers.
+        int64_t number = instruction->parameter_number();
+        for (HloInstruction* caller : callers) {
+          // Create a more generic `bitcast` even if the caller has a
+          // `reshape`.
+          HloInstruction* new_bitcast =
+              caller->AddInstruction(HloInstruction::CreateBitcast(
+                  result_shape, caller->mutable_operand(number)));
+          TF_RETURN_IF_ERROR(
+              caller->ReplaceOperandWithDifferentShape(number, new_bitcast));
+        }
+        break;
+      }
+      case HloOpcode::kBroadcast: {
+        auto* broadcast = Cast<HloBroadcastInstruction>(instruction);
+        auto params = CalculateBitcastOfBroadcast(broadcast, result_shape);
+        // Must be OK, already succeeded in PlanHoistBitcasUpwardsToCallers.
+        QCHECK_OK(params);
+        broadcast->mutable_dimensions()->assign(params->new_dims.begin(),
+                                                params->new_dims.end());
+        break;
+      }
+      case HloOpcode::kTranspose: {
+        auto* transpose = Cast<HloTransposeInstruction>(instruction);
+        auto params = CalculateBitcastOfTranspose(transpose, result_shape);
+        // Must be OK, already succeeded in PlanHoistBitcastUpwardsToCallers.
+        QCHECK_OK(params);
+        transpose->mutable_dimensions()->assign(params->new_dims.begin(),
+                                                params->new_dims.end());
+        break;
+      }
+      default:
+        break;
+    }
+    *instruction->mutable_shape() = result_shape;
+  }
+  TF_RETURN_IF_ERROR(bitcast->ReplaceAllUsesWith(bitcast->mutable_operand(0)));
+  TF_RETURN_IF_ERROR(bitcast->parent()->RemoveInstruction(bitcast));
+  return absl::OkStatus();
+}
+
+// Inserts a bitcast at the root if the root shape is different from the dot
+// shape. The bitcast is chosen so that it cancels out bitcasts and reshapes
+// along the way up to the dot. Updates the callers of the dot to expect the new
+// root shape.
+absl::StatusOr<bool> MaybeInsertRootBitcast(
+    HloInstruction* dot, absl::Span<HloInstruction*> callers) {
+  TF_ASSIGN_OR_RETURN(Shape root_shape,
+                      ComputeRootShapeAfterHoistingBitcasts(dot));
+
+  HloComputation* computation = dot->parent();
+  HloInstruction* root = computation->root_instruction();
+  if (root->shape() == root_shape) {
+    return false;
+  }
+
+  // Insert a new bitcast at the root.
+  computation->set_root_instruction(
+      root->AddInstruction(HloInstruction::CreateBitcast(root_shape, root)));
+
+  // Insert new bitcast for each caller's result.
+  for (HloInstruction* caller : callers) {
+    HloInstruction* new_bitcast = caller->AddInstruction(
+        HloInstruction::CreateBitcast(caller->shape(), caller));
+    TF_RETURN_IF_ERROR(caller->ReplaceAllUsesWith(new_bitcast));
+    *caller->mutable_shape() = root_shape;
+  }
+
+  return true;
+}
+
+// Try hoisting bitcasts and reshapes in the computation away from 'dot' to the
+// callers of the computation. Some bitcasts or reshapes may remain in the
+// computation, because they cannot be hoisted across all ops, e.g. across some
+// transposes and broadcasts. This is not reported as an error.
+absl::StatusOr<bool> TryHoistBitcastsInComputationToCallers(
+    HloInstruction* dot, CallGraph* call_graph) {
+  bool changed = false;
+  // Instead of implementing a logic to hoist bitcast upwards and downwards
+  // we insert a bitcast at the root that and always hoist bitcasts upwards.
+  // That significantly simplifies the implementation.
+  VLOG(2) << "Before hoisting bitcasts: " << dot->parent()->ToString();
+
+  auto callers = call_graph->GetComputationCallers(dot->parent());
+  absl::StatusOr<bool> inserted =
+      MaybeInsertRootBitcast(dot, absl::MakeSpan(callers));
+  if (!inserted.ok()) {
+    VLOG(2) << "Failed to insert root bitcast: " << inserted.status();
+  } else {
+    changed |= *inserted;
+  }
+  VLOG(2) << "After inserting root bitcast: " << dot->parent()->ToString();
+
+  auto def_before_use = dot->parent()->MakeInstructionPostOrder();
+  for (HloInstruction* instruction :
+       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
+    if (!HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kReshape>(
+            instruction)) {
+      continue;
+    }
+    VLOG(2) << "Hoisting bitcast upwards " << instruction->ToString();
+    auto status =
+        HoistBitcastUpwardsToCallers(instruction, absl::MakeSpan(callers));
+    if (!status.ok()) {
+      VLOG(2) << "Failed to hoist " << instruction->ToString()
+              << " upwards: " << status;
+    } else {
+      changed = true;
+    }
+  }
+
+  VLOG(2) << "After hoisting bitcasts: " << dot->parent()->ToString();
+  return changed;
+}
+
+class HoistFusedBitcastsVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit HoistFusedBitcastsVisitor(CallGraph* call_graph)
+      : call_graph_(call_graph) {}
+
+ private:
+  absl::Status RewriteFusion(HloFusionInstruction* fusion,
+                             CallGraph* call_graph) {
+    HloComputation* computation = fusion->called_computation();
+    HloInstruction* instr =
+        hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+    if (instr == nullptr) {
+      instr = hlo_query::GetFirstInstructionWithOpcode(*computation,
+                                                       HloOpcode::kScaledDot);
+      if (instr == nullptr) {
+        return absl::InternalError(absl::StrCat("Computation of fusion ",
+                                                fusion->ToString(),
+                                                " has no dot instruction"));
+      }
+    }
+
+    ASSIGN_OR_RETURN(bool changed,
+                     TryHoistBitcastsInComputationToCallers(instr, call_graph));
+    if (changed) {
+      MarkAsChanged();
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleFusion(HloInstruction* instruction) override {
+    HloFusionInstruction* fusion = Cast<HloFusionInstruction>(instruction);
+
+    // Check if we target this fusion.
+    absl::StatusOr<TritonGemmConfig> config = GetTritonGemmConfig(*fusion);
+    if (!config.ok()) {
+      VLOG(2) << "Skipping fusion as it does not have a TritonGemmConfig";
+      return absl::OkStatus();
+    }
+    HloComputation* computation = fusion->called_computation();
+    HloInstruction* instr =
+        hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+    if (instr == nullptr) {
+      instr = hlo_query::GetFirstInstructionWithOpcode(*computation,
+                                                       HloOpcode::kScaledDot);
+      if (instr == nullptr) {
+        VLOG(2) << "Skipping fusion as it has no dot instruction";
+        return absl::OkStatus();
+      }
+    }
+    return RewriteFusion(fusion, call_graph_);
+  }
+
+ private:
+  CallGraph* call_graph_;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> HoistFusedBitcasts::RunOnModule(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  auto call_graph = CallGraph::Build(module, execution_threads);
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    HoistFusedBitcastsVisitor visitor(call_graph.get());
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> HoistFusedBitcasts::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return RunOnModule(module, execution_threads);
+}
+
+namespace detail {
+
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
+CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
+                                  absl::Span<const int64_t> b) {
+  // CommonFactors does what we need but it also creates empty groups with
+  // product of 1, e.g. `[1] -> []` or `[] -> [1]`. We remove the bounds of
+  // such ranges to merge them with neighbors. There are many different ways
+  // to do this, here we continously append ranges to the start of the next
+  // group unless it is the very last range.
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> bounds =
+      CommonFactors(a, b);
+  for (size_t i = 0; i + 1 < bounds.size() && bounds.size() > 2;) {
+    auto [a_start, b_start] = bounds[i];
+    auto [a_end, b_end] = bounds[i + 1];
+    if (a_start != a_end && b_start != b_end) {
+      i++;
+      continue;
+    }
+    if (i + 2 == bounds.size()) {
+      // Very last range - append it to the previous one.
+      bounds.erase(bounds.begin() + i);
+    } else {
+      bounds.erase(bounds.begin() + i + 1);
+    }
+  }
+  return bounds;
+}
+
+}  // namespace detail
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h
new file mode 100644
index 00000000000000..01d1fb3c367cd9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_HOIST_FUSED_BITCASTS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_HOIST_FUSED_BITCASTS_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Hoist bitcasts and reshapes in the computation out of "__triton_gemm" fusions
+// with a dot instruction.
+class HoistFusedBitcasts : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "hoist-fused-bitcasts"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnModule(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+};
+
+namespace detail {
+
+// Returns the start indices of consecutive non-overlapping subsequences of `a`
+// and `b` with the same product (see `CommonFactors` from `util.h`) grouping
+// ranges having product of 1 with neighbors.
+//
+// For example, if a=[2, 5, 1, 3] and b=[1, 10, 3, 1], the result will be
+// {{0, 0}, {2, 2}, {4, 4}}, grouping [2,5] with [1,10] and [1,3] with [3,1].
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
+CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
+                                  absl::Span<const int64_t> b);
+
+}  // namespace detail
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_HOIST_FUSED_BITCASTS_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
new file mode 100644
index 00000000000000..f6cde67faa95d6
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
@@ -0,0 +1,1325 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+
+using ::absl_testing::IsOkAndHolds;
+
+namespace xla {
+
+namespace gpu {
+namespace {
+
+// Wraps a matcher for a fusion instruction's output tile sizes.
+// Proto matchers would be nice, but b/229726259 is P2.
+MATCHER_P(OutputTileSizesIs, matcher, "") {
+  auto backend_config = arg.template backend_config<GpuBackendConfig>();
+  if (!backend_config.ok()) {
+    *result_listener << "failed to get backend config: "
+                     << backend_config.status();
+    return false;
+  }
+  FusionBackendConfig fusion_backend_config =
+      backend_config->fusion_backend_config();
+  if (!fusion_backend_config.has_block_level_fusion_config()) {
+    *result_listener << "has no block level fusion config";
+    return false;
+  }
+  if (fusion_backend_config.kind() != "__triton_nested_gemm_fusion") {
+    *result_listener << "fusion kind is not __triton_nested_gemm_fusion";
+    return false;
+  }
+  auto output_tile_sizes =
+      fusion_backend_config.block_level_fusion_config().output_tiles(0).sizes();
+  return ExplainMatchResult(matcher, output_tile_sizes, result_listener);
+}
+
+class HoistFusedBitcastsReshapeTest
+    : public HloHardwareIndependentTestBase,
+      public ::testing::WithParamInterface<HloOpcode> {
+ protected:
+  const se::DeviceDescription device_description_{
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(
+          se::GpuComputeCapability{se::CudaComputeCapability::Ampere()})};
+  mlir::MLIRContext mlir_context_;
+
+  std::unique_ptr<VerifiedHloModule> RunHoistFusedBitcasts(
+      absl::string_view hlo, const bool expect_change = true) {
+    std::unique_ptr<VerifiedHloModule> module =
+        ParseAndReturnVerifiedModule(hlo).value();
+    EXPECT_THAT(HoistFusedBitcasts().Run(module.get()),
+                IsOkAndHolds(expect_change));
+    EXPECT_OK(verifier().Run(module.get()).status());
+    return module;
+  }
+};
+
+// Tests hoisting of bitcasts which would otherwise trigger unsatisfiable
+// constraints during symbolic tile analysis.
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedOutOfGemmFusions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[21] parameter(0)
+  bitcast = f32[3,7]{0,1} $0(lhs)
+  rhs = f32[7,11] parameter(1)
+  ROOT dot = f32[3,11] dot(bitcast, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[21] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f32[3,11] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+}
+)";
+
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: dot {
+CHECK-NEXT: [[lhs:[^ ]+]] = f32[3,7]{0,1} parameter(0)
+CHECK-NEXT: [[rhs:[^ ]+]] = f32[7,11]{1,0} parameter(1)
+CHECK-NEXT: ROOT {{.*}} = f32[3,11]{1,0} dot([[lhs]], [[rhs]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+CHECK-NEXT: }
+CHECK: ENTRY
+CHECK: bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsCanBeHoistedPastOtherBitcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  bitcast0 = f32[21] $0(lhs)
+  bitcast1 = f32[3,7] $0(bitcast0)
+  rhs = f32[7,11] parameter(1)
+  ROOT dot = f32[3,11] dot(bitcast1, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f32[3,11] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+}
+)";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsCanBeHoistedPastElementwiseEpilogues) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast = f32[33] $0(dot)
+  ROOT add = f32[33] add(bitcast, bitcast)
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f32[33] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsCanBeHoistedPastConvertEpilogues) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast = f32[33] $0(dot)
+  ROOT convert = f16[33] convert(bitcast)
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f16[33] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: f16[3,11]{1,0} convert(
+CHECK: f16[3,11]{1,0} fusion(
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       ResumeBitcastSinkingAfterIncompatibleOps) {
+  // Even though we cannot hoist the bitcast1 past the transpose we still can
+  // remove bitcast2.
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = f32[2,32] parameter(0)
+  p1 = f32[64,32] parameter(1)
+  dot = f32[2,64] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast1 = f32[2,32,2] $0(dot)
+  transpose1 = f32[2,2,32] transpose(bitcast1), dimensions={0,2,1}
+  ROOT bitcast2 = f32[1,2,1,2,32] $0(transpose1)
+}
+
+ENTRY entry {
+  p0 = f32[2,32] parameter(0)
+  p1 = f32[64,32] parameter(1)
+  ROOT fusion = f32[1,2,1,2,32] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"128","block_n":"16","block_k":"32",
+          "split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  auto module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+  CHECK: ROOT {{.*}} = f32[2,2,32]{2,1,0} transpose
+  CHECK: ENTRY
+  CHECK: ROOT {{.*}} = f32[1,2,1,2,32]{4,3,2,1,0} bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsKeepElementSizeInBits) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = s8[21]{0:E(4)} parameter(0)
+  c1 = s8[21] convert(lhs)
+  c2 = f32[21] convert(c1)
+  b0 = f32[3,7] $0(c2)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(b0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  b1 = f32[33] $0(dot)
+  ROOT c = s8[33]{0:E(4)} convert(b1)
+}
+
+ENTRY entry {
+  p0 = s8[21]{0:E(4)} parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = s8[33]{0:E(4)} fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+  CHECK: ENTRY
+  CHECK: {{.*}} = s8[3,7]{1,0:E(4)} bitcast({{.*}})
+  CHECK: [[fusion:[^ ]+]] = s8[3,11]{1,0:E(4)} fusion({{.*}})
+  CHECK: ROOT {{.*}} = s8[33]{0:E(4)} bitcast([[fusion]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       TritonFusionEmitterDeviceLegacyTestSample1) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = f16[1,16,17,3] parameter(0)
+  bitcast0 = f16[16,51] $0(f16[1,16,17,3] p0)
+  p1 = s8[16,17,3] parameter(1)
+  bitcast1 = s8[16,51] $0(s8[16,17,3] p1)
+  convert = f16[16,51] convert(s8[16,51] bitcast1)
+  bitcast2 = f16[51,16]{0,1} $0(f16[16,51] convert)
+  dot = f16[16,16] dot(bitcast0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bitcast3 = f16[1,16,16] $0(f16[16,16] dot)
+}
+
+ENTRY entry {
+  p0 = f16[1,16,17,3] parameter(0)
+  p1 = s8[16,17,3] parameter(1)
+  ROOT fusion = f16[1,16,16] fusion(f16[1,16,17,3] p0, s8[16,17,3] p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"16","block_n":"16","block_k":"32",
+          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       TritonFusionEmitterDeviceLegacyTestSample2) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = pred[3,122,96,12] parameter(0)
+  transpose = pred[3,96,12,122] transpose(p0), dimensions={0,2,3,1}
+  bitcast0 = pred[3456,122] $0(transpose)
+  convert0 = f16[3456,122] convert(bitcast0)
+  p1 = pred[1,5,122] parameter(1)
+  bitcast1 = pred[5,122] $0(p1)
+  convert1 = f16[5,122] convert(bitcast1)
+  bitcast2 = f16[122,5]{0,1} $0(convert1)
+  dot.1 = f16[3456,5] dot(convert0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bitcast3 = f16[3,96,12,1,5] $0(dot.1)
+}
+
+ENTRY entry_computation {
+  p0 = pred[3,122,96,12] parameter(0)
+  p1 = pred[1,5,122] parameter(1)
+  ROOT gemm_fusion_dot = f16[3,96,12,1,5] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"4","block_n":"16","block_k":"128",
+          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  // Note: block sizes were 16,16,32, but that now fails to satisfy constraints.
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       TritonFusionEmitterDeviceLegacyTestSample3) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = f32[1,40] parameter(0)
+  bitcast0 = f32[40] $0(p0)
+  bitcast1 = f32[40,1] $0(bitcast0)
+  p1 = f32[1,40,250000] parameter(1)
+  bitcast2 = f32[40,250000] $0(p1)
+  dot = f32[1,250000] dot(bitcast1, bitcast2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  bitcast3 = f32[250000] $0(dot)
+  ROOT bitcast4 = f32[1,250000] $0(bitcast3)
+}
+
+ENTRY entry_computation {
+  p0 = f32[1,40] parameter(0)
+  p1 = f32[1,40,250000] parameter(1)
+  ROOT gemm_fusion_dot.2 = f32[1,250000] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"16","block_n":"16","block_k":"32",
+          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedPastCompare) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = s32[11,24,128]{2,1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  eq = pred[11,24,128]{2,1,0} compare(p0, p1), direction=EQ
+  eq_reshape = pred[264,128]{1,0} $0(eq)
+  eq_f32 = f32[264,128]{1,0} convert(eq_reshape)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT result = f32[264,8]{1,0} dot(eq_f32, p2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s32[11,24, 128]{2,1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT result = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {
+      "block_m":32,"block_n":16,"block_k":128,
+      "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
+)";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedUpThroughBroadcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,1,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,1,2,3}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,1,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[dot_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[264,128]{1,0} broadcast([[dot_p0]]), dimensions={0}
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsAreHoistedUpThroughBroadcastsWithTrivialDimensions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,2,3}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[dot_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[264,128]{1,0} broadcast([[dot_p0]]), dimensions={0}
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,24,1]{{.*}} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastOfOperandAndBroadcastDimsIsNotHoistedUp) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,4] parameter(0)
+  p1 = f32[64,7]{1,0} parameter(1)
+  broadcast = f32[3,4,16] broadcast(p0), dimensions={0,1}
+  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
+  reshape = f32[3,64] $0(broadcast)
+  ROOT dot = f32[3,7]{1,0} dot(reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,4] parameter(0)
+  p1 = f32[64,7] parameter(1)
+  ROOT result = f32[3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
+  // Cos should not be rewritten as we cannot hoist bitcast.
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[3,4,16]{2,1,0} broadcast
+CHECK-NEXT: f32[3,64]{1,0} $0
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastOfOperandAndBroadcastDimsIsNotHoistedDown) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7]{1,0} parameter(1)
+  dot = f32[6,5]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
+  reshape = f32[2,3,5] $0(dot)
+  ROOT broadcast = f32[2,4,3,5] broadcast(reshape), dimensions={0,2,3}
+}
+
+ENTRY e {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,4,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
+  // Cos should not be rewritten as we cannot hoist bitcast.
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[2,3,5]{2,1,0} $0
+CHECK-NEXT: f32[2,4,3,5]{3,2,1,0} broadcast
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsAreHoistedUpThroughBroadcastDiamonds) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,5] parameter(0)
+  b0 = f32[3,5,77,1] broadcast(p0), dimensions={0,1}
+  b1 = f32[3,5,1] broadcast(p0), dimensions={0,1}
+  b2 = f32[3,5,77,1] broadcast(b1), dimensions={0,1,3}
+  sum = add(b0, b2)
+  sum_reshape = f32[15,77] $0(sum)
+  p1 = f32[77,8]{1,0} parameter(1)
+  ROOT result = f32[15,8] dot(sum_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,5] parameter(0)
+  p1 = f32[77,8] parameter(1)
+  ROOT result = f32[15,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: [[p0:[^ ]+]] = f32[15]{0} parameter(0)
+CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[p0]]), dimensions={0}
+CHECK-DAG: [[br:[^ ]+]] = f32[15]{0} broadcast([[p0]]), dimensions={0}
+CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[br]]), dimensions={0}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedOverBroadcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,1,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128,1] broadcast(p0), dimensions={0,1,2,5}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,1,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[dot_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[264,128]{1,0} broadcast([[dot_p0]]), dimensions={0}
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsLayoutIsPreserved) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+gemm_dot {
+  p0 = pred[3,122,96,12] parameter(0)
+  bitcast0 = pred[3,122,1152] $0(p0)
+  transpose0 = pred[3,1152,122] transpose(bitcast0), dimensions={0,2,1}
+  bitcast2 = pred[3456,122] $0(transpose0)
+  convert0 = f16[3456,122] convert(bitcast2)
+  p1 = pred[1,5,122] parameter(1)
+  bitcast3 = pred[5,122] $0(p1)
+  convert1 = f16[5,122] convert(bitcast3)
+  bitcast4 = f16[122,5]{0,1} $0(convert1)
+  dot0 = f16[3456,5]{1,0} dot(convert0, bitcast4), lhs_contracting_dims={1},
+    rhs_contracting_dims={0}
+  ROOT bitcast5 = f16[3,96,12,1,5] $0(dot0)
+}
+
+ENTRY e {
+  p0 = pred[3,122,96,12] parameter(0)
+  p1 = pred[1,5,122] parameter(1)
+  ROOT fusion = f16[3,96,12,1,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK: {{.*}} {
+CHECK: [[re:[^ ]+]] = pred[3456,122]{1,0} $0({{.*}})
+CHECK: {{.*}} = f16[3456,122]{1,0} convert([[re]])
+CHECK-NOT: $0
+CHECK: {{.*}} = f16[122,5]{0,1} convert({{.*}})
+CHECK-NEXT: }
+CHECK: ENTRY {{.*}} {
+CHECK: {{.*}} = pred[122,5]{0,1} bitcast({{.*}})
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       CheckDimensionsOfBroadcastAfterBitcastIsHoisted) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = bf16[1,8] parameter(0)
+  broadcast0 = bf16[1,8,8] broadcast(p0), dimensions={0,2}
+  lhs = bf16[1,2,4,8] $0(broadcast0)
+
+  p1 = bf16[1,8] parameter(1)
+  broadcast1 = bf16[1,8,8] broadcast(p1), dimensions={0,2}
+  rhs = bf16[1,2,4,8] $0(broadcast1)
+
+  ROOT dot = bf16[2,1,4,4] dot(lhs, rhs),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
+    rhs_contracting_dims={3}, rhs_batch_dims={1,0}
+}
+
+ENTRY entry {
+  p0 = bf16[1,8] parameter(0)
+  ROOT fusion = bf16[2,1,4,4] fusion(p0, p0), kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+})";
+
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedUpThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[7,6] parameter(0)
+  transpose = f32[6,7] transpose(p0), dimensions={1,0}
+  bitcast = f32[2,3,7] $0(transpose)
+  p1 = f32[2,5,7] parameter(1)
+  ROOT result = f32[2,3,5] dot(bitcast, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[7,6] parameter(0)
+  p1 = f32[2,5,7] parameter(1)
+  ROOT result = f32[2,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK-NEXT: [[p0:[^ ]*]] = f32[7,2,3]{2,1,0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[2,3,7]{2,1,0} transpose([[p0]]), dimensions={1,2,0}
+CHECK ENTRY
+CHECK f32[7,2,3]{2,1,0} bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsWithSize1DimensionsAreHoistedUpThroughTransposes) {
+  const HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[7,6] parameter(0)
+  transpose = f32[6,7] transpose(p0), dimensions={1,0}
+  bitcast = f32[1,6,7] $0(transpose)
+  p1 = f32[1,5,7] parameter(1)
+  ROOT result = f32[1,6,5] dot(bitcast, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[7,6] parameter(0)
+  p1 = f32[1,5,7] parameter(1)
+  ROOT result = f32[1,6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK-NEXT: [[p0:[^ ]+]] = f32[7,1,6]{2,1,0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[1,6,7]{2,1,0} transpose([[p0]]), dimensions={1,2,0}
+CHECK-NOT: bitcast
+CHECK: }
+CHECK ENTRY {{.*}} {
+CHECK: bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       RankReducingBitcastsAreNotHoistedUpThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[2,7,3] parameter(0)
+  transpose = f32[3,2,7] transpose(p0), dimensions={2,0,1}
+  $0 = f32[6,7] $0(transpose)
+  p1 = f32[5,7] parameter(1)
+  ROOT dot = f32[6,5] dot($0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[2,7,3] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      transpose
+CHECK-SAME: f32[3,2,7]{2,1,0} transpose
+CHECK-SAME: dimensions={2,0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       RankReducingBitcastsAreNotHoistedDownThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[6,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0 = f32[2,3,5] $0(dot)
+  ROOT transpose = f32[2,5,3] transpose($0), dimensions={0,2,1}
+}
+
+ENTRY e {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,5,3] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[2,3,5]{2,1,0} $0
+CHECK-NEXT: f32[2,5,3]{2,1,0} transpose
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       HoistingBitcastDoesNotIntroduceArtificialDimension) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+gemm_dot {
+  p0 = f16[3,122,1152] parameter(0)
+  transpose = f16[3,1152,122] transpose(p0), dimensions={0,2,1}
+  bitcast0 = f16[3,96,12,122] $0(transpose)
+  bitcast1 = f16[3456,122] $0(bitcast0)
+  p1 = f16[122,5] parameter(1)
+  ROOT dot = f16[3456,5]{1,0} dot(bitcast1, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[3,122,1152] parameter(0)
+  p1 = f16[122,5] parameter(1)
+  ROOT fusion = f16[3456,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+}
+          )";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  // Checks that transpose is on rank 3 tensor from hoisting bitcast1, not rank
+  // 4 tensor from hoisting bitcast0 first and then failing to hoist bitcast1.
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      transpose
+CHECK-SAME: f16[3,1152,122]{2,1,0} transpose
+CHECK-SAME: dimensions={0,2,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedDownThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[2,3,7] parameter(0)
+  p1 = f32[2,5,7] parameter(1)
+  dot = f32[2,3,5] dot(p0, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+  bitcast = f32[6,5] $0(dot)
+  ROOT transpose = f32[5,6] transpose(bitcast), dimensions={1,0}
+}
+
+ENTRY e {
+  p0 = f32[2,3,7] parameter(0)
+  p1 = f32[2,5,7] parameter(1)
+  ROOT result = f32[5,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT transpose
+CHECK-SAME: f32[5,2,3]{2,1,0} transpose
+CHECK-SAME: dimensions={2,0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedDownThroughBroadcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[3,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[15] $0(dot)
+  ROOT broadcast = f32[2,15,6] broadcast(bitcast), dimensions={1}
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,15,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT broadcast
+CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
+CHECK-SAME: dimensions={0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+// TODO(b/467306121): handle the case when we need to sink the reshape through
+// broadcast.
+TEST_P(HoistFusedBitcastsReshapeTest,
+       DISABLED_BitcastsAreHoistedDownThroughBroadcastsWithTrivialDimensions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[6,7] parameter(1)
+  dot = f32[3,6] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[3,2,3] $0(dot)
+  ROOT broadcast = f32[3,2,1,3,7] broadcast(bitcast), dimensions={0,1,3}
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[6,7] parameter(1)
+  ROOT result = f32[3,2,1,3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT broadcast
+CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
+CHECK-SAME: dimensions={0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsAreNotHoistedDownThroughBroadcastsWithNonDefaultLayout) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[6,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[2,3,5]{2,1,0} $0(dot)
+  ROOT broadcast = f32[2,3,5]{2,0,1} broadcast(bitcast), dimensions={0,1,2}
+}
+
+ENTRY e {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,3,5]{2,0,1} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[2,3,5]{2,1,0} $0(dot)
+CHECK-NEXT: f32[2,3,5]{2,0,1} broadcast
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastRootsAreHoistedDown) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[3,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  ROOT bitcast = f32[15] $0(dot)
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[15] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: ROOT dot
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastAreHoistedDownThroughBinaryElementwiseOps) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  p2 = f32[15] parameter(2)
+  dot = f32[3,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0 = f32[15] $0(dot)
+  ROOT add = f32[15] add($0, p2)
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  p2 = f32[15] parameter(2)
+  ROOT result = f32[15] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: ROOT add = f32[3,5]{1,0} add
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsWithNonDefaultLayoutAreHoistedOutThroughBroadcast) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[7,2]{0,1} parameter(0)
+  broadcast.1 = f32[15,7,2]{1,0,2} broadcast(p0), dimensions={1,2}
+  $0.1 = f32[2,7,15]{1,2,0} $0(broadcast.1)
+  p1 = f32[2,15,15]{2,1,0} parameter(1)
+  dot = f32[2,7,15]{2,1,0} dot($0.1, p1),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+  $0.2 = f32[15,14]{0,1} $0(dot)
+  ROOT broadcast.2 = f32[15,11,14]{0,2,1} broadcast($0.2), dimensions={0,2}
+}
+
+ENTRY e {
+  p0 = f32[7,2]{0,1} parameter(0)
+  p1 = f32[2,15,15]{2,1,0} parameter(1)
+  ROOT result = f32[15,11,14]{0,2,1} fusion(p0, p1),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[2,7,15]{1,2,0} broadcast({{.*}}), dimensions={0,1}
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[2,7,15,11]{2,1,0,3} broadcast({{.*}}), dimensions={0,1,2}
+CHECK: ENTRY
+CHECK: f32[7,2]{0,1} parameter(0)
+CHECK: f32[2,7]{1,0} bitcast(p0
+CHECK: result = f32[2,7,15,11]{2,1,0,3} fusion
+CHECK: ROOT {{.*}} = f32[15,11,14]{0,2,1} bitcast(result)
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsWithNonDefaultLayoutAreHoistedOutThroughTranspose) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[2,3,7]{0,2,1} parameter(0)
+  $0.1 = f32[7,3,2]{2,0,1} $0(p0)
+  transpose.1 = f32[3,2,7]{2,0,1} transpose($0.1), dimensions={1,2,0}
+  p1 = f32[3,5,7]{2,1,0} parameter(1)
+  dot = f32[3,2,5]{2,1,0} dot(transpose.1, p1),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+  $0.2 = f32[5,3,2]{0,2,1} $0(dot)
+  ROOT transpose.2 = f32[2,3,5]{0,2,1} transpose($0.2), dimensions={2,1,0}
+}
+
+ENTRY e {
+  p0 = f32[2,3,7]{0,2,1} parameter(0)
+  p1 = f32[3,5,7]{2,1,0} parameter(1)
+  ROOT result = f32[2,3,5]{0,2,1} fusion(p0, p1),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[3,2,7]{2,0,1} transpose({{.*}}), dimensions={1,2,0}
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[3,5,2]{2,1,0} transpose({{.*}}), dimensions={0,2,1}
+CHECK: ENTRY
+CHECK: f32[2,3,7]{0,2,1} parameter(0)
+CHECK: f32[7,3,2]{2,0,1} bitcast(p0
+CHECK: result = f32[3,5,2]{2,1,0} fusion
+CHECK: ROOT {{.*}} = f32[2,3,5]{0,2,1} bitcast(result)
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, MultipleBitcastsAreHoistedOut) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,3]{1,0} parameter(0)
+  $0.1 = f32[3,3]{1,0} $0(p0)
+  $0.2 = f32[3,3]{1,0} $0($0.1)
+  p1 = f32[3,3]{1,0} parameter(1)
+  dot = f32[3,3]{1,0} dot($0.2, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0.3 = f32[3,3]{1,0} $0(dot)
+  ROOT $0.4 = f32[3,3]{0,1} $0($0.3)
+}
+
+ENTRY e {
+  p0 = f32[3,3]{1,0} parameter(0)
+  ROOT result = f32[3,3]{0,1} fusion(p0, p0),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: ENTRY
+)"),
+      IsOkAndHolds(true));
+}
+
+// TODO(b/393299275): this test was not written correctly and now fails.
+TEST_P(HoistFusedBitcastsReshapeTest,
+       DISABLED_BitcastsAreNotHoistedOutThroughLayoutChangingTranspose) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[7,2]{1,0} parameter(0)
+  $0.1 = f32[2,7]{0,1} $0(p0)
+  transpose.1 = f32[2,7]{1,0} transpose($0.1), dimensions={0,1}
+  p1 = f32[5,7]{1,0} parameter(1)
+  dot = f32[2,5]{1,0} dot(transpose.1, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0.2 = f32[5,2]{0,1} $0(dot)
+  ROOT transpose.2 = f32[5,2]{1,0} transpose($0.2), dimensions={0,1}
+}
+
+ENTRY e {
+  p0 = f32[7,2]{1,0} parameter(0)
+  p1 = f32[5,7]{1,0} parameter(1)
+  ROOT result = f32[5,2]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK: $0.1 = f32[2,7]{0,1} $0
+CHECK: $0.2 = f32[5,2]{0,1} $0
+CHECK: ENTRY
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+        )",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+INSTANTIATE_TEST_SUITE_P(HoistFusedBitcastsReshapeTestSuite,
+                         HoistFusedBitcastsReshapeTest,
+                         ::testing::ValuesIn({HloOpcode::kReshape,
+                                              HloOpcode::kBitcast}),
+                         [](const ::testing::TestParamInfo<HloOpcode>& info) {
+                           return std::string(HloOpcodeString(info.param));
+                         });
+
+struct CommonFactorsTestCase {
+  std::vector<int64_t> from, to;
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> expected;
+};
+
+class CommonFactorsMergingTrivialRangesTest
+    : public ::testing::TestWithParam<CommonFactorsTestCase> {};
+
+TEST_P(CommonFactorsMergingTrivialRangesTest, Example) {
+  const CommonFactorsTestCase& test_case = GetParam();
+  EXPECT_EQ(test_case.expected, detail::CommonFactorsMergingTrivialRanges(
+                                    test_case.from, test_case.to));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CommonFactorsMergingTrivialRangesTestSuite,
+    CommonFactorsMergingTrivialRangesTest,
+    ::testing::Values(
+        CommonFactorsTestCase{{1}, {}, {{0, 0}, {1, 0}}},
+        CommonFactorsTestCase{{}, {1}, {{0, 0}, {0, 1}}},
+        CommonFactorsTestCase{{}, {}, {{0, 0}}},
+        CommonFactorsTestCase{{1, 2, 0}, {2, 0, 3}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{2, 3, 0}, {1, 0, 1000}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{1, 1, 1}, {1, 1}, {{0, 0}, {1, 1}, {3, 2}}},
+        CommonFactorsTestCase{{1, 1, 3}, {3, 1, 1}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{2, 6}, {4, 3}, {{0, 0}, {2, 2}}},
+        CommonFactorsTestCase{{1, 2, 6}, {4, 1, 3, 1}, {{0, 0}, {3, 4}}},
+        CommonFactorsTestCase{{2, 3, 4, 5}, {6, 20}, {{0, 0}, {2, 1}, {4, 2}}},
+        CommonFactorsTestCase{
+            {2, 3, 4, 5, 6}, {6, 20, 6}, {{0, 0}, {2, 1}, {4, 2}, {5, 3}}},
+        CommonFactorsTestCase{{2, 2, 2, 2}, {4, 4}, {{0, 0}, {2, 1}, {4, 2}}},
+        CommonFactorsTestCase{
+            {2, 5, 1, 3}, {1, 10, 3, 1}, {{0, 0}, {2, 2}, {4, 4}}}),
+    [](const ::testing::TestParamInfo<CommonFactorsTestCase>& info) {
+      return absl::StrCat(absl::StrJoin(info.param.from, "_"), "_to_",
+                          absl::StrJoin(info.param.to, "_"));
+    });
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
index ee015d7f127b25..f37097e61ec869 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
@@ -525,7 +525,7 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
       TF_RETURN_IF_ERROR(SetOperandLayout(op0_shape, instruction, 0));
       TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
     } else if ((HloPredicateIsOp<HloOpcode::kSort>(instruction) ||
-                IsCubDeviceRadixSort(*instruction)) &&
+                IsCubDeviceRadixSortNoScratchSize(*instruction)) &&
                instruction->operand(0)->shape().dimensions().size() > 1) {
       // Make sure that all the operands and the output(s) have the same layout.
       Shape keys_shape = instruction->operand(0)->shape();
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
index f5f35fca937004..0281e68b03e4ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: bf16[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
index 07b3a04afabf74..10cc948cf6a288 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100_sxm.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/h100_sxm.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: f8e4m3fn[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
index 161ac37dcc644a..d13d61754ed6dd 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -417,7 +417,7 @@ TEST_F(LayoutAssignmentTest,
     values = f32[2,3]{1,0} parameter(0)
     transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
     ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}, u8[128]{0})
-        custom-call(keys, transpose), custom_call_target="__cub$DeviceRadixSort"
+        custom-call(keys, transpose), custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize"
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
index f91bea89a7cce5..5ae06c318a1cf9 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/v100.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: f16[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 0a8633fd5e7b4d..8a8069c5a04c5b 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -16,9 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
 #include <cstdint>
-#include <deque>
 #include <memory>
-#include <optional>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -35,9 +33,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile.h"
@@ -50,10 +46,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/layout.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -63,7 +57,6 @@ limitations under the License.
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/shape.h"
-#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/errors.h"
@@ -337,767 +330,6 @@ absl::Status MakeNestedFusionFromGemmFusion(
   return absl::OkStatus();
 }
 
-using HloInstructionSetVector =
-    llvm::SetVector<HloInstruction*, std::vector<HloInstruction*>,
-                    HloInstructionSet>;
-
-// Returns the set of instructions that are reachable from 'instruction' using
-// the given accessor.
-template <typename T>
-HloInstructionSetVector GetTransitiveInstructionSet(
-    const HloInstruction* instruction, T (HloInstruction::*get)() const) {
-  std::deque<HloInstruction*> worklist;
-  auto append = [&](const auto& instructions) {
-    worklist.insert(worklist.end(), instructions.begin(), instructions.end());
-  };
-  append((instruction->*get)());
-  HloInstructionSetVector result;
-  while (!worklist.empty()) {
-    HloInstruction* front = worklist.front();
-    worklist.pop_front();
-    if (result.insert(front)) {
-      append((front->*get)());
-    }
-  }
-  return result;
-}
-
-// Returns the set of producers reachable from 'instruction' in use-before-def
-// order.
-HloInstructionSetVector GetProducerSet(const HloInstruction* instruction) {
-  return GetTransitiveInstructionSet(instruction, &HloInstruction::operands);
-}
-// Returns the set of consumers reachable from 'instruction' in def-before-use
-// order.
-HloInstructionSetVector GetConsumerSet(const HloInstruction* instruction) {
-  return GetTransitiveInstructionSet(instruction, &HloInstruction::users);
-}
-
-// Verifies that the set of instructions is closed under the given accessor,
-// i.e. that the set of instructions reachable through the given accessor are
-// either in the set itself or the root.
-template <typename T>
-absl::Status VerifyIsClosedInstructionSet(
-    const HloInstructionSetVector& instructions, const HloInstruction* root,
-    T (HloInstruction::*get)() const) {
-  for (HloInstruction* instruction : instructions) {
-    for (HloInstruction* reachable : (instruction->*get)()) {
-      if (reachable != root && instructions.count(reachable) == 0) {
-        return absl::FailedPreconditionError(
-            absl::StrCat("Instruction ", reachable->ToString(),
-                         " is reachable from ", instruction->ToString(),
-                         ", which is not in the recursive set of, or ",
-                         root->ToString(), " itself."));
-      }
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status VerifyIsClosedProducerSet(
-    const HloInstructionSetVector& instructions, const HloInstruction* root) {
-  return VerifyIsClosedInstructionSet(instructions, root,
-                                      &HloInstruction::users);
-}
-
-// Copies the element type and size from `source` to `destination`.
-void CopyElementType(const Shape& source, Shape* destination) {
-  destination->set_element_type(source.element_type());
-  destination->mutable_layout()->set_element_size_in_bits(
-      source.layout().element_size_in_bits());
-}
-
-llvm::SmallVector<int64_t> GetInversePermutation(
-    absl::Span<const int64_t> permutation) {
-  llvm::SmallVector<int64_t> result(permutation.size());
-  for (int64_t i = 0; i < permutation.size(); ++i) {
-    result[permutation[i]] = i;
-  }
-  return result;
-}
-
-// Applies the backward-mapping 'permutation' to 'values'.
-llvm::SmallVector<int64_t> ApplyPermutation(
-    absl::Span<const int64_t> values, absl::Span<const int64_t> permutation) {
-  llvm::SmallVector<int64_t> result;
-  result.reserve(permutation.size());
-  for (int64_t index : permutation) {
-    result.push_back(values[index]);
-  }
-  return result;
-}
-
-// Returns the dimensions of 'shape' in minor-to-major order.
-llvm::SmallVector<int64_t> GetPhysicalDimensions(const Shape& shape) {
-  return ApplyPermutation(shape.dimensions(), shape.layout().minor_to_major());
-}
-
-// Parameters to rewrite a bitcast(broadcast/transpose) as
-// broadcast/transpose(bitcast) and vice versa.
-struct BitcastParams {
-  Shape new_shape;                      // The bitcast output shape.
-  llvm::SmallVector<int64_t> new_dims;  // The dims of the broadcast/transpose.
-};
-
-// Returns parameters to rewrite a broadcast + bitcast as bitcast + broadcast.
-//
-// Example:
-//
-// broadcast = broadcast(operand)
-// result = result_shape bitcast(broadcast)
-//
-// to
-//
-// bitcast = new_shape bitcast(operand)
-// result = broadcast(bitcast), dimensions={new_dims}.
-//
-// Assumes that:
-// - broadcast does not transpose dimensions (checked by hlo_verifier);
-// - bitcast does not mix operand and broadcast dimensions (checks);
-absl::StatusOr<BitcastParams> CalculateBitcastOfBroadcast(
-    const HloBroadcastInstruction* broadcast, const Shape& result_shape) {
-  const Shape& broadcast_shape = broadcast->shape();
-
-  // Maps broadcast dimension index to whether it's an operand dimension.
-  llvm::SmallVector<bool> is_operand_dim(broadcast_shape.dimensions().size());
-  for (const int64_t index : broadcast->dimensions()) {
-    is_operand_dim[index] = true;
-  }
-
-  // Dimensions of the new broadcast.
-  llvm::SmallVector<int64_t> new_dims;
-  auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
-                               GetPhysicalDimensions(broadcast_shape));
-  for (int64_t i = 1; i < factors.size(); ++i) {
-    auto [result_from, broadcast_from] = factors[i - 1];
-    auto [result_to, broadcast_to] = factors[i];
-
-    bool all_operands = true, any_operands = false;
-    for (int64_t j = broadcast_from; j < broadcast_to; ++j) {
-      bool value = is_operand_dim[broadcast_shape.layout().minor_to_major(j)];
-      all_operands &= value;
-      any_operands |= value;
-    }
-
-    if (!any_operands) {
-      continue;  // All dimensions in this group are broadcast dimensions.
-    }
-    if (!all_operands) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
-                       " as it mixes operand and broadcast dimensions."));
-    }
-
-    for (int64_t j = result_from; j < result_to; ++j) {
-      new_dims.push_back(result_shape.layout().minor_to_major(j));
-    }
-  }
-  absl::c_sort(new_dims);  // Sort into logical order.
-
-  BitcastParams result;
-  CopyElementType(result_shape, &result.new_shape);
-  for (int64_t index : new_dims) {
-    result.new_shape.add_dimensions(result_shape.dimensions(index));
-  }
-  auto* new_layout =
-      result.new_shape.mutable_layout()->mutable_minor_to_major();
-  new_layout->reserve(new_dims.size());
-  for (int64_t index : result_shape.layout().minor_to_major()) {
-    if (auto it = absl::c_lower_bound(new_dims, index);
-        it != new_dims.end() && *it == index) {
-      new_layout->push_back(it - new_dims.begin());
-    }
-  }
-  result.new_dims = std::move(new_dims);
-
-  VLOG(3) << "CalculateBitcastOfBroadcast:";
-  VLOG(3) << "  broadcast = " << broadcast_shape.ToString(true) << " broadcast("
-          << broadcast->operand(0)->shape().ToString(true)
-          << " operand), dimensions="
-          << absl::StrJoin(broadcast->dimensions(), ",");
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
-          << broadcast_shape.ToString(true) << " broadcast)";
-  VLOG(3) << "--------------------------------";
-  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
-          << broadcast->operand(0)->shape().ToString(true) << " operand)";
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
-          << result.new_shape.ToString(true)
-          << " bitcast), dimensions=" << absl::StrJoin(result.new_dims, ",");
-
-  return result;
-}
-
-// Returns parameters to rewrite a bitcast + broadcast as broadcast + bitcast.
-//
-// Example:
-//
-// bitcast = bitcast(operand_shape operand)
-// result = broadcast(bitcast)
-//
-// to
-//
-// broadcast = new_shape broadcast(operand), dimensions={new_dims}.
-// result = bitcast(broadcast)
-//
-// Assumes that:
-// - broadcast does not transpose dimensions (checked by hlo_verifier);
-// - bitcast does not mix operand and broadcast dimensions (checks);
-absl::StatusOr<BitcastParams> CalculateBroadcastOfBitcast(
-    const HloBroadcastInstruction* broadcast, const Shape& operand_shape) {
-  const Shape& bitcast_shape = broadcast->operand(0)->shape();
-  const Shape& result_shape = broadcast->shape();
-
-  // Maps logical result dimension index to a range of physical operand
-  // dimensions, or nullopt if the dimension is broadcasted.
-  llvm::SmallVector<std::optional<std::pair<int64_t, int64_t>>>
-      result_to_operand_range(result_shape.dimensions().size());
-  auto result_inv_layout =
-      GetInversePermutation(result_shape.layout().minor_to_major());
-  auto factors = CommonFactors(GetPhysicalDimensions(bitcast_shape),
-                               GetPhysicalDimensions(operand_shape));
-  for (int64_t i = 1; i < factors.size(); ++i) {
-    auto [bitcast_from, operand_from] = factors[i - 1];
-    auto [bitcast_to, operand_to] = factors[i];
-
-    llvm::SmallVector<int64_t> indices;
-    indices.reserve(bitcast_to - bitcast_from);
-    for (int64_t j = bitcast_from; j < bitcast_to; ++j) {
-      int64_t index =
-          broadcast->dimensions()[bitcast_shape.layout().minor_to_major(j)];
-
-      // Store the entire operand dimension range in the minor-most dimension
-      // index and an empty range in all others.
-      result_to_operand_range[index].emplace(operand_from, operand_to);
-      operand_from = operand_to;
-
-      // Check that the physical result indices form a contiguous range.
-      indices.push_back(result_inv_layout[index]);
-    };
-
-    if (indices.back() - indices.front() >= bitcast_to - bitcast_from ||
-        !absl::c_is_sorted(indices)) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
-                       " because result dimensions are not contiguous."));
-    }
-  }
-
-  BitcastParams result;
-  CopyElementType(operand_shape, &result.new_shape);
-  result.new_dims.resize(operand_shape.dimensions().size());
-  auto* new_layout =
-      result.new_shape.mutable_layout()->mutable_minor_to_major();
-  int64_t new_rank = operand_shape.dimensions().size() +
-                     result_shape.dimensions().size() -
-                     bitcast_shape.dimensions().size();
-  new_layout->reserve(new_rank);
-  llvm::SmallVector<int64_t> new_shape_dims(new_rank);
-
-  // We are free to insert the broadcast dimensions in any order. Insert them
-  // at the end of the the logical dimension order.
-  int64_t broadcast_index = operand_shape.dimensions().size();
-
-  // Iterate through the logical result dimension indices in physical order.
-  for (int64_t result_index : result_shape.layout().minor_to_major()) {
-    if (auto range = result_to_operand_range[result_index]) {
-      // This result dimension corresponds to a group of operand dimensions.
-      // Iterate through the range of physical operand dimension indices.
-      for (int64_t i = range->first; i < range->second; ++i) {
-        int64_t operand_index = operand_shape.layout().minor_to_major(i);
-        int64_t new_index = operand_index;
-        new_shape_dims[new_index] = operand_shape.dimensions(operand_index);
-        new_layout->push_back(new_index);
-        result.new_dims[operand_index] = new_index;
-      }
-    } else {
-      // This is a new dimension introduced by the original broadcast.
-      int64_t new_index = broadcast_index++;
-      new_shape_dims[new_index] = result_shape.dimensions(result_index);
-      new_layout->push_back(new_index);
-    }
-  }
-  absl::c_sort(result.new_dims);  // Sort into logical order.
-  for (int64_t dimension : new_shape_dims) {
-    result.new_shape.add_dimensions(dimension);
-  }
-
-  VLOG(3) << "CalculateBroadcastOfBitcast:";
-  VLOG(3) << "  bitcast   = " << bitcast_shape.ToString(true) << " bitcast("
-          << operand_shape.ToString(true) << " operand)";
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
-          << bitcast_shape.ToString(true) << " bitcast), dimensions="
-          << absl::StrJoin(broadcast->dimensions(), ",");
-  VLOG(3) << "--------------------------------";
-  VLOG(3) << "  broadcast = " << result.new_shape.ToString(true)
-          << " broadcast(" << operand_shape.ToString(true)
-          << " operand), dimensions=" << absl::StrJoin(result.new_dims, ",");
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
-          << result.new_shape.ToString(true) << " broadcast)";
-
-  return result;
-}
-
-// Implements CalculateBitcastOfTranspose(), except that result.new_dims is
-// the inverse permutation, mapping the input dimensions to the output
-// dimensions.
-absl::StatusOr<BitcastParams> CalculateBitcastOfTransposeImpl(
-    const HloTransposeInstruction* transpose, const Shape& result_shape,
-    const Shape& transpose_shape, const Shape& operand_shape,
-    absl::Span<const int64_t> transpose_dims) {
-  if (transpose->shape().layout() != transpose->operand(0)->shape().layout()) {
-    return absl::InternalError(
-        absl::StrCat("Expected input and output layouts to be the same for ",
-                     transpose->ToString()));
-  }
-
-  // Maps physical operand dimension index to a range of physical result
-  // dimensions.
-  llvm::SmallVector<std::pair<int64_t, int64_t>> operand_to_result_range(
-      operand_shape.dimensions().size());
-  // Maps logical operand dimension index to the physical dimension index.
-  llvm::SmallVector<int64_t> operand_inv_layout =
-      GetInversePermutation(operand_shape.layout().minor_to_major());
-  auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
-                               GetPhysicalDimensions(transpose_shape));
-  for (int64_t i = 1; i < factors.size(); ++i) {
-    auto [result_from, transpose_from] = factors[i - 1];
-    auto [result_to, transpose_to] = factors[i];
-
-    llvm::SmallVector<int64_t> indices;
-    indices.reserve(transpose_to - transpose_from);
-    for (int64_t j = transpose_from; j < transpose_to; ++j) {
-      int64_t index = operand_inv_layout
-          [transpose_dims[transpose_shape.layout().minor_to_major(j)]];
-
-      // Store the entire result dimension range in the minor-most dimension
-      // index and an empty range in all others.
-      operand_to_result_range[index] = {result_from, result_to};
-      result_from = result_to;
-
-      // Check that the physical operand indices form a contiguous range.
-      indices.push_back(index);
-    };
-
-    if (indices.back() - indices.front() >= transpose_to - transpose_from ||
-        !absl::c_is_sorted(indices)) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
-                       " because result dimensions are not contiguous."));
-    }
-  }
-
-  BitcastParams result;
-  CopyElementType(result_shape, &result.new_shape);
-  // Just like the old transpose, the new transpose does not change the
-  // layout.
-  *result.new_shape.mutable_layout() = result_shape.layout();
-  result.new_dims.resize(result_shape.dimensions().size());
-  llvm::SmallVector<int64_t> new_shape_dims(result_shape.dimensions().size());
-  // Iterate through the physical operand and new_shape dimension indices.
-  for (int64_t i = 0, j = 0; i < operand_shape.dimensions().size(); ++i) {
-    auto range = operand_to_result_range[i];
-    // Iterate through corresponding range of physical result dimension
-    // indices.
-    for (int64_t k = range.first; k < range.second; ++k) {
-      int64_t new_index = result_shape.layout().minor_to_major(j++);
-      int64_t result_index = result_shape.layout().minor_to_major(k);
-      new_shape_dims[new_index] = result_shape.dimensions(result_index);
-      result.new_dims[new_index] = result_index;
-    }
-  }
-  for (int64_t dimension : new_shape_dims) {
-    result.new_shape.add_dimensions(dimension);
-  }
-
-  VLOG(3) << "CalculateBitcastOfTransposeImpl:";
-  VLOG(3) << "  transpose = " << transpose_shape.ToString(true) << " transpose("
-          << operand_shape.ToString(true)
-          << " operand), dimensions=" << absl::StrJoin(transpose_dims, ",");
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
-          << transpose_shape.ToString(true) << " transpose)";
-  VLOG(3) << "--------------------------------";
-  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
-          << operand_shape.ToString(true) << " operand)";
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " transpose("
-          << result.new_shape.ToString(true) << " bitcast), dimensions="
-          << absl::StrJoin(GetInversePermutation(result.new_dims), ",");
-
-  return result;
-}
-
-// Returns parameters to rewrite a transpose + bitcast as bitcast + transpose.
-//
-// Example:
-//
-// transpose = transpose(operand)
-// result = result_shape bitcast(transpose)
-//
-// to
-//
-// bitcast = new_shape bitcast(operand)
-// result = transpose(bitcast), dimensions={new_dims}.
-//
-// Assumes that:
-// - bitcast only mixes contiguous dimensions (checks);
-// - transpose does not change layout (checks);
-absl::StatusOr<BitcastParams> CalculateBitcastOfTranspose(
-    const HloTransposeInstruction* transpose, const Shape& result_shape) {
-  TF_ASSIGN_OR_RETURN(
-      BitcastParams result,
-      CalculateBitcastOfTransposeImpl(
-          transpose, result_shape, transpose->shape(),
-          transpose->operand(0)->shape(), transpose->dimensions()));
-  result.new_dims = GetInversePermutation(result.new_dims);
-  return result;
-}
-
-// Returns parameters to rewrite a bitcast + transpose as transpose + bitcast.
-//
-// Example:
-//
-// bitcast = bitcast(operand_shape operand)
-// result = transpose(bitcast)
-//
-// to
-//
-// transpose = new_shape transpose(operand), dimensions={new_dims}.
-// result = bitcast(transpose)
-//
-// Assumes that:
-// - bitcast only mixes contiguous dimensions (checks);
-// - transpose does not change layout (checks);
-absl::StatusOr<BitcastParams> CalculateTransposeOfBitcast(
-    const HloTransposeInstruction* transpose, const Shape& operand_shape) {
-  return CalculateBitcastOfTransposeImpl(
-      transpose, operand_shape, transpose->operand(0)->shape(),
-      transpose->shape(), GetInversePermutation(transpose->dimensions()));
-}
-
-// Simulates a rewrite of all producers of a given bitcast/reshape, moving the
-// instruction outside of the computation. Returns the new shapes of affected
-// instructions in order of traversal from consumers to producers.
-absl::StatusOr<std::vector<std::pair<HloInstruction*, Shape>>>
-PlanHoistBitcastUpwardsToCallers(const HloInstruction* bitcast) {
-  // Check that all producers only affect the bitcast. If there are any
-  // other consumers: refuse the hoisting.
-  // It is possible to support more cases by sinking the bitcast from such
-  // producers downward.
-  HloInstructionSetVector producers = GetProducerSet(bitcast);
-  TF_RETURN_IF_ERROR(VerifyIsClosedProducerSet(producers, bitcast));
-  if (bitcast->shape().element_type() !=
-      bitcast->operand(0)->shape().element_type()) {
-    return absl::UnimplementedError(
-        absl::StrCat("Hoisting bitcast with type conversion is not supported: ",
-                     bitcast->ToString()));
-  }
-
-  HloInstructionMap<Shape> result_shapes;
-  auto set_result_shape =
-      [&](const absl::Span<HloInstruction* const> instructions,
-          const Shape& shape) -> absl::Status {
-    for (HloInstruction* instruction : instructions) {
-      // Only update the dimensions keeping the type intact.
-      Shape new_shape(shape);
-      CopyElementType(instruction->shape(), &new_shape);
-      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
-               ShapeUtil::ArrayDataSize(instruction->shape()))
-          << " instruction " << instruction->ToString()
-          << " updating result shape from "
-          << ShapeUtil::HumanStringWithLayout(instruction->shape()) << " to "
-          << ShapeUtil::HumanStringWithLayout(new_shape)
-          << " with different data size";
-      auto it = result_shapes.find(instruction);
-      if (it == result_shapes.end()) {
-        VLOG(2) << "updating the result shape of " << instruction->ToString()
-                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
-        result_shapes.emplace(instruction, new_shape);
-      } else if (it->second != new_shape) {
-        return absl::FailedPreconditionError(absl::StrCat(
-            "Conflicting shape assignment for ", instruction->ToString(),
-            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
-            ShapeUtil::HumanStringWithLayout(shape)));
-      }
-    }
-    return absl::OkStatus();
-  };
-  TF_RETURN_IF_ERROR(set_result_shape(bitcast->operands(), bitcast->shape()));
-
-  std::vector<std::pair<HloInstruction*, Shape>> result;
-  // We want to visit instructions in order from consumers to producers: we
-  // hoist the bitcast upwards and having a valid HLO at every rewrite step
-  // helps a lot. A simple DFS or BFS over operands will not work in non-tree
-  // situations when there are multiple consumers of the same producer. Instead
-  // of writing a custom traversal we can simply walk the post-order (producers
-  // before consumers) list backward and only update the instructions affected.
-  // TODO(b/393299275): use MakeInstructionPostOrderFrom(bitcast) - that should
-  // be slightly more efficient.
-  auto def_before_use = bitcast->parent()->MakeInstructionPostOrder();
-  for (HloInstruction* instruction :
-       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
-    auto it = result_shapes.find(instruction);
-    if (it == result_shapes.end()) {
-      continue;  // Not affected.
-    }
-    Shape& result_shape = it->second;
-    if (instruction->shape() == result_shape) {
-      continue;  // No change.
-    }
-    result.emplace_back(instruction, result_shape);
-    switch (instruction->opcode()) {
-      case HloOpcode::kParameter:
-      case HloOpcode::kConstant:
-        // No operands.
-        break;
-      case HloOpcode::kReshape:  // Reshape is a bitcast.
-      case HloOpcode::kBitcast:
-        // Other bitcast will be hoisted separately so we don't need to
-        // update its operand.
-        break;
-      case HloOpcode::kBroadcast: {
-        TF_ASSIGN_OR_RETURN(
-            BitcastParams params,
-            CalculateBitcastOfBroadcast(
-                Cast<HloBroadcastInstruction>(instruction), result_shape));
-        TF_RETURN_IF_ERROR(
-            set_result_shape(instruction->operands(), params.new_shape));
-        break;
-      }
-      case HloOpcode::kTranspose: {
-        TF_ASSIGN_OR_RETURN(
-            BitcastParams params,
-            CalculateBitcastOfTranspose(
-                Cast<HloTransposeInstruction>(instruction), result_shape));
-        TF_RETURN_IF_ERROR(
-            set_result_shape(instruction->operands(), params.new_shape));
-        break;
-      }
-      default:
-        if (!instruction->IsElementwise()) {
-          return absl::FailedPreconditionError(absl::StrCat(
-              "Cannot hoist bitcast past ", instruction->ToString()));
-        }
-        TF_RETURN_IF_ERROR(
-            set_result_shape(instruction->operands(), result_shape));
-        break;
-    }
-  }
-  return result;
-}
-
-// Returns the shape of the root instruction after hoisting all bitcasts.
-//
-// For example, given:
-//
-// dot = dot_shape dot
-// bitcast = bitcast(dot)
-// ROOT root = transpose(bitcast)
-//
-// Returns root_shape for:
-//
-// dot = dot_shape dot
-// ROOT root = roots_shape transpose(dot)
-//
-absl::StatusOr<Shape> ComputeRootShapeAfterHoistingBitcasts(
-    const HloInstruction* dot) {
-  if (dot->IsRoot()) {
-    return dot->shape();
-  }
-
-  HloInstructionMap<Shape> operand_shapes;
-  auto set_operand_shape =
-      [&](const absl::Span<HloInstruction* const> instructions,
-          const Shape& shape) -> absl::Status {
-    for (HloInstruction* instruction : instructions) {
-      // Only update the dimensions keeping the type intact.
-      Shape new_shape(shape);
-      const HloInstruction* operand = instruction->operand(0);
-      CopyElementType(operand->shape(), &new_shape);
-      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
-               ShapeUtil::ArrayDataSize(operand->shape()))
-          << " instruction " << instruction->ToString()
-          << " updating operand shape from "
-          << ShapeUtil::HumanStringWithLayout(operand->shape()) << " to "
-          << ShapeUtil::HumanStringWithLayout(new_shape)
-          << " with different data size";
-      auto it = operand_shapes.find(instruction);
-      if (it == operand_shapes.end()) {
-        VLOG(2) << "updating the operand shape of "
-                << instruction->ToString(
-                       HloPrintOptions().set_print_operand_shape(true))
-                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
-        operand_shapes.emplace(instruction, new_shape);
-      } else if (it->second != new_shape) {
-        return absl::FailedPreconditionError(absl::StrCat(
-            "Conflicting shape assignment for ", instruction->ToString(),
-            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
-            ShapeUtil::HumanStringWithLayout(shape)));
-      }
-    }
-    return absl::OkStatus();
-  };
-  TF_RETURN_IF_ERROR(set_operand_shape(dot->users(), dot->shape()));
-
-  for (HloInstruction* instruction : GetConsumerSet(dot)) {
-    auto it = operand_shapes.find(instruction);
-    if (it == operand_shapes.end()) {
-      continue;  // Not affected.
-    }
-    Shape& operand_shape = it->second;
-    TF_ASSIGN_OR_RETURN(Shape result_shape, [&]() -> absl::StatusOr<Shape> {
-      switch (instruction->opcode()) {
-        case HloOpcode::kBroadcast: {
-          TF_ASSIGN_OR_RETURN(
-              BitcastParams params,
-              CalculateBroadcastOfBitcast(
-                  Cast<HloBroadcastInstruction>(instruction), operand_shape));
-          return params.new_shape;
-        }
-        case HloOpcode::kTranspose: {
-          TF_ASSIGN_OR_RETURN(
-              BitcastParams params,
-              CalculateTransposeOfBitcast(
-                  Cast<HloTransposeInstruction>(instruction), operand_shape));
-          return params.new_shape;
-        }
-        default:
-          if (!instruction->IsElementwise()) {
-            return absl::FailedPreconditionError(absl::StrCat(
-                "Cannot hoist bitcast past ", instruction->ToString()));
-          }
-          [[fallthrough]];
-        case HloOpcode::kReshape:  // Reshape is a bitcast.
-        case HloOpcode::kBitcast:
-          return operand_shape;
-      }
-    }());
-    if (instruction->IsRoot()) {
-      CopyElementType(instruction->shape(), &result_shape);
-      return result_shape;
-    }
-    TF_RETURN_IF_ERROR(set_operand_shape(instruction->users(), result_shape));
-  }
-  return absl::InternalError("No root found");
-}
-
-// Hoists the given 'bitcast' upwards out of its computation, to the parent of
-// each caller.
-absl::Status HoistBitcastUpwardsToCallers(HloInstruction* bitcast,
-                                          absl::Span<HloInstruction*> callers) {
-  TF_ASSIGN_OR_RETURN(auto rewrite_plan,
-                      PlanHoistBitcastUpwardsToCallers(bitcast));
-  for (auto [instruction, result_shape] : rewrite_plan) {
-    VLOG(2) << absl::StrCat("rewriting result shape of ",
-                            instruction->ToString(), " to ",
-                            ShapeUtil::HumanStringWithLayout(result_shape));
-    switch (instruction->opcode()) {
-      case HloOpcode::kParameter: {
-        // Create a new bitcast in callers.
-        int64_t number = instruction->parameter_number();
-        for (HloInstruction* caller : callers) {
-          // Create a more generic `bitcast` even if the caller has a
-          // `reshape`.
-          HloInstruction* new_bitcast =
-              caller->AddInstruction(HloInstruction::CreateBitcast(
-                  result_shape, caller->mutable_operand(number)));
-          TF_RETURN_IF_ERROR(
-              caller->ReplaceOperandWithDifferentShape(number, new_bitcast));
-        }
-        break;
-      }
-      case HloOpcode::kBroadcast: {
-        auto* broadcast = Cast<HloBroadcastInstruction>(instruction);
-        auto params = CalculateBitcastOfBroadcast(broadcast, result_shape);
-        // Must be OK, already succeeded in PlanHoistBitcasUpwardsToCallers.
-        QCHECK_OK(params);
-        broadcast->mutable_dimensions()->assign(params->new_dims.begin(),
-                                                params->new_dims.end());
-        break;
-      }
-      case HloOpcode::kTranspose: {
-        auto* transpose = Cast<HloTransposeInstruction>(instruction);
-        auto params = CalculateBitcastOfTranspose(transpose, result_shape);
-        // Must be OK, already succeeded in PlanHoistBitcastUpwardsToCallers.
-        QCHECK_OK(params);
-        transpose->mutable_dimensions()->assign(params->new_dims.begin(),
-                                                params->new_dims.end());
-        break;
-      }
-      default:
-        break;
-    }
-    *instruction->mutable_shape() = result_shape;
-  }
-  TF_RETURN_IF_ERROR(bitcast->ReplaceAllUsesWith(bitcast->mutable_operand(0)));
-  TF_RETURN_IF_ERROR(bitcast->parent()->RemoveInstruction(bitcast));
-  return absl::OkStatus();
-}
-
-// Inserts a bitcast at the root if the root shape is different from the dot
-// shape. The bitcast is chosen so that it cancels out bitcasts and reshapes
-// along the way up to the dot. Updates the callers of the dot to expect the new
-// root shape.
-absl::Status MaybeInsertRootBitcast(HloInstruction* dot,
-                                    absl::Span<HloInstruction*> callers) {
-  TF_ASSIGN_OR_RETURN(Shape root_shape,
-                      ComputeRootShapeAfterHoistingBitcasts(dot));
-
-  HloComputation* computation = dot->parent();
-  HloInstruction* root = computation->root_instruction();
-  if (root->shape() == root_shape) {
-    return absl::OkStatus();
-  }
-
-  // Insert a new bitcast at the root.
-  computation->set_root_instruction(
-      root->AddInstruction(HloInstruction::CreateBitcast(root_shape, root)));
-
-  // Insert new bitcast for each caller's result.
-  for (HloInstruction* caller : callers) {
-    HloInstruction* new_bitcast = caller->AddInstruction(
-        HloInstruction::CreateBitcast(caller->shape(), caller));
-    TF_RETURN_IF_ERROR(caller->ReplaceAllUsesWith(new_bitcast));
-    *caller->mutable_shape() = root_shape;
-  }
-
-  return absl::OkStatus();
-}
-
-// Try hoisting bitcasts and reshapes in the computation away from 'dot' to the
-// callers of the computation. Some bitcasts or reshapes may remain in the
-// computation, because they cannot be hoisted across all ops, e.g. across some
-// transposes and broadcasts. This is not reported as an error.
-absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
-                                                    CallGraph* call_graph) {
-  VLOG(2) << "Before hoisting bitcasts: " << dot->parent()->ToString();
-
-  auto callers = call_graph->GetComputationCallers(dot->parent());
-  if (auto status = MaybeInsertRootBitcast(dot, absl::MakeSpan(callers));
-      !status.ok()) {
-    VLOG(2) << "Failed to insert root bitcast: " << status;
-  }
-  VLOG(2) << "After inserting root bitcast: " << dot->parent()->ToString();
-
-  auto def_before_use = dot->parent()->MakeInstructionPostOrder();
-  for (HloInstruction* instruction :
-       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
-    if (!HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kReshape>(
-            instruction)) {
-      continue;
-    }
-    VLOG(2) << "Hoisting bitcast upwards " << instruction->ToString();
-    auto status =
-        HoistBitcastUpwardsToCallers(instruction, absl::MakeSpan(callers));
-    if (!status.ok()) {
-      VLOG(2) << "Failed to hoist " << instruction->ToString()
-              << " upwards: " << status;
-    }
-  }
-
-  VLOG(2) << "After hoisting bitcasts: " << dot->parent()->ToString();
-  return absl::OkStatus();
-}
-
 class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
  public:
   explicit NestGemmFusionVisitor(
@@ -1154,8 +386,6 @@ class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    TF_RETURN_IF_ERROR(
-        TryHoistBitcastsInComputationToCallers(instr, call_graph));
     TF_RETURN_IF_ERROR(MakeNestedFusionFromGemmFusion(
         fusion, instr, mlir_context_, device_description_));
 
@@ -1242,13 +472,13 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
       SymbolicTileAnalysis::AnalyzeComputation(
           *computation, ctx,
           TritonEmitterConstraints::GetBuilder(device_description));
-  if (std::holds_alternative<FusionDecision>(analysis_or)) {
+
+  if (const auto* fusion_decision = std::get_if<FusionDecision>(&analysis_or)) {
     std::unique_ptr<HloModule> extracted_computation_module =
         ExtractInstructionIntoNewModule(*computation->FusionInstruction());
-    return absl::InternalError(
-        absl::StrCat("Failed to analyze the computation (",
-                     std::get<FusionDecision>(analysis_or).Explain(),
-                     "): ", extracted_computation_module->ToString()));
+    return absl::InternalError(absl::StrCat(
+        "Failed to analyze the computation (", fusion_decision->Explain(),
+        "):\n", extracted_computation_module->ToString()));
   }
 
   auto& analysis = std::get<SymbolicTileAnalysis>(analysis_or);
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
index 2d94ad1c4417f6..720dd8b116bdb0 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
-#include "xla/hlo/ir/hlo_instructions.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/matmul_utils.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index b353c68c8b2d29..bc37e69dcb8489 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -15,31 +15,30 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
-#include <string>
+#include <memory>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
+using ::absl_testing::IsOkAndHolds;
 using ::testing::ElementsAre;
 
 namespace xla {
@@ -77,6 +76,17 @@ class NestGemmFusionTest : public HloHardwareIndependentTestBase {
       TestGpuDeviceInfo::RTXA6000DeviceInfo(
           se::GpuComputeCapability{se::CudaComputeCapability::Ampere()})};
   mlir::MLIRContext mlir_context_;
+
+  std::unique_ptr<VerifiedHloModule> RunNestGemmFusion(
+      absl::string_view hlo, const bool expect_change = true) {
+    std::unique_ptr<VerifiedHloModule> module =
+        ParseAndReturnVerifiedModule(hlo).value();
+    EXPECT_THAT(
+        NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
+        IsOkAndHolds(expect_change));
+    EXPECT_OK(verifier().Run(module.get()).status());
+    return module;
+  }
 };
 
 TEST_F(NestGemmFusionTest, BasicTest) {
@@ -102,12 +112,7 @@ ENTRY entry {
     }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
   const HloInstruction* fusion = nullptr;
   ASSERT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(match::Fusion(&fusion)));
@@ -161,11 +166,7 @@ ENTRY e {
                          "split_k":1,"num_stages":1,"num_warps":2,
                          "num_ctas":1}}}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
   HloComputation* fusion_computation = module->entry_computation()
                                            ->root_instruction()
                                            ->fused_instructions_computation();
@@ -178,6 +179,43 @@ ENTRY e {
               GmockMatch(match::Concatenate(match::Fusion(), match::Fusion())));
 }
 
+TEST_F(NestGemmFusionTest, CreatesTwoNestedFusionsFromSameParameter) {
+  absl::string_view hlo = R"(
+dot {
+  p0 = f32[32] parameter(0)
+  lhs = f32[4,8] reshape(p0)
+  rhs = f32[8,4] reshape(p0)
+  ROOT dot = f32[4,4] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[32] parameter(0)
+  ROOT fusion = f32[4,4] fusion(p0),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"4", "block_n":"4", "block_k":"8",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+}
+)";
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK-NEXT: {{.*}} f32[32]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} reshape
+CHECK-NEXT: }
+CHECK: {{.*}} {
+CHECK-NEXT: {{.*}} f32[32]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} reshape
+)"),
+      IsOkAndHolds(true));
+}
+
 // TODO(b/393299275): update test to use a unsupported operation.
 TEST_F(NestGemmFusionTest, DISABLED_UnsupportedComputationsAreNotChanged) {
   // Fusions other than kTritonNestedGemmFusionKind are not supported.
@@ -225,11 +263,7 @@ ENTRY e {
   ROOT result = (f32[128,128], f32[8192,512]) tuple(r1, r2)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool updated,
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()));
-  EXPECT_TRUE(updated);
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kFusion);
@@ -246,1281 +280,6 @@ ENTRY e {
             "__triton_nested_gemm_fusion");
 }
 
-class NestGemmFusionReshapeTest
-    : public NestGemmFusionTest,
-      public ::testing::WithParamInterface<HloOpcode> {};
-
-// Tests hoisting of bitcasts which would otherwise trigger unsatisfiable
-// constraints during symbolic tile analysis.
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedOutOfGemmFusions) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[21] parameter(0)
-  bitcast = f32[3,7]{0,1} $0(lhs)
-  rhs = f32[7,11] parameter(1)
-  ROOT dot = f32[3,11] dot(bitcast, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = f32[21] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f32[3,11] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-
-  const HloInstruction* fusion = nullptr;
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(match::Fusion(&fusion)));
-  EXPECT_THAT(fusion->operand(0), GmockMatch(match::Bitcast()));
-  EXPECT_THAT(*fusion, OutputTileSizesIs(ElementsAre(32, 64)));
-
-  const HloInstruction* lhs = nullptr;
-  const HloInstruction* rhs = nullptr;
-  EXPECT_THAT(fusion->fused_expression_root(),
-              GmockMatch(match::Dot(match::Fusion(&lhs), match::Fusion(&rhs))));
-  EXPECT_THAT(*lhs, OutputTileSizesIs(ElementsAre(32, 16)));
-  EXPECT_THAT(*rhs, OutputTileSizesIs(ElementsAre(16, 64)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, SupportsTwoBitcastsFromSameParameter) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f32[32] parameter(0)
-  lhs = f32[4,8] $0(p0)
-  rhs = f32[8,4] $0(p0)
-  ROOT dot = f32[4,4] dot(lhs, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = f32[32] parameter(0)
-  ROOT fusion = f32[4,4] fusion(p0),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"4", "block_n":"4", "block_k":"8",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-
-  const HloInstruction* fusion = nullptr;
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(match::Fusion(&fusion)));
-  EXPECT_THAT(*fusion, OutputTileSizesIs(ElementsAre(4, 4)));
-
-  const HloInstruction* lhs = nullptr;
-  const HloInstruction* rhs = nullptr;
-  EXPECT_THAT(fusion->fused_expression_root(),
-              GmockMatch(match::Dot(match::Fusion(&lhs), match::Fusion(&rhs))));
-  EXPECT_THAT(*lhs, OutputTileSizesIs(ElementsAre(4, 8)));
-  EXPECT_THAT(*rhs, OutputTileSizesIs(ElementsAre(8, 4)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsCanBeHoistedPastOtherBitcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[3,7] parameter(0)
-  bitcast0 = f32[21] $0(lhs)
-  bitcast1 = f32[3,7] $0(bitcast0)
-  rhs = f32[7,11] parameter(1)
-  ROOT dot = f32[3,11] dot(bitcast1, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = f32[3, 7] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f32[3,11] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsCanBeHoistedPastElementwiseEpilogues) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[3,7] parameter(0)
-  rhs = f32[7,11] parameter(1)
-  dot = f32[3,11] dot(lhs, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast = f32[33] $0(dot)
-  ROOT add = f32[33] add(bitcast, bitcast)
-}
-
-ENTRY entry {
-  p0 = f32[3, 7] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f32[33] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsCanBeHoistedPastConvertEpilogues) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[3,7] parameter(0)
-  rhs = f32[7,11] parameter(1)
-  dot = f32[3,11] dot(lhs, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast = f32[33] $0(dot)
-  ROOT convert = f16[33] convert(bitcast)
-}
-
-ENTRY entry {
-  p0 = f32[3, 7] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f16[33] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: f16[3,11]{1,0} convert(
-CHECK: f16[3,11]{1,0} fusion(
-)"),
-      absl_testing::IsOkAndHolds(true));
-
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-// We cannot hoist bitcasts past transposes, but we don't need to hoist
-// because the bitcast is not rank-expanding and symbolic tile analysis
-// works fine.
-TEST_P(NestGemmFusionReshapeTest, BitcastsCannotBeHoistedPastTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f32[72,36,2] parameter(0)
-  transpose0 = f32[72,2,36] transpose(p0), dimensions={0,2,1}
-  bitcast0 = f32[144,36] $0(transpose0)
-  p1 = f32[36,3] parameter(1)
-  dot = f32[144,3] dot(bitcast0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast1 = f32[144,3] $0(dot)
-  ROOT transpose1 = f32[3,144] transpose(bitcast1), dimensions={1,0}
-}
-
-ENTRY entry {
-  p0 = f32[72,36,2] parameter(0)
-  p1 = f32[36,3] parameter(1)
-  ROOT fusion = f32[3,144] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"128","block_n":"16","block_k":"32",
-          "split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsKeepElementSizeInBits) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = s8[21]{0:E(4)} parameter(0)
-  c1 = s8[21] convert(lhs)
-  c2 = f32[21] convert(c1)
-  b0 = f32[3,7] $0(c2)
-  rhs = f32[7,11] parameter(1)
-  dot = f32[3,11] dot(b0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  b1 = f32[33] $0(dot)
-  ROOT c = s8[33]{0:E(4)} convert(b1)
-}
-
-ENTRY entry {
-  p0 = s8[21]{0:E(4)} parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = s8[33]{0:E(4)} fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-  CHECK: ENTRY
-  CHECK: {{.*}} = s8[3,7]{1,0:E(4)} bitcast({{.*}})
-  CHECK: [[fusion:[^ ]+]] = s8[3,11]{1,0:E(4)} fusion({{.*}})
-  CHECK: ROOT {{.*}} = s8[33]{0:E(4)} bitcast([[fusion]])
-)"),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample1) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f16[1,16,17,3] parameter(0)
-  bitcast0 = f16[16,51] $0(f16[1,16,17,3] p0)
-  p1 = s8[16,17,3] parameter(1)
-  bitcast1 = s8[16,51] $0(s8[16,17,3] p1)
-  convert = f16[16,51] convert(s8[16,51] bitcast1)
-  bitcast2 = f16[51,16]{0,1} $0(f16[16,51] convert)
-  dot = f16[16,16] dot(bitcast0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bitcast3 = f16[1,16,16] $0(f16[16,16] dot)
-}
-
-ENTRY entry {
-  p0 = f16[1,16,17,3] parameter(0)
-  p1 = s8[16,17,3] parameter(1)
-  ROOT fusion = f16[1,16,16] fusion(f16[1,16,17,3] p0, s8[16,17,3] p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"16","block_n":"16","block_k":"32",
-          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample2) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = pred[3,122,96,12] parameter(0)
-  transpose = pred[3,96,12,122] transpose(p0), dimensions={0,2,3,1}
-  bitcast0 = pred[3456,122] $0(transpose)
-  convert0 = f16[3456,122] convert(bitcast0)
-  p1 = pred[1,5,122] parameter(1)
-  bitcast1 = pred[5,122] $0(p1)
-  convert1 = f16[5,122] convert(bitcast1)
-  bitcast2 = f16[122,5]{0,1} $0(convert1)
-  dot.1 = f16[3456,5] dot(convert0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bitcast3 = f16[3,96,12,1,5] $0(dot.1)
-}
-
-ENTRY entry_computation {
-  p0 = pred[3,122,96,12] parameter(0)
-  p1 = pred[1,5,122] parameter(1)
-  ROOT gemm_fusion_dot = f16[3,96,12,1,5] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"4","block_n":"16","block_k":"128",
-          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  // Note: block sizes were 16,16,32, but that now fails to satisfy constraints.
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample3) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f32[1,40] parameter(0)
-  bitcast0 = f32[40] $0(p0)
-  bitcast1 = f32[40,1] $0(bitcast0)
-  p1 = f32[1,40,250000] parameter(1)
-  bitcast2 = f32[40,250000] $0(p1)
-  dot = f32[1,250000] dot(bitcast1, bitcast2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
-  bitcast3 = f32[250000] $0(dot)
-  ROOT bitcast4 = f32[1,250000] $0(bitcast3)
-}
-
-ENTRY entry_computation {
-  p0 = f32[1,40] parameter(0)
-  p1 = f32[1,40,250000] parameter(1)
-  ROOT gemm_fusion_dot.2 = f32[1,250000] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"16","block_n":"16","block_k":"32",
-          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedPastCompare) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = s32[11,24,128]{2,1,0} parameter(0)
-  p1 = s32[11,24,128]{2,1,0} parameter(1)
-  eq = pred[11,24,128]{2,1,0} compare(p0, p1), direction=EQ
-  eq_reshape = pred[264,128]{1,0} $0(eq)
-  eq_f32 = f32[264,128]{1,0} convert(eq_reshape)
-  p2 = f32[128,8]{1,0} parameter(2)
-  ROOT result = f32[264,8]{1,0} dot(eq_f32, p2),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s32[11,24, 128]{2,1,0} parameter(0)
-  p1 = s32[11,24,128]{2,1,0} parameter(1)
-  p2 = f32[128,8]{1,0} parameter(2)
-  ROOT result = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {
-      "block_m":32,"block_n":16,"block_k":128,
-      "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedUpThroughBroadcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[11,1,24,1] parameter(0)
-  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,1,2,3}
-  p0_reshape = f32[264,128] $0(p0_broadcast)
-
-  p1 = f32[128,8]{1,0} parameter(1)
-  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[11,1,24,1] parameter(0)
-  p1 = f32[128,8] parameter(1)
-  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-// Broadcast fusion:
-CHECK: {{.*}} {
-CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
-CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
-CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastOfOperandAndBroadcastDimsIsNotHoistedUp) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[3,4] parameter(0)
-  p1 = f32[64,7]{1,0} parameter(1)
-  broadcast = f32[3,4,16] broadcast(p0), dimensions={0,1}
-  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
-  reshape = f32[3,64] $0(broadcast)
-  ROOT dot = f32[3,7]{1,0} dot(reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[3,4] parameter(0)
-  p1 = f32[64,7] parameter(1)
-  ROOT result = f32[3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  // We can nest the fusion including the broadcast.
-  EXPECT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
-                  .Run(module.get())
-                  .ok());
-  ASSERT_OK(verifier().Run(module.get()).status());
-  // Cos should not be rewritten as we cannot hoist bitcast.
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[3,4,16]{2,1,0} broadcast
-CHECK-NEXT: f32[3,64]{1,0} $0
-)",
-                                            HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastOfOperandAndBroadcastDimsIsNotHoistedDown) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7]{1,0} parameter(1)
-  dot = f32[6,5]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
-  reshape = f32[2,3,5] $0(dot)
-  ROOT broadcast = f32[2,4,3,5] broadcast(reshape), dimensions={0,2,3}
-}
-
-ENTRY e {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,4,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  // We can nest the fusion including the broadcast.
-  EXPECT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
-                  .Run(module.get())
-                  .ok());
-  ASSERT_OK(verifier().Run(module.get()).status());
-  // Cos should not be rewritten as we cannot hoist bitcast.
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[2,3,5]{2,1,0} $0
-CHECK-NEXT: f32[2,4,3,5]{3,2,1,0} broadcast
-)",
-                                            HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreHoistedUpThroughBroadcastDiamonds) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[3,5] parameter(0)
-  b0 = f32[3,5,77,1] broadcast(p0), dimensions={0,1}
-  b1 = f32[3,5,1] broadcast(p0), dimensions={0,1}
-  b2 = f32[3,5,77,1] broadcast(b1), dimensions={0,1,3}
-  sum = add(b0, b2)
-  sum_reshape = f32[15,77] $0(sum)
-  p1 = f32[77,8]{1,0} parameter(1)
-  ROOT result = f32[15,8] dot(sum_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[3,5] parameter(0)
-  p1 = f32[77,8] parameter(1)
-  ROOT result = f32[15,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: [[p0:[^ ]+]] = f32[15]{0} parameter(0)
-CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[p0]]), dimensions={0}
-CHECK-DAG: [[br:[^ ]+]] = f32[15]{0} broadcast([[p0]]), dimensions={0}
-CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[br]]), dimensions={0}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedOverBroadcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[11,1,24,1] parameter(0)
-  p0_broadcast = f32[11,1,24,1,128,1] broadcast(p0), dimensions={0,1,2,5}
-  p0_reshape = f32[264,128] $0(p0_broadcast)
-
-  p1 = f32[128,8]{1,0} parameter(1)
-  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[11,1,24,1] parameter(0)
-  p1 = f32[128,8] parameter(1)
-  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           R"(
-// Broadcast fusion:
-CHECK: {{.*}} {
-CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
-CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
-CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
-)"),
-
-              absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsLayoutIsPreserved) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-gemm_dot {
-  p0 = pred[3,122,96,12] parameter(0)
-  bitcast0 = pred[3,122,1152] $0(p0)
-  transpose0 = pred[3,1152,122] transpose(bitcast0), dimensions={0,2,1}
-  bitcast2 = pred[3456,122] $0(transpose0)
-  convert0 = f16[3456,122] convert(bitcast2)
-  p1 = pred[1,5,122] parameter(1)
-  bitcast3 = pred[5,122] $0(p1)
-  convert1 = f16[5,122] convert(bitcast3)
-  bitcast4 = f16[122,5]{0,1} $0(convert1)
-  dot0 = f16[3456,5]{1,0} dot(convert0, bitcast4), lhs_contracting_dims={1},
-    rhs_contracting_dims={0}
-  ROOT bitcast5 = f16[3,96,12,1,5] $0(dot0)
-}
-
-ENTRY e {
-  p0 = pred[3,122,96,12] parameter(0)
-  p1 = pred[1,5,122] parameter(1)
-  ROOT fusion = f16[3,96,12,1,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
-    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK: {{.*}} {
-CHECK: transpose
-CHECK: [[bitcast_or_reshape:[^ ]+]] = pred[3456,122]{1,0} $0({{.*}})
-CHECK: ROOT {{.*}} = f16[3456,122]{1,0} convert([[bitcast_or_reshape]])
-CHECK-NEXT: }
-CHECK: {{.*}} {
-CHECK-NOT: $0
-CHECK: ROOT {{.*}} = f16[122,5]{0,1} convert({{.*}})
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: {{.*}} = pred[122,5]{0,1} bitcast({{.*}})
-)",
-                                            HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       CheckDimensionsOfBroadcastAfterBitcastIsHoisted) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = bf16[1,8] parameter(0)
-  broadcast0 = bf16[1,8,8] broadcast(p0), dimensions={0,2}
-  lhs = bf16[1,2,4,8] $0(broadcast0)
-
-  p1 = bf16[1,8] parameter(1)
-  broadcast1 = bf16[1,8,8] broadcast(p1), dimensions={0,2}
-  rhs = bf16[1,2,4,8] $0(broadcast1)
-
-  ROOT dot = bf16[2,1,4,4] dot(lhs, rhs),
-    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
-    rhs_contracting_dims={3}, rhs_batch_dims={1,0}
-}
-
-ENTRY entry {
-  p0 = bf16[1,8] parameter(0)
-  ROOT fusion = bf16[2,1,4,4] fusion(p0, p0), kind=kCustom, calls=dot,
-    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
-CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedUpThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[7,6] parameter(0)
-  transpose = f32[6,7] transpose(p0), dimensions={1,0}
-  bitcast = f32[2,3,7] $0(transpose)
-  p1 = f32[2,5,7] parameter(1)
-  ROOT result = f32[2,3,5] dot(bitcast, p1),
-    lhs_contracting_dims={2}, lhs_batch_dims={0},
-    rhs_contracting_dims={2}, rhs_batch_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[7,6] parameter(0)
-  p1 = f32[2,5,7] parameter(1)
-  ROOT result = f32[2,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT transpose
-CHECK-SAME: f32[2,3,7]{2,1,0} transpose
-CHECK-SAME: dimensions={1,2,0}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       RankReducingBitcastsAreNotHoistedUpThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[2,7,3] parameter(0)
-  transpose = f32[3,2,7] transpose(p0), dimensions={2,0,1}
-  $0 = f32[6,7] $0(transpose)
-  p1 = f32[5,7] parameter(1)
-  ROOT dot = f32[6,5] dot($0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = f32[2,7,3] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      transpose
-CHECK-SAME: f32[3,2,7]{2,1,0} transpose
-CHECK-SAME: dimensions={2,0,1}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       RankReducingBitcastsAreNotHoistedDownThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[6,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0 = f32[2,3,5] $0(dot)
-  ROOT transpose = f32[2,5,3] transpose($0), dimensions={0,2,1}
-}
-
-ENTRY e {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,5,3] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[2,3,5]{2,1,0} $0
-CHECK-NEXT: f32[2,5,3]{2,1,0} transpose
-)",
-                                            HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       HoistingBitcastDoesNotIntroduceArtificialDimension) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-gemm_dot {
-  p0 = f16[3,122,1152] parameter(0)
-  transpose = f16[3,1152,122] transpose(p0), dimensions={0,2,1}
-  bitcast0 = f16[3,96,12,122] $0(transpose)
-  bitcast1 = f16[3456,122] $0(bitcast0)
-  p1 = f16[122,5] parameter(1)
-  ROOT dot = f16[3456,5]{1,0} dot(bitcast1, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f16[3,122,1152] parameter(0)
-  p1 = f16[122,5] parameter(1)
-  ROOT fusion = f16[3456,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
-    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
-}
-          )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  // Checks that transpose is on rank 3 tensor from hoisting bitcast1, not rank
-  // 4 tensor from hoisting bitcast0 first and then failing to hoist bitcast1.
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      transpose
-CHECK-SAME: f16[3,1152,122]{2,1,0} transpose
-CHECK-SAME: dimensions={0,2,1}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedDownThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[2,3,7] parameter(0)
-  p1 = f32[2,5,7] parameter(1)
-  dot = f32[2,3,5] dot(p0, p1),
-    lhs_contracting_dims={2}, lhs_batch_dims={0},
-    rhs_contracting_dims={2}, rhs_batch_dims={0}
-  bitcast = f32[6,5] $0(dot)
-  ROOT transpose = f32[5,6] transpose(bitcast), dimensions={1,0}
-}
-
-ENTRY e {
-  p0 = f32[2,3,7] parameter(0)
-  p1 = f32[2,5,7] parameter(1)
-  ROOT result = f32[5,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT transpose
-CHECK-SAME: f32[5,2,3]{2,1,0} transpose
-CHECK-SAME: dimensions={2,0,1}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedDownThroughBroadcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[3,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bitcast = f32[15] $0(dot)
-  ROOT broadcast = f32[2,15,6] broadcast(bitcast), dimensions={1}
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,15,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT broadcast
-CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
-CHECK-SAME: dimensions={0,1}
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreHoistedDownThroughBroadcastsWithNonDefaultLayout) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[6,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bitcast = f32[2,3,5]{2,1,0} $0(dot)
-  ROOT broadcast = f32[2,3,5]{2,0,1} broadcast(bitcast), dimensions={0,1,2}
-}
-
-ENTRY e {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,3,5]{2,0,1} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[2,3,5]{2,1,0} $0(dot)
-CHECK-NEXT: f32[2,3,5]{2,0,1} broadcast
-)",
-                                            HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastRootsAreHoistedDown) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[3,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  ROOT bitcast = f32[15] $0(dot)
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[15] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: ROOT dot
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastAreHoistedDownThroughBinaryElementwiseOps) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  p2 = f32[15] parameter(2)
-  dot = f32[3,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0 = f32[15] $0(dot)
-  ROOT add = f32[15] add($0, p2)
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  p2 = f32[15] parameter(2)
-  ROOT result = f32[15] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: ROOT add = f32[3,5]{1,0} add
-)"),
-      absl_testing::IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsWithNonDefaultLayoutAreHoistedOutThroughBroadcast) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[7,2]{0,1} parameter(0)
-  broadcast.1 = f32[15,7,2]{1,0,2} broadcast(p0), dimensions={1,2}
-  $0.1 = f32[2,7,15]{1,2,0} $0(broadcast.1)
-  p1 = f32[2,15,15]{2,1,0} parameter(1)
-  dot = f32[2,7,15]{2,1,0} dot($0.1, p1),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-  $0.2 = f32[15,14]{0,1} $0(dot)
-  ROOT broadcast.2 = f32[15,11,14]{0,2,1} broadcast($0.2), dimensions={0,2}
-}
-
-ENTRY e {
-  p0 = f32[7,2]{0,1} parameter(0)
-  p1 = f32[2,15,15]{2,1,0} parameter(1)
-  ROOT result = f32[15,11,14]{0,2,1} fusion(p0, p1),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[2,7,15]{1,2,0} broadcast({{.*}}), dimensions={0,1}
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[2,7,15,11]{2,1,0,3} broadcast({{.*}}), dimensions={0,1,2}
-CHECK: ENTRY
-CHECK: f32[7,2]{0,1} parameter(0)
-CHECK: f32[2,7]{1,0} bitcast(p0
-CHECK: result = f32[2,7,15,11]{2,1,0,3} fusion
-CHECK: ROOT {{.*}} = f32[15,11,14]{0,2,1} bitcast(result)
-)"),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsWithNonDefaultLayoutAreHoistedOutThroughTranspose) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[2,3,7]{0,2,1} parameter(0)
-  $0.1 = f32[7,3,2]{2,0,1} $0(p0)
-  transpose.1 = f32[3,2,7]{2,0,1} transpose($0.1), dimensions={1,2,0}
-  p1 = f32[3,5,7]{2,1,0} parameter(1)
-  dot = f32[3,2,5]{2,1,0} dot(transpose.1, p1),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-  $0.2 = f32[5,3,2]{0,2,1} $0(dot)
-  ROOT transpose.2 = f32[2,3,5]{0,2,1} transpose($0.2), dimensions={2,1,0}
-}
-
-ENTRY e {
-  p0 = f32[2,3,7]{0,2,1} parameter(0)
-  p1 = f32[3,5,7]{2,1,0} parameter(1)
-  ROOT result = f32[2,3,5]{0,2,1} fusion(p0, p1),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[3,2,7]{2,0,1} transpose({{.*}}), dimensions={1,2,0}
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[3,5,2]{2,1,0} transpose({{.*}}), dimensions={0,2,1}
-CHECK: ENTRY
-CHECK: f32[2,3,7]{0,2,1} parameter(0)
-CHECK: f32[7,3,2]{2,0,1} bitcast(p0
-CHECK: result = f32[3,5,2]{2,1,0} fusion
-CHECK: ROOT {{.*}} = f32[2,3,5]{0,2,1} bitcast(result)
-)"),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, MultipleBitcastsAreHoistedOut) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[3,3]{1,0} parameter(0)
-  $0.1 = f32[3,3]{1,0} $0(p0)
-  $0.2 = f32[3,3]{1,0} $0($0.1)
-  p1 = f32[3,3]{1,0} parameter(1)
-  dot = f32[3,3]{1,0} dot($0.2, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0.3 = f32[3,3]{1,0} $0(dot)
-  ROOT $0.4 = f32[3,3]{0,1} $0($0.3)
-}
-
-ENTRY e {
-  p0 = f32[3,3]{1,0} parameter(0)
-  ROOT result = f32[3,3]{0,1} fusion(p0, p0),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: ENTRY
-)"),
-      absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreNotHoistedOutThroughLayoutChangingTranspose) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[7,2]{1,0} parameter(0)
-  $0.1 = f32[2,7]{0,1} $0(p0)
-  transpose.1 = f32[2,7]{1,0} transpose($0.1), dimensions={0,1}
-  p1 = f32[5,7]{1,0} parameter(1)
-  dot = f32[2,5]{1,0} dot(transpose.1, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0.2 = f32[5,2]{0,1} $0(dot)
-  ROOT transpose.2 = f32[5,2]{1,0} transpose($0.2), dimensions={0,1}
-}
-
-ENTRY e {
-  p0 = f32[7,2]{1,0} parameter(0)
-  p1 = f32[5,7]{1,0} parameter(1)
-  ROOT result = f32[5,2]{1,0} fusion(p0, p1),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK: $0.1 = f32[2,7]{0,1} $0
-CHECK: $0.2 = f32[5,2]{0,1} $0
-CHECK: ENTRY
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-        )",
-                                            HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-INSTANTIATE_TEST_SUITE_P(NestGemmFusionReshapeTestSuite,
-                         NestGemmFusionReshapeTest,
-                         ::testing::ValuesIn({HloOpcode::kReshape,
-                                              HloOpcode::kBitcast}),
-                         [](const ::testing::TestParamInfo<HloOpcode>& info) {
-                           return std::string(HloOpcodeString(info.param));
-                         });
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
index e5669c1fa4440f..1a4e128f41679a 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
@@ -1295,6 +1295,7 @@ HloInstruction::FusionKind PriorityFusion::ChooseKind(
     case HloFusionAnalysis::EmitterFusionKind::kTriton:
     case HloFusionAnalysis::EmitterFusionKind::kCustomFusion:
     case HloFusionAnalysis::EmitterFusionKind::kCuDnn:
+    case HloFusionAnalysis::EmitterFusionKind::kSort:
       return HloInstruction::FusionKind::kCustom;
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate:
     case HloFusionAnalysis::EmitterFusionKind::kReduction:
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
index fd559471f03ed4..e9c4905147cb9d 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -1292,6 +1291,123 @@ TEST_F(PriorityFusionTest, DoNotFuseInsideReducer) {
               absl_testing::IsOkAndHolds(false));
 }
 
+TEST_F(PriorityFusionTest, SkipsTilingsWithInfiniteRuntime) {
+  // This test verifies the fix in TryFindBestTilingForFusion that skips
+  // tilings with infinite runtime estimates.
+  //
+  // The fix: After estimating runtime for each tiling candidate, check if
+  // exec_time == absl::InfiniteDuration() and skip those tilings.
+  //
+  // Background: DoesComputationFitInRegisters() returns false when tiles are
+  // too large to fit in registers (tile_size > 0.4 * registers_per_block).
+  // When this happens, EstimateRunTimeForTiledHloComputation() returns
+  // EstimateRunTimeData::Infinite() with exec_time = absl::InfiniteDuration().
+  //
+  // Without the fix: If all tilings have infinite runtime, the first one
+  // would be selected as "best" by default, leading to certain register
+  // spilling and poor performance.
+  //
+  // With the fix: Infinite-runtime tilings are skipped during evaluation,
+  // allowing:
+  // 1. Selection of tilings that actually fit in registers, OR
+  // 2. Return FusionDecision::Forbid("No valid tilings found") if all fail
+  //
+  // Test structure: LayerNorm-like computation with reductions that can
+  // trigger problematic tile sizes on certain input shapes.
+  const std::string kHloText = R"(
+HloModule m
+%region_2.260.clone.19 (Arg_0.55: f32[], Arg_1.55: f32[]) -> f32[] {
+  %Arg_0.55 = f32[] parameter(0)
+  %Arg_1.55 = f32[] parameter(1)
+  ROOT %add.492.0 = f32[] add(%Arg_0.55, %Arg_1.55)
+}
+
+%region_2.260.clone.8 (Arg_0.44: f32[], Arg_1.44: f32[]) -> f32[] {
+  %Arg_0.44 = f32[] parameter(0)
+  %Arg_1.44 = f32[] parameter(1)
+  ROOT %add.481.0 = f32[] add(%Arg_0.44, %Arg_1.44)
+}
+
+%region_2.260.clone.7 (Arg_0.43: f32[], Arg_1.43: f32[]) -> f32[] {
+  %Arg_0.43 = f32[] parameter(0)
+  %Arg_1.43 = f32[] parameter(1)
+  ROOT %add.480.0 = f32[] add(%Arg_0.43, %Arg_1.43)
+}
+
+%producer_computation (param_0: bf16[16384,4096]) -> bf16[16384,4096] {
+  %param_0 = bf16[16384,4096]{1,0} parameter(0)
+  %constant_0 = bf16[] constant(1e-03)
+  %broadcast_0 = bf16[16384,4096]{1,0} broadcast(%constant_0), dimensions={}
+  ROOT %add_0 = bf16[16384,4096]{1,0} add(%param_0, %broadcast_0)
+}
+
+%fused_computation.337 (param_0.1730: bf16[1,16384,4096], param_1.1795: bf16[16384,4096]) -> f32[128,4096] {
+  %param_0.1730 = bf16[1,16384,4096]{2,1,0} parameter(0)
+  %convert.113.32 = f32[1,16384,4096]{2,1,0} convert(%param_0.1730)
+  %bitcast.1893 = f32[16384,4096]{1,0} bitcast(%convert.113.32)
+  %constant_2184 = f32[] constant(0)
+  %reduce.310 = f32[16384]{0} reduce(%bitcast.1893, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.7
+  %bitcast.1892 = f32[1,16384]{1,0} bitcast(%reduce.310)
+  %constant_2183 = f32[] constant(0.000244140625)
+  %broadcast.1035 = f32[1,16384]{1,0} broadcast(%constant_2183), dimensions={}
+  %multiply.520 = f32[1,16384]{1,0} multiply(%bitcast.1892, %broadcast.1035)
+  %bitcast.1891 = f32[16384]{0} bitcast(%multiply.520)
+  %broadcast.1034 = f32[1,16384,4096]{2,1,0} broadcast(%bitcast.1891), dimensions={1}
+  %subtract.183 = f32[1,16384,4096]{2,1,0} subtract(%convert.113.32, %broadcast.1034)
+  %multiply.261.15 = f32[1,16384,4096]{2,1,0} multiply(%subtract.183, %subtract.183)
+  %bitcast.1136.15 = f32[16384,4096]{1,0} bitcast(%multiply.261.15)
+  %reduce.127.15 = f32[16384]{0} reduce(%bitcast.1136.15, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.8
+  %bitcast.1137.13 = f32[1,16384]{1,0} bitcast(%reduce.127.15)
+  %multiply.262.13 = f32[1,16384]{1,0} multiply(%bitcast.1137.13, %broadcast.1035)
+  %constant_1233_1 = f32[] constant(1e-05)
+  %broadcast.449.11 = f32[1,16384]{1,0} broadcast(%constant_1233_1), dimensions={}
+  %add.350.11 = f32[1,16384]{1,0} add(%multiply.262.13, %broadcast.449.11)
+  %bitcast.213.16 = f32[1,16384,1]{2,1,0} bitcast(%add.350.11)
+  %rsqrt.14.5 = f32[1,16384,1]{2,1,0} rsqrt(%bitcast.213.16)
+  %bitcast.215.7 = f32[16384]{0} bitcast(%rsqrt.14.5)
+  %broadcast.472.7 = f32[1,16384,4096]{2,1,0} broadcast(%bitcast.215.7), dimensions={1}
+  %param_1.1795 = bf16[16384,4096]{1,0} parameter(1)
+  %bitcast.211.19 = bf16[1,16384,4096]{2,1,0} bitcast(%param_1.1795)
+  %convert.201.19 = f32[1,16384,4096]{2,1,0} convert(%bitcast.211.19)
+  %multiply.267.5 = f32[1,16384,4096]{2,1,0} multiply(%subtract.183, %convert.201.19)
+  %multiply.282.3 = f32[1,16384,4096]{2,1,0} multiply(%broadcast.472.7, %multiply.267.5)
+  %bitcast.1210.1 = f32[128,128,4096]{2,1,0} bitcast(%multiply.282.3)
+  ROOT %reduce.180.1 = f32[128,4096]{1,0} reduce(%bitcast.1210.1, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.19
+}
+ENTRY main {
+  p0 = bf16[1,16384,4096] parameter(0)
+  p1 = bf16[16384,4096] parameter(1)
+  producer_fusion = bf16[16384,4096]{1,0} fusion(p1), kind=kLoop, calls=%producer_computation
+
+  ROOT fusion = f32[128,4096] fusion(p0, producer_fusion), kind=kCustom,
+    calls=%fused_computation.337, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1"]}],
+        "num_warps":"8",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_triton_gemm(false);
+
+  module->mutable_config().mutable_debug_options().set_xla_gpu_autotune_level(
+      0);
+
+  // VLOG(2) << module->ToString() << std::endl;
+
+  // Run priority fusion - it should not fuse producer into
+  // %fused_computation.337.
+  EXPECT_THAT(priority_fusion_.Run(module.get()),
+              absl_testing::IsOkAndHolds(false));
+}
+
 class PriorityFusionWithTritonEnabledTest : public PriorityFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
index e6da5ffb509679..578470d6f18e74 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -50,6 +51,92 @@ namespace gpu {
 
 using hlo_query::NextChannelId;
 
+// Returns a permutation of the devices in the replica group such that devices
+// on the same host are next to each other. The order of the devices within a
+// host is preserved.
+absl::InlinedVector<int64_t, 8> FindPermutation(
+    const ReplicaGroup& replica_group, int64_t num_devices_per_host) {
+  int64_t num_devices_in_replica = replica_group.replica_ids_size();
+
+  absl::InlinedVector<int64_t, 8> permutation(num_devices_in_replica);
+  absl::c_iota(permutation, 0);
+
+  absl::c_stable_sort(permutation, [&](int64_t i, int64_t j) {
+    int64_t host_i = replica_group.replica_ids(i) / num_devices_per_host;
+    int64_t host_j = replica_group.replica_ids(j) / num_devices_per_host;
+    return host_i < host_j;
+    return replica_group.replica_ids(i) < replica_group.replica_ids(j);
+  });
+  return permutation;
+}
+
+// Returns a permutation of the devices in the replica groups such that devices
+// on the same host are next to each other. Returns std::nullopt if the
+// permutation is not the same for all replica groups.
+std::optional<absl::InlinedVector<int64_t, 8>> FindReplicaGroupsPermutation(
+    absl::Span<ReplicaGroup const> replica_groups,
+    int64_t num_devices_per_host) {
+  absl::InlinedVector<int64_t, 8> permutation =
+      FindPermutation(replica_groups[0], num_devices_per_host);
+
+  // Check that all replica groups have the same permutation. Operand
+  // permutation doesn't not depend on the device id, so if permutations are
+  // different, we can't rewrite the ragged-all-to-all.
+  for (int64_t i = 1; i < replica_groups.size(); ++i) {
+    auto replica_group_permutation =
+        FindPermutation(replica_groups[i], num_devices_per_host);
+    if (replica_group_permutation != permutation) {
+      return std::nullopt;
+    }
+  }
+
+  return permutation;
+}
+
+// Shuffle values in the hlo instruction based on the permutation.
+HloInstruction* ShuffleMetadataOperandValues(
+    HloInstruction* hlo, absl::Span<int64_t const> permutation) {
+  // If the permutation is already sorted, then we don't need to shuffle.
+  if (absl::c_is_sorted(permutation)) {
+    return hlo;
+  }
+
+  HloComputation* computation = hlo->parent();
+
+  PrimitiveType element_type = hlo->shape().element_type();
+  int64_t num_elements = ShapeUtil::ElementsIn(hlo->shape());
+  int64_t num_replicas = permutation.size();
+  int64_t num_elements_per_replica = num_elements / permutation.size();
+  Shape linear_shape = ShapeUtil::MakeShape(element_type, {num_elements});
+  Shape gather_shape = ShapeUtil::MakeShape(
+      element_type, {num_replicas, num_elements_per_replica});
+
+  Array<int64_t> permutation_array({num_replicas, 1});
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    permutation_array(i, 0) = num_elements_per_replica * permutation[i];
+  }
+
+  auto permutation_constant =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateFromArray(permutation_array)));
+
+  hlo = computation->AddInstruction(
+      HloInstruction::CreateReshape(linear_shape, hlo));
+
+  hlo = computation->AddInstruction(
+      HloInstruction::CreateGather(gather_shape, hlo, permutation_constant,
+                                   HloGatherInstruction::MakeGatherDimNumbers(
+                                       /*offset_dims=*/{1},
+                                       /*collapsed_slice_dims=*/{},
+                                       /*start_index_map=*/{0},
+                                       /*index_vector_dim=*/1),
+                                   /*slice_sizes=*/{num_elements_per_replica},
+                                   /*indices_are_sorted=*/false));
+
+  return computation->AddInstruction(
+      HloInstruction::CreateReshape(linear_shape, hlo));
+}
+
 // Corrects the offsets in the local metadata to account for the number of input
 // rows in the combined ragged tensor.
 HloInstruction* CorrectOffsets(int64_t offset, HloInstruction* local_metadata,
@@ -83,13 +170,16 @@ HloInstruction* CorrectOffsets(int64_t offset, HloInstruction* local_metadata,
 absl::InlinedVector<HloInstruction*, 4> GetIntraHostMetadata(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
     HloComputation* computation, absl::Span<ReplicaGroup const> replica_groups,
-    int64_t num_hosts, int64_t num_devices_in_replica) {
+    absl::Span<int64_t const> replica_groups_permutation, int64_t num_hosts,
+    int64_t num_devices_in_replica) {
   int64_t num_devices_in_replica_per_host = num_devices_in_replica / num_hosts;
 
   absl::InlinedVector<HloInstruction*, 4> metadata_operands;
   metadata_operands.reserve(4);
   for (int i = 2; i < 6; ++i) {
     metadata_operands.push_back(ragged_all_to_all->mutable_operand(i));
+    metadata_operands.back() = ShuffleMetadataOperandValues(
+        metadata_operands.back(), replica_groups_permutation);
   }
 
   Shape metadata_operand_shape = metadata_operands[0]->shape();
@@ -179,7 +269,8 @@ absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
     HloComputation* computation,
     absl::Span<ReplicaGroup const> inter_host_replica_groups,
-    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups,
+    absl::Span<int64_t const> replica_groups_permutation, int64_t num_hosts,
     int64_t num_devices_in_replica) {
   HloInstruction* input_operand = ragged_all_to_all->mutable_operand(0);
 
@@ -208,9 +299,9 @@ absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
           ragged_all_to_all->channel_id().has_value()));
 
   absl::InlinedVector<HloInstruction*, 4> intra_host_metadata =
-      GetIntraHostMetadata(ragged_all_to_all, computation,
-                           inter_host_replica_groups, num_hosts,
-                           num_devices_in_replica);
+      GetIntraHostMetadata(
+          ragged_all_to_all, computation, inter_host_replica_groups,
+          replica_groups_permutation, num_hosts, num_devices_in_replica);
 
   HloInstruction* new_ragged_all_to_all =
       computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
@@ -219,7 +310,7 @@ absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
           {all_gather_input, ragged_all_to_all->mutable_operand(1),
            intra_host_metadata[0], intra_host_metadata[1],
            intra_host_metadata[2], intra_host_metadata[3]},
-          /*replica_groups=*/intra_host_replica_groups,
+          /*device_list=*/CollectiveDeviceList(intra_host_replica_groups),
           /*channel_id=*/ragged_all_to_all->channel_id()));
 
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(ragged_all_to_all,
@@ -248,8 +339,11 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
     HloComputation* computation,
     absl::Span<ReplicaGroup const> inter_host_replica_groups,
-    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups,
+    absl::Span<int64_t const> replica_groups_permutation, int64_t num_hosts,
     int64_t num_devices_in_replica, int64_t num_participating_devices) {
+  const Shape& metadata_operand_shape = ragged_all_to_all->operand(2)->shape();
+
   auto* zero = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(
           ragged_all_to_all->operand(1)->shape().element_type())));
@@ -270,6 +364,9 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
 
   auto get_intra_host_metadata = [&](HloInstruction* metadata_operand,
                                      bool correct_offsets) {
+    metadata_operand = ShuffleMetadataOperandValues(metadata_operand,
+                                                    replica_groups_permutation);
+
     metadata_operand =
         computation->AddInstruction(HloInstruction::CreateReshape(
             /*shape=*/ShapeUtil::MakeShape(
@@ -294,8 +391,7 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
             /*dimensions=*/{1, 0, 2}));
 
     return computation->AddInstruction(HloInstruction::CreateReshape(
-        /*shape=*/ragged_all_to_all->operand(2)->shape(),
-        /*operand=*/metadata_operand));
+        /*shape=*/metadata_operand_shape, /*operand=*/metadata_operand));
   };
 
   absl::InlinedVector<HloInstruction*, 4> intra_host_ragged_all_to_all_operands{
@@ -354,36 +450,40 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
           : std::nullopt,
       /*split_dimension=*/0));
 
-  HloInstruction* corrected_output_offsets = output_offsets;
+  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
+      /*shape=*/metadata_operand_shape, /*operand=*/output_offsets));
+
+  std::vector<HloInstruction*> local_ragged_all_to_all_operands = {
+      local_inputs,   ragged_all_to_all->mutable_operand(1),
+      output_offsets, ragged_all_to_all->mutable_operand(5),
+      output_offsets, ragged_all_to_all->mutable_operand(5),
+  };
+
+  for (int i = 2; i < 6; ++i) {
+    local_ragged_all_to_all_operands[i] = ShuffleMetadataOperandValues(
+        local_ragged_all_to_all_operands[i], replica_groups_permutation);
+  }
 
-  corrected_output_offsets =
+  HloInstruction* local_input_offsets =
       computation->AddInstruction(HloInstruction::CreateReshape(
           /*shape=*/ShapeUtil::MakeShape(
               output_offsets->shape().element_type(),
               {num_hosts, num_devices_in_replica_per_host,
                num_updates_per_replica}),
-          /*operand=*/corrected_output_offsets));
+          /*operand=*/local_ragged_all_to_all_operands[2]));
 
-  corrected_output_offsets =
+  local_input_offsets =
       CorrectOffsets(ragged_all_to_all->operand(1)->shape().dimensions(0),
-                     corrected_output_offsets, computation);
-
-  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
-      /*shape=*/ragged_all_to_all->operand(2)->shape(),
-      /*operand=*/output_offsets));
+                     local_input_offsets, computation);
 
-  corrected_output_offsets =
+  local_ragged_all_to_all_operands[2] =
       computation->AddInstruction(HloInstruction::CreateReshape(
-          /*shape=*/ragged_all_to_all->operand(2)->shape(),
-          /*operand=*/corrected_output_offsets));
+          /*shape=*/metadata_operand_shape, /*operand=*/local_input_offsets));
 
   HloInstruction* local_ragged_all_to_all =
       computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
           /*shape=*/ragged_all_to_all->shape(),
-          /*operands=*/
-          {local_inputs, ragged_all_to_all->mutable_operand(1),
-           corrected_output_offsets, ragged_all_to_all->mutable_operand(5),
-           output_offsets, ragged_all_to_all->mutable_operand(5)},
+          /*operands=*/local_ragged_all_to_all_operands,
           /*device_list=*/CollectiveDeviceList(degenerated_replica_groups),
           /*channel_id=*/ragged_all_to_all->channel_id()));
 
@@ -430,6 +530,25 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
     return false;
   }
 
+  // Offsets and sizes in metadata operands are stored in the order of replica
+  // groups. For example, if the replica groups are:
+  //   {{0, 2, 4, 6, 1, 3, 5, 7}}
+  // Then the offsets and sizes are stored in the order of
+  //   [0, 2, 4, 6, 1, 3, 5, 7]
+  // In the decomposition, we want to exchange all the intra-host metadata
+  // between hosts. To do that we want to group the metadata by hosts. We
+  // compute permutation that need to be performed on the metadata operand and
+  // use gather to move values. After the shuffle, offsets and sizes will be
+  // ordered as:
+  //   [0, 2, 1, 3, 4, 6, 5, 7]
+  auto replica_groups_permutation = FindReplicaGroupsPermutation(
+      replica_groups, fast_interconnect_slice_size);
+  // Empty value means that we can not find such permutation and the
+  // ragged-all-to-all can not be decomposed.
+  if (!replica_groups_permutation.has_value()) {
+    return false;
+  }
+
   // Decompose the replica groups into inter-host and intra-host replica groups.
   // For example, if the original replica groups were:
   //   {{0, 2, 4, 6, 8, 10, 12, 14}, {1, 3, 5, 7, 9, 11, 13, 15}}
@@ -482,13 +601,14 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
   if (num_input_rows > num_output_rows) {
     return DecomposeCombineRaggedAllToAll(
         ragged_all_to_all, computation, inter_host_replica_groups,
-        intra_host_replica_groups, num_hosts, num_devices_in_replica,
-        num_participating_devices);
+        intra_host_replica_groups, *replica_groups_permutation, num_hosts,
+        num_devices_in_replica, num_participating_devices);
   }
 
   return DecomposeDispatchRaggedAllToAll(
       ragged_all_to_all, computation, inter_host_replica_groups,
-      intra_host_replica_groups, num_hosts, num_devices_in_replica);
+      intra_host_replica_groups, *replica_groups_permutation, num_hosts,
+      num_devices_in_replica);
 }
 
 absl::StatusOr<bool> RaggedAllToAllMultiHostDecomposer::RunImpl(
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
index 9508b9395c15de..e0df20ff42a189 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
@@ -68,8 +68,7 @@ ENTRY main {
   )"));
 }
 
-TEST_F(RaggedAllToAllDecomposerTest,
-       SimpleRaggedAllToAllCrossPartitionIsSupported) {
+TEST_F(RaggedAllToAllDecomposerTest, DispatchRaggedAllToAllIsDecomposed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 HloModule module, num_partitions=16
 
@@ -102,6 +101,41 @@ ENTRY main {
   )"));
 }
 
+TEST_F(RaggedAllToAllDecomposerTest,
+       DispatchRaggedAllToAllWithShuffledReplicaGroupsIsDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module, num_partitions=16
+
+ENTRY main {
+  input = bf16[128] parameter(0)
+  output = bf16[256] parameter(1)
+  input_offsets = s64[32] parameter(2)
+  send_sizes = s64[32] parameter(3)
+  output_offsets = s64[32] parameter(4)
+  recv_sizes = s64[32] parameter(5)
+  ROOT ra2a = bf16[256] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes),
+    replica_groups={{0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15}}
+}
+)"));
+
+  RaggedAllToAllMultiHostDecomposer decomposer(
+      /*fast_interconnect_slice_size=*/8);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+
+  EXPECT_TRUE(changed);
+  EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  EXPECT_OK(HloDCE().Run(module.get()));
+  EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{4,12},{1,9},{5,13},{2,10},{6,14},{3,11},{7,15}{{[}]}}
+    // CHECK-COUNT-4: s64[16,2]{1,0} gather
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{4,12},{1,9},{5,13},{2,10},{6,14},{3,11},{7,15}{{[}]}}
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,4,1,5,2,6,3,7},{8,12,9,13,10,14,11,15}{{[}]}}
+  )"));
+}
+
 TEST_F(RaggedAllToAllDecomposerTest, SingleHostRaggedAllToAllIsNotDecomposed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 HloModule module
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
index ef7b6269cacb51..5aa9efb309d25f 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
index 315623af25a736..87fd2e45775559 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
index f50392dc9d6385..eb189496d3c11f 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -1066,8 +1065,7 @@ ENTRY main {
   EXPECT_FALSE(fusion_rewriter_.Run(module.get()).value());
 }
 
-TEST_F(SoftmaxRewriterTritonTest,
-       DoNotFuseNormalizationWithVeryLongRowsIfProfitabilityCheckIsEnabled) {
+TEST_F(SoftmaxRewriterTritonTest, DoesNotFuseNormalizationWithVeryLongRows) {
   const std::string hlo_string = R"(
 HloModule softmax
 max_computation {
@@ -1084,19 +1082,16 @@ ENTRY main {
 })";
 
   {
-    // Verify that SoftmaxRewriterTriton without Cost Model will fuse the
-    // normalization diamond.
+    // Verify that SoftmaxRewriterTriton without Cost Model will not fuse the
+    // normalization diamond, because the row size is too large to fit in
+    // registers.
     SoftmaxRewriterTriton fusion_rewriter_without_cost_model{
         device_info_, HloCostAnalysis::DefaultShapeSize, &alias_info_,
         &mlir_context_,
         /*only_fuse_if_profitable=*/false};
 
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
-    EXPECT_TRUE(fusion_rewriter_without_cost_model.Run(module.get()).value());
-    EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Fusion(m::Parameter())
-                               .WithPredicate(HasBlockLevelFusionConfig)));
+    EXPECT_FALSE(fusion_rewriter_without_cost_model.Run(module.get()).value());
   }
 
   {
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc
new file mode 100644
index 00000000000000..cba65682acee38
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/sort_iota_fusion.h"
+
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla::gpu {
+namespace {
+
+class SortIotaFusionGroupVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleSort(HloInstruction* sort) override {
+    VLOG(4) << "Input: " << sort->ToString();
+    std::vector<HloInstruction*> iota_operands;
+    absl::flat_hash_set<HloInstruction*> different_iotas;
+    for (HloInstruction* operand : sort->mutable_operands()) {
+      if (HloPredicateIsOp<HloOpcode::kIota>(operand)) {
+        if (different_iotas.insert(operand).second) {
+          iota_operands.push_back(operand);
+        }
+      }
+    }
+    if (iota_operands.empty()) {
+      return absl::OkStatus();
+    }
+    HloInstruction* fusion =
+        sort->parent()->AddInstruction(HloInstruction::CreateFusion(
+            sort->shape(), HloInstruction::FusionKind::kCustom, sort));
+    for (HloInstruction* iota : iota_operands) {
+      fusion->FuseInstruction(iota);
+    }
+    VLOG(5) << "Generated fusion: " << fusion->ToString();
+    return ReplaceInstruction(sort, fusion);
+  }
+};
+}  // namespace
+
+absl::StatusOr<bool> SortIotaFusion::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  ASSIGN_OR_RETURN(bool changed, SortIotaFusionGroupVisitor().RunOnModule(
+                                     module, execution_threads));
+  return changed;
+}
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h
new file mode 100644
index 00000000000000..dacb103910c363
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h
@@ -0,0 +1,39 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SORT_IOTA_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SORT_IOTA_FUSION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Fuses iotas into sort.
+class SortIotaFusion : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "sort-iota-fusion"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SORT_IOTA_FUSION_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc
new file mode 100644
index 00000000000000..34ba36597c0b75
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/sort_iota_fusion.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/service/pattern_matcher.h"
+
+namespace m = ::xla::match;
+
+namespace xla::gpu {
+namespace {
+
+using SortIotaFusionTest = HloHardwareIndependentTestBase;
+
+TEST_F(SortIotaFusionTest, FuseIota) {
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_index = s32[] parameter(2)
+      %rhs_index = s32[] parameter(3)
+      %lhs_index2 = s32[] parameter(4)
+      %rhs_index2 = s32[] parameter(5)
+      %lt_key = pred[] compare(%lhs_key, %rhs_key), direction=LT
+      %gt_key = pred[] compare(%rhs_key, %lhs_key), direction=LT
+      %eq_key = pred[] compare(%lt_key, %gt_key), direction=EQ
+      %lt_index = pred[] compare(%lhs_index, %rhs_index), direction=LT
+      ROOT res = pred[] select(%eq_key, %lt_index, %lt_key)
+    }
+
+    ENTRY main {
+      p0 = s32[16384]{0} parameter(0)
+      neg = s32[16384]{0} negate(p0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) sort(neg, iota, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+  )");
+  EXPECT_THAT(SortIotaFusion().Run(module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion = nullptr;
+  ASSERT_THAT(root, GmockMatch(m::Fusion(&fusion, m::Negate())));
+  EXPECT_EQ(fusion->fusion_kind(), HloInstruction::FusionKind::kCustom);
+  EXPECT_THAT(fusion->fused_expression_root(),
+              GmockMatch(m::Sort(m::Parameter(), m::Iota(), m::Iota())));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
index c4399f3964123f..4ed5d493ab093b 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
-#include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -49,8 +47,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
-namespace xla {
-namespace gpu {
+namespace xla::gpu {
 namespace {
 
 namespace m = match;
@@ -227,12 +224,44 @@ std::optional<SortComputationAnalysis> AnalyzeSortOp(
       sort_analysis->sort_order, sort_key_type, sort_value_type};
 }
 
-// Create runner for CUB sort operation.
-absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateRunner(
-    const SortComputationAnalysis& sort_analysis,
-    absl::string_view platform_name) {
-  return CubSortRunnerInterface::Create(
-      sort_analysis.key_type, sort_analysis.value_type, platform_name);
+// Returns whether the sort operation is supported by CUB.
+bool AreOperandTypesSupportedByCub(
+    const SortComputationAnalysis& sort_analysis) {
+  PrimitiveType key_type = sort_analysis.key_type;
+  std::optional<PrimitiveType> value_type = sort_analysis.value_type;
+  if (!value_type.has_value()) {
+    switch (key_type) {
+      case BF16:
+      case F16:
+      case F32:
+      case F64:
+      case S8:
+      case S16:
+      case S32:
+      case S64:
+      case U8:
+      case U16:
+      case U32:
+      case U64:
+        return true;
+      default:
+        return false;
+    }
+  }
+  auto value_bitwidth = primitive_util::BitWidth(*value_type);
+  switch (key_type) {
+    case U8:
+    case U16:
+    case U32:
+    case U64:
+    case F32:
+      return value_bitwidth == 16 || value_bitwidth == 32 ||
+             value_bitwidth == 64;
+    case S32:
+      return value_bitwidth == 32;
+    default:
+      return false;
+  }
 }
 
 // Restore the result shape after sorting a pair of tensors.
@@ -456,7 +485,7 @@ bool IsCubCompatibleSort(const se::DeviceDescription& device_description,
     VLOG(2) << "Only simple compare computations are supported";
     return false;
   }
-  if (!CreateRunner(*sort_analysis, platform_name).ok()) {
+  if (!AreOperandTypesSupportedByCub(*sort_analysis)) {
     VLOG(2) << "Unsupported operand types (no compiled CUB kernels): "
             << PrimitiveType_Name(sort_analysis->key_type) << " "
             << (sort_analysis->value_type.has_value()
@@ -476,22 +505,6 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
   // Get the sort tensor index and direction.
   SortComputationAnalysis sort_analysis = AnalyzeSortOp(*sort_op).value();
 
-  // Get scratch size requirements from CUB.
-  const Shape& operand_shape = sort_op->operand(0)->shape();
-  int64_t batch_size = Product(operand_shape.dimensions()) /
-                       operand_shape.dimensions(sort_op->sort_dimension());
-
-  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_analysis, platform_name_));
-  TF_ASSIGN_OR_RETURN(
-      int64_t scratch_size,
-      runner->GetScratchSize(Product(operand_shape.dimensions()), batch_size));
-
-  // Align and increase scratch size to fit the offsets.
-  if (batch_size > 1) {
-    scratch_size += sizeof(int) - scratch_size % sizeof(int);
-    scratch_size += (batch_size + 1) * sizeof(int);
-  }
-
   // Values are only present if sorting a pair of tensors.
   HloInstruction* keys;
   HloInstruction* values = nullptr;
@@ -519,13 +532,17 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
     shapes.push_back(values->shape());
     operands.push_back(values);
   }
-  shapes.push_back(ShapeUtil::MakeShape(U8, {scratch_size}));
+  // The last shape corresponds to the scratch buffer. In this pass we put 1 as
+  // the scratch size, but later the actual size will be set by the
+  // AssignCubScratchSize pass.
+  shapes.push_back(ShapeUtil::MakeShape(U8, {/*scratch_size=*/1}));
   Shape call_shape = ShapeUtil::MakeTupleShape(absl::MakeSpan(shapes));
 
   // Build the custom call instruction.
   HloInstruction* custom_call =
       sort_op->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, absl::MakeSpan(operands), kCubDeviceRadixSortTarget));
+          call_shape, absl::MakeSpan(operands),
+          kCubDeviceRadixSortUnassignedScratchSizeTarget));
 
   xla::SortOptions backend_config;
   backend_config.set_descending(sort_analysis.descending);
@@ -586,5 +603,4 @@ absl::StatusOr<bool> SortRewriter::RunImpl(
   return changed;
 }
 
-}  // namespace gpu
-}  // namespace xla
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
index b2b7382e5d8fa8..471facbe522d88 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
@@ -58,10 +59,11 @@ class SortRewriterTest
 
   bool RunModuleAndPass(HloModule* module) {
     auto cloned = module->Clone();
-    bool changed = SortRewriter(TestGpuDeviceInfo::CudaOrRocmDeviceInfo(),
-                                GetTestPlatform()->Name())
-                       .Run(module)
-                       .value();
+    const std::string& platform_name = GetTestPlatform()->Name();
+    bool changed =
+        SortRewriter(TestGpuDeviceInfo::CudaOrRocmDeviceInfo(), platform_name)
+            .Run(module)
+            .value();
     if (changed) {
       // Here we run an end to end test to make sure that SortRewriter does
       // not introduce an incorrect rewrite. To do this, we need to clone the
@@ -106,7 +108,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -132,7 +136,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/true);
 }
@@ -158,7 +164,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -431,6 +439,9 @@ ENTRY %main {
   ROOT %sort = f32[$0,100000] sort(%input), dimensions={1}, to_apply=%compare
 })";
 
+  if (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm") {
+    GTEST_SKIP() << "Skipping CUDA-specific test";
+  }
   auto pass = SortRewriter(TestGpuDeviceInfo::RTXH100SXMDeviceInfo(), "CUDA");
 
   // Batch 1
@@ -469,6 +480,9 @@ ENTRY %main {
   ROOT %sort = f32[$0,100000] sort(%input), dimensions={1}, to_apply=%compare
 })";
 
+  if (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm") {
+    GTEST_SKIP() << "Skipping CUDA-specific test";
+  }
   auto pass = SortRewriter(TestGpuDeviceInfo::RTXA6000DeviceInfo(), "CUDA");
 
   // Batch 1
@@ -512,7 +526,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -538,7 +554,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -559,13 +577,23 @@ ENTRY %main {
       dimensions={0}, to_apply=%compare, metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}
 })";
   constexpr char kExpectedPattern[] = R"(
-    // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[1]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSort", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
+    // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[{{[0-9]+}}]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
   )";
-  for (const auto& [device_description, platform_name] :
-       {std::tuple{TestGpuDeviceInfo::RTXA6000DeviceInfo(), "CUDA"},
-        std::tuple{TestGpuDeviceInfo::RTXH100SXMDeviceInfo(), "CUDA"}}) {
-    RunAndFilecheckHloRewrite(kHlo,
-                              SortRewriter(device_description, platform_name),
+
+  auto platform_name = absl::AsciiStrToUpper(
+      xla::PlatformUtil::CanonicalPlatformName("gpu").value());
+  auto device_list = [platform_name]() -> std::vector<se::DeviceDescription> {
+    if (platform_name == "CUDA") {
+      return {TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+              TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+    } else {
+      return {TestGpuDeviceInfo::AMDMI210DeviceInfo(),
+              TestGpuDeviceInfo::AMDRX7900DeviceInfo()};
+    }
+  };
+
+  for (const auto& device_desc : device_list()) {
+    RunAndFilecheckHloRewrite(kHlo, SortRewriter(device_desc, platform_name),
                               kExpectedPattern);
   }
 }
@@ -602,7 +630,8 @@ ENTRY main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Op(), m::Parameter()),
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Op(), m::Parameter()),
           1)))
       << module->ToString();
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index d6dfc08863b5a2..d3eb4c56e28c78 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -15,11 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/transpose_dimension_grouper.h"
 
-#include <cstddef>
 #include <cstdint>
-#include <functional>
-#include <numeric>
-#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
@@ -31,151 +27,29 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/layout_util.h"
 #include "xla/permutation_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
 
 namespace {
-// Returns the indices of the first elements of all consecutive subarrays of the
-// given array. For example:
-// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4}
-absl::InlinedVector<size_t, 3> ConsecutiveSegments(
-    absl::Span<const int64_t> xs) {
-  absl::InlinedVector<size_t, 3> is = {0};
-  for (size_t i = 1; i < xs.size(); ++i) {
-    if (1 != xs[i] - xs[i - 1]) {
-      is.push_back(i);
-    }
-  }
-  return is;
-}
-
-// Merges the sequences of dimensions of the given shape which start at the
-// given indices `segs`.
-Shape MergeDimensions(absl::Span<const size_t> segs, const Shape &shape) {
-  std::vector<int64_t> dimensions;
-  const auto size = segs.size();
-  dimensions.reserve(size);
-  for (size_t i = 1; i <= size; ++i) {
-    dimensions.push_back(std::accumulate(
-        shape.dimensions().begin() + segs[i - 1],
-        shape.dimensions().begin() +
-            (segs.size() == i ? shape.dimensions().size() : segs[i]),
-        int64_t{1}, std::multiplies<int64_t>()));
-  }
-  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
-                                                  dimensions);
-}
-
-absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
-    const Shape &output_shape, absl::Span<int64_t const> output_to_input,
-    absl::InlinedVector<int64_t, 3> &permutation) {
-  absl::InlinedVector<size_t, 3> segments =
-      ConsecutiveSegments(output_to_input);
-  Shape normalized_shape = MergeDimensions(segments, output_shape);
-  absl::InlinedVector<int64_t, 3> normalized_dims(
-      normalized_shape.dimensions().begin(),
-      normalized_shape.dimensions().end());
-  if (segments.size() == 1) {
-    return normalized_dims;
-  }
-  // Derive the permutation from the segments.
-  std::vector<int64_t> segment_to_normalized_dim(
-      output_shape.dimensions().size(), -1);
-  for (size_t segment : segments) {
-    segment_to_normalized_dim[output_to_input[segment]] = 0;
-  }
-  int64_t normalized_dim = 0;
-  for (int64_t i = 0; i < segment_to_normalized_dim.size(); ++i) {
-    if (segment_to_normalized_dim[i] >= 0) {
-      segment_to_normalized_dim[i] = normalized_dim++;
-    }
-  }
-  permutation.reserve(segments.size());
-  for (int64_t i = 0; i < segments.size(); ++i) {
-    permutation.push_back(
-        segment_to_normalized_dim[output_to_input[segments[i]]]);
-  }
-  return normalized_dims;
-}
-
-// In this case, we care about transposes that permute dimensions of a shape
-// that can be viewed as several logical components in the order of major to
-// minor. As an example, let's consider a 0-2-1 transpose:
-//
-// If a shape can be viewed as three logical components 0-1-2 in the order of
-// major to minor, a 0-2-1-transpose changes the order of such logical
-// components to 0-2-1. We call the shape being transposed the input shape and
-// the transposed shape the output shape. The logical view of the input/output
-// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
-// shapes. The original input/output shapes are called unnormalized shapes.
-//
-// 'output_shape' should have the default layout (enforced by the caller).
-//
-// 'dimensions' specifies the kind of the unnormalized transpose and defines the
-// permutation of the input shape that will result in the provided output shape.
-// So to compute the input shape, we need to apply the inverse permutation of
-// 'dimensions'.
-//
-// 'permutation' is an output parameter and specifies the kind of the normalized
-// transpose.
-//
-// The method returns the dimensions for the normalized transpose shape.
-//
-// Example: Suppose the unnormalized output shape is [32, 1, 10, 11], and
-// 'dimensions' is set to {3, 1, 0, 2}. This means the corresponding input shape
-// is [10, 1, 11, 32]. The normalized output shape is [32, 110] with
-// 'permutation' set to {1,0}.
-absl::InlinedVector<int64_t, 3> GetNormalizedLogicalTransposeShape(
-    const Shape &output_shape, absl::Span<int64_t const> dimensions,
-    absl::InlinedVector<int64_t, 3> &permutation) {
-  permutation.clear();
-  // Drop degenerate dimensions.
-  absl::InlinedVector<int64_t, 3> delta(output_shape.dimensions().size() + 1,
-                                        0);
-  auto input_dimensions =
-      Permute(output_shape.dimensions(), InversePermutation(dimensions));
-  for (int i = 0; i < output_shape.dimensions().size(); ++i) {
-    delta[i + 1] = delta[i];
-    if (input_dimensions[i] == static_cast<int64_t>(1)) {
-      ++delta[i + 1];
-    }
-  }
-  absl::InlinedVector<int64_t, 3> new_dimensions;
-  for (int i = 0; i < dimensions.size(); i++) {
-    if (output_shape.dimensions(i) != 1) {
-      new_dimensions.push_back(dimensions[i] - delta[dimensions[i]]);
-    }
-  }
-
-  return GetNormalizedTransposeShapeHelper(
-      ShapeUtil::DropDegenerateDimensions(output_shape), new_dimensions,
-      permutation);
-}
 
 class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
  public:
-  absl::Status HandleTranspose(HloInstruction *transpose) override {
+  absl::Status HandleTranspose(HloInstruction* transpose) override {
     VLOG(4) << "Input: " << transpose->ToString();
-    if (!LayoutUtil::IsMonotonicWithDim0Major(transpose->shape().layout()) ||
-        !LayoutUtil::IsMonotonicWithDim0Major(
-            transpose->operand(0)->shape().layout())) {
-      // TransposeDimensionGrouper runs almost immediately after
-      // LayoutNormalization. The passes in between have been verified to not
-      // introduce transposes with non-default layout.
-      return FailedPrecondition(
-          "Layout normalization should have assigned the default layout to "
-          "transpose and its operand");
-    }
     absl::InlinedVector<int64_t, 3> permutation;
-    auto normalized_dims = GetNormalizedLogicalTransposeShape(
-        transpose->shape(), transpose->dimensions(), permutation);
+    // TransposeDimensionGrouper runs almost immediately after
+    // LayoutNormalization. The passes in between have been verified to not
+    // introduce transposes with non-default layout.
+    ASSIGN_OR_RETURN(auto normalized_dims,
+                     ShapeUtil::GetNormalizedLogicalTransposeShape(
+                         transpose->operand(0)->shape(), transpose->shape(),
+                         transpose->dimensions(), permutation));
     if (normalized_dims.size() == 1 ||
         normalized_dims == transpose->shape().dimensions()) {
       return absl::OkStatus();
@@ -202,9 +76,8 @@ class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
 absl::StatusOr<bool> TransposeDimensionGrouper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  TF_ASSIGN_OR_RETURN(
-      bool changed,
-      TransposeDimensionGroupVisitor().RunOnModule(module, execution_threads));
+  ASSIGN_OR_RETURN(bool changed, TransposeDimensionGroupVisitor().RunOnModule(
+                                     module, execution_threads));
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
index 774bd1185fd184..5095b9d5934222 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
index 5f7101cb05ae9e..ed2199bb7748c2 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index 8b52e7ce01a983..28dd2a5505c619 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/primitive_util.h"
@@ -273,7 +272,7 @@ gemm_computation (p0: bf16[128,512], p1: bf16[256,512], p2: bf16[512,512]) -> bf
       "kind":"__triton_nested_gemm_fusion",
       "block_level_fusion_config":{
         "num_warps":"8",
-        "output_tiles":[{"sizes":["128","64"]}],
+        "output_tiles":[{"sizes":["128","32"]}],
         "num_ctas":1,
         "num_stages":4,
         "is_tma_allowed":false}}}
@@ -286,7 +285,7 @@ gemm_computation (p0: bf16[128,512], p1: bf16[256,512], p2: bf16[512,512]) -> bf
       "kind":"__triton_nested_gemm_fusion",
       "block_level_fusion_config":{
         "num_warps":"8",
-        "output_tiles":[{"sizes":["64","256"]}],
+        "output_tiles":[{"sizes":["32","256"]}],
         "num_ctas":1,
         "num_stages":4,
         "is_tma_allowed":false}}}
@@ -397,7 +396,7 @@ ENTRY main {
         "kind":"__triton",
         "block_level_fusion_config":{
           "output_tiles":[{"sizes":["1","256000"]}],
-          "num_warps":"32",
+          "num_warps":"16",
           "num_ctas":"1",
           "num_stages":"1"}}}
 })",
@@ -521,7 +520,7 @@ ENTRY main {
       "kind":"__triton",
       "block_level_fusion_config":{
         "output_tiles":[{"sizes":["1","1","1","16384"]}],
-        "num_warps":"32",
+        "num_warps":"16",
         "num_ctas":"1",
         "num_stages":"1"}}}
 }
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index cfc47a333955d0..9f7fbcba0aa98b 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -57,6 +57,16 @@ using triton_fusion::GetPropagatedDimOrdersAndRequirements;
 using triton_fusion::kNoSplitRequirement;
 using triton_fusion::TransformDirection;
 
+int64_t GetContractingDimSize(const HloInstruction& dot) {
+  const auto& contracting_dims =
+      ContractingDimensionsForOperand(dot, /*operand_number=*/0);
+  int64_t contracting_dim_size = 1;
+  for (int64_t dim : contracting_dims) {
+    contracting_dim_size *= dot.operand(0)->shape().dimensions(dim);
+  }
+  return contracting_dim_size;
+}
+
 }  // namespace
 
 namespace triton_fusion {
@@ -81,9 +91,13 @@ namespace triton_fusion {
           0) {
     splittable_dimension_index = non_contracting_dimension_index;
   }
-  FusionContext context(DotProperties{non_contracting_dimension_index,
-                                      splittable_dimension_index},
-                        DotRequirements(kNoSplitRequirement));
+
+  int64_t contracting_size = GetContractingDimSize(dot);
+
+  FusionContext context(
+      DotProperties{non_contracting_dimension_index, splittable_dimension_index,
+                    contracting_size},
+      DotRequirements(kNoSplitRequirement));
   context.dim_orders_[dot.operand(operand_number)] =
       DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
                                              split_k_dimension_index);
@@ -102,9 +116,13 @@ namespace triton_fusion {
     // LHS non-contracting follows (batch is absent in this case).
     splittable_dimension_index = (split_k > 1) ? 1 : 0;
   }
-  FusionContext context(DotProperties{/*noncontracting_dimension=*/-1,
-                                      splittable_dimension_index},
-                        std::move(requirements));
+
+  int64_t contracting_size = GetContractingDimSize(dot);
+
+  FusionContext context(
+      DotProperties{/*noncontracting_dimension=*/-1, splittable_dimension_index,
+                    contracting_size},
+      std::move(requirements));
   context.dim_orders_[&dot] = DimensionOrder::FromDotOperandOrOutput(dot);
   return context;
 }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 437a2269739cf7..9cb21bb0f6710b 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/triton_tiling_propagation.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -260,7 +259,9 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
 
     // We should not remove the only fragment in a dimension, because if it is
     // removed, the dimension will be removed from the TensorIterationSpec.
-    if (dim_spec.size() <= 1) continue;
+    if (dim_spec.size() <= 1) {
+      continue;
+    }
 
     TensorIterationSpec::DimIterationSpec filtered_dim_spec;
     absl::c_copy_if(dim_spec, std::back_inserter(filtered_dim_spec),
@@ -575,9 +576,8 @@ DimOrderMapOrError GetPropagatedDimOrdersForBitcast(
     std::vector<int>& dst = dst_dim_fragment_orders[dim_index];
     dst.reserve(dim_sequence.size());
     for (const int src : dim_sequence) {
-      std::copy(src_to_dst[&src_fragments_order[src]].cbegin(),
-                src_to_dst[&src_fragments_order[src]].cend(),
-                std::back_inserter(dst));
+      absl::c_copy(src_to_dst[&src_fragments_order[src]],
+                   std::back_inserter(dst));
     }
   }
 
@@ -992,11 +992,29 @@ bool CanNotBeFusedIntoAUser(const HloInstruction& hlo) {
                           hlo.users()[0]->opcode() == HloOpcode::kTuple);
 }
 
+// Maximum contracting dimension size for which slice fusion is allowed when
+// the operand has multiple users.
+constexpr int kMaxContractingDimSizeForSliceFusion = 1024;
+
 // Let input and output data volumes of a fusion grow by small amounts.
 constexpr int kIoToleranceBytes = 1024;
 
+// Returns true if all users of the given operand are kSlice operations
+// with the same shape as `slice_shape`.
+bool AllUsersAreSlicesWithSameShape(const HloInstruction& operand,
+                                    const Shape& slice_shape) {
+  for (const HloInstruction* user : operand.users()) {
+    if (user->opcode() != HloOpcode::kSlice ||
+        !ShapeUtil::SameDimensions(user->shape(), slice_shape)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Tells that fusing an instruction as an input is efficient.
-bool IsInputWorthFusing(const HloInstruction& hlo) {
+bool IsInputWorthFusing(const HloInstruction& hlo,
+                        const DotProperties& properties) {
   std::optional<int64_t> input_minus_output_bytes = InputMinusOutputBytes(hlo);
   if (!input_minus_output_bytes.has_value()) {
     return false;
@@ -1011,6 +1029,26 @@ bool IsInputWorthFusing(const HloInstruction& hlo) {
       hlo_query::AllOperandsAreParametersOrConstants(hlo)) {
     return true;
   }
+
+  // Explanation:
+  // * Operand user count > 1 - if the producer of the slice has a single user
+  //   the slice can be fused into the producer instead of here.
+  // * contracting_dim_size < 1024 - fusing slices disables split-K rewriter,
+  //   which may outweigh the benefit of fusing it in the first place. Small
+  //   contracting dimension almost never benefits from splitting it, so we
+  //   allow the fusion.
+  // * AllUsersAreSlicesWithSameShape - slices of the same shape can be
+  //   fused into the producer by the multi output fusion pass.
+  //
+  // TODO: b/393299275 - Remove the contracting dim size restriction once the
+  // new emitter lands and we can support slices in contracting dimension with
+  // splits.
+  if (hlo.opcode() == HloOpcode::kSlice && hlo.operand(0)->user_count() > 1 &&
+      properties.contracting_dim_size <= kMaxContractingDimSizeForSliceFusion &&
+      !AllUsersAreSlicesWithSameShape(*hlo.operand(0), hlo.shape())) {
+    return true;
+  }
+
   const bool enable_subchannel_dequantisation_fusion =
       hlo.GetModule()
           ->config()
@@ -1018,8 +1056,8 @@ bool IsInputWorthFusing(const HloInstruction& hlo) {
           .xla_gpu_experimental_enable_subchannel_dequantisation_fusion();
   if (hlo.opcode() == HloOpcode::kMultiply) {
     return enable_subchannel_dequantisation_fusion &&
-           IsInputWorthFusing(*hlo.operand(0)) &&
-           IsInputWorthFusing(*hlo.operand(1));
+           IsInputWorthFusing(*hlo.operand(0), properties) &&
+           IsInputWorthFusing(*hlo.operand(1), properties);
   }
   return hlo_query::AllOperandsAreParametersOrConstantsWithSingleUser(hlo);
 }
@@ -1139,7 +1177,7 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
         }
       }
     }
-    if (!accepted && !IsInputWorthFusing(hlo)) {
+    if (!accepted && !IsInputWorthFusing(hlo, properties)) {
       return FusionDecision::Forbid(
           "Not obviously profitable to fuse as input.");
     }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
index a83dd9c976f8c4..df09b35a1f0ffc 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -249,6 +249,8 @@ struct DotProperties {
   // Index of dot dimension that can be split.
   // Currently typically LHS non-contracting one.
   const int splittable_dimension_index;
+  // Size of the contracting dimension (K).
+  const int64_t contracting_dim_size;
 };
 
 // A special value for splittable_dimension_major_part_size.
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index 6fd49f3a55e1b2..574f6c735b98c6 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -132,9 +132,11 @@ ProcessNewGpuCompilationEnvironment(
 }  // namespace xla
 
 static bool InitModule() {
-  xla::CompilationEnvironments::RegisterProcessNewEnvFn(
-      xla::GpuCompilationEnvironment::descriptor(),
-      xla::ProcessNewGpuCompilationEnvironment);
+  // TODO(b/284274097): Enable the registration once GPU compilation environment
+  // is well supported.
+  // xla::CompilationEnvironments::RegisterProcessNewEnvFn(
+  //     xla::GpuCompilationEnvironment::descriptor(),
+  //     xla::ProcessNewGpuCompilationEnvironment);
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc b/third_party/xla/xla/service/gpu_topology.cc
similarity index 94%
rename from third_party/xla/xla/pjrt/gpu/gpu_topology.cc
rename to third_party/xla/xla/service/gpu_topology.cc
index 9932d1a52c74d7..fd3304126ffd4c 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
+++ b/third_party/xla/xla/service/gpu_topology.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/service/gpu_topology.h"
 
 #include <memory>
 
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.h b/third_party/xla/xla/service/gpu_topology.h
similarity index 94%
rename from third_party/xla/xla/pjrt/gpu/gpu_topology.h
rename to third_party/xla/xla/service/gpu_topology.h
index dea991f99bfeb4..9d5137e6b99f64 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.h
+++ b/third_party/xla/xla/service/gpu_topology.h
@@ -13,17 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PJRT_GPU_GPU_TOPOLOGY_H_
-#define XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+#ifndef XLA_SERVICE_GPU_TOPOLOGY_H_
+#define XLA_SERVICE_GPU_TOPOLOGY_H_
 
 #include <cstdint>
 #include <memory>
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 
 namespace xla {
+
 class GpuTopology {
  public:
   explicit GpuTopology(absl::string_view platform_version,
@@ -77,4 +78,4 @@ class GpuTopology {
 
 }  // namespace xla
 
-#endif  // XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+#endif  // XLA_SERVICE_GPU_TOPOLOGY_H_
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.proto b/third_party/xla/xla/service/gpu_topology.proto
similarity index 89%
rename from third_party/xla/xla/pjrt/gpu/gpu_topology.proto
rename to third_party/xla/xla/service/gpu_topology.proto
index 405346499901eb..7f12d68fb71b81 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.proto
+++ b/third_party/xla/xla/service/gpu_topology.proto
@@ -2,6 +2,9 @@ syntax = "proto3";
 
 package xla;
 
+option java_multiple_files = true;
+option java_outer_classname = "GpuTopologyProto";
+
 // A proto used to serialize GpuTopology instances.
 message GpuTopologyProto {
   reserved 1;  // Was: device_ids
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 6ea3da98fd7cba..01610575d714cf 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -113,7 +113,7 @@ enum CustomCallApiVersion {
 }
 
 // Serialization of HloInstruction.
-// Next ID: 92
+// Next ID: 94
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -385,7 +385,11 @@ message HloInstructionProto {
   reserved 86;
 
   // Represents the list of devices that participate in a collective operation.
-  xla.CollectiveDeviceListProto collective_device_list = 87;
+  oneof replica_group_list {
+    xla.CollectiveDeviceListProto collective_device_list = 87;
+    xla.IotaReplicaGroupListProto iota_collective_device_list = 92;
+    xla.MeshAxesReplicaGroupListProto mesh_axes_replica_group_list = 93;
+  }
 
   // For HLO value tracking.
   xla.OriginalValueProto original_value = 88;
diff --git a/third_party/xla/xla/service/hlo_instruction_test.cc b/third_party/xla/xla/service/hlo_instruction_test.cc
index 005758e657f761..85ae2ec7c1e359 100644
--- a/third_party/xla/xla/service/hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/hlo_instruction_test.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -3381,5 +3382,39 @@ TEST_F(HloInstructionTest, DifferentResultAccuracy) {
   EXPECT_FALSE(exp1->equal_result_accuracy(exp2));
 }
 
+TEST_F(HloInstructionTest, FusionPermuteOperandsTest) {
+  constexpr char kHloString[] = R"(
+  HloModule test_module
+  fusion_computation {
+    p0 = f32[] parameter(0)
+    p1 = f32[32] parameter(1)
+    p2 = f32[32,32] parameter(2)
+    bcast0 = f32[32,32] broadcast(p0), dimensions={}
+    bcast1 = f32[32,32] broadcast(p1), dimensions={0}
+    sub = f32[32,32] subtract(bcast0, bcast1)
+    ROOT add = f32[32,32] add(sub, p2)
+  }
+
+  ENTRY reduce {
+    p0 = f32[] parameter(0)
+    p1 = f32[32] parameter(1)
+    p2 = f32[32,32] parameter(2)
+    ROOT root = f32[32,32] fusion(p0, p1, p2), kind=kLoop, calls=fusion_computation
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_OK(fusion->PermuteFusionOperands({1, 2, 0}));
+
+  EXPECT_THAT(fusion, GmockMatch(m::Fusion(m::Parameter(2), m::Parameter(0),
+                                           m::Parameter(1))));
+  HloComputation* fusion_computation = fusion->fused_instructions_computation();
+  EXPECT_THAT(fusion_computation->root_instruction(),
+              GmockMatch(m::Add(m::Subtract(m::Broadcast(m::Parameter(1)),
+                                            m::Broadcast(m::Parameter(2))),
+                                m::Parameter(0))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index d0e58a65b97d98..4d071600c4e405 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -41,7 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -294,7 +295,7 @@ static std::vector<ExecutionInput> ExecutionInputsFromScopedShapedBuffers(
 
   for (int param_num = 0; param_num < inputs.size(); param_num++) {
     const ScopedShapedBuffer& input_buffer = inputs[param_num];
-    ShapeTree<MaybeOwningDeviceMemory> buffer_tree(
+    ShapeTree<MaybeOwningDeviceAddress> buffer_tree(
         input_buffer.on_device_shape());
 
     input_buffer.buffers().ForEachElement(
@@ -329,7 +330,7 @@ static void ExecutionInputsFromMovedScopedShapedBuffers(
   for (int param_num = 0; param_num < inputs.size(); param_num++) {
     ShapedBuffer input_buffer = inputs[param_num].release();
 
-    ShapeTree<MaybeOwningDeviceMemory> buffer_tree(
+    ShapeTree<MaybeOwningDeviceAddress> buffer_tree(
         input_buffer.on_device_shape());
 
     input_buffer.buffers().ForEachElement(
@@ -474,7 +475,15 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
-  return ExecuteReplicated(executable.get(), options, device_assignment,
+  return ExecuteReplicatedWithExecutable(executable.get(), options,
+                                         device_assignment);
+}
+
+absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
+    const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment) {
+  return ExecuteReplicated(executable, options, device_assignment,
                            /*profile=*/nullptr);
 }
 
@@ -497,7 +506,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   // argument_buffers.
   const int64_t total_argument_count = [&]() {
     int64_t total = 0;
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       total += argument_count_provider(i);
     }
     return total;
@@ -511,7 +520,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64_t index = 0;
   RunId run_id;
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     int64_t device =
         (*device_assignment)(i / num_partitions, i % num_partitions);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
@@ -543,10 +552,10 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
 
   std::unique_ptr<tsl::thread::ThreadPool> pool;
   TF_RET_CHECK(options.infeed_values.empty() ||
-               options.infeed_values.size() == options.num_replicas);
+               options.infeed_values.size() == options.num_devices);
   int64_t num_threads = options.infeed_values.size();
   if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
-    num_threads += options.num_replicas;
+    num_threads += options.num_devices;
   }
   if (num_threads > 0) {
     pool = std::make_unique<tsl::thread::ThreadPool>(
@@ -554,7 +563,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
         /*num_threads=*/num_threads);
   }
   if (!options.infeed_values.empty()) {
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       int64_t device =
           (*device_assignment)(i / num_partitions, i % num_partitions);
       pool->Schedule([this, device, &options, i]() {
@@ -574,9 +583,9 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   }
   if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
     if (options.outfeed_values) {
-      options.outfeed_values->resize(options.num_replicas);
+      options.outfeed_values->resize(options.num_devices);
     }
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       int64_t device =
           (*device_assignment)(i / num_partitions, i % num_partitions);
       pool->Schedule([this, device, &options, i]() {
@@ -606,8 +615,8 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   VLOG(1) << "Replicated execution terminated";
 
   std::vector<Literal> exec_results;
-  exec_results.reserve(options.num_replicas);
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  exec_results.reserve(options.num_devices);
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     TF_RETURN_IF_ERROR(streams[i]->BlockHostUntilDone());
     TF_ASSIGN_OR_RETURN(Literal literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
@@ -620,6 +629,14 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     OpaqueExecutable* executable, const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+  DeviceAssignment computation_device_assignment;
+  if (device_assignment == nullptr) {
+    ASSIGN_OR_RETURN(
+        computation_device_assignment,
+        backend().computation_placer()->AssignDevices(options.num_devices, 1));
+    device_assignment = &computation_device_assignment;
+  }
+  CHECK_NE(device_assignment, nullptr);
   TF_ASSIGN_OR_RETURN(HloRunnerExecutable* const wrapped_executable,
                       HloRunnerExecutable::TryUnwrap(*this, executable));
   return ExecuteReplicatedImpl(
@@ -636,13 +653,13 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         } else {
           absl::Mutex mutex;
           std::vector<absl::StatusOr<ScopedShapedBuffer>> thread_results(
-              options.num_replicas);
+              options.num_devices);
           {
-            VLOG(1) << "Creating thread pool for " << options.num_replicas
+            VLOG(1) << "Creating thread pool for " << options.num_devices
                     << " replicas";
             tsl::thread::ThreadPool pool(tsl::Env::Default(), "replicas",
-                                         options.num_replicas);
-            for (int64_t i = 0; i < options.num_replicas; ++i) {
+                                         options.num_devices);
+            for (int64_t i = 0; i < options.num_devices; ++i) {
               pool.Schedule([&, i] {
                 auto result = executable->ExecuteOnStream(
                     &service_run_options[i], argument_buffer_slices[i]);
@@ -678,7 +695,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   if (device_assignment == nullptr) {
     TF_ASSIGN_OR_RETURN(
         computation_device_assignment,
-        backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+        backend().computation_placer()->AssignDevices(options.num_devices, 1));
     device_assignment = &computation_device_assignment;
   }
   CHECK_NE(device_assignment, nullptr);
@@ -691,13 +708,13 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         std::vector<ScopedShapedBuffer> results;
         absl::Mutex mutex;
         std::vector<absl::StatusOr<ScopedShapedBuffer>> thread_results(
-            options.num_replicas);
+            options.num_devices);
         {
-          VLOG(1) << "Creating thread pool for " << options.num_replicas
+          VLOG(1) << "Creating thread pool for " << options.num_devices
                   << " replicas";
           tsl::thread::ThreadPool pool(tsl::Env::Default(), "replicas",
-                                       options.num_replicas);
-          for (int64_t i = 0; i < options.num_replicas; ++i) {
+                                       options.num_devices);
+          for (int64_t i = 0; i < options.num_devices; ++i) {
             for (const auto& arg : argument_buffer_slices[i]) {
               TF_RET_CHECK(arg != nullptr);
             }
@@ -732,10 +749,20 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     const ReplicatedExecuteOptions& options) {
   TF_ASSIGN_OR_RETURN(
       DeviceAssignment device_assignment,
-      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+      backend().computation_placer()->AssignDevices(options.num_devices, 1));
   return ExecuteReplicated(std::move(module), options, &device_assignment);
 }
 
+absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
+    const ReplicatedExecuteOptions& options) {
+  ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_devices, 1));
+  return ExecuteReplicatedWithExecutable(executable, options,
+                                         &device_assignment);
+}
+
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>> HloRunner::CreateExecutable(
     std::unique_ptr<HloModule> module, bool run_hlo_passes) {
   return CreateExecutableWithBufferAssignment(
diff --git a/third_party/xla/xla/service/hlo_runner.h b/third_party/xla/xla/service/hlo_runner.h
index b2b5feab32b491..122a4b6c766376 100644
--- a/third_party/xla/xla/service/hlo_runner.h
+++ b/third_party/xla/xla/service/hlo_runner.h
@@ -173,6 +173,16 @@ class HloRunner : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options) override;
+
+  // Same as above, but with specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
   // Same as above, but with a reusable Executable.  This may update the profile
   // information in *executable.
   //
diff --git a/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/xla/xla/service/hlo_runner_interface.h
index a7141a9c02482e..6956847ee83f72 100644
--- a/third_party/xla/xla/service/hlo_runner_interface.h
+++ b/third_party/xla/xla/service/hlo_runner_interface.h
@@ -166,7 +166,7 @@ class HloRunnerInterface {
   // The options used to configure an ExecuteReplicated() call.
   struct ReplicatedExecuteOptions {
     // The number of devices the HLO module should be replicated onto.
-    int64_t num_replicas = 1;
+    int64_t num_devices = 1;
 
     // The arguments to be fed to each replica. Since this is used for a
     // replicated execution, all the arguments are the same for all replicas.
@@ -290,6 +290,16 @@ class HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) = 0;
 
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options) = 0;
+
+  // Same as above, but with specified device assignment.
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) = 0;
+
   virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index eebcdc0e0401f3..dd569941286ce7 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -57,12 +58,12 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/recordphase.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/path.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -247,6 +248,26 @@ class HloRunnerPjRtExecutable : public OpaqueExecutable {
   std::unique_ptr<PjRtLoadedExecutable> loaded_executable_;
 };
 
+// Obtains the best device assignment for the given executable.
+// If the executable was compiled with a device assignment, that assignment is
+// returned. Otherwise, the static device assignment is pulled from the module
+// and returned instead. If that does not exist either, the default device
+// assignment is computed and returned.
+absl::StatusOr<DeviceAssignment> GetBestDeviceAssignment(
+    HloRunnerPjRtExecutable* const executable, PjRtClient& client) {
+  ASSIGN_OR_RETURN(CompileOptions compile_options,
+                   executable->executable()->GetCompileOptions());
+  if (compile_options.executable_build_options.has_device_assignment()) {
+    return compile_options.executable_build_options.device_assignment();
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      executable->executable()->GetHloModules());
+  TF_RET_CHECK(hlo_modules.size() == 1);
+  return GetStaticDeviceAssignmentOrComputeDefault(*hlo_modules.front(),
+                                                   client);
+}
+
 }  // namespace
 
 HloRunnerPjRt::HloRunnerPjRt(std::unique_ptr<PjRtClient> pjrt_client)
@@ -301,9 +322,6 @@ absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 HloRunnerPjRt::TransferLiteralsToDevice(
     const absl::Span<const ShapeLayout> layouts,
     const absl::Span<const Literal* const> literals) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_TransferLiteralsToDevice",
-                                    /*use_unique_phase_name=*/true);
-
   // Note: This function is used for single (default) device execution.
   if (pjrt_client_->addressable_device_count() <= kDeviceIdx) {
     return absl::InternalError("No addressable devices available");
@@ -364,9 +382,6 @@ HloRunnerPjRt::TransferLiteralsToDevice(
 absl::StatusOr<Literal> HloRunnerPjRt::TransferLiteralsFromDevice(
     absl::Span<const std::unique_ptr<PjRtBuffer>> output_buffers,
     const bool untuple_result) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_TransferLiteralsFromDevice",
-                                    /*use_unique_phase_name=*/true);
-
   if (!untuple_result) {
     // If not flattened, the tuple should only contain arrays with layouts.
     TF_RET_CHECK(output_buffers.size() == 1)
@@ -407,9 +422,6 @@ HloRunnerPjRt::ExecuteWithDeviceBuffers(
     OpaqueExecutable* executable,
     const std::vector<std::unique_ptr<PjRtBuffer>>& arguments,
     const ExecuteOptions* execute_options) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
-                                    /*use_unique_phase_name=*/true);
-
   TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
@@ -478,9 +490,6 @@ HloRunnerPjRt::ExecuteWithExecutable(OpaqueExecutable* executable,
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 HloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                 bool run_hlo_passes) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Compile",
-                                    /*use_unique_phase_name=*/true);
-
   TF_ASSIGN_OR_RETURN(
       CompileOptions compile_options,
       GenerateDefaultCompileOptions(module.get(), run_hlo_passes));
@@ -550,41 +559,56 @@ HloRunnerPjRt::DeserializeExecutable(const absl::string_view serialized) const {
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options) {
-  TF_ASSIGN_OR_RETURN(
-      DeviceAssignment device_assignment,
-      GetStaticDeviceAssignmentOrComputeDefault(*module, *pjrt_client_));
-  return ExecuteReplicated(std::move(module), options, &device_assignment);
+  return ExecuteReplicated(std::move(module), options,
+                           /*device_assignment=*/nullptr);
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  module->mutable_config().set_replica_count(options.num_replicas);
-
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
+  return ExecuteReplicatedWithExecutable(executable.get(), options,
+                                         device_assignment);
+}
 
-  return ExecuteReplicated(executable.get(), options, device_assignment);
+absl::StatusOr<std::vector<Literal>>
+HloRunnerPjRt::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
+    const ReplicatedExecuteOptions& options) {
+  return ExecuteReplicatedWithExecutable(executable, options,
+                                         /*device_assignment=*/nullptr);
 }
 
-absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
-    OpaqueExecutable* executable,
+absl::StatusOr<std::vector<Literal>>
+HloRunnerPjRt::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
-                      HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
+  ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
+                   HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
+
+  // If a device assignment is provided, use it. Otherwise, use the one from the
+  // executable, or if that is not available, generate a default one.
+  std::optional<DeviceAssignment> device_assignment_storage = std::nullopt;
+  if (device_assignment == nullptr) {
+    ASSIGN_OR_RETURN(
+        device_assignment_storage,
+        GetBestDeviceAssignment(wrapped_executable, *pjrt_client_));
+    device_assignment = &*device_assignment_storage;
+  }
+  CHECK_NE(device_assignment, nullptr);
 
   xla::ExecuteOptions execute_options;
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
           absl::AnyInvocable<OpaqueExecutable*(int64_t)>
-              executable_provider_arg)
+              executable_provider_arg,
+          absl::Span<PjRtDevice* const>)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
-        tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
-                                          /*use_unique_phase_name=*/true);
         TF_ASSIGN_OR_RETURN(
             PjRtLoadedExecutable * pjrt_executable,
             wrapped_executable->GetOrLoadExecutable(pjrt_client_.get()));
@@ -603,30 +627,45 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  TF_RET_CHECK(device_assignment->computation_count() == 1)
-      << "Only single-computation execution is supported.";
+  CHECK_GT(options.num_devices, 0);
+  ASSIGN_OR_RETURN(
+      HloRunnerPjRtExecutable* const wrapped_executable_device0,
+      HloRunnerPjRtExecutable::TryUnwrap(*this, executable_provider(0)));
+
+  // NB: we assume all executables have the same device assignments.  If a
+  // device assignment is provided, use it. Otherwise, use the one from the
+  // first device's executable, or if that is not available, generate a default
+  // one.
+  std::optional<DeviceAssignment> device_assignment_storage = std::nullopt;
+  if (device_assignment == nullptr) {
+    ASSIGN_OR_RETURN(
+        device_assignment_storage,
+        GetBestDeviceAssignment(wrapped_executable_device0, *pjrt_client_));
+    device_assignment = &*device_assignment_storage;
+  }
+  CHECK_NE(device_assignment, nullptr);
+
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
           absl::AnyInvocable<OpaqueExecutable*(int64_t)>
-              executable_provider_arg)
+              executable_provider_arg,
+          absl::Span<PjRtDevice* const> id_to_device_ptr)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
-        tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
-                                          /*use_unique_phase_name=*/true);
         TF_RET_CHECK(options.use_threads);
 
         // The underlying data is modified concurrently. We don't need to
         // protect access as each replica writes only to its own slot.
         std::vector<absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>>
-            per_replica_results(options.num_replicas);
+            per_replica_results(options.num_devices);
         absl::c_fill(per_replica_results,
                      absl::InternalError("No result for replica."));
 
         {
           // NB: `pool` is joined on destruction.
           tsl::thread::ThreadPool pool(tsl::Env::Default(), "replicas",
-                                       options.num_replicas);
-          for (int64_t i = 0; i < options.num_replicas; ++i) {
+                                       options.num_devices);
+          for (int64_t i = 0; i < options.num_devices; ++i) {
             for (const PjRtBuffer* const buffer : argument_buffer_slices[i]) {
               TF_RET_CHECK(buffer != nullptr);
             }
@@ -636,12 +675,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
             TF_ASSIGN_OR_RETURN(
                 PjRtLoadedExecutable * pjrt_executable,
                 executable->GetOrLoadExecutable(pjrt_client_.get()));
-            TF_ASSIGN_OR_RETURN(
-                PjRtDevice * device_ptr,
-                pjrt_client_->LookupDevice(
-                    DeviceIdForInvocation(*device_assignment, i)));
             pool.Schedule([&per_replica_results, i, pjrt_executable,
-                           args = argument_buffer_slices[i], device_ptr]() {
+                           args = argument_buffer_slices[i],
+                           device_ptr = id_to_device_ptr[i]]() {
               std::optional<Future<>> returned_future = {};
               xla::ExecuteOptions options;
               per_replica_results[i] = pjrt_executable->ExecuteSharded(
@@ -659,7 +695,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
         }
         // Aggregate results.
         std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results;
-        for (int64_t i = 0; i < options.num_replicas; ++i) {
+        for (int64_t i = 0; i < options.num_devices; ++i) {
           absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>&
               replica_result = per_replica_results[i];
           if (!replica_result.ok()) {
@@ -677,7 +713,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     absl::AnyInvocable<
         absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
             absl::Span<const std::vector<PjRtBuffer*>>,
-            absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
+            absl::AnyInvocable<OpaqueExecutable*(int64_t)>,
+            absl::Span<PjRtDevice* const>)>
         execution_helper,
     absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
     absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
@@ -685,23 +722,26 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(options.infeed_values.empty() ||
-               options.infeed_values.size() == options.num_replicas);
+               options.infeed_values.size() == options.num_devices);
+  TF_RET_CHECK(device_assignment != nullptr);
 
-  std::vector<PjRtDevice*> replica_devices(options.num_replicas, nullptr);
+  std::vector<PjRtDevice*> id_to_device_ptr(options.num_devices, nullptr);
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffer_slices;
-  argument_buffer_slices.reserve(options.num_replicas);
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  argument_buffer_slices.reserve(options.num_devices);
+  std::vector<bool> is_tuple_result(options.num_devices, false);
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     // Amortize device lookup.
     TF_ASSIGN_OR_RETURN(PjRtDevice* const device_ptr,
                         pjrt_client_->LookupDevice(
                             DeviceIdForInvocation(*device_assignment, i)));
-    replica_devices[i] = device_ptr;
+    id_to_device_ptr[i] = device_ptr;
 
     // Get the entry layout.
     OpaqueExecutable* const wrapped_executable = executable_provider(i);
     TF_ASSIGN_OR_RETURN(const HloModule* const module,
                         HloModuleFromWrapped(wrapped_executable));
     const ComputationLayout& ecl = module->entry_computation_layout();
+    is_tuple_result[i] = ecl.result_shape().IsTuple();
 
     // Transfer literals to device.
     const int64_t argument_count = argument_count_provider(i);
@@ -732,14 +772,14 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   if (has_infeed || has_outfeed) {
     // One infeed per infeed value and one outfeed per replica.
     const int64_t num_threads =
-        options.infeed_values.size() + (has_outfeed ? options.num_replicas : 0);
+        options.infeed_values.size() + (has_outfeed ? options.num_devices : 0);
     pool = std::make_unique<tsl::thread::ThreadPool>(
         tsl::Env::Default(), "infeed_outfeed", num_threads);
   }
   if (has_infeed) {
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       pool->Schedule(
-          [device = replica_devices[i],
+          [device = id_to_device_ptr[i],
            &infeed_literal = *ABSL_DIE_IF_NULL(options.infeed_values[i]),
            infeed_steps = options.infeed_steps, &infeed_outfeed_status_mu,
            &infeed_outfeed_status]() {
@@ -759,10 +799,10 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   }
   if (has_outfeed) {
     if (options.outfeed_values != nullptr) {
-      options.outfeed_values->resize(options.num_replicas);
+      options.outfeed_values->resize(options.num_devices);
     }
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
-      pool->Schedule([i, device = replica_devices[i],
+    for (int64_t i = 0; i < options.num_devices; ++i) {
+      pool->Schedule([i, device = id_to_device_ptr[i],
                       outfeed_values = options.outfeed_values,
                       outfeed_shape = options.outfeed_shape,
                       infeed_steps = options.infeed_steps,
@@ -791,16 +831,16 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
       const std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
           result_buffers,
       execution_helper(BufferMatToPointerMat(argument_buffer_slices),
-                       std::move(executable_provider)));
+                       std::move(executable_provider), id_to_device_ptr));
   VLOG(1) << "Replicated execution terminated";
 
   // Get the result from execution.
   std::vector<Literal> result_literals;
-  result_literals.reserve(options.num_replicas);
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
-    TF_ASSIGN_OR_RETURN(Literal literal,
-                        TransferLiteralsFromDevice(
-                            result_buffers[i], result_buffers[i].size() != 1));
+  result_literals.reserve(options.num_devices);
+  for (int64_t i = 0; i < options.num_devices; ++i) {
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        TransferLiteralsFromDevice(result_buffers[i], is_tuple_result[i]));
     result_literals.push_back(std::move(literal));
   }
 
@@ -841,7 +881,8 @@ absl::StatusOr<Literal> HloRunnerPjRt::TransferLiteralFromDevice(
       on_device_shape.IsTuple() && on_device_shape.tuple_shapes().size() == 0) {
     return LiteralUtil::MakeTuple({});
   }
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, buffer.ToLiteralSync());
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                      buffer.ToLiteral().Await());
   return std::move(*literal);
 }
 
@@ -939,6 +980,14 @@ std::string MakeFilename(const HloModule& module, const bool run_hlo_passes) {
                                                  fingerprint_bytes.size());
   return absl::StrCat(absl::BytesToHexString(fingerprint_bytes_view), ".bin");
 }
+
+inline absl::StatusOr<DeviceAssignment> SplitPhaseDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) {
+  DeviceAssignment device_assignment(num_replicas, num_partitions);
+  device_assignment.FillIota(0);
+  return device_assignment;
+}
+
 }  // namespace
 
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
@@ -960,6 +1009,12 @@ CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   return wrapped_executable;
 }
 
+absl::StatusOr<DeviceAssignment>
+CompilePhaseHloRunnerPjRt::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  return SplitPhaseDefaultDeviceAssignment(num_replicas, num_partitions);
+}
+
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                             const bool run_hlo_passes) {
@@ -1004,4 +1059,10 @@ ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   return DeserializeExecutable(serialized_executable);
 }
 
+absl::StatusOr<DeviceAssignment>
+ExecutePhaseHloRunnerPjRt::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  return SplitPhaseDefaultDeviceAssignment(num_replicas, num_partitions);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index 488a0777aadcf4..bb698a64bbe786 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -101,6 +101,16 @@ class HloRunnerPjRt : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options) override;
+
+  // Same as above, but with specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
@@ -108,11 +118,6 @@ class HloRunnerPjRt : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
-  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      OpaqueExecutable* executable,
-      const HloRunnerInterface::ReplicatedExecuteOptions& options,
-      DeviceAssignment* device_assignment);
-
   absl::string_view Name() const override;
 
   int device_count() const override { return pjrt_client_->device_count(); }
@@ -140,7 +145,8 @@ class HloRunnerPjRt : public HloRunnerInterface {
       absl::AnyInvocable<
           absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
               absl::Span<const std::vector<PjRtBuffer*>>,
-              absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
+              absl::AnyInvocable<OpaqueExecutable*(int64_t)>,
+              absl::Span<PjRtDevice* const>)>
           execution_helper,
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
@@ -186,6 +192,9 @@ class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
         "expected.");
   }
 
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
  private:
   std::string artifact_dir_;
 };
@@ -215,6 +224,9 @@ class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
       std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
 
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
  private:
   std::string artifact_dir_;
   bool compile_if_not_found_;
diff --git a/third_party/xla/xla/service/hlo_sharding_test.cc b/third_party/xla/xla/service/hlo_sharding_test.cc
index ee87360d9c2c2c..bb97ca0578e0a7 100644
--- a/third_party/xla/xla/service/hlo_sharding_test.cc
+++ b/third_party/xla/xla/service/hlo_sharding_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <sstream>
 #include <string>
@@ -21,16 +20,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/hash/hash.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
 
@@ -42,7 +48,7 @@ using ::tsl::proto_testing::EqualsProto;
 Array<int64_t> MakeArray(absl::Span<const int64_t> dimensions,
                          absl::Span<const int64_t> contents) {
   Array<int64_t> a(dimensions);
-  std::copy(contents.begin(), contents.end(), a.begin());
+  absl::c_copy(contents, a.begin());
   return a;
 }
 
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 4d73cb8ba5bab9..2a6e1d694bea28 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -2565,16 +2565,13 @@ absl::StatusOr<bool> ShouldSkipDeadlockCheck(const T* instruction) {
   if (instruction->is_host_transfer()) {
     return true;
   }
-  // TODO: b/441038687 - Remove kSendRecvValidationAttr
   // TODO: b/441088186 - update static analyzer logic to also handle
   // instructions annotated with _xla_send_recv_pipeline
   // For now we will skip checks for instructions annotated with
-  // _xla_send_recv_pipeline and _xla_send_recv_validation, since they introduce
-  // extra constraints that have not been modeled by this function.
+  // _xla_send_recv_pipeline, since they introduce extra constraints that have
+  // not been modeled by this function.
   if (instruction->frontend_attributes().map().contains(
-          kSendRecvPipelineAttr) ||
-      instruction->frontend_attributes().map().contains(
-          kSendRecvValidationAttr)) {
+          kSendRecvPipelineAttr)) {
     return true;
   }
   // Check that the instruction itself does not have conflicting
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 8eab67139f5f54..82d1afcf50f6f6 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -1431,33 +1431,6 @@ TEST_F(HloVerifierTest, AsyncDoneOutputWrongType) {
                         "async shape at index {1}"));
 }
 
-TEST_F(HloVerifierTest, AsyncUpdateWrongType) {
-  const char* const hlo_string = R"(
-  HloModule Module
-
-  async_computation {
-    p = f32[2,3] parameter(0)
-    ROOT custom-call = f32[3,2] custom-call(p), custom_call_target="foo"
-  }
-
-  ENTRY AsyncStartAndAsyncDone {
-    p0 = f32[2,3] parameter(0)
-    async-start = ((f32[2,3]), f32[3,2], u32[]) async-start(p0), calls=async_computation
-    async-update = ((f32[3,2]), f32[3,2], u32[]) async-update(async-start), calls=async_computation
-    ROOT async-done = f32[3,2] async-done(async-update), calls=async_computation
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
-
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.message(),
-      HasSubstr(
-          "async-update expects the shape of operand and output to match"));
-}
-
 TEST_F(HloVerifierTest, AsyncOpComputationNotTrivial) {
   const char* const hlo_string = R"(
   HloModule Module
diff --git a/third_party/xla/xla/service/host_offload_utils.cc b/third_party/xla/xla/service/host_offload_utils.cc
index 13eef41d1453d2..1473db05c250f9 100644
--- a/third_party/xla/xla/service/host_offload_utils.cc
+++ b/third_party/xla/xla/service/host_offload_utils.cc
@@ -353,5 +353,88 @@ bool IsMoveToDeviceWithDynamicSlice(const HloInstruction* instr) {
   return false;
 }
 
+namespace {
+
+// Recursively finds GTE indices used in DS/DUS index operands.
+absl::flat_hash_set<int64_t> FindTupleIndicesInOperand(
+    const HloInstruction* operand) {
+  absl::flat_hash_set<int64_t> indices;
+
+  if (operand->opcode() == HloOpcode::kGetTupleElement) {
+    indices.insert(operand->tuple_index());
+  } else if (operand->opcode() == HloOpcode::kCopy &&
+             operand->operand_count() == 1) {
+    auto copy_indices = FindTupleIndicesInOperand(operand->operand(0));
+    indices.insert(copy_indices.begin(), copy_indices.end());
+  } else if (operand->opcode() == HloOpcode::kAdd ||
+             operand->opcode() == HloOpcode::kSubtract ||
+             operand->opcode() == HloOpcode::kMultiply ||
+             operand->opcode() == HloOpcode::kDivide) {
+    for (int i = 0; i < operand->operand_count(); ++i) {
+      auto op_indices = FindTupleIndicesInOperand(operand->operand(i));
+      indices.insert(op_indices.begin(), op_indices.end());
+    }
+  }
+
+  return indices;
+}
+
+}  // namespace
+
+absl::Status MarkDynamicVariables(HloInstruction* while_loop) {
+  if (while_loop->opcode() != HloOpcode::kWhile) {
+    return absl::OkStatus();
+  }
+
+  if (!while_loop->while_body()) {
+    return absl::OkStatus();
+  }
+
+  bool has_host_offloading = false;
+  for (const HloInstruction* instr : while_loop->while_body()->instructions()) {
+    if (IsMoveToHostWithDynamicUpdateSlice(instr) ||
+        IsMoveToDeviceWithDynamicSlice(instr)) {
+      has_host_offloading = true;
+      break;
+    }
+  }
+  if (!has_host_offloading) {
+    return absl::OkStatus();
+  }
+
+  WhileLoopBackendConfig config;
+  TF_ASSIGN_OR_RETURN(config,
+                      while_loop->backend_config<WhileLoopBackendConfig>());
+
+  config.clear_dynamic_variable_tuple_indices();
+
+  std::set<int64_t> dynamic_slice_indices;
+
+  for (auto* instr : while_loop->while_body()->instructions()) {
+    if (instr->opcode() == HloOpcode::kDynamicUpdateSlice ||
+        instr->opcode() == HloOpcode::kDynamicSlice) {
+      int first_index_operand =
+          (instr->opcode() == HloOpcode::kDynamicUpdateSlice)
+              ? Cast<HloDynamicUpdateSliceInstruction>(instr)
+                    ->first_index_operand_number()
+              : Cast<HloDynamicSliceInstruction>(instr)
+                    ->first_index_operand_number();
+
+      for (int i = first_index_operand; i < instr->operand_count(); ++i) {
+        auto* index_op = instr->operand(i);
+        auto op_indices = FindTupleIndicesInOperand(index_op);
+        dynamic_slice_indices.insert(op_indices.begin(), op_indices.end());
+      }
+    }
+  }
+
+  for (int64_t tuple_idx : dynamic_slice_indices) {
+    config.add_dynamic_variable_tuple_indices(tuple_idx);
+  }
+
+  TF_RETURN_IF_ERROR(while_loop->set_backend_config(config));
+  return absl::OkStatus();
+}
+
 }  // namespace host_offload_utils
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offload_utils.h b/third_party/xla/xla/service/host_offload_utils.h
index 1edac1898c7b4b..e49b9b64bd423b 100644
--- a/third_party/xla/xla/service/host_offload_utils.h
+++ b/third_party/xla/xla/service/host_offload_utils.h
@@ -111,6 +111,10 @@ bool IsMoveToHostWithDynamicUpdateSlice(const HloInstruction* instr);
 
 bool IsMoveToDeviceWithDynamicSlice(const HloInstruction* instr);
 
+// Scans while loop body for DS/DUS, traces their index operands back to GTEs
+// and marks corresponding tuple indices as dynamic variables.
+absl::Status MarkDynamicVariables(HloInstruction* while_loop);
+
 }  // namespace host_offload_utils
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 1d9b568f88f9f0..f4e84b7e585969 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -56,18 +56,6 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla {
-
-#if defined(PLATFORM_GOOGLE)
-FusionDecision::FusionDecision(bool decision,
-                               absl::SourceLocation source_location) {
-  if (!decision) {
-    explanation_ =
-        absl::StrCat("Not fusing: due to ", source_location.file_name(), ":",
-                     source_location.line());
-  }
-}
-#endif  // PLATFORM_GOOGLE
-
 namespace {
 
 // These nodes can always be duplicated into consumers, even if
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index 85ff4dde04035c..d5ad1b7c17e1a6 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -21,18 +21,14 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/service/hlo_module_config.h"
-#include "tsl/platform/macros.h"
 // The source_location.h is not available in open source.
 #if defined(PLATFORM_GOOGLE)
 #include "absl/types/source_location.h"
@@ -54,6 +50,29 @@ struct InPlaceFusionOptions {
   bool relax_multiple_non_elementwise_ops = false;
 };
 
+// A holder for the source location. absl::SourceLocation is not available in
+// open source, so we have a stub implementation to limit
+// #if define(PLATFORM_GOOGLE).
+class SourceLocationHolder {
+ public:
+#if defined(PLATFORM_GOOGLE)
+  explicit constexpr SourceLocationHolder(
+      absl::SourceLocation source_location = absl::SourceLocation::current())
+      : source_location_(source_location) {}
+
+  std::string ToString() const {
+    return absl::StrCat(" at: ", source_location_.file_name(), ":",
+                        source_location_.line());
+  }
+
+ private:
+  absl::SourceLocation source_location_;
+#else
+  SourceLocationHolder() = default;
+  std::string ToString() const { return ""; }
+#endif  // PLATFORM_GOOGLE
+};
+
 // Propagating explanation of fusion decisions: if something could not be fused,
 // explain the reason.
 class FusionDecision {
@@ -61,34 +80,29 @@ class FusionDecision {
   static FusionDecision Allow() { return FusionDecision(); }
   FusionDecision(const FusionDecision& decision) = default;
 
-#if defined(PLATFORM_GOOGLE)
-  static std::string LocToString(absl::SourceLocation source_location) {
-    return absl::StrCat(" at: ", source_location.file_name(), ":",
-                        source_location.line());
-  }
   static FusionDecision Forbid(
       absl::string_view explanation,
-      absl::SourceLocation source_location = absl::SourceLocation::current()) {
-    return FusionDecision(
-        absl::StrCat(explanation, LocToString(source_location)));
+      SourceLocationHolder source_location = SourceLocationHolder()) {
+    return FusionDecision(false, explanation, source_location);
   }
 
   // If condition is `true` means that we CAN fuse. In that case, explanation is
   // discarded.
   FusionDecision(
       bool condition, absl::string_view explanation,
-      absl::SourceLocation source_location = absl::SourceLocation::current()) {
+      SourceLocationHolder source_location = SourceLocationHolder()) {
     if (!condition) {
-      explanation_ = absl::StrCat(explanation, LocToString(source_location));
+      explanation_ = explanation;
+      source_location_ = source_location;
     }
   }
 
   explicit FusionDecision(
       absl::Status status,
-      absl::SourceLocation source_location = absl::SourceLocation::current()) {
+      SourceLocationHolder source_location = SourceLocationHolder()) {
     if (!status.ok()) {
-      explanation_ =
-          absl::StrCat(status.message(), LocToString(source_location));
+      explanation_ = status.message();
+      source_location_ = source_location;
     }
   }
 
@@ -97,25 +111,8 @@ class FusionDecision {
   // provide explicit explanation.
   FusionDecision(  // NOLINT
       bool decision,
-      absl::SourceLocation source_location = absl::SourceLocation::current());
-#else
-  // If condition is `true` means that we CAN fuse. In that case, explanation is
-  // discarded.
-  FusionDecision(bool condition, absl::string_view explanation) {
-    if (!condition) {
-      explanation_ = std::string(explanation);
-    }
-  }
-  static FusionDecision Forbid(absl::string_view explanation) {
-    return FusionDecision(explanation);
-  }
-  explicit FusionDecision(absl::Status status) {
-    if (!status.ok()) {
-      explanation_ = status.message();
-    }
-  }
-
-#endif  // PLATFORM_GOOGLE
+      SourceLocationHolder source_location = SourceLocationHolder())
+      : FusionDecision(decision, "Not fusing", source_location) {}
 
   // Returns whether it can be fused.
   explicit operator bool() const { return CanFuse(); }
@@ -130,8 +127,7 @@ class FusionDecision {
     if (CanFuse() || decision.CanFuse()) {
       return Allow();
     }
-    return Forbid(
-        absl::StrCat(explanation_.value_or(""), " ; ", decision.Explain()));
+    return Forbid(absl::StrCat(Explain(), " ; ", decision.Explain()));
   }
 
   // Connects two fusion decision with a conjunction. Unlike disjunction,
@@ -150,30 +146,30 @@ class FusionDecision {
 
   // Appends to explanation, or turns the decision negative.
   FusionDecision operator<<(absl::string_view explanation) const {
-    return Forbid(absl::StrCat(explanation_.value_or(""), explanation));
+    return Forbid(absl::StrCat(explanation_.value_or(""), explanation),
+                  source_location_);
   }
 
   // Appends to explanation, or turns the decision negative.
   FusionDecision operator<<(int64_t explanation) const {
-    return Forbid(absl::StrCat(explanation_.value_or(""), explanation));
+    return Forbid(absl::StrCat(explanation_.value_or(""), explanation),
+                  source_location_);
   }
 
   // Explains why the fusion could not be performed, or that it can be.
   std::string Explain() const {
-    return explanation_.value_or("Actually, we can fuse it.");
+    if (explanation_.has_value()) {
+      return absl::StrCat(explanation_.value(), source_location_.ToString());
+    }
+    return "Actually, we can fuse it.";
   }
 
  private:
   // Empty IFF fusion is possible (explanation provided for negative cases).
   std::optional<std::string> explanation_;
+  SourceLocationHolder source_location_;
 
   FusionDecision() = default;
-
-  explicit FusionDecision(absl::string_view explanation)
-      : explanation_(explanation) {}
-
-  explicit FusionDecision(const char* explanation)
-      : explanation_(explanation) {}
 };
 
 #define RETURN_IF_NOT_FUSIBLE(...)                   \
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 62995d048976f9..aefb20a2718470 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -163,6 +163,11 @@ int GetCustomCallForceDelayPriority(const HloInstruction* instr) {
   return 0;
 }
 
+bool HasForceDelayAsyncAttribute(const HloInstruction* instr) {
+  auto attr = instr->get_frontend_attribute("scheduler_hint");
+  return attr.has_value() && attr.value() == "force_delay_async";
+}
+
 absl::flat_hash_map<int64_t, int64_t>
 GetNumResourcesNeededForAnnotationWithKeepOriginalOrderAttrs(
     const DefaultSchedulerCore::SchedulingState& sched_state,
@@ -2622,6 +2627,9 @@ HloScheduleGraph::HloScheduleGraph(
       n->SetForceDelay(true);
       n->SetForceDelayPriority(GetCustomCallForceDelayPriority(instr));
     }
+    if (n->IsSupportedAsyncStart() && HasForceDelayAsyncAttribute(instr)) {
+      n->SetForceDelay(true);
+    }
   }
 
   // num_predecessors[i]: number of predecessors for instruction number "i"
@@ -3001,6 +3009,8 @@ absl::Status DefaultSchedulerCore::InitializeScheduler(
               continue;
             }
             if (count > it->second) {
+              VLOG(5) << "Cross overlap limit for resource: " << resource
+                      << " count: " << count << " limit: " << it->second;
               return true;
             }
           }
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 01630bbaa5bf5e..5f4f6d449162be 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 9a3dd47c32949b..c1bc14e0268974 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -812,6 +812,66 @@ ENTRY %module {
   EXPECT_TRUE(result.value());
 }
 
+TEST_F(LatencyHidingSchedulerTest, ForceDelayAsyncAllGather) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY %module {
+  %constant.19 = u32[] constant(1)
+  %replica_id = u32[]{:T(128)} replica-id()
+  %add.1 = u32[]{:T(128)} add(replica_id, constant.19)
+  %convert = f32[]{:T(128)} convert(u32[]{:T(128)} %replica_id)
+  %convert.1 = f32[]{:T(128)} convert(u32[]{:T(128)} %add.1)
+  %color_operand.1 = f32[8,256,256]{2,1,0:T(8,128)} broadcast(f32[]{:T(128)} %convert), dimensions={}
+  %color_operand.2 = f32[8,256,256]{2,1,0:T(8,128)} broadcast(f32[]{:T(128)} %convert.1), dimensions={}
+  %ag-start.2 = (f32[8,256,256], f32[16,256,256]) all-gather-start(f32[8,256,256] %color_operand.2), replica_groups={{0,1}}, dimensions={0},
+    metadata={op_type="AllGather" op_name="ag1"}
+  %ag-start = (f32[8,256,256], f32[16,256,256]) all-gather-start(f32[8,256,256] %color_operand.1), replica_groups={{0,1}}, dimensions={0},
+    frontend_attributes={scheduler_hint="force_delay_async"},
+    metadata={op_type="AllGather" op_name="ag0"}
+  %ag-done = f32[16,256,256] all-gather-done((f32[8,256,256], f32[16,256,256]) %ag-start),
+    metadata={op_type="AllGather" op_name="ag0"}
+  %ag-done.2 = f32[16,256,256] all-gather-done((f32[8,256,256], f32[16,256,256]) %ag-start.2),
+    metadata={op_type="AllGather" op_name="ag1"}
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  p1 = f32[16,64,256]{2,1,0} parameter(1)
+  c0 = f32[16,256,256]{2,1,0} convolution(p0, p1),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+  ROOT a2 = f32[16,256,256]{2,1,0} add(%ag-done, %ag-done.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  std::vector<HloInstruction*> original_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+
+  TF_EXPECT_OK(RunScheduler(hlo_module.get()));
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+
+  // The all-gather with force_delay_async (ag0) should be scheduled earlier
+  // than ag1 because force_delay_async affects the scheduling priority.
+  EXPECT_LT(GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherStart,
+                                        new_instruction_sequence, "ag0"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherStart,
+                                        new_instruction_sequence, "ag1"));
+
+  // Check the order stays the same for the dones.
+  EXPECT_LT(GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherDone,
+                                        new_instruction_sequence, "ag0"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherDone,
+                                        new_instruction_sequence, "ag1"));
+}
+
 TEST_F(LatencyHidingSchedulerTest, WhileLoopAliasingBug2) {
   // Like WhileLoopAliasingBug above, but this time the input buffer of the
   // first collective permute aliases with the output buffer of the second
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index a625cd355bdbe0..5043fe1ab34097 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -311,6 +311,7 @@ xla_cc_test(
         "//xla:error_spec",
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_loop.cc b/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
index 70e952bdcf961e..7845c54786f22a 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <numeric>
-#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc b/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc
index 5e57393dc95cf2..b39afac44ff9e9 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::llvm_ir {
 namespace {
diff --git a/third_party/xla/xla/service/lockable.h b/third_party/xla/xla/service/lockable.h
index 91b43b070aa5fc..b5c999fd62b363 100644
--- a/third_party/xla/xla/service/lockable.h
+++ b/third_party/xla/xla/service/lockable.h
@@ -45,12 +45,12 @@ class Lockable {
    public:
     Lock() = default;
 
-    Lock(Lock&& other) {
+    Lock(Lock&& other) noexcept {
       lockable_ = other.lockable_;
       other.lockable_ = nullptr;
     }
 
-    Lock& operator=(Lock&& other) {
+    Lock& operator=(Lock&& other) noexcept {
       lockable_ = other.lockable_;
       other.lockable_ = nullptr;
       return *this;
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.cc b/third_party/xla/xla/service/maybe_owning_device_address.cc
similarity index 53%
rename from third_party/xla/xla/service/maybe_owning_device_memory.cc
rename to third_party/xla/xla/service/maybe_owning_device_address.cc
index a7b3aa5e4b641c..6f8e252ebac99d 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.cc
+++ b/third_party/xla/xla/service/maybe_owning_device_address.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 
 #include <cstdint>
 #include <optional>
@@ -25,33 +25,29 @@ limitations under the License.
 
 namespace xla {
 
-stream_executor::DeviceAddressBase MaybeOwningDeviceMemory::AsDeviceMemoryBase()
-    const {
+se::DeviceAddressBase MaybeOwningDeviceAddress::AsDeviceAddress() const {
   if (HasOwnership()) {
-    return *std::get<stream_executor::ScopedDeviceAddress<uint8_t>>(mem_);
+    return *std::get<se::ScopedDeviceAddress<uint8_t>>(mem_);
   }
-  return std::get<stream_executor::DeviceAddressBase>(mem_);
+  return std::get<se::DeviceAddressBase>(mem_);
 }
 
-bool MaybeOwningDeviceMemory::HasOwnership() const {
-  return std::holds_alternative<stream_executor::ScopedDeviceAddress<uint8_t>>(
-      mem_);
+bool MaybeOwningDeviceAddress::HasOwnership() const {
+  return std::holds_alternative<se::ScopedDeviceAddress<uint8_t>>(mem_);
 }
 
-std::optional<stream_executor::ScopedDeviceAddress<uint8_t>>
-MaybeOwningDeviceMemory::Release() {
+std::optional<se::ScopedDeviceAddress<uint8_t>>
+MaybeOwningDeviceAddress::Release() {
   if (!HasOwnership()) {
     return {};
   }
-  return std::move(
-      std::get<stream_executor::ScopedDeviceAddress<uint8_t>>(mem_));
+  return std::move(std::get<se::ScopedDeviceAddress<uint8_t>>(mem_));
 }
 
-const stream_executor::ScopedDeviceAddress<uint8_t>*
-MaybeOwningDeviceMemory::AsOwningDeviceMemory() const {
-  return HasOwnership()
-             ? &std::get<stream_executor::ScopedDeviceAddress<uint8_t>>(mem_)
-             : nullptr;
+const se::ScopedDeviceAddress<uint8_t>*
+MaybeOwningDeviceAddress::AsScopedDeviceAddress() const {
+  return HasOwnership() ? &std::get<se::ScopedDeviceAddress<uint8_t>>(mem_)
+                        : nullptr;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/maybe_owning_device_address.h b/third_party/xla/xla/service/maybe_owning_device_address.h
new file mode 100644
index 00000000000000..8a6f52e15adcaf
--- /dev/null
+++ b/third_party/xla/xla/service/maybe_owning_device_address.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MAYBE_OWNING_DEVICE_ADDRESS_H_
+#define XLA_SERVICE_MAYBE_OWNING_DEVICE_ADDRESS_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <variant>
+
+#include "absl/base/macros.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// MaybeOwningDeviceAddress represents either an owned or unowned device
+// address. Like std::variant<se::ScopedDeviceAddress<uint8_t>, DeviceMemory>.
+// When the object goes output of scope, it will free the underlying device
+// address if it owns it.
+class MaybeOwningDeviceAddress {
+ public:
+  MaybeOwningDeviceAddress() = default;
+  MaybeOwningDeviceAddress(MaybeOwningDeviceAddress&&) = default;
+  MaybeOwningDeviceAddress& operator=(MaybeOwningDeviceAddress&&) = default;
+
+  explicit MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t> owned)
+      : mem_(std::move(owned)) {}
+
+  explicit MaybeOwningDeviceAddress(se::DeviceAddressBase unowned)
+      : mem_(unowned) {}
+
+  MaybeOwningDeviceAddress& operator=(se::DeviceAddressBase unowned) {
+    mem_ = unowned;
+    return *this;
+  }
+
+  MaybeOwningDeviceAddress& operator=(se::ScopedDeviceAddress<uint8_t> owned) {
+    mem_ = std::move(owned);
+    return *this;
+  }
+
+  // Fetches the underlying DeviceAddressBase. The caller of this function is
+  // *not* responsible for freeing the address.
+  se::DeviceAddressBase AsDeviceAddress() const;
+
+  // Release the se::ScopedDeviceAddress<uint8_t> without freeing
+  // it, and moves the ownership of the address from the object to the caller.
+  //
+  // A nullopt is returned if the HasOwnership() == false;
+  std::optional<se::ScopedDeviceAddress<uint8_t>> Release();
+
+  // If the device address is owned, returns a pointer to the internal
+  // ScopedDeviceAddress, otherwise nullptr is returned.
+  const se::ScopedDeviceAddress<uint8_t>* AsScopedDeviceAddress() const;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  se::DeviceAddressBase AsDeviceMemoryBase() const { return AsDeviceAddress(); }
+
+  ABSL_DEPRECATE_AND_INLINE()
+  const se::ScopedDeviceAddress<uint8_t>* AsOwningDeviceMemory() const {
+    return AsScopedDeviceAddress();
+  }
+
+  // Returns true if has ownership over underlying address.
+  bool HasOwnership() const;
+
+ private:
+  std::variant<se::DeviceAddressBase, se::ScopedDeviceAddress<uint8_t>> mem_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MAYBE_OWNING_DEVICE_ADDRESS_H_
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory_test.cc b/third_party/xla/xla/service/maybe_owning_device_address_test.cc
similarity index 77%
rename from third_party/xla/xla/service/maybe_owning_device_memory_test.cc
rename to third_party/xla/xla/service/maybe_owning_device_address_test.cc
index 2d3a5a8cf38708..d2dbcc46aad3ca 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory_test.cc
+++ b/third_party/xla/xla/service/maybe_owning_device_address_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -21,13 +21,14 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using MaybeOwningDeviceMemoryTest = ::testing::Test;
+using MaybeOwningDeviceAddressTest = ::testing::Test;
 
-TEST(MaybeOwningDeviceMemoryTest, DefaultConstructed) {
-  MaybeOwningDeviceMemory memory;
+TEST(MaybeOwningDeviceAddressTest, DefaultConstructed) {
+  MaybeOwningDeviceAddress memory;
   EXPECT_FALSE(memory.HasOwnership());
-  EXPECT_EQ(memory.AsDeviceMemoryBase().opaque(), nullptr);
-  EXPECT_EQ(memory.AsDeviceMemoryBase().size(), 0);
+
+  EXPECT_EQ(memory.AsDeviceAddress().opaque(), nullptr);
+  EXPECT_EQ(memory.AsDeviceAddress().size(), 0);
 }
 
 //===-----------------------------------------------------------------------===/
@@ -36,7 +37,7 @@ TEST(MaybeOwningDeviceMemoryTest, DefaultConstructed) {
 
 void BM_DefaultConstructed(benchmark::State& state) {
   for (auto s : state) {
-    MaybeOwningDeviceMemory memory;
+    MaybeOwningDeviceAddress memory;
     benchmark::DoNotOptimize(memory);
   }
 }
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.h b/third_party/xla/xla/service/maybe_owning_device_memory.h
index 8f7b33b4d2d66e..40d05599971dcd 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.h
+++ b/third_party/xla/xla/service/maybe_owning_device_memory.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,76 +16,18 @@ limitations under the License.
 #ifndef XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
 #define XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
 
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <variant>
-
-#include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "absl/base/macros.h"
+#include "xla/service/maybe_owning_device_address.h"
+#include "xla/stream_executor/device_address.h"  // IWYU pragma: keep
+#include "xla/stream_executor/device_address_allocator.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory_allocator.h"  // IWYU pragma: keep
 
 namespace xla {
 
-// MaybeOwningDeviceMemory represents either an owned or unowned
-// device memory. Like std::variant<se::ScopedDeviceAddress<uint8_t>,
-// DeviceMemory>. When the object goes output of scope, it will free the
-// underlying memory if it owns it.
-class MaybeOwningDeviceMemory {
- public:
-  MaybeOwningDeviceMemory() = default;
-  ~MaybeOwningDeviceMemory() = default;
-
-  explicit MaybeOwningDeviceMemory(
-      stream_executor::ScopedDeviceAddress<uint8_t> owned)
-      : mem_(std::move(owned)) {}
-
-  explicit MaybeOwningDeviceMemory(stream_executor::DeviceAddressBase unowned)
-      : mem_(unowned) {}
-
-  MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
-
-  MaybeOwningDeviceMemory& operator=(
-      stream_executor::DeviceAddressBase unowned) {
-    mem_ = unowned;
-    return *this;
-  }
-
-  MaybeOwningDeviceMemory& operator=(
-      stream_executor::ScopedDeviceAddress<uint8_t> owned) {
-    mem_ = std::move(owned);
-    return *this;
-  }
-
-  MaybeOwningDeviceMemory& operator=(MaybeOwningDeviceMemory&&) = default;
-
-  // Fetches the underlying DeviceAddressBase from a
-  // MaybeOwningDeviceMemory. The caller of this function is *not*
-  // responsible for freeing the memory.
-  stream_executor::DeviceAddressBase AsDeviceMemoryBase() const;
-
-  // Release the stream_executor::ScopedDeviceAddress<uint8_t> without freeing
-  // it, and moves the ownership of the memory buffer from the object to the
-  // caller.
-  //
-  // A nullopt is returned if the HasOwnership() == false;
-  std::optional<stream_executor::ScopedDeviceAddress<uint8_t>> Release();
-
-  // If the device memory is owned, returns a pointer to the internal
-  // OwningDeviceMemory, otherwise nullptr is returned.
-  const stream_executor::ScopedDeviceAddress<uint8_t>* AsOwningDeviceMemory()
-      const;
-
-  // Returns true if the device_memory has ownership over underlying memory.
-  bool HasOwnership() const;
-
- private:
-  std::variant<stream_executor::DeviceAddressBase,
-               stream_executor::ScopedDeviceAddress<uint8_t>>
-      mem_;
-};
+using MaybeOwningDeviceMemory ABSL_DEPRECATE_AND_INLINE() =
+    MaybeOwningDeviceAddress;
 
-}  // namespace xla
+}
 
 #endif  // XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index 6d105815e5887d..323cbe0e08c2de 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -6415,7 +6415,8 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
   }
 
   // Finally, try to prefetch the buffer into alternate memory.
-  if (request.allow_prefetch &&
+  if ((request.allow_prefetch ||
+       request.require_end_colored_in_alternate_memory) &&
       !request.allocation_value->requires_contiguous_allocation() &&
       !request.only_extend_existing_allocation &&
       required_memory_space_at_end != MemorySpace::kDefault &&
@@ -6490,7 +6491,18 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
     result_mark(prefetch_result, allocation_result);
   }
 
-  CHECK(!request.require_end_colored_in_alternate_memory);
+  CHECK(!request.require_end_colored_in_alternate_memory)
+      << "Failed to allocate end in alternate memory, even though its "
+         "required. "
+         "requires_contiguous_allocation: "
+      << request.allocation_value->requires_contiguous_allocation()
+      << " only_extend_existing_allocation: "
+      << request.only_extend_existing_allocation
+      << " require_end_colored_in_default_memory: "
+      << request.require_end_colored_in_default_memory
+      << " required_memory_space_at_end: "
+      << (required_memory_space_at_end == MemorySpace::kDefault ? "default"
+                                                                : "alternate");
 
   // If the end assignment was required to be in alternate memory but that
   // wasn't possible, then this allocation is invalid.
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 58241c0e147225..0d2933aebbd6b0 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -1176,6 +1176,62 @@ absl::Status MemorySpaceAssignment::FixSchedule() {
   return absl::OkStatus();
 }
 
+namespace {
+
+bool IsConcatBitcastCustomCall(const HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kCustomCall &&
+         instruction->custom_call_target() ==
+             memory_space_assignment::kConcatBitcastCustomCall;
+}
+
+// If a use is a ConcatBitcastCustomCall, we add the uses of the
+// ConcatBitcastCustomCall's output buffer to the list of uses for the
+// current value. Since the instructions that have ConcatBitcastCustomCall as a
+// user are slices, this means we are considering all uses of the concatenated
+// buffer as uses of the original slices.
+std::vector<HloUse> GetUsesAndExtendIfConcatBitcast(
+    const HloValue* value, const HloDataflowAnalysis& dataflow_analysis) {
+  std::vector<HloUse> uses;
+  for (const HloUse& use : value->GetUses()) {
+    if (IsConcatBitcastCustomCall(use.instruction)) {
+      const HloValue& concat_bitcast_value =
+          dataflow_analysis.GetUniqueValueAt(use.instruction);
+      absl::c_copy(concat_bitcast_value.GetUses(), std::back_inserter(uses));
+    } else {
+      uses.push_back(use);
+    }
+  }
+  return uses;
+}
+
+// If a value is used by a ConcatBitcastCustomCall, we extend the time bound to
+// represent the time bound of the concatenated value.
+HloLiveRange::TimeBound GetTimeBoundAndExtendIfConcatBitcast(
+    const HloValue* value, const HloDataflowAnalysis& dataflow_analysis,
+    const HloLiveRange& hlo_live_range) {
+  HloLiveRange::TimeBound time_bound =
+      hlo_live_range.buffer_live_ranges().at(value);
+  for (const HloUse& use : value->GetUses()) {
+    if (IsConcatBitcastCustomCall(use.instruction)) {
+      const HloValue& concat_bitcast_value =
+          dataflow_analysis.GetUniqueValueAt(use.instruction);
+      const HloLiveRange::TimeBound& concat_time_bound =
+          hlo_live_range.buffer_live_ranges().at(&concat_bitcast_value);
+      time_bound.start = std::min(time_bound.start, concat_time_bound.start);
+      time_bound.end = std::max(time_bound.end, concat_time_bound.end);
+      if (hlo_live_range.instruction_schedule().at(
+              time_bound.end_position.instruction) <
+          hlo_live_range.instruction_schedule().at(
+              concat_time_bound.end_position.instruction)) {
+        time_bound.end_position = concat_time_bound.end_position;
+      }
+    }
+  }
+  return time_bound;
+}
+
+}  // namespace
+
 absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
     const HloAliasAnalysis& alias_analysis,
     std::vector<int64_t>* alt_mem_bytes_occupied) {
@@ -1197,6 +1253,9 @@ absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
   auto add_allocation_and_verify = [&](int64_t start_time, int64_t end_time,
                                        const HeapSimulator::Chunk& chunk,
                                        const HloValue* value) -> absl::Status {
+    if (IsConcatBitcastCustomCall(value->instruction())) {
+      return absl::OkStatus();
+    }
     events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
         std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
     events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
@@ -1256,10 +1315,13 @@ absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
 
     for (const HloValue* value : buffer.values()) {
       const HloLiveRange::TimeBound& time_bound =
-          hlo_live_range->buffer_live_ranges().at(value);
+          GetTimeBoundAndExtendIfConcatBitcast(
+              value, alias_analysis.dataflow_analysis(), *hlo_live_range);
       const HloInstruction* last_use_instruction = nullptr;
       int64_t last_use_time = time_bound.start;
-      for (const HloUse& use : value->GetUses()) {
+      std::vector<HloUse> uses = GetUsesAndExtendIfConcatBitcast(
+          value, alias_analysis.dataflow_analysis());
+      for (const HloUse& use : uses) {
         int64_t use_time =
             hlo_live_range->instruction_schedule().at(use.instruction);
         if (use_time > last_use_time) {
@@ -1293,7 +1355,9 @@ absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
               std::min(earliest_computation_start_time, computation_start_time);
           int64_t last_use_time = -1;
           const HloInstruction* last_use_instruction = nullptr;
-          for (const HloUse& use : value->GetUses()) {
+          std::vector<HloUse> uses = GetUsesAndExtendIfConcatBitcast(
+              value, alias_analysis.dataflow_analysis());
+          for (const HloUse& use : uses) {
             int64_t use_time =
                 hlo_live_range->instruction_schedule().at(use.instruction);
             if (use.instruction->parent() == called_computation &&
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index b60ef8872a01e7..9f9ad74fe726dc 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -13370,6 +13370,135 @@ ENTRY main {
   TF_EXPECT_OK(CheckSliceChunks(*assignments, root->operand(1)));
 }
 
+// This module is optmized as below, which adds two slices followed by a concat
+// bitcast. Since the concat bitcast uses the same slice buffers for its output,
+// the heap simulator trace should not have freed any of the slices before the
+// concat bitcast users are allocated/processed.
+//
+// ENTRY main {
+//   ...
+//   p1 = f32[8,8]{1,0} parameter(1)
+//   slice-start = slice-start(p1), slice={[4:8], [0:8]}
+//   ...
+//   slice-start.1 = slice-start(p1), slice={[0:4], [0:8]}
+//   ...
+//   c = f32[8,8]{1,0:S(1)} tanh(b)
+//   slice-done = f32[4,8]{1,0:S(1)} slice-done(slice-start)
+//   slice-done.1 = f32[4,8]{1,0:S(1)} slice-done(slice-start.1)
+//   custom-call = f32[8,8]{1,0:S(1)} custom-call(slice-done, slice-done.1),
+//                                    custom_call_target="ConcatBitcast"
+//   r = f32[8,8]{1,0:S(1)} add(c, custom-call)
+//   n = f32[8,8]{1,0:S(1)} negate(custom-call)
+// ROOT f = f32[8,8]{1,0} add(r, n)
+// }
+//
+TEST_F(SlicedPrefetchTest, SlicedPrefetchHeapSimulatorTrace) {
+  std::string hlo_text = R"zz(
+HloModule Slice, is_scheduled=true
+
+ENTRY main {
+  p0 = f32[8,8] parameter(0)
+  p1 = f32[8,8] parameter(1)
+
+  a = f32[8,8] tanh(p0)
+  b = f32[8,8] tanh(a)
+  c = f32[8,8] tanh(b)
+
+  r = f32[8,8] add(c, p1)
+  n = f32[8,8] negate(p1)
+  ROOT f = f32[8,8] add(r, n)
+})zz";
+
+  SetupProposeSlicesToExpect2SlicesOfF32x8x8();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  VLOG(1) << "Original module:\n"
+          << module->ToString(HloPrintOptions::ShortParsable());
+
+  std::unique_ptr<PresetAssignments> assignments = AssignMemorySpace(
+      module.get(), MakeDefaultOptions(),
+      /*max_prefetch_interval=*/10, /*min_prefetch_interval=*/1);
+
+  VLOG(1) << "Post-MSA module:\n"
+          << module->ToString(HloPrintOptions::ShortParsable());
+
+  auto* r_instr = FindInstruction(module.get(), "r");
+  EXPECT_NE(r_instr, nullptr);
+  auto* n_instr = FindInstruction(module.get(), "n");
+  EXPECT_NE(n_instr, nullptr);
+
+  // Expect p1 to be copied via a sliced prefetch for use in r and n.
+  EXPECT_THAT(
+      r_instr,
+      op::Add(_, IsAsyncSlicedCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                                   {{{0, 4}, {0, 8}}, {{4, 8}, {0, 8}}},
+                                   op::Parameter(1))));
+  EXPECT_THAT(n_instr, op::Negate(r_instr->operand(1)));
+
+  // Check the instruction schedule.
+  TF_EXPECT_OK(
+      CheckSchedule(*module, r_instr->operand(1),
+                    /*slices_start_after_instruction_name=*/"p1",
+                    /*slices_done_before_instruction_name=*/"r",
+                    /*expect_slices_started_at_different_times=*/true));
+
+  // Check expectations on the chunks assigned to the asynchronous sliced copy.
+  TF_EXPECT_OK(CheckSliceChunks(*assignments, r_instr->operand(1)));
+
+  const HeapSimulatorTrace& heap_trace =
+      assignments->assignment_information_for_space(kAlternateMemorySpace)
+          ->heap_simulator_trace;
+  // Track the set of instructions currently living in the alternate memory
+  // space.
+  // - ALLOC event: Instruction should not be in the set. Add it.
+  // - FREE event: Instruction should be in the set. Remove it.
+  // - Concat bitcast instruction: Slice operands should remain in the set until
+  //   all concat bitcast users are allocated.
+  absl::flat_hash_set<const HloInstruction*> allocated_instructions;
+  for (const auto& event : heap_trace.events()) {
+    VLOG(3) << "event: " << event.DebugString();
+    const HloInstruction* instruction =
+        FindInstruction(module.get(), event.instruction_name());
+    EXPECT_NE(instruction, nullptr)
+        << "Instruction not found: " << event.instruction_name();
+    if (instruction->opcode() == HloOpcode::kCustomCall) {
+      EXPECT_NE(instruction->custom_call_target(),
+                memory_space_assignment::kConcatBitcastCustomCall)
+          << "We do not expect concat bitcast custom call to add any "
+             "independent events to the heap trace.";
+    }
+    if (instruction->opcode() == HloOpcode::kSlice) {
+      EXPECT_TRUE(event.kind() == HeapSimulatorTrace::Event::ALLOC ||
+                  event.kind() == HeapSimulatorTrace::Event::FREE);
+    }
+    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      allocated_instructions.insert(instruction);
+    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+      EXPECT_TRUE(allocated_instructions.contains(instruction))
+          << "FREE is called on slice instruction before its ALLOC or is its "
+             "being called more than once on the same slice buffer.";
+      allocated_instructions.erase(instruction);
+    } else {
+      FAIL() << "Unexpected event kind: " << event.kind()
+             << " for instruction: " << event.instruction_name();
+    }
+    // At the time we allocate the r and n instructions, we should still have
+    // valid allocations for both slices of the concatbitcast operands, because
+    // the concatbitcast should share that buffer with its users, i.e. r and n
+    // instructions.
+    if ((instruction == r_instr || instruction == n_instr) &&
+        event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      int slice_count = absl::c_count_if(
+          allocated_instructions, [](const HloInstruction* inst) {
+            return inst->opcode() == HloOpcode::kSlice;
+          });
+      EXPECT_EQ(slice_count, 2)
+          << "Did not find enough valid allocations for both slice buffers in "
+             "the trace at the time of allocation for r or n instructions.";
+    }
+  }
+}
+
 TEST_F(SlicedPrefetchTest, TwoSlicesWithCopyReplacement) {
   std::string hlo_text = R"zz(
 HloModule Slice, is_scheduled=true
@@ -14644,6 +14773,167 @@ TEST_F(MemorySpaceAssignmentTest, TestColoringMultipleOperands) {
             kAlternateMemorySpace);
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestColoringWithLoopOptimization) {
+  absl::string_view hlo_string = R"hlo(
+HloModule UnrolledLoop, is_scheduled=true
+
+ENTRY %main {
+  %param.0 = (f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4],
+              f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4]) parameter(0)
+
+  %gte.0 = f32[2,4] get-tuple-element(%param.0), index=0
+  %gte.1 = f32[2,4] get-tuple-element(%param.0), index=1
+  %add.common = f32[2,4] add(%gte.0, %gte.1)
+  %tanh.common = f32[2,4] tanh(%add.common)
+
+  // loop idx0
+  %neg.0.idx0 = f32[2,4] negate(%tanh.common)
+  %neg.1.idx0 = f32[2,4] negate(%neg.0.idx0)
+  %neg.2.idx0 = f32[2,4] negate(%neg.1.idx0)
+
+  %gte.0.idx0 = f32[2,4] get-tuple-element(%param.0), index=2
+  %add.0.idx0 = f32[2,4] add(%neg.2.idx0, %gte.0.idx0)
+
+  %gte.1.idx0 = f32[2,4] get-tuple-element(%param.0), index=3
+  %add.1.idx0 = f32[2,4] add(%add.0.idx0, %gte.1.idx0)
+
+  %tanh.idx0 = f32[2,4] tanh(%add.1.idx0)
+
+  // loop idx1
+  %neg.0.idx1 = f32[2,4] negate(%tanh.idx0)
+  %neg.1.idx1 = f32[2,4] negate(%neg.0.idx1)
+  %neg.2.idx1 = f32[2,4] negate(%neg.1.idx1)
+
+  %gte.0.idx1 = f32[2,4] get-tuple-element(%param.0), index=4
+  %add.0.idx1 = f32[2,4] add(%neg.2.idx1, %gte.0.idx1)
+
+  %gte.1.idx1 = f32[2,4] get-tuple-element(%param.0), index=5
+  %add.1.idx1 = f32[2,4] add(%add.0.idx1, %gte.1.idx1)
+
+  %tanh.idx1 = f32[2,4] tanh(%add.1.idx1)
+
+  // loop idx2
+  %neg.0.idx2 = f32[2,4] negate(%tanh.idx1)
+  %neg.1.idx2 = f32[2,4] negate(%neg.0.idx2)
+  %neg.2.idx2 = f32[2,4] negate(%neg.1.idx2)
+
+  %gte.0.idx2 = f32[2,4] get-tuple-element(%param.0), index=6
+  %add.0.idx2 = f32[2,4] add(%neg.2.idx2, %gte.0.idx2)
+
+  %gte.1.idx2 = f32[2,4] get-tuple-element(%param.0), index=7
+  %add.1.idx2 = f32[2,4] add(%add.0.idx2, %gte.1.idx2)
+
+  %tanh.idx2 = f32[2,4] tanh(%add.1.idx2)
+
+  // loop idx3
+  %neg.0.idx3 = f32[2,4] negate(%tanh.idx2)
+  %neg.1.idx3 = f32[2,4] negate(%neg.0.idx3)
+  %neg.2.idx3 = f32[2,4] negate(%neg.1.idx3)
+
+  %gte.0.idx3 = f32[2,4] get-tuple-element(%param.0), index=8
+  %add.0.idx3 = f32[2,4] add(%neg.2.idx3, %gte.0.idx3)
+
+  %gte.1.idx3 = f32[2,4] get-tuple-element(%param.0), index=9
+  %add.1.idx3 = f32[2,4] add(%add.0.idx3, %gte.1.idx3)
+
+  %tanh.idx3 = f32[2,4] tanh(%add.1.idx3)
+
+  // loop idx4
+  %neg.0.idx4 = f32[2,4] negate(%tanh.idx3)
+  %neg.1.idx4 = f32[2,4] negate(%neg.0.idx4)
+  %neg.2.idx4 = f32[2,4] negate(%neg.1.idx4)
+
+  %gte.0.idx4 = f32[2,4] get-tuple-element(%param.0), index=10
+  %add.0.idx4 = f32[2,4] add(%neg.2.idx4, %gte.0.idx4)
+
+  %gte.1.idx4 = f32[2,4] get-tuple-element(%param.0), index=11
+  %add.1.idx4 = f32[2,4] add(%add.0.idx4, %gte.1.idx4)
+
+  %tanh.idx4 = f32[2,4] tanh(%add.1.idx4)
+
+  // loop idx5
+  %neg.0.idx5 = f32[2,4] negate(%tanh.idx4)
+  %neg.1.idx5 = f32[2,4] negate(%neg.0.idx5)
+  %neg.2.idx5 = f32[2,4] negate(%neg.1.idx5)
+
+  %gte.0.idx5 = f32[2,4] get-tuple-element(%param.0), index=12
+  %add.0.idx5 = f32[2,4] add(%neg.2.idx5, %gte.0.idx5)
+
+  %gte.1.idx5 = f32[2,4] get-tuple-element(%param.0), index=13
+  %add.1.idx5 = f32[2,4] add(%add.0.idx5, %gte.1.idx5)
+
+  %tanh.idx5 = f32[2,4] tanh(%add.1.idx5)
+
+  // loop idx6
+  %neg.0.idx6 = f32[2,4] negate(%tanh.idx5)
+  %neg.1.idx6 = f32[2,4] negate(%neg.0.idx6)
+  %neg.2.idx6 = f32[2,4] negate(%neg.1.idx6)
+
+  %gte.0.idx6 = f32[2,4] get-tuple-element(%param.0), index=14
+  %add.0.idx6 = f32[2,4] add(%neg.2.idx6, %gte.0.idx6)
+
+  %gte.1.idx6 = f32[2,4] get-tuple-element(%param.0), index=15
+  %add.1.idx6 = f32[2,4] add(%add.0.idx6, %gte.1.idx6)
+
+  %tanh.idx6 = f32[2,4] tanh(%add.1.idx6)
+
+  ROOT %negate.common = f32[2,4] negate(%tanh.idx6)
+})hlo";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 48;
+  memory_space_options.memory_bound_loop_optimizer_options.set_enabled(true);
+  memory_space_options.memory_bound_loop_optimizer_options
+      .set_desired_copy_ratio(0.1);
+  memory_space_options.memory_bound_loop_optimizer_options
+      .set_allow_unsatisfied_fully_pipelined_prefetch(true);
+  memory_space_options.memory_bound_loop_optimizer_options
+      .set_min_num_iterations(4);
+  HloUse add_0_idx0_use{FindInstruction(module.get(), "add.0.idx0"), 1, {}};
+  HloUse add_1_idx0_use{FindInstruction(module.get(), "add.1.idx0"), 1, {}};
+  HloUse add_0_idx1_use{FindInstruction(module.get(), "add.0.idx1"), 1, {}};
+  HloUse add_1_idx1_use{FindInstruction(module.get(), "add.1.idx1"), 1, {}};
+  HloUse add_0_idx2_use{FindInstruction(module.get(), "add.0.idx2"), 1, {}};
+  HloUse add_1_idx2_use{FindInstruction(module.get(), "add.1.idx2"), 1, {}};
+  HloUse add_0_idx3_use{FindInstruction(module.get(), "add.0.idx3"), 1, {}};
+  HloUse add_1_idx3_use{FindInstruction(module.get(), "add.1.idx3"), 1, {}};
+  HloUse add_0_idx4_use{FindInstruction(module.get(), "add.0.idx4"), 1, {}};
+  HloUse add_1_idx4_use{FindInstruction(module.get(), "add.1.idx4"), 1, {}};
+  HloUse add_0_idx5_use{FindInstruction(module.get(), "add.0.idx5"), 1, {}};
+  HloUse add_1_idx5_use{FindInstruction(module.get(), "add.1.idx5"), 1, {}};
+  HloUse add_0_idx6_use{FindInstruction(module.get(), "add.0.idx6"), 1, {}};
+  HloUse add_1_idx6_use{FindInstruction(module.get(), "add.1.idx6"), 1, {}};
+  memory_space_options.buffer_colorings = {
+      {add_0_idx0_use, kAlternateMemorySpace},
+      {add_1_idx0_use, kAlternateMemorySpace},
+      {add_0_idx1_use, kAlternateMemorySpace},
+      {add_1_idx1_use, kAlternateMemorySpace},
+      {add_0_idx2_use, kAlternateMemorySpace},
+      {add_1_idx2_use, kAlternateMemorySpace},
+      {add_0_idx3_use, kAlternateMemorySpace},
+      {add_1_idx3_use, kAlternateMemorySpace},
+      {add_0_idx4_use, kAlternateMemorySpace},
+      {add_1_idx4_use, kAlternateMemorySpace},
+      {add_0_idx5_use, kAlternateMemorySpace},
+      {add_1_idx5_use, kAlternateMemorySpace},
+      {add_0_idx6_use, kAlternateMemorySpace},
+      {add_1_idx6_use, kAlternateMemorySpace}};
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+  std::vector<std::string> alternate_memory_uses = {
+      "add.0.idx0", "add.1.idx0", "add.0.idx1", "add.1.idx1", "add.0.idx2",
+      "add.1.idx2", "add.0.idx3", "add.1.idx3", "add.0.idx4", "add.1.idx4",
+      "add.0.idx5", "add.1.idx5", "add.0.idx6", "add.1.idx6"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/alternate_memory_uses,
+      /*operand_index=*/1, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest,
        ScopedAllocationAccountingWhenInstructionsAreRemoved) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index b5dc87cbf44ee7..8bc3ccb4987186 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -2189,22 +2189,19 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   std::vector<int64_t> input_dnums(num_dims);
   input_dnums[0] = dnums.input_batch_dimension();
   input_dnums[1] = dnums.input_feature_dimension();
-  std::copy(dnums.input_spatial_dimensions().begin(),
-            dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
+  absl::c_copy(dnums.input_spatial_dimensions(), input_dnums.begin() + 2);
   absl::c_sort(input_dnums);
 
   std::vector<int64_t> window_dnums(num_dims);
   window_dnums[0] = dnums.kernel_input_feature_dimension();
   window_dnums[1] = dnums.kernel_output_feature_dimension();
-  std::copy(dnums.kernel_spatial_dimensions().begin(),
-            dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
+  absl::c_copy(dnums.kernel_spatial_dimensions(), window_dnums.begin() + 2);
   absl::c_sort(window_dnums);
 
   std::vector<int64_t> output_dnums(num_dims);
   output_dnums[0] = dnums.output_batch_dimension();
   output_dnums[1] = dnums.output_feature_dimension();
-  std::copy(dnums.output_spatial_dimensions().begin(),
-            dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
+  absl::c_copy(dnums.output_spatial_dimensions(), output_dnums.begin() + 2);
   absl::c_sort(output_dnums);
 
   std::vector<int64_t> expected_dnums(num_dims);
@@ -3590,9 +3587,9 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
 
   std::vector<int64_t> dimensions(operand.dimensions().size() +
                                   broadcast_sizes.size());
-  std::copy(broadcast_sizes.begin(), broadcast_sizes.end(), dimensions.begin());
-  std::copy(operand.dimensions().begin(), operand.dimensions().end(),
-            dimensions.begin() + broadcast_sizes.size());
+  absl::c_copy(broadcast_sizes, dimensions.begin());
+  absl::c_copy(operand.dimensions(),
+               dimensions.begin() + broadcast_sizes.size());
 
   TF_ASSIGN_OR_RETURN(Shape result, ShapeUtil::MakeValidatedShape(
                                         operand.element_type(), dimensions));
diff --git a/third_party/xla/xla/service/shaped_buffer_test.cc b/third_party/xla/xla/service/shaped_buffer_test.cc
index 4d3828a55eb56e..63cdc7f9abc3d3 100644
--- a/third_party/xla/xla/service/shaped_buffer_test.cc
+++ b/third_party/xla/xla/service/shaped_buffer_test.cc
@@ -42,7 +42,8 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
                           xla::PlatformUtil::GetDefaultPlatform());
   TF_ASSERT_OK_AND_ASSIGN(auto executors,
                           xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
   auto scoped_buffer = std::make_unique<xla::ScopedShapedBuffer>(
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index b00aa02c3f1196..437dfdd487fd23 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -280,6 +280,7 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index 7dd17a745e6231..007657b5986a1a 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -482,7 +482,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
   // Block-scaled dot with MX operands.
   if (hlo->custom_call_target() == "__op$block_scaled_dot") {
     // Evaluate the dimension numbers of the block-scaled dot.
-    int dimensions_size = hlo->operand(0)->shape().dimensions_size();
+    int dimensions_size = hlo->operand(0)->shape().dimensions().size();
     TF_RET_CHECK(dimensions_size == 2 || dimensions_size == 3);
     DotDimensionNumbers dimension_numbers;
     dimension_numbers.add_lhs_contracting_dimensions(dimensions_size - 1);
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index ddc9012283166d..e380f36be93052 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -293,7 +293,7 @@ DotDimensionIndexMapping ComputeDimensionIndexMapping(
                                   output_to_lhs_indices, output_to_rhs_indices};
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+CollectiveDeviceList GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
   int64_t group_size = 1;
   for (int64_t i : replication_dims) {
@@ -312,7 +312,7 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
         }
         partition_groups[group_id].push_back(partition);
       });
-  return partition_groups;
+  return CollectiveDeviceList(partition_groups);
 }
 
 // Returns true iff all of the following conditions are simultaneously true:
@@ -2595,9 +2595,19 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnNonContractingImpl(
     if (!other.sharding().ReplicateOnLastTileDim() || !device_group_match) {
       other = other.Reshard(target_sharding);
     }
+
+    DimensionVector dims_to_replicate = other_grouped->group_dims;
+    for (auto it = dims_to_replicate.begin(); it != dims_to_replicate.end();) {
+      if (*it >= other.base_shape().dimensions().size()) {
+        it = dims_to_replicate.erase(it);
+      } else {
+        ++it;
+      }
+    }
+
     partially_replicated_other =
         other.Reshard(hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-            other.sharding(), other_grouped->group_dims));
+            other.sharding(), dims_to_replicate));
     top_level_sharding_to_reset.emplace_back(
         partially_replicated_other, partially_replicated_other.sharding());
     partially_replicated_other.set_sharding(other_grouped->sharding);
@@ -3385,18 +3395,16 @@ bool PrioritizeContractingDimensionsPartitioning(
        other_non_contracting_dims) {
     ag_replication_dims.push_back(lhs_matching_iterations ? dim.rhs : dim.lhs);
   }
+
   auto all_gather_subgroups =
       GetPartitionGroupsForReplication(other_sharding, ag_replication_dims);
   auto reduce_scatter_subgroups = GetPartitionGroupsForReplication(
       outer_output_tmp_sharding, output_slice_dims);
   const double all_gather_time_in_ms = visitor->GetCommunicationTimeInMilliSec(
-      all_gather_bytes,
-      CollectiveDeviceList(visitor->CreateReplicaGroups(all_gather_subgroups)));
+      all_gather_bytes, all_gather_subgroups);
   const double reduce_scatter_time_in_ms =
-      visitor->GetCommunicationTimeInMilliSec(
-          reduce_scatter_bytes,
-          CollectiveDeviceList(
-              visitor->CreateReplicaGroups(reduce_scatter_subgroups)));
+      visitor->GetCommunicationTimeInMilliSec(reduce_scatter_bytes,
+                                              reduce_scatter_subgroups);
 
   Shape other_original_shape = other_hlo->shape();
   *other_hlo->mutable_shape() =
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
index dc1506711a2a1d..3faf853f49cf11 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
@@ -65,9 +65,8 @@ class OpenWhileFreeVarsShardingPass
           // a sharding constraint.
           continue;
         }
-        auto shardingConstraint =
-            rewriter.create<mlir::sdy::ShardingConstraintOp>(
-                freeVar.getLoc(), freeVar, fullyOpenSharding);
+        auto shardingConstraint = mlir::sdy::ShardingConstraintOp::create(
+            rewriter, freeVar.getLoc(), freeVar, fullyOpenSharding);
         // Only replace uses in the regions of the while op.
         rewriter.replaceUsesWithIf(
             freeVar, shardingConstraint, [op](mlir::OpOperand& use) {
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
index 5943506c60909f..e2c26b649550bc 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
@@ -59,8 +59,8 @@ void replaceCallbackWithTupleVersion(CustomCallOp customCall) {
       mlir::TupleType::get(customCall->getContext(),
                            {customCall->getResultTypes()}),
       rewriter);
-  auto getTupleElement = rewriter.create<mlir::stablehlo::GetTupleElementOp>(
-      customCall.getLoc(), customCall->getResultTypes().front(),
+  auto getTupleElement = mlir::stablehlo::GetTupleElementOp::create(
+      rewriter, customCall.getLoc(), customCall->getResultTypes().front(),
       tupleCustomCall.getResult(0), rewriter.getI32IntegerAttr(0));
   getTupleElement->setAttr(kXlaShardingAttr,
                            customCall->getAttr(kXlaShardingAttr));
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
index 0b392639b96bbb..876cc3de21b4f8 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <utility>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
@@ -258,6 +260,141 @@ int64_t convertReduceScatter(sdy::ReduceScatterOp op, int64_t nextChannelId,
   return nextChannelId;
 }
 
+std::pair<llvm::StringMap<Value>, llvm::StringMap<Value>>
+getAxesCoordinateAndSize(OpBuilder& builder, mlir::Location loc,
+                         MeshAttr mesh) {
+  Value partitionId = stablehlo::PartitionIdOp::create(builder, loc);
+  Value currentRem = stablehlo::ConvertOp::create(
+      builder, loc, RankedTensorType::get({}, builder.getIntegerType(32)),
+      partitionId);
+  llvm::StringMap<Value> axisSizes, axisCoordinates;
+  for (sdy::MeshAxisAttr axis : llvm::reverse(mesh.getAxes())) {
+    Value axisSize = stablehlo::ConstantOp::create(
+        builder, loc, builder.getI32IntegerAttr(axis.getSize()));
+    axisSizes[axis.getName()] = axisSize;
+    axisCoordinates[axis.getName()] =
+        stablehlo::RemOp::create(builder, loc, currentRem, axisSize);
+    currentRem = stablehlo::DivOp::create(builder, loc, currentRem, axisSize);
+  }
+  return {axisCoordinates, axisSizes};
+}
+
+void convertShardedToUnreduced(sdy::ShardedToUnreducedOp op,
+                               mlir::IRRewriter& rewriter) {
+  TensorShardingAttr outSharding = op.getOutSharding();
+  MeshAttr mesh = outSharding.getMesh(op);
+  // If the mesh does not have iota device ids, we need an extra step to convert
+  // partition id to logical device id. We do not support this case for now.
+  CHECK(mesh.getDeviceIds().empty());
+
+  mlir::Location loc = op.getLoc();
+  rewriter.setInsertionPoint(op);
+
+  ManualComputationOp manualComputation = createFullyManualComputation(
+      loc, op.getTensor(), outSharding, mesh, rewriter,
+      [&](mlir::BlockArgument arg, OpBuilder& blockBuilder) {
+        RankedTensorType fullType =
+            mlir::cast<RankedTensorType>(op.getResult().getType());
+        RankedTensorType inputType =
+            sdy::getSharding(op.getTensor())
+                .getLocalTensorType(fullType, mesh,
+                                    /*allowNonDivisible=*/false);
+        CHECK(inputType) << kNonDivisibleShardingError;
+        RankedTensorType outputType =
+            outSharding.getLocalTensorType(fullType, mesh);
+
+        Value zero = stablehlo::ConstantOp::create(
+            blockBuilder, loc,
+            blockBuilder.getZeroAttr(outputType.getElementType()));
+        Value broadcast = stablehlo::BroadcastOp::create(
+            blockBuilder, loc, outputType, zero, outputType.getShape());
+
+        auto [axisCoordinates, axisSizes] =
+            getAxesCoordinateAndSize(blockBuilder, loc, mesh);
+
+        SmallVector<Value> offsets;
+        offsets.reserve(outputType.getRank());
+        Value zeroOffset = stablehlo::ConstantOp::create(
+            blockBuilder, loc, blockBuilder.getI32IntegerAttr(0));
+        for (int64_t dim = 0; dim < outputType.getRank(); ++dim) {
+          if (op.getAxes()[dim].empty()) {
+            offsets.push_back(zeroOffset);
+            continue;
+          }
+
+          Value offset, prevAxisSize;
+          for (AxisRefAttr axis : op.getAxes()[dim].getValue()) {
+            CHECK(!axis.getSubAxisInfo()) << "Sub-axes not supported in "
+                                             "ShardedToUnreducedOp.";
+            StringRef axisName = axis.getName();
+            if (prevAxisSize == nullptr) {
+              offset = axisCoordinates[axisName];
+            } else {
+              offset = stablehlo::MulOp::create(blockBuilder, loc, offset,
+                                                prevAxisSize);
+              offset = stablehlo::AddOp::create(blockBuilder, loc, offset,
+                                                axisCoordinates[axisName]);
+            }
+
+            prevAxisSize = axisSizes[axisName];
+          }
+
+          Value localDimSize = stablehlo::ConstantOp::create(
+              blockBuilder, loc,
+              blockBuilder.getI32IntegerAttr(inputType.getDimSize(dim)));
+          offset =
+              stablehlo::MulOp::create(blockBuilder, loc, offset, localDimSize);
+          offsets.push_back(offset);
+        }
+
+        return stablehlo::DynamicUpdateSliceOp::create(
+            blockBuilder, loc, outputType, broadcast, arg, offsets);
+      });
+  rewriter.replaceOp(op, manualComputation);
+}
+
+void convertReplicatedToUnreduced(sdy::ReplicatedToUnreducedOp op,
+                                  mlir::IRRewriter& rewriter) {
+  TensorShardingAttr outSharding = op.getOutSharding();
+  MeshAttr mesh = outSharding.getMesh(op);
+
+  mlir::Location loc = op.getLoc();
+  rewriter.setInsertionPoint(op);
+
+  ManualComputationOp manualComputation = createFullyManualComputation(
+      loc, op.getTensor(), outSharding, mesh, rewriter,
+      [&](mlir::BlockArgument arg, OpBuilder& blockBuilder) {
+        auto [axisCoordinates, axisSizes] =
+            getAxesCoordinateAndSize(blockBuilder, loc, mesh);
+        (void)axisSizes;
+
+        Value i32Zero = stablehlo::ConstantOp::create(
+            blockBuilder, loc, blockBuilder.getI32IntegerAttr(0));
+        Value pred = nullptr;
+        for (AxisRefAttr axis : op.getAxes()) {
+          CHECK(!axis.getSubAxisInfo()) << "Sub-axes not supported in "
+                                           "ReplicatedToUnreducedOp.";
+          Value coord = axisCoordinates[axis.getName()];
+          Value isZero =
+              stablehlo::CompareOp::create(blockBuilder, loc, coord, i32Zero,
+                                           stablehlo::ComparisonDirection::EQ);
+          pred = pred
+                     ? stablehlo::AndOp::create(blockBuilder, loc, pred, isZero)
+                     : isZero;
+        }
+        CHECK(pred != nullptr) << "No replicated-to-unreduced axes.";
+
+        RankedTensorType type = mlir::cast<RankedTensorType>(arg.getType());
+        Value zeroVal = stablehlo::ConstantOp::create(
+            blockBuilder, loc, blockBuilder.getZeroAttr(type.getElementType()));
+        Value zeroBroadcast = stablehlo::BroadcastOp::create(
+            blockBuilder, loc, type, zeroVal, type.getShape());
+        return stablehlo::SelectOp::create(blockBuilder, loc, pred, arg,
+                                           zeroBroadcast);
+      });
+  rewriter.replaceOp(op, manualComputation);
+}
+
 void syncInOutUnreducedAxes(mlir::Operation* op) {
   Value input = op->getOperand(0);
   TensorShardingAttr outSharding = sdy::getSharding(op->getResult(0));
@@ -300,6 +437,26 @@ class StablehloExportManualReductionCollectivesPass
     ModuleOp moduleOp = getOperation();
     mlir::IRRewriter rewriter(moduleOp.getContext());
 
+    moduleOp.walk([&](mlir::Operation* op) {
+      if (auto constant = mlir::dyn_cast<sdy::ConstantOp>(op)) {
+        TensorShardingAttr oldSharding = sdy::getSharding(constant);
+        if (!oldSharding || oldSharding.getUnreducedAxes().empty()) {
+          return;
+        }
+
+        TensorShardingAttr newSharding = oldSharding.replaceUnreducedAxes({});
+        sdy::setSharding(constant, newSharding);
+
+        rewriter.setInsertionPointAfter(constant);
+        sdy::ReplicatedToUnreducedOp replicatedToUnreduced =
+            sdy::ReplicatedToUnreducedOp::create(
+                rewriter, constant.getLoc(), constant,
+                oldSharding.getUnreducedAxes(), oldSharding);
+        rewriter.replaceAllUsesExcept(constant, replicatedToUnreduced,
+                                      replicatedToUnreduced);
+      }
+    });
+
     // Do very restricted backward propagation of unreduced axes along specific
     // ops that don't modify the data.
     moduleOp.walk<mlir::WalkOrder::PostOrder, mlir::ReverseIterator>(
@@ -322,6 +479,12 @@ class StablehloExportManualReductionCollectivesPass
           nextChannelId =
               convertReduceScatter(reduceScatter, nextChannelId, rewriter);
         }
+      } else if (auto shardedToUnreduced =
+                     mlir::dyn_cast<sdy::ShardedToUnreducedOp>(op)) {
+        convertShardedToUnreduced(shardedToUnreduced, rewriter);
+      } else if (auto replicatedToUnreduced =
+                     mlir::dyn_cast<sdy::ReplicatedToUnreducedOp>(op)) {
+        convertReplicatedToUnreduced(replicatedToUnreduced, rewriter);
       }
     });
   }
@@ -331,9 +494,10 @@ class StablehloExportManualReductionCollectivesPass
   }
 
   StringRef getDescription() const override {
-    return "Exports `sdy.all_reduce`, that originate from user defined "
-           "shardings with unreduced axes, to `stablehlo.all_reduce` inside a "
-           "fully manual `sdy.manual_computation`";
+    return "Exports `sdy.all_reduce`, `sdy.reduce_scatter`, "
+           "`sdy.sharded_to_unreduced` and `sdy.replicated_to_unreduced` that "
+           "originate from user-defined shardings with unreduced axes. The "
+           "exported ops are inside a full manual `sdy.manual_computation`.";
   }
 
   void getDependentDialects(mlir::DialectRegistry& registry) const final {
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h
index 98575deebb9f3c..be473a371ca545 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h
@@ -23,11 +23,10 @@ limitations under the License.
 namespace xla {
 namespace sdy {
 
-// TODO(tomnatan): mention reduce-scatter and how collectives are marked.
-// TODO(tomnatan): mention if the shard map is fully manual or not.
-
-// Exports `sdy.all_reduce`, that originate from user defined shardings with
-// unreduced axes, to `stablehlo.all_reduce` inside an `sdy.manual_computation`.
+// Exports `sdy.all_reduce`, `sdy.reduce_scatter`, `sdy.sharded_to_unreduced`
+// and `sdy.replicated_to_unreduced` that originate from user-defined shardings
+// with unreduced axes. The exported ops are inside a full manual
+// `sdy.manual_computation`.
 std::unique_ptr<mlir::Pass>
 createStablehloExportManualReductionCollectivesPass();
 
diff --git a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
index ef1e4ce2438dca..6b3b04dda62b93 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
@@ -84,7 +84,6 @@ func.func @all_reduce_single_axis(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sh
   return %0 : tensor<8x8xf32>
 }
 
-
 // CHECK-LABEL: func @all_reduce_single_axis_2
 func.func @all_reduce_single_axis_2(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_x_2_y_2, [{"x"}, {}], unreduced={"y"}>}) -> tensor<8x8xf32> {
   // CHECK{LITERAL}: replica_groups = dense<[[0, 1], [2, 3]]>
@@ -345,3 +344,96 @@ func.func @unreduced_sine_of_replicated_dot(%arg0: tensor<8x4xf32>, %arg1: tenso
   %1 = stablehlo.sine %0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}], unreduced={"x"}>]>} : tensor<8x2xf32>
   return %1 : tensor<8x2xf32>
 }
+
+// CHECK-LABEL: func @sharded_to_unreduced
+func.func @sharded_to_unreduced(%arg0: tensor<16x16xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> tensor<16x16xf32> {
+  // CHECK-NEXT: %[[MANUAL_COMP:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME:     in_shardings=[<@mesh, [{"x"}, {"y"}]>]
+  // CHECK-SAME:     out_shardings=[<@mesh, [{}, {"y"}], unreduced={"x"}>]
+  // CHECK-SAME:     manual_axes={"x", "y"} (%arg1: tensor<4x8xf32>) {
+  // CHECK-NEXT:   %[[CST:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:   %[[BROADCAST:.*]] = stablehlo.broadcast %[[CST]], sizes = [16, 8] : (tensor<f32>) -> tensor<16x8xf32>
+  // CHECK-NEXT:   %[[PID:.*]] = stablehlo.partition_id : tensor<ui32>
+  // CHECK-NEXT:   %[[PID_I32:.*]] = stablehlo.convert %[[PID]] : (tensor<ui32>) -> tensor<i32>
+  // CHECK-NEXT:   %[[C2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM2:.*]] = stablehlo.remainder %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV2:.*]] = stablehlo.divide %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[C4:.*]] = stablehlo.constant dense<4> : tensor<i32>
+  // CHECK-NEXT:   %[[REM4:.*]] = stablehlo.remainder %[[DIV2]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV4:.*]] = stablehlo.divide %[[DIV2]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[C0:.*]] = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT:   %[[C4_2:.*]] = stablehlo.constant dense<4> : tensor<i32>
+  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %[[REM4]], %[[C4_2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DUS:.*]] = stablehlo.dynamic_update_slice %[[BROADCAST]], %arg1, %[[MULT]], %[[C0]] : (tensor<16x8xf32>, tensor<4x8xf32>, tensor<i32>, tensor<i32>) -> tensor<16x8xf32>
+  // CHECK-NEXT:   sdy.return %[[DUS]] : tensor<16x8xf32>
+  // CHECK-NEXT: } : (tensor<16x16xf32>) -> tensor<16x16xf32>
+  // CHECK-NEXT: return %[[MANUAL_COMP]] : tensor<16x16xf32>
+  %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {"y"}], unreduced={"x"}> : tensor<16x16xf32>
+  return %0 : tensor<16x16xf32>
+}
+
+// -----
+
+sdy.mesh @mesh = <["x"=4, "y"=2, "z"=3]>
+
+// CHECK-LABEL: func @replicated_to_unreduced
+func.func @replicated_to_unreduced(%arg0: tensor<16x16xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"y"}>}) -> tensor<16x16xf32> {
+  // CHECK-NEXT: %[[MANUAL_COMP:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME:     in_shardings=[<@mesh, [{}, {}], unreduced={"y"}>]
+  // CHECK-SAME:     out_shardings=[<@mesh, [{}, {}], unreduced={"x", "y", "z"}>]
+  // CHECK-SAME:     manual_axes={"x", "y", "z"} (%arg1: tensor<16x16xf32>) {
+  // CHECK-NEXT:   %[[PID:.*]] = stablehlo.partition_id : tensor<ui32>
+  // CHECK-NEXT:   %[[PID_I32:.*]] = stablehlo.convert %[[PID]] : (tensor<ui32>) -> tensor<i32>
+  // CHECK-NEXT:   %[[C3:.*]] = stablehlo.constant dense<3> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_Z:.*]] = stablehlo.remainder %[[PID_I32]], %[[C3]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_Z:.*]] = stablehlo.divide %[[PID_I32]], %[[C3]] : tensor<i32>
+  // CHECK-NEXT:   %[[C2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_Y:.*]] = stablehlo.remainder %[[DIV_Z]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_Y:.*]] = stablehlo.divide %[[DIV_Z]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[C4:.*]] = stablehlo.constant dense<4> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_X:.*]] = stablehlo.remainder %[[DIV_Y]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_X:.*]] = stablehlo.divide %[[DIV_Y]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[C0:.*]] = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT:   %[[CMP_X:.*]] = stablehlo.compare  EQ, %[[REM_X]], %[[C0]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT:   %[[CMP_Z:.*]] = stablehlo.compare  EQ, %[[REM_Z]], %[[C0]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT:   %[[PRED:.*]] = stablehlo.and %[[CMP_X]], %[[CMP_Z]] : tensor<i1>
+  // CHECK-NEXT:   %[[ZERO:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:   %[[ZERO_BCAST:.*]] = stablehlo.broadcast %[[ZERO]], sizes = [16, 16] : (tensor<f32>) -> tensor<16x16xf32>
+  // CHECK-NEXT:   %[[SELECT:.*]] = stablehlo.select %[[PRED]], %arg1, %[[ZERO_BCAST]] : tensor<i1>, tensor<16x16xf32>
+  // CHECK-NEXT:   sdy.return %[[SELECT]] : tensor<16x16xf32>
+  // CHECK-NEXT: } : (tensor<16x16xf32>) -> tensor<16x16xf32>
+  // CHECK-NEXT: return %[[MANUAL_COMP]] : tensor<16x16xf32>
+  %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x16xf32>
+  return %0 : tensor<16x16xf32>
+}
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @unreduced_constant
+func.func @unreduced_constant() -> tensor<2x2xf32> {
+  // CHECK-NEXT: %[[CONST:.*]] = sdy.constant {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>} dense<{{\[\[}}0.000000e+00, 1.000000e+00], [2.000000e+00, 3.000000e+00]]> : tensor<2x2xf32>
+  // CHECK-NEXT: %[[MANUAL_COMP:.*]] = sdy.manual_computation(%[[CONST]])
+  // CHECK-SAME:     in_shardings=[<@mesh, [{"x"}, {}]>]
+  // CHECK-SAME:     out_shardings=[<@mesh, [{"x"}, {}], unreduced={"y"}>]
+  // CHECK-SAME:     manual_axes={"x", "y"} (%arg0: tensor<1x2xf32>) {
+  // CHECK-NEXT:   %[[PID:.*]] = stablehlo.partition_id : tensor<ui32>
+  // CHECK-NEXT:   %[[PID_I32:.*]] = stablehlo.convert %[[PID]] : (tensor<ui32>) -> tensor<i32>
+  // CHECK-NEXT:   %[[C2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM:.*]] = stablehlo.remainder %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV:.*]] = stablehlo.divide %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[C2_0:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_0:.*]] = stablehlo.remainder %[[DIV]], %[[C2_0]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_0:.*]] = stablehlo.divide %[[DIV]], %[[C2_0]] : tensor<i32>
+  // CHECK-NEXT:   %[[C0:.*]] = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT:   %[[CMP:.*]] = stablehlo.compare  EQ, %[[REM]], %[[C0]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT:   %[[ZERO:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:   %[[ZERO_BCAST:.*]] = stablehlo.broadcast %[[ZERO]], sizes = [1, 2] : (tensor<f32>) -> tensor<1x2xf32>
+  // CHECK-NEXT:   %[[SELECT:.*]] = stablehlo.select %[[CMP]], %arg0, %[[ZERO_BCAST]] : tensor<i1>, tensor<1x2xf32>
+  // CHECK-NEXT:   sdy.return %[[SELECT]] : tensor<1x2xf32>
+  // CHECK-NEXT: } : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK-NEXT: return %[[MANUAL_COMP]] : tensor<2x2xf32>
+  %0 = sdy.constant {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}], unreduced={"y"}>]>} dense<[[0.0, 1.0], [2.0, 3.0]]> : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
diff --git a/third_party/xla/xla/service/spmd/shardy/utils.cc b/third_party/xla/xla/service/spmd/shardy/utils.cc
index d3a4bd26971cef..91f44339a22565 100644
--- a/third_party/xla/xla/service/spmd/shardy/utils.cc
+++ b/third_party/xla/xla/service/spmd/shardy/utils.cc
@@ -260,12 +260,12 @@ void adjustOutputSharding(
 CustomCallOp cloneCustomCallWithNewResultTypes(CustomCallOp op,
                                                mlir::TypeRange resultTypes,
                                                mlir::IRRewriter& rewriter) {
-  auto customCallOp = rewriter.create<CustomCallOp>(
-      op.getLoc(), resultTypes, op.getOperands(), op.getCallTargetNameAttr(),
-      op.getHasSideEffectAttr(), op.getBackendConfigAttr(),
-      op.getApiVersionAttr(), op.getCalledComputations(),
-      op.getOperandLayoutsAttr(), op.getResultLayoutsAttr(),
-      op.getOutputOperandAliases());
+  auto customCallOp = CustomCallOp::create(
+      rewriter, op.getLoc(), resultTypes, op.getOperands(),
+      op.getCallTargetNameAttr(), op.getHasSideEffectAttr(),
+      op.getBackendConfigAttr(), op.getApiVersionAttr(),
+      op.getCalledComputations(), op.getOperandLayoutsAttr(),
+      op.getResultLayoutsAttr(), op.getOutputOperandAliases());
   customCallOp->setDiscardableAttrs(mlir::DictionaryAttr::get(
       op->getContext(), llvm::to_vector(op->getDiscardableAttrs())));
   return customCallOp;
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 3efed6ad73375a..fc233f51925933 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -1397,56 +1397,32 @@ HloInstruction* PartitionedHlo::ReplicatePartial(
     return broadcast;
   }
 
-  HloInstruction* result = nullptr;
-  if (state_.collective_ops_creator.create_cross_partition_all_gather) {
-    result = state_.partitioner->AllGatherShards(
-        state_.b, broadcast, sharding(), state_.next_channel_id, ag_dims,
-        state_.collective_ops_creator);
-  }
-
-  if (result == nullptr) {
-    // We do not create all-gather instructions.
-    dus_ar_dims.insert(dus_ar_dims.end(), ag_dims.begin(), ag_dims.end());
-    result = broadcast;
-  } else {
-    // We create all-gather instructions, which may contain padding. Add a slice
-    // to remove the padding.
-    if (!ShapeUtil::Compatible(result->shape(), ag_result_shape)) {
-      std::vector<int64_t> start_indices(ag_result_shape.dimensions().size(),
-                                         0);
-      std::vector<int64_t> strides(ag_result_shape.dimensions().size(), 1);
-      result = state_.b->AddInstruction(
-          HloInstruction::CreateSlice(ag_result_shape, result, start_indices,
-                                      ag_result_shape.dimensions(), strides));
-    }
+  HloInstruction* result = state_.partitioner->AllGatherShards(
+      state_.b, broadcast, sharding(), state_.next_channel_id, ag_dims,
+      state_.collective_ops_creator);
+  // We create all-gather instructions, which may contain padding. Add a slice
+  // to remove the padding.
+  if (!ShapeUtil::Compatible(result->shape(), ag_result_shape)) {
+    std::vector<int64_t> start_indices(ag_result_shape.dimensions().size(), 0);
+    std::vector<int64_t> strides(ag_result_shape.dimensions().size(), 1);
+    result = state_.b->AddInstruction(
+        HloInstruction::CreateSlice(ag_result_shape, result, start_indices,
+                                    ag_result_shape.dimensions(), strides));
   }
 
   if (!dus_ar_dims.empty()) {
     auto zero = state_.b->AddInstruction(HloInstruction::CreateConstant(
         LiteralUtil::Zero(shard_shape.element_type())));
-    std::vector<int64_t> masking_dims;
-    for (int64_t dim : dus_ar_dims) {
-      if (shard_shape.dimensions(dim) * sharding().dimension(dim) !=
-          base_shape().dimensions(dim)) {
-        // DUS will be out-of-bound and offset will be clamped, so we need to
-        // mask this dim with 0.
-        masking_dims.push_back(dim);
-      }
-    }
-    if (!masking_dims.empty()) {
-      std::vector<int64_t> skipped_dims;
-      for (int64_t i = 0; i < base_shape().dimensions().size(); ++i) {
-        if (!absl::c_linear_search(masking_dims, i)) {
-          skipped_dims.push_back(i);
-        }
+    std::vector<int64_t> skipped_dims;
+    for (int64_t i = 0; i < base_shape().dimensions().size(); ++i) {
+      if (!absl::c_linear_search(dus_ar_dims, i)) {
+        skipped_dims.push_back(i);
       }
-      result->copy_sharding(hlo_);
-      result = PartitionedHlo(result, final_result_shape, state_)
-                   .PadWithValue(zero,
-                                 /*left_padded_dims=*/{},
-                                 /*skipped_dims=*/skipped_dims)
-                   .hlo();
     }
+    result->copy_sharding(hlo_);
+    result = PartitionedHlo(result, final_result_shape, state_)
+                 .PadWithValue(zero, /*left_padded_dims=*/{}, skipped_dims)
+                 .hlo();
     auto zero_bcast = state_.b->AddInstruction(
         HloInstruction::CreateBroadcast(final_result_shape, zero, {}));
     auto offsets = MakePartitionOffsets(
@@ -1774,14 +1750,13 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
     VLOG(5) << "Falling back to creating all-to-all with replica groups V1 "
                "(list of vectors).";
     // The order of ids in the group must follow the temp_target sharding.
-    std::vector<std::vector<int64_t>> groups =
-        GetPartitionGroupsAcrossTargetDims(temp_target, {target_dim},
-                                           {group_size});
+    CollectiveDeviceList groups = GetPartitionGroupsAcrossTargetDims(
+        temp_target, {target_dim}, {group_size});
     // After the reshape, it is guaranteed to have at least 3 dimensions.
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape}, groups, (*state_.next_channel_id)++,
-            target_dim);
+            state_.b, {reshape}, groups.flattened_replica_groups(),
+            (*state_.next_channel_id)++, target_dim);
   }
   CHECK_NE(all_to_all, nullptr);
 
@@ -1963,12 +1938,12 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
   } else {
     VLOG(5) << "Falling back to creating all-to-all with replica groups V1 "
                "(list of vectors).";
-    std::vector<std::vector<int64_t>> groups =
-        GetPartitionGroupsAcrossTargetDims(temp_target, eligible_target_dims,
-                                           group_sizes);
+    CollectiveDeviceList groups = GetPartitionGroupsAcrossTargetDims(
+        temp_target, eligible_target_dims, group_sizes);
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape_1}, groups, (*state_.next_channel_id)++, 0);
+            state_.b, {reshape_1}, groups.flattened_replica_groups(),
+            (*state_.next_channel_id)++, 0);
   }
   // Step 3. Split sharding axes to multiple dimensions
   // 1. reshape_2 (8,16,8,16,8) -> (2,4,16,8,16,8)
@@ -4977,12 +4952,6 @@ absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
 
 SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                                                         int64_t num_replicas) {
-  auto uses_all_partitions =
-      [num_partitions](const IotaReplicaGroupList& partition_group_list) {
-        return partition_group_list.num_replica_groups() *
-                   partition_group_list.num_devices_per_group() ==
-               num_partitions;
-      };
   auto create_all_reduce_lists_of_lists =
       [num_replicas, num_partitions](
           SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
@@ -5077,14 +5046,14 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 b, operand, reduction, partition_subgroups, channel_id);
           },
       .create_cross_partition_all_reduce_with_iota_device_list =
-          [create_all_reduce_lists_of_lists, uses_all_partitions, num_replicas,
-           num_partitions](SpmdBuilder* b, HloInstruction* operand,
-                           HloComputation* reduction,
-                           const IotaReplicaGroupList& partition_group_list,
-                           int64_t channel_id) {
+          [create_all_reduce_lists_of_lists, num_replicas, num_partitions](
+              SpmdBuilder* b, HloInstruction* operand,
+              HloComputation* reduction,
+              const IotaReplicaGroupList& partition_group_list,
+              int64_t channel_id) {
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
-            if (!uses_all_partitions(partition_group_list)) {
+            if (partition_group_list.num_total_devices() != num_partitions) {
               return create_all_reduce_lists_of_lists(
                   b, operand, reduction,
                   partition_group_list.flattened_replica_groups(), channel_id);
@@ -5135,14 +5104,13 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 b, operands, partition_subgroups, channel_id, split_dimension);
           },
       .create_cross_partition_all_to_all_with_iota_device_list =
-          [create_all_to_all_list_of_lists, uses_all_partitions, num_replicas,
-           num_partitions](
+          [create_all_to_all_list_of_lists, num_replicas, num_partitions](
               SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
               const IotaReplicaGroupList& partition_group_list,
               int64_t channel_id, std::optional<int64_t> split_dimension) {
             // Fallback back to list of lists collective creation if the
             // partition group list does not utilize all the partitions.
-            if (!uses_all_partitions(partition_group_list)) {
+            if (partition_group_list.num_total_devices() != num_partitions) {
               return create_all_to_all_list_of_lists(
                   b, operands, partition_group_list.flattened_replica_groups(),
                   channel_id, split_dimension);
@@ -5167,14 +5135,13 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 all_gather_dimension);
           },
       .create_cross_partition_all_gather_with_iota_device_list =
-          [create_all_gather_list_of_lists, uses_all_partitions, num_replicas,
-           num_partitions](SpmdBuilder* b, HloInstruction* operand,
-                           const Shape& ag_shape,
-                           const IotaReplicaGroupList& partition_group_list,
-                           int64_t channel_id, int64_t all_gather_dimension) {
+          [create_all_gather_list_of_lists, num_replicas, num_partitions](
+              SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+              const IotaReplicaGroupList& partition_group_list,
+              int64_t channel_id, int64_t all_gather_dimension) {
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
-            if (!uses_all_partitions(partition_group_list)) {
+            if (partition_group_list.num_total_devices() != num_partitions) {
               return create_all_gather_list_of_lists(
                   b, operand, ag_shape,
                   partition_group_list.flattened_replica_groups(), channel_id,
@@ -5241,9 +5208,12 @@ SpmdPartitioner::AllGatherShardsInternal(
         auto partition_subgroups =
             GetPartitionGroupsForReplication(sharding, {*it});
         result_shape.set_dimensions(
-            *it, result_shape.dimensions(*it) * partition_subgroups[0].size());
+            *it, result_shape.dimensions(*it) *
+                     partition_subgroups.num_devices_per_group());
         result = collectives_creator.create_cross_partition_all_gather(
-            b, result, result_shape, partition_subgroups, (*next_channel_id)++,
+            b, result, result_shape,
+            partition_subgroups.flattened_replica_groups(),
+            (*next_channel_id)++,
             /*all_gather_dimension=*/*it);
       }
     }
@@ -5279,10 +5249,10 @@ SpmdPartitioner::AllGatherShardsInternal(
   } else {
     auto partition_subgroups =
         GetPartitionGroupsForReplication(sharding, selected_dims);
-    shape[0] *= partition_subgroups[0].size();
+    shape[0] *= partition_subgroups.num_devices_per_group();
     result = collectives_creator.create_cross_partition_all_gather(
         b, result, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
-        partition_subgroups, (*next_channel_id)++,
+        partition_subgroups.flattened_replica_groups(), (*next_channel_id)++,
         /*all_gather_dimension=*/0);
   }
   ag = result;
@@ -5371,7 +5341,8 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
     auto partition_subgroups =
         GetPartitionGroupsForReplication(sharding, selected_dims);
     return collectives_creator.create_cross_partition_all_reduce(
-        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
+        b, operand, reduction, partition_subgroups.flattened_replica_groups(),
+        (*next_channel_id)++);
   }
 
   auto result = operand;
@@ -5394,7 +5365,8 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
       auto partition_subgroups =
           GetPartitionGroupsForReplication(sharding, {*it});
       result = collectives_creator.create_cross_partition_all_reduce(
-          b, result, reduction, partition_subgroups, (*next_channel_id)++);
+          b, result, reduction, partition_subgroups.flattened_replica_groups(),
+          (*next_channel_id)++);
     }
   }
   return result;
@@ -5488,6 +5460,26 @@ int64_t SpmdPartitioner::CommunicationCostInBytes(HloInstruction* hlo) {
   module->set_spmd_output_sharding(entry_root->sharding());
 }
 
+namespace {
+
+// Returns true if the old and the new entry layout shapes differ.
+// NOTE: that we explicitly ignore the layout, since it is either defined
+// beforehand or during layout assignment.
+bool ShapeChangesBetween(const ComputationLayout& old_entry_layout,
+                         const ProgramShape& new_program_shape) {
+  for (int64_t i = 0; i < new_program_shape.parameters_size(); ++i) {
+    if (!Shape::Equal().IgnoreLayout()(old_entry_layout.parameter_shape(i),
+                                       new_program_shape.parameters(i))) {
+      return true;
+    }
+  }
+
+  return !Shape::Equal().IgnoreLayout()(old_entry_layout.result_shape(),
+                                        new_program_shape.result());
+}
+
+}  // namespace
+
 absl::StatusOr<bool> SpmdPartitioner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -5582,57 +5574,64 @@ absl::StatusOr<bool> SpmdPartitioner::RunImpl(
       }));
 
   // For the entry computation, make sure that the root instruction and the
-  // parameters preserve their signatures.
+  // parameters preserve their signatures if there are any partitioning changes.
   auto new_program_shape = module->entry_computation()->ComputeProgramShape();
-  if (!options_.allow_module_signature_change) {
-    if (!Shape::Equal()(program_shape.result(), new_program_shape.result())) {
-      return absl::InvalidArgumentError(
-          "Result shape changed for the entry computation from: " +
-          program_shape.result().ToString() +
-          " to: " + new_program_shape.result().ToString());
-    }
-    if (program_shape.parameters_size() !=
-        new_program_shape.parameters_size()) {
-      return absl::InvalidArgumentError(
-          "Parameter count changed for the entry computation from: " +
-          std::to_string(program_shape.parameters_size()) +
-          " to: " + std::to_string(new_program_shape.parameters_size()));
-    }
-    for (int64_t i = 0; i < program_shape.parameters_size(); ++i) {
-      if (!Shape::Equal()(program_shape.parameters(i),
-                          new_program_shape.parameters(i))) {
+  const ComputationLayout& old_entry_layout =
+      module->entry_computation_layout();
+  if (ShapeChangesBetween(old_entry_layout, new_program_shape)) {
+    if (!options_.allow_module_signature_change) {
+      if (!Shape::Equal()(program_shape.result(), new_program_shape.result())) {
         return absl::InvalidArgumentError(
-            "Parameter shape changed for the entry computation parameter " +
-            std::to_string(i) +
-            " from: " + program_shape.parameters(i).ToString() +
-            " to: " + new_program_shape.parameters(i).ToString());
+            "Result shape changed for the entry computation from: " +
+            program_shape.result().ToString() +
+            " to: " + new_program_shape.result().ToString());
       }
-    }
-  } else {
-    // Fix up some bad tiling in entry computation layout.
-    auto update_shape = [this](Shape* subshape, const xla::ShapeIndex& index) {
-      if (subshape->IsArray() && subshape->has_layout()) {
-        UpdateLayout(subshape);
+      if (program_shape.parameters_size() !=
+          new_program_shape.parameters_size()) {
+        return absl::InvalidArgumentError(
+            "Parameter count changed for the entry computation from: " +
+            std::to_string(program_shape.parameters_size()) +
+            " to: " + std::to_string(new_program_shape.parameters_size()));
       }
-    };
-    const auto& old_entry_layout = module->entry_computation_layout();
-    // Shapes can change but the layout should still remain the same.
-    for (int64_t i = 0; i < new_program_shape.parameters_size(); ++i) {
+      for (int64_t i = 0; i < program_shape.parameters_size(); ++i) {
+        if (!Shape::Equal()(program_shape.parameters(i),
+                            new_program_shape.parameters(i))) {
+          return absl::InvalidArgumentError(
+              "Parameter shape changed for the entry computation parameter " +
+              std::to_string(i) +
+              " from: " + program_shape.parameters(i).ToString() +
+              " to: " + new_program_shape.parameters(i).ToString());
+        }
+      }
+    } else {
+      // For the cases where we update the shape, also fix up some bad tiling in
+      // entry computation layout.
+      auto update_shape = [this](Shape* subshape,
+                                 const xla::ShapeIndex& index) {
+        if (subshape->IsArray() && subshape->has_layout()) {
+          UpdateLayout(subshape);
+        }
+      };
+      // Shapes can change but the layout should still remain the same.
+      // If the shapes do not change, we shouldn't change the layout if pre-set.
+      for (int64_t i = 0; i < new_program_shape.parameters_size(); ++i) {
+        TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+            old_entry_layout.parameter_shape(i),
+            new_program_shape.mutable_parameters(i)));
+        ShapeUtil::ForEachMutableSubshape(
+            new_program_shape.mutable_parameters(i), update_shape);
+      }
+
       TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-          old_entry_layout.parameter_shape(i),
-          new_program_shape.mutable_parameters(i)));
-      ShapeUtil::ForEachMutableSubshape(new_program_shape.mutable_parameters(i),
+          old_entry_layout.result_shape(), new_program_shape.mutable_result()));
+      ShapeUtil::ForEachMutableSubshape(new_program_shape.mutable_result(),
                                         update_shape);
+
+      HloModuleConfig config = module->config();
+      *config.mutable_entry_computation_layout() =
+          ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
+      module->set_config(config);
     }
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        old_entry_layout.result_shape(), new_program_shape.mutable_result()));
-    ShapeUtil::ForEachMutableSubshape(new_program_shape.mutable_result(),
-                                      update_shape);
-
-    HloModuleConfig config = module->config();
-    *config.mutable_entry_computation_layout() =
-        ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
-    module->set_config(config);
   }
 
   XLA_VLOG_LINES(1, SpmdLogger::ReportAfterPartition(
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index c430d29b65036c..8aae8502e73d9b 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -824,12 +824,12 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   }
 
   virtual double GetCommunicationTimeInMilliSec(
-      int64_t bytes, const CollectiveDeviceList& collective_device_list) {
+      int64_t bytes, const CollectiveDeviceListBase& collective_device_list) {
     return 0.0;
   }
 
   virtual int GetCommunicationMultiplier(
-      const CollectiveDeviceList& collective_device_list) {
+      const CollectiveDeviceListBase& collective_device_list) {
     return 1;
   }
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 1df01a367663dc..e7cd172fd25d66 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -85,13 +85,10 @@ class SpmdPartitioningTest
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       absl::string_view hlo_module, int64_t num_devices,
       SpmdPartitionerOptions options = SpmdPartitionerOptions(),
-      bool use_all_gather = true, bool enable_enzyme_opt = false) {
+      bool enable_enzyme_opt = false) {
     options.allow_module_signature_change = true;
     auto collective_ops_creator =
         GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
-    if (!use_all_gather) {
-      collective_ops_creator.create_cross_partition_all_gather = nullptr;
-    }
 
     HloModuleConfig config = GetModuleConfigForTest();
     config.set_use_spmd_partitioning(true);
@@ -535,13 +532,12 @@ ENTRY entry {
   EXPECT_NE(all_gather, nullptr);
 
   // Verify all-gather instruction contains ReplicaGroupV2.
-  EXPECT_TRUE(all_gather->device_list().iota_replica_group_list().has_value());
-  IotaReplicaGroupList list =
-      all_gather->device_list().iota_replica_group_list().value();
-  EXPECT_EQ(list.num_replica_groups(), 1);
-  EXPECT_EQ(list.num_devices_per_group(), 4);
-  EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(4));
-  EXPECT_THAT(list.transpose_perm(), ::testing::ElementsAre(0));
+  EXPECT_TRUE(all_gather->device_list().version() ==
+              CollectiveDeviceListVersion::kIota);
+  EXPECT_EQ(all_gather->device_list(),
+            CollectiveDeviceList(IotaReplicaGroupList(
+                /*num_replica_groups=*/1, /*num_devices_per_group=*/4,
+                /*reshape_dims=*/{4}, /*transpose_perm=*/{0})));
 }
 
 TEST_P(SpmdPartitioningTest, TiledToSingleDevice) {
@@ -601,8 +597,10 @@ ENTRY entry {
   EXPECT_EQ(all_to_all->replica_groups().size(), 1);
   EXPECT_EQ(all_to_all->replica_groups()[0].replica_ids_size(), 8);
   if (GetParam() == ShardingFormatPicker::ShardingType::kBestEffortV2) {
-    EXPECT_EQ(all_to_all->device_list().iota_replica_group_list(),
-              IotaReplicaGroupList(1, 8, {4, 2}, {1, 0}));
+    EXPECT_EQ(all_to_all->device_list(),
+              CollectiveDeviceList(IotaReplicaGroupList(
+                  /*num_replica_groups=*/1, /*num_devices_per_group=*/8,
+                  /*reshape_dims=*/{4, 2}, /*transpose_perm=*/{1, 0})));
   } else {
     std::vector<std::vector<int64_t>> expected_replica_groups = {
         {0, 2, 4, 6, 1, 3, 5, 7}};
@@ -2010,18 +2008,12 @@ ENTRY entry {
             module->entry_computation()->instructions().end());
 
   // Verify all-reduce instruction contains ReplicaGroupV2.
-  EXPECT_TRUE((*all_reduce_instruction)
-                  ->device_list()
-                  .iota_replica_group_list()
-                  .has_value());
-  IotaReplicaGroupList list = (*all_reduce_instruction)
-                                  ->device_list()
-                                  .iota_replica_group_list()
-                                  .value();
-  EXPECT_EQ(list.num_replica_groups(), 1);
-  EXPECT_EQ(list.num_devices_per_group(), 8);
-  EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(8));
-  EXPECT_THAT(list.transpose_perm(), ::testing::ElementsAre(0));
+  EXPECT_EQ((*all_reduce_instruction)->device_list().version(),
+            CollectiveDeviceListVersion::kIota);
+  EXPECT_EQ((*all_reduce_instruction)->device_list(),
+            CollectiveDeviceList(IotaReplicaGroupList(
+                /*num_replica_groups=*/1, /*num_devices_per_group=*/8,
+                /*reshape_dims=*/{8}, /*transpose_perm=*/{0})));
 }
 
 TEST_P(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowReversal) {
@@ -8161,10 +8153,10 @@ TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantInRange) {
       dynamic-update-slice(%input, %update, %c59, %c27),
       sharding={devices=[1,2]<=[2]}
   })";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
   const auto root = module->entry_computation()->root_instruction();
   auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
   auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[10,5]"));
@@ -8215,10 +8207,10 @@ TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantOutOfRange) {
       sharding={devices=[1,2]<=[2]}
   })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
   const auto root = module->entry_computation()->root_instruction();
   auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
   auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[128,10]"));
@@ -8248,10 +8240,10 @@ TEST_P(SpmdPartitioningTest, DynamicUpdateSliceSingleDimensionWithEnzymeOpt) {
         sharding={devices=[4]<=[4]}
     })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/4, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
   const auto root = module->entry_computation()->root_instruction();
   auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[4]"));
   auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[2]"));
@@ -12134,8 +12126,14 @@ ENTRY %module {
   EXPECT_TRUE(all_to_all != nullptr);
   if (GetParam() ==
       test_only::ShardingFormatPicker::ShardingType::kBestEffortV2) {
-    EXPECT_EQ(all_to_all->device_list().iota_replica_group_list().value(),
-              IotaReplicaGroupList(4, 2, {2, 2, 2}, {0, 2, 1}));
+    EXPECT_EQ(all_to_all->device_list().version(),
+              CollectiveDeviceListVersion::kIota);
+    EXPECT_EQ(all_to_all->device_list(),
+              CollectiveDeviceList(IotaReplicaGroupList(
+                  /*num_replica_groups=*/4, /*num_devices_per_group=*/2,
+                  /*reshape_dims=*/{2, 2, 2},
+                  /*transpose_perm=*/{0, 2, 1})));
+
   } else {
     std::vector<std::vector<int64_t>> expected_replica_groups = {
         {0, 2}, {1, 3}, {4, 6}, {5, 7}};
@@ -12322,29 +12320,22 @@ TEST_P(SpmdPartitioningTest,
 HloModule module
 
 ENTRY %module {
-  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
-    sharding={devices=[2,2,1,1,2]<=[8] last_tile_dim_replicate}
-  %parameter.1 = s32[2,8,4]{2,1,0} parameter(1),
-    sharding={devices=[1,2,1,4]<=[8] last_tile_dim_replicate}
-  ROOT %gather.20 = s32[8,4,2,2]{3,2,1,0} gather(
-    s32[8,4,2,2]{3,2,1,0} %parameter.0,
-    s32[2,8,4]{2,1,0} %parameter.1), offset_dims={2,3},
-    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
-    slice_sizes={1,1,2,2}, sharding={replicated}
+  %operand = s32[18,14,2,2] parameter(0), sharding={devices=[2,2,1,1,2]<=[8] last_tile_dim_replicate}
+  %indices = s32[2,8,4] parameter(1), sharding={devices=[1,2,1,4]<=[8] last_tile_dim_replicate}
+  ROOT %gather.20 = s32[8,4,2,2] gather(%operand, %indices),
+    offset_dims={2,3}, collapsed_slice_dims={0,1}, start_index_map={0,1},
+    index_vector_dim=0, slice_sizes={1,1,2,2}, sharding={replicated}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_devices=*/8,
-                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto operand = AllOf(op::Shape("s32[4,2,2,2]"), op::Parameter());
+  auto operand = AllOf(op::Shape("s32[9,7,2,2]"), op::Parameter());
   auto indices = AllOf(op::Shape("s32[2,4,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[4,4,2,2]"), op::Gather(operand, indices));
   EXPECT_THAT(
-      root, op::AllReduce(op::DynamicUpdateSlice(
-                _, op::AllReduce(op::AllReduce(op::Select(_, _, gather))), _, _,
-                _, _)));
+      root,
+      op::AllGather(op::AllReduce(op::AllReduce(op::Select(_, _, gather)))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -13310,12 +13301,10 @@ ENTRY %module {
     update_window_dims={2,3},
     inserted_window_dims={0,1},
     scatter_dims_to_operand_dims={0,1},
-    index_vector_dim=0, sharding={replicated}
+    index_vector_dim=0, sharding={devices=[2,2,2,1]<=[8]}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_devices=*/8,
-                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::Select());
@@ -13323,8 +13312,8 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,2,1,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[8,4,1,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::AllReduce(op::AllReduce(scatter)), _, _, _, _)));
+  EXPECT_THAT(root, op::DynamicSlice(op::AllReduce(op::AllReduce(scatter)), _,
+                                     _, _, _));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -14996,10 +14985,10 @@ ENTRY entry {
   ROOT c = bf16[16,224,224,384]{3,2,1,0} copy(dynamic-update-slice.128), sharding={devices=[2,2,2,1]<=[8]}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/8, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
 
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -16440,27 +16429,6 @@ ENTRY entry {
   EXPECT_EQ(FindInstruction(module.get(), HloOpcode::kAllReduce), nullptr);
 }
 
-TEST_P(SpmdPartitioningTest, UnreducedPopulation) {
-  absl::string_view hlo_string = R"(
-HloModule module
-
-ENTRY entry {
-  constant = s32[2,4]{1,0} constant({{1,1,1,1},{1,1,1,1}}), sharding={maximal device=0}
-  a = s32[2,4]{1,0} parameter(0), sharding={devices=[1,2]0,1}
-  add = s32[2,4]{1,0} add(constant, a), sharding={unreduced}
-  ROOT copy = s32[2,4]{1,0} copy(%add), sharding={unreduced}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_devices=*/2,
-                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
-  VLOG(1) << module->ToString();
-  // Check that we use all-reduce to reshard the operands of the add in spite
-  // that the `add` has unreduced axes.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Copy(op::Add(op::AllReduce(), op::AllReduce())));
-}
-
 TEST_P(SpmdPartitioningTest, UnreducedParam) {
   absl::string_view hlo_string = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index d87ef705b8185c..c2b418440c1cc0 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -536,6 +536,162 @@ std::optional<IotaReplicaGroupList> ExpandDeviceGroupsWithIota(
                               processed_device_groups.iota()->transpose_perm());
 }
 
+// Expand the device groups, given a mesh-axes partition group list.
+// NOLINTNEXTLINE(clang-diagnostic-unused-function)
+std::optional<IotaReplicaGroupList> ExpandDeviceGroupsWithMeshAxes(
+    const DeviceGroupTileAssignment& device_groups,
+    MeshAxesReplicaGroupList* partition_group_list) {
+  return ExpandDeviceGroupsWithIota(
+      device_groups, partition_group_list->ToIotaReplicaGroupList());
+}
+
+// Lambdas for creating SPMDCollectiveOps functions.
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_all_reduce)
+CreateCrossPartitionAllReduce(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             int64_t channel_id) {
+    return creator.create_cross_partition_all_reduce(
+        b, operand, reduction,
+        ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
+        channel_id);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::
+             create_cross_partition_all_reduce_with_iota_device_list)
+CreateCrossPartitionAllReduceWithIotaDeviceList(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+             const IotaReplicaGroupList& partition_group_list,
+             int64_t channel_id) {
+    // Try to expand the device group list, but if this fails fallback
+    // to creating collective with list of list of integers representation.
+    std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
+        ExpandDeviceGroupsWithIota(*device_groups_ptr, partition_group_list);
+    if (!expanded_iota_partition_group_list.has_value()) {
+      return creator.create_cross_partition_all_reduce(
+          b, operand, reduction,
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
+          channel_id);
+    }
+    return creator.create_cross_partition_all_reduce_with_iota_device_list(
+        b, operand, reduction, *expanded_iota_partition_group_list, channel_id);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_collective_permute)
+CreateCrossPartitionCollectivePermute(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand,
+             std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
+             int64_t next_channel_id) {
+    std::vector<std::pair<int64_t, int64_t>> expanded_pairs(
+        src_dst_pairs.size() * device_groups_ptr->num_groups());
+    for (int64_t g = 0; g < device_groups_ptr->num_groups(); ++g) {
+      for (int64_t i = 0; i < src_dst_pairs.size(); ++i) {
+        expanded_pairs[g * src_dst_pairs.size() + i] =
+            std::pair<int64_t, int64_t>{
+                device_groups_ptr->array()(g, src_dst_pairs[i].first),
+                device_groups_ptr->array()(g, src_dst_pairs[i].second)};
+      }
+    }
+    return creator.create_cross_partition_collective_permute(
+        b, operand, expanded_pairs, next_channel_id);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_all_to_all)
+CreateCrossPartitionAllToAll(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             int64_t channel_id, std::optional<int64_t> split_dimension) {
+    return creator.create_cross_partition_all_to_all(
+        b, operands,
+        ExpandDeviceGroups(*device_groups_ptr, partition_subgroups), channel_id,
+        split_dimension);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::
+             create_cross_partition_all_to_all_with_iota_device_list)
+CreateCrossPartitionAllToAllWithIotaDeviceList(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+             const IotaReplicaGroupList& partition_group_list,
+             int64_t channel_id, std::optional<int64_t> split_dimension) {
+    // Try to expand the partition group list, but if this fails fallback
+    // to creating collective with list of list of integers representation.
+    std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
+        ExpandDeviceGroupsWithIota(*device_groups_ptr, partition_group_list);
+    if (!expanded_iota_partition_group_list.has_value()) {
+      return creator.create_cross_partition_all_to_all(
+          b, operands,
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
+          channel_id, split_dimension);
+    }
+    return creator.create_cross_partition_all_to_all_with_iota_device_list(
+        b, operands, *expanded_iota_partition_group_list, channel_id,
+        split_dimension);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_all_gather)
+CreateCrossPartitionAllGather(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             int64_t channel_id, int64_t all_gather_dimension) {
+    return creator.create_cross_partition_all_gather(
+        b, operand, ag_shape,
+        ExpandDeviceGroups(*device_groups_ptr, partition_subgroups), channel_id,
+        all_gather_dimension);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::
+             create_cross_partition_all_gather_with_iota_device_list)
+CreateCrossPartitionAllGatherWithIotaDeviceList(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+             const IotaReplicaGroupList& partition_group_list,
+             int64_t channel_id, int64_t all_gather_dimension) {
+    // Try to expand the device group list, but if this fails fallback
+    // to creating collective with list of list of integers
+    // representation.
+    std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
+        ExpandDeviceGroupsWithIota(*device_groups_ptr, partition_group_list);
+    if (!expanded_iota_partition_group_list.has_value()) {
+      return creator.create_cross_partition_all_gather(
+          b, operand, ag_shape,
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
+          channel_id, all_gather_dimension);
+    }
+    return creator.create_cross_partition_all_gather_with_iota_device_list(
+        b, operand, ag_shape, *expanded_iota_partition_group_list, channel_id,
+        all_gather_dimension);
+  };
+}
+
 SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
     const SPMDCollectiveOpsCreator& creator,
     const DeviceGroupTileAssignment& device_groups) {
@@ -559,124 +715,25 @@ SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
                                  *device_groups_ptr, b);
   };
   result.create_cross_partition_all_reduce =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-          const std::vector<std::vector<int64_t>>& partition_subgroups,
-          int64_t channel_id) {
-        return creator.create_cross_partition_all_reduce(
-            b, operand, reduction,
-            ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
-            channel_id);
-      };
+      CreateCrossPartitionAllReduce(creator, device_groups_ptr);
   result.create_cross_partition_all_reduce_with_iota_device_list =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-          const IotaReplicaGroupList& partition_group_list,
-          int64_t channel_id) {
-        // Try to expand the device group list, but if this fails fallback
-        // to creating collective with list of list of integers representation.
-        std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
-            ExpandDeviceGroupsWithIota(*device_groups_ptr,
-                                       partition_group_list);
-        if (!expanded_iota_partition_group_list.has_value()) {
-          return creator.create_cross_partition_all_reduce(
-              b, operand, reduction,
-              ExpandDeviceGroups(
-                  *device_groups_ptr,
-                  partition_group_list.flattened_replica_groups()),
-              channel_id);
-        }
-        return creator.create_cross_partition_all_reduce_with_iota_device_list(
-            b, operand, reduction, *expanded_iota_partition_group_list,
-            channel_id);
-      };
+      CreateCrossPartitionAllReduceWithIotaDeviceList(creator,
+                                                      device_groups_ptr);
   result.create_cross_partition_collective_permute =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, HloInstruction* operand,
-          std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
-          int64_t next_channel_id) {
-        std::vector<std::pair<int64_t, int64_t>> expanded_pairs(
-            src_dst_pairs.size() * device_groups_ptr->num_groups());
-        for (int64_t g = 0; g < device_groups_ptr->num_groups(); ++g) {
-          for (int64_t i = 0; i < src_dst_pairs.size(); ++i) {
-            expanded_pairs[g * src_dst_pairs.size() + i] =
-                std::pair<int64_t, int64_t>{
-                    device_groups_ptr->array()(g, src_dst_pairs[i].first),
-                    device_groups_ptr->array()(g, src_dst_pairs[i].second)};
-          }
-        }
-        return creator.create_cross_partition_collective_permute(
-            b, operand, expanded_pairs, next_channel_id);
-      };
+      CreateCrossPartitionCollectivePermute(creator, device_groups_ptr);
   result.create_cross_partition_all_to_all =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-          const std::vector<std::vector<int64_t>>& partition_subgroups,
-          int64_t channel_id, std::optional<int64_t> split_dimension) {
-        return creator.create_cross_partition_all_to_all(
-            b, operands,
-            ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
-            channel_id, split_dimension);
-      };
+      CreateCrossPartitionAllToAll(creator, device_groups_ptr);
   result.create_cross_partition_all_to_all_with_iota_device_list =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-          const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
-          std::optional<int64_t> split_dimension) {
-        // Try to expand the partition group list, but if this fails fallback
-        // to creating collective with list of list of integers representation.
-        std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
-            ExpandDeviceGroupsWithIota(*device_groups_ptr,
-                                       partition_group_list);
-        if (!expanded_iota_partition_group_list.has_value()) {
-          return creator.create_cross_partition_all_to_all(
-              b, operands,
-              ExpandDeviceGroups(
-                  *device_groups_ptr,
-                  partition_group_list.flattened_replica_groups()),
-              channel_id, split_dimension);
-        }
-        return creator.create_cross_partition_all_to_all_with_iota_device_list(
-            b, operands, *expanded_iota_partition_group_list, channel_id,
-            split_dimension);
-      };
+      CreateCrossPartitionAllToAllWithIotaDeviceList(creator,
+                                                     device_groups_ptr);
   if (creator.create_cross_partition_all_gather) {
     result.create_cross_partition_all_gather =
-        [creator, device_groups_ptr](
-            SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-            const std::vector<std::vector<int64_t>>& partition_subgroups,
-            int64_t channel_id, int64_t all_gather_dimension) {
-          return creator.create_cross_partition_all_gather(
-              b, operand, ag_shape,
-              ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
-              channel_id, all_gather_dimension);
-        };
+        CreateCrossPartitionAllGather(creator, device_groups_ptr);
   }
   if (creator.create_cross_partition_all_gather_with_iota_device_list) {
     result.create_cross_partition_all_gather_with_iota_device_list =
-        [creator, device_groups_ptr](
-            SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-            const IotaReplicaGroupList& partition_group_list,
-            int64_t channel_id, int64_t all_gather_dimension) {
-          // Try to expand the device group list, but if this fails fallback
-          // to creating collective with list of list of integers
-          // representation.
-          std::optional<IotaReplicaGroupList>
-              expanded_iota_partition_group_list = ExpandDeviceGroupsWithIota(
-                  *device_groups_ptr, partition_group_list);
-          if (!expanded_iota_partition_group_list.has_value()) {
-            return creator.create_cross_partition_all_gather(
-                b, operand, ag_shape,
-                ExpandDeviceGroups(
-                    *device_groups_ptr,
-                    partition_group_list.flattened_replica_groups()),
-                channel_id, all_gather_dimension);
-          }
-          return creator
-              .create_cross_partition_all_gather_with_iota_device_list(
-                  b, operand, ag_shape, *expanded_iota_partition_group_list,
-                  channel_id, all_gather_dimension);
-        };
+        CreateCrossPartitionAllGatherWithIotaDeviceList(creator,
+                                                        device_groups_ptr);
   }
   return result;
 }
@@ -2838,7 +2895,7 @@ HloInstruction* PadDataFromWindowReshard(
   return sharded_data;
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+CollectiveDeviceList GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
   absl::Span<const int64_t> sharding_dims = sharding.dimensions();
   DCHECK_GE(sharding_dims.size(), replication_dims.size());
@@ -2882,10 +2939,10 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
         DCHECK_LT(group_id, partition_groups.size());
         partition_groups[group_id].push_back(partition);
       });
-  return partition_groups;
+  return CollectiveDeviceList(partition_groups);
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
+CollectiveDeviceList GetPartitionGroupsAcrossTargetDims(
     const HloSharding& sharding, std::vector<int64_t> target_dims,
     std::vector<int64_t> group_sizes) {
   CHECK(target_dims.size() == group_sizes.size());
@@ -2893,24 +2950,23 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
       group_sizes.begin(), group_sizes.end(), 1, std::multiplies<int64_t>());
   std::vector<std::vector<int64_t>> groups(sharding.num_devices() /
                                            total_group_size);
-  sharding.tile_assignment().Each(
-      [&](absl::Span<const int64_t> indices, int64_t device) {
-        int64_t group_id = 0;
-        for (int64_t dim = 0; dim < indices.size(); ++dim) {
-          if (auto it = absl::c_find(target_dims, dim);
-              it != target_dims.end()) {
-            int64_t group_size =
-                group_sizes[std::distance(target_dims.begin(), it)];
-            group_id *= sharding.dimension(dim) / group_size;
-            group_id += indices[dim] / group_size;
-          } else {
-            group_id *= sharding.dimension(dim);
-            group_id += indices[dim];
-          }
-        }
-        groups[group_id].push_back(device);
-      });
-  return groups;
+  sharding.tile_assignment().Each([&](absl::Span<const int64_t> indices,
+                                      int64_t device) {
+    int64_t group_id = 0;
+    for (int64_t dim = 0; dim < indices.size(); ++dim) {
+      if (auto it = absl::c_find(target_dims, dim); it != target_dims.end()) {
+        int64_t group_size =
+            group_sizes[std::distance(target_dims.begin(), it)];
+        group_id *= sharding.dimension(dim) / group_size;
+        group_id += indices[dim] / group_size;
+      } else {
+        group_id *= sharding.dimension(dim);
+        group_id += indices[dim];
+      }
+    }
+    groups[group_id].push_back(device);
+  });
+  return CollectiveDeviceList(groups);
 }
 
 std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
@@ -3053,7 +3109,7 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
 std::optional<Mesh> GetMeshFromSharding(const HloSharding& sharding) {
   // For V3 shardings, use the mesh associated with the named sharding.
   if (sharding.UseNamedShardingLeaf()) {
-    return sharding.named_sharding()->mesh();
+    return sharding.named_sharding().mesh();
   }
 
   // For V2 shardings, create the mesh from the tile assignment.
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 01ca99b7f5edf7..ff3ffa5e80d7f6 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -595,13 +595,13 @@ HloInstruction* PadDataFromWindowReshard(
 
 // Generates partition groups (groups of devices that will communicate via a
 // collective) from sharding and provided replication_dims.
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+CollectiveDeviceList GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims);
 
 // Generates partition groups (groups of devices that will communicate via a
 // collective) across provided target dims with provided group sizes in vector
 // of vector format (legacy format).
-std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
+CollectiveDeviceList GetPartitionGroupsAcrossTargetDims(
     const HloSharding& sharding, std::vector<int64_t> target_dims,
     std::vector<int64_t> group_sizes);
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
index dff93ecb7d4f98..06e90b3061a8e3 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
@@ -77,21 +77,21 @@ TEST(SPMDPartitionerUtilTest, PartialReplicateReshardCompatibleSharding2) {
 
 TEST(SPMDPartitionerUtilTest, GetPartitionGroupsForReplication) {
   HloSharding sharding = HloSharding::IotaTile({2, 2, 2});
-  std::vector<std::vector<int64_t>> actual_partition_groups =
+  CollectiveDeviceList actual_partition_groups =
       GetPartitionGroupsForReplication(sharding, {1});
   std::vector<std::vector<int64_t>> expected_partition_groups = {
       {0, 2}, {1, 3}, {4, 6}, {5, 7}};
-  EXPECT_THAT(actual_partition_groups,
+  EXPECT_THAT(actual_partition_groups.flattened_replica_groups(),
               testing::ContainerEq(expected_partition_groups));
 }
 
 TEST(SPMDPartitionerUtilTest, GetPartitionGroupsForReplication2) {
   HloSharding sharding = HloSharding::IotaTile({2, 2, 2}, {2, 2, 2}, {0, 2, 1});
-  std::vector<std::vector<int64_t>> actual_partition_groups =
+  CollectiveDeviceList actual_partition_groups =
       GetPartitionGroupsForReplication(sharding, {0, 2});
   std::vector<std::vector<int64_t>> expected_partition_groups = {{0, 2, 4, 6},
                                                                  {1, 3, 5, 7}};
-  EXPECT_THAT(actual_partition_groups,
+  EXPECT_THAT(actual_partition_groups.flattened_replica_groups(),
               testing::ContainerEq(expected_partition_groups));
 }
 
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
index fc0e1269962e53..3c39561468aa99 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index da4264b9cb302e..7c0fc92a23bc77 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "xla/literal.h"
 #include "xla/service/compiler.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -288,13 +288,14 @@ absl::Status TransferManager::WriteRootTupleIndexTable(
 }
 
 absl::Status TransferManager::WriteRootTupleIndexTable(
-    se::Stream* stream, const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree) {
+    se::Stream* stream,
+    const ShapeTree<MaybeOwningDeviceAddress>& buffer_tree) {
   TF_RET_CHECK(buffer_tree.shape().IsTuple());
   if (ShapeUtil::TupleElementCount(buffer_tree.shape()) == 0) {
     return absl::OkStatus();
   }
   se::DeviceAddressBase device_memory =
-      buffer_tree.element({}).AsDeviceMemoryBase();
+      buffer_tree.element({}).AsDeviceAddress();
   TF_RET_CHECK(GetByteSizeRequirement(buffer_tree.shape()) ==
                device_memory.size());
 
@@ -302,7 +303,7 @@ absl::Status TransferManager::WriteRootTupleIndexTable(
   elements.reserve(ShapeUtil::TupleElementCount(buffer_tree.shape()));
   for (int64_t i = 0; i < ShapeUtil::TupleElementCount(buffer_tree.shape());
        ++i) {
-    elements.push_back(buffer_tree.element({i}).AsDeviceMemoryBase());
+    elements.push_back(buffer_tree.element({i}).AsDeviceAddress());
   }
   return WriteSingleTupleIndexTable(stream, elements, buffer_tree.shape(),
                                     &device_memory);
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index 978bcfc523bfc1..811138ba23f905 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
@@ -215,7 +215,7 @@ class TransferManager {
                                         const ShapedBuffer& device_buffer);
   absl::Status WriteRootTupleIndexTable(
       se::Stream* stream,
-      const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree);
+      const ShapeTree<MaybeOwningDeviceAddress>& buffer_tree);
 
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
diff --git a/third_party/xla/xla/service/triangular_solve_expander.cc b/third_party/xla/xla/service/triangular_solve_expander.cc
index 049249aa5b0481..5c8577a47eca98 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -36,13 +37,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -120,7 +119,7 @@ XlaOp DiagonalBlocks(XlaOp a, int64_t block_size) {
       TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
       auto shape_dims = blocks_shape.dimensions();
       auto last_blocks_dims = std::vector<int64_t>(ndims);
-      std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
+      absl::c_copy(shape_dims, last_blocks_dims.begin());
       last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
       last_blocks = Reshape(last_blocks, last_blocks_dims);
 
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
index 2f8838dfc7dd1b..537220e28e543e 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -34,8 +35,7 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/while_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 /*static*/
@@ -125,8 +125,9 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
     }
   }
 
-  std::vector<HloInstruction*> original_roots;
   for (auto&& [while_instruction, unroll_factor] : while_instructions) {
+    VLOG(1) << "Unrolling: " << while_instruction->name()
+            << " unroll_factor: " << unroll_factor;
     HloComputation* body = while_instruction->while_body();
     HloComputation* condition = while_instruction->while_condition();
 
@@ -138,16 +139,15 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
         b.AddInstruction(HloInstruction::CreateParameter(
             0, while_instruction->shape(), "input_tuple"));
     HloComputation* unrolled_body = module->AddEmbeddedComputation(b.Build());
+    HloInstruction* unrolled_root = input_tuple;
     for (int64_t step = 0; step < unroll_factor; ++step) {
       HloComputation* loop_step = module->AddEmbeddedComputation(body->Clone(
           absl::StrFormat("unrolled_%dx_step_%d", unroll_factor, step)));
       input_tuple = unrolled_body->AddInstruction(HloInstruction::CreateCall(
           while_instruction->shape(), {input_tuple}, loop_step));
-      original_roots.push_back(input_tuple);
+      unrolled_root = input_tuple;
     }
     // The final original root is now the root of the unrolled loop.
-    HloInstruction* unrolled_root = original_roots.back();
-    original_roots.pop_back();
     unrolled_body->set_root_instruction(unrolled_root);
 
     // We need the unrolled loop and the remainder (original) loop to execute
@@ -167,12 +167,16 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
         while_instruction->parent()->AddInstruction(HloInstruction::CreateWhile(
             while_instruction->shape(), unrolled_condition, body,
             while_instruction->mutable_operand(0)));
-    TF_RETURN_IF_ERROR(WhileUtil::IncrementWhileLoopTripCount(
-        *unrolled_while_instruction, -(unroll_factor - 1)));
+    absl::Status status = WhileUtil::IncrementWhileLoopTripCount(
+        *unrolled_while_instruction, -(unroll_factor - 1));
     unrolled_while_instruction->set_while_body(unrolled_body);
 
-    TF_RETURN_IF_ERROR(
-        while_instruction->ReplaceOperandWith(0, unrolled_while_instruction));
+    if (status.ok()) {
+      RETURN_IF_ERROR(
+          while_instruction->ReplaceOperandWith(0, unrolled_while_instruction));
+    } else {
+      VLOG(1) << "Failed to unroll: " << while_instruction->name();
+    }
   }
 
   const bool changed = !while_instructions.empty();
@@ -181,9 +185,9 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
     // recursively clone all the nested computations. FCG will take care of this
     // for us.
     FlattenCallGraph fcg;
-    TF_RETURN_IF_ERROR(fcg.Run(module).status());
+    RETURN_IF_ERROR(fcg.Run(module).status());
     HloDCE dce;
-    TF_RETURN_IF_ERROR(dce.Run(module).status());
+    RETURN_IF_ERROR(dce.Run(module).status());
   }
 
   return changed;
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
index d5a6cc011343fc..b1ce4e66e25fe9 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/copy_insertion.h"
-#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -78,7 +78,7 @@ ENTRY main {
   ROOT while.0 = (s32[], s32[], s32[], s32[]) while(while_tuple.0), body=body, condition=condition
 }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   WhileLoopPipelineUnroller wlpu;
   ASSERT_IS_OK(wlpu.Run(module.get()).status());
   AliasInfo alias_info;
@@ -147,7 +147,7 @@ ENTRY main {
   ROOT root.0 = get-tuple-element(while.0), index=0
 }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   WhileLoopPipelineUnroller wlpu;
   ASSERT_IS_OK(wlpu.Run(module.get()).status());
   AliasInfo alias_info;
@@ -186,5 +186,104 @@ ENTRY main {
   }
 }
 
+TEST_F(WhileLoopPipelineUnrollerTest, FailureRecovery) {
+  constexpr absl::string_view hlo = R"hlo(
+HloModule main
+
+body {
+  input_tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+  arg.0 = get-tuple-element(input_tuple.0), index=0
+  arg.1 = get-tuple-element(input_tuple.0), index=1
+  arg.2 = get-tuple-element(input_tuple.0), index=2
+  arg.3 = get-tuple-element(input_tuple.0), index=3
+
+  one.0 = s32[] constant(1)
+  out.0 = add(arg.0, one.0)
+
+  add.0 = add(arg.3, one.0)
+  ROOT output_tuple.0 = tuple(arg.1, arg.2, out.0, add.0)
+}
+
+condition {
+  input_tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+  arg.3 = get-tuple-element(input_tuple.0), index=3
+  three.0 = s32[] constant(3)
+  ROOT pred.0 = compare(arg.3, three.0), direction=LT
+}
+
+ENTRY main {
+  tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+  while-pass.0 = (s32[], s32[], s32[], s32[]) while(tuple.0), body={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.0 = get-tuple-element(tuple.0), index=0
+    arg.1 = get-tuple-element(tuple.0), index=1
+    arg.2 = get-tuple-element(tuple.0), index=2
+    arg.3 = get-tuple-element(tuple.0), index=3
+
+    one.0 = s32[] constant(1)
+    add.0 = add(arg.0, one.0)
+
+    add.1 = add(arg.3, one.0)
+    ROOT output_tuple.0 = tuple(arg.1, arg.2, add.0, add.1)
+  }, condition={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.3 = get-tuple-element(tuple.0), index=3
+    three.0 = s32[] constant(3)
+    ROOT pred.0 = compare(arg.3, three.0), direction=LT
+  }
+  ROOT while-fail.0 = (s32[], s32[], s32[], s32[]) while(while-pass.0), body={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.0 = get-tuple-element(tuple.0), index=0
+    arg.1 = get-tuple-element(tuple.0), index=1
+    arg.2 = get-tuple-element(tuple.0), index=2
+    arg.3 = get-tuple-element(tuple.0), index=3
+
+    one.0 = s32[] constant(1)
+    add.0 = add(arg.0, one.0)
+
+    add.1 = add(arg.3, one.0)
+    ROOT output_tuple.0 = tuple(arg.1, arg.2, add.0, add.1)
+  }, condition={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.3 = get-tuple-element(tuple.0), index=3
+    three.0 = s32[] constant(3)
+    pred.0 = compare(arg.3, three.0), direction=LT
+    true.0 = pred[] constant(true)
+    ROOT and.0 = and(pred.0, true.0)
+  }
+}
+  )hlo";
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  WhileLoopPipelineUnroller wlpu;
+  ASSERT_IS_OK(wlpu.Run(module.get()).status());
+  AliasInfo alias_info;
+  CopyInsertion copy_insertion(&alias_info,
+                               /*use_region_based_live_range_analysis=*/-1);
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+
+  const HloInstruction* pass_original_loop =
+      FindInstruction(module.get(), "while-pass.0");
+  // The rolled passing loop should have 3 copies.
+  // arg.1 moves to index 0.
+  // arg.2 moves to index 1.
+  // out.0 moves to index 2.
+  EXPECT_EQ(Count(HloOpcode::kCopy, *pass_original_loop->while_body()), 3);
+
+  const HloInstruction* unrolled_loop = pass_original_loop->operand(0);
+  ASSERT_EQ(unrolled_loop->opcode(), HloOpcode::kWhile);
+  // There should be no copies inserted into the unrolled loop.
+  EXPECT_EQ(Count(HloOpcode::kCopy, *unrolled_loop->while_body()), 0);
+
+  const HloInstruction* fail_loop =
+      FindInstruction(module.get(), "while-fail.0");
+  // The rolled failing loop should have 3 copies.
+  // arg.1 moves to index 0.
+  // arg.2 moves to index 1.
+  // out.0 moves to index 2.
+  EXPECT_EQ(Count(HloOpcode::kCopy, *fail_loop->while_body()), 3);
+  // The failing loop should not have been unrolled.
+  EXPECT_EQ(fail_loop->users().size(), 0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index 55c260fb08571d..a1aab5b744e8f8 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -37,7 +37,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -152,35 +152,6 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
     HloInstruction* while_op, absl::flat_hash_set<int64_t>& used_tuple_indices,
     std::optional<absl::flat_hash_map<int32_t, int32_t>>
         dead_to_surviving_index = std::nullopt) {
-  auto copy_remaining_original_arrays =
-      [&](const HloInstruction* src_instruction,
-          HloInstruction* dest_instruction,
-          const absl::flat_hash_map<int64_t, int64_t>& old_to_new_tuple_idx) {
-        std::shared_ptr<OriginalValue> original_value =
-            src_instruction->original_value();
-        if (!original_value) {
-          return;
-        }
-
-        const int64_t src_tuple_size =
-                          src_instruction->shape().tuple_shapes().size(),
-                      dest_tuple_size =
-                          dest_instruction->shape().tuple_shapes().size();
-        std::shared_ptr<OriginalValue> old_original_value =
-            src_instruction->original_value();
-        std::shared_ptr<xla::OriginalValue> new_original_value =
-            std::make_shared<xla::OriginalValue>(dest_instruction->shape());
-        for (const auto& [old_idx, new_idx] : old_to_new_tuple_idx) {
-          if (old_idx < 0 || old_idx >= src_tuple_size || new_idx < 0 ||
-              new_idx >= dest_tuple_size) {
-            return;
-          }
-          new_original_value->mutable_tree()->CopySubtreeFrom(
-              old_original_value->tree(), {old_idx}, {new_idx});
-        }
-        dest_instruction->set_original_value(new_original_value);
-      };
-
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64_t> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                             used_tuple_indices.end());
@@ -306,9 +277,8 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
   CopyFrontendAttributes(while_op, new_while_op);
   CopyMetadata(while_op, new_while_op);
 
-  copy_remaining_original_arrays(while_init, new_while_init,
-                                 old_to_new_tuple_idx);
-  copy_remaining_original_arrays(while_op, new_while_op, old_to_new_tuple_idx);
+  CopyOriginalValue(while_init, new_while_init, old_to_new_tuple_idx);
+  CopyOriginalValue(while_op, new_while_op, old_to_new_tuple_idx);
 
   // Create a tuple op that recreates the output of the old while op.  That is,
   // we transform to
@@ -875,7 +845,8 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
   };
 
   // Returns a new tuple without the elements of constant_tuple_indices.
-  auto remove_constant_elems = [&](HloInstruction* instr) {
+  auto remove_constant_elems =
+      [&](HloInstruction* instr) -> std::unique_ptr<HloInstruction> {
     CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
 
     std::vector<HloInstruction*> tuple_elems;
@@ -886,10 +857,24 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
                 while_shape.tuple_shapes(i), instr, i)));
       }
     }
-    return HloInstruction::CreateTuple(tuple_elems);
+    std::unique_ptr<HloInstruction> new_tuple =
+        HloInstruction::CreateTuple(tuple_elems);
+    if (instr->original_value()) {
+      auto new_ov = std::make_shared<OriginalValue>(new_tuple->shape());
+      int64_t new_i = 0;
+      for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
+        if (!constant_tuple_indices.count(i)) {
+          CHECK_OK(new_ov->mutable_tree()->CopyCompatibleSubtreeFrom(
+              instr->original_value()->tree(), {i}, {new_i++}));
+        }
+      }
+      new_tuple->set_original_value(new_ov);
+    }
+    return new_tuple;
   };
 
-  auto add_constant_elems = [&](HloInstruction* instr) {
+  auto add_constant_elems =
+      [&](HloInstruction* instr) -> std::unique_ptr<HloInstruction> {
     CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
 
     std::vector<HloInstruction*> tuple_elems;
@@ -952,6 +937,17 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
       module->AddEmbeddedComputation(std::move(new_while_cond)),
       module->AddEmbeddedComputation(std::move(new_while_body)),
       add_new_instr(remove_constant_elems(while_init))));
+  if (while_op->original_value()) {
+    auto new_ov = std::make_shared<OriginalValue>(new_while_op->shape());
+    int64_t new_i = 0;
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
+      if (!constant_tuple_indices.count(i)) {
+        CHECK_OK(new_ov->mutable_tree()->CopyCompatibleSubtreeFrom(
+            while_op->original_value()->tree(), {i}, {new_i++}));
+      }
+    }
+    new_while_op->set_original_value(new_ov);
+  }
   new_while_op->CopyBackendConfigFrom(while_op);
   CopyFrontendAttributes(while_op, new_while_op);
   CopyMetadata(while_op, new_while_op);
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index 24202035575301..9ee413ca8b655c 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -819,6 +819,35 @@ TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) {
               op::Tuple(op::Constant()));
 }
 
+TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarryWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1]) parameter(0)
+    a = s32[1] constant({0})
+    ROOT tuple = (s32[1]) tuple(a)
+  }
+  Cond {
+    param = (s32[1]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    init = (s32[1]) tuple(a), origin={({"a"})}
+    ROOT while = (s32[1]) while(init), condition=Cond, body=Body, origin={({"w"})}
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant()));
+  HloInstruction* root_instr = m->entry_computation()->root_instruction();
+  ASSERT_NE(root_instr->original_value(), nullptr);
+  EXPECT_EQ(root_instr->original_value()->ToString(), R"(({"w"}))");
+}
+
 TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
   const std::string hlo_string = R"(
   HloModule Test
@@ -1481,6 +1510,88 @@ ENTRY %main (arg.0: f32[3], arg.1: f32[2]) -> (f32[3], f32[2], f32[2], f32[3]) {
                         op::GetTupleElement(op::While(), 0)));
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarryWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    a.1 = s32[1] add(a, a)
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({2,2,2})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c), origin={({"a"},{"b"},{"c"})}
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body, origin={({"w0"},{"w1"},{"w2"})}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  HloInstruction* while_instr = FindFirstWhile(m.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(), R"(({"w0"}, {"w2"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(), R"(({"a"}, {"c"}))");
+  HloInstruction* root_instr = m->entry_computation()->root_instruction();
+  ASSERT_NE(root_instr->original_value(), nullptr);
+  EXPECT_EQ(root_instr->original_value()->ToString(),
+            R"(({"w0"}, {"w1"}, {"w2"}))");
+}
+
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarryWithOriginalValue2) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] constant({1})
+    b = s32[2] get-tuple-element(param), index=1
+    b.1 = s32[2] add(b, b)
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a, b.1, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({1})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c), origin={({"a"},{"b"},{"c"})}
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body, origin={({"w0"},{"w1"},{"w2"})}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  HloInstruction* while_instr = FindFirstWhile(m.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(), R"(({"w1"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(), R"(({"b"}))");
+  HloInstruction* root_instr = m->entry_computation()->root_instruction();
+  ASSERT_NE(root_instr->original_value(), nullptr);
+  EXPECT_EQ(root_instr->original_value()->ToString(),
+            R"(({"w0"}, {"w1"}, {"w2"}))");
+}
+
 TEST_F(WhileLoopSimplifierTest, RemoveDeadTupleIndicesWithOriginalValue) {
   const std::string hlo_string = R"(
   HloModule dus
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 321f9dfffab4cd..a9e0f6dee030c2 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -2336,6 +2336,107 @@ int64_t ShapeUtil::ForEachState::CalculateNumSteps() const {
   });
 }
 
+namespace {
+
+// Returns the indices of the first elements of all consecutive subarrays of the
+// given array. For example:
+// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4}
+absl::InlinedVector<size_t, 3> ConsecutiveSegments(
+    absl::Span<const int64_t> xs) {
+  absl::InlinedVector<size_t, 3> is = {0};
+  for (size_t i = 1; i < xs.size(); ++i) {
+    if (1 != xs[i] - xs[i - 1]) {
+      is.push_back(i);
+    }
+  }
+  return is;
+}
+
+// Merges the sequences of dimensions of the given shape which start at the
+// given indices `segs`.
+Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
+  std::vector<int64_t> dimensions;
+  const auto size = segs.size();
+  dimensions.reserve(size);
+  for (size_t i = 1; i <= size; ++i) {
+    dimensions.push_back(std::accumulate(
+        shape.dimensions().begin() + segs[i - 1],
+        shape.dimensions().begin() +
+            (segs.size() == i ? shape.dimensions().size() : segs[i]),
+        int64_t{1}, std::multiplies<int64_t>()));
+  }
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  dimensions);
+}
+
+absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
+    const Shape& output_shape, absl::Span<int64_t const> output_to_input,
+    absl::InlinedVector<int64_t, 3>& permutation) {
+  absl::InlinedVector<size_t, 3> segments =
+      ConsecutiveSegments(output_to_input);
+  Shape normalized_shape = MergeDimensions(segments, output_shape);
+  absl::InlinedVector<int64_t, 3> normalized_dims(
+      normalized_shape.dimensions().begin(),
+      normalized_shape.dimensions().end());
+  if (segments.size() == 1) {
+    return normalized_dims;
+  }
+  // Derive the permutation from the segments.
+  std::vector<int64_t> segment_to_normalized_dim(
+      output_shape.dimensions().size(), -1);
+  for (size_t segment : segments) {
+    segment_to_normalized_dim[output_to_input[segment]] = 0;
+  }
+  int64_t normalized_dim = 0;
+  for (int64_t i = 0; i < segment_to_normalized_dim.size(); ++i) {
+    if (segment_to_normalized_dim[i] >= 0) {
+      segment_to_normalized_dim[i] = normalized_dim++;
+    }
+  }
+  permutation.reserve(segments.size());
+  for (int64_t i = 0; i < segments.size(); ++i) {
+    permutation.push_back(
+        segment_to_normalized_dim[output_to_input[segments[i]]]);
+  }
+  return normalized_dims;
+}
+
+}  // namespace
+
+/*static*/ absl::StatusOr<absl::InlinedVector<int64_t, 3>>
+ShapeUtil::GetNormalizedLogicalTransposeShape(
+    const Shape& input_shape, const Shape& output_shape,
+    absl::Span<int64_t const> dimensions,
+    absl::InlinedVector<int64_t, 3>& permutation) {
+  if (!LayoutUtil::IsMonotonicWithDim0Major(input_shape.layout()) ||
+      !LayoutUtil::IsMonotonicWithDim0Major(output_shape.layout())) {
+    return FailedPrecondition(
+        "Transpose normalization requires monotonic layouts. Layout "
+        "normalization should have assigned the default layout.");
+  }
+
+  permutation.clear();
+  // Drop degenerate dimensions.
+  absl::InlinedVector<int64_t, 3> delta(output_shape.dimensions().size() + 1,
+                                        0);
+  for (int i = 0; i < output_shape.dimensions().size(); ++i) {
+    delta[i + 1] = delta[i];
+    if (input_shape.dimensions(i) == static_cast<int64_t>(1)) {
+      ++delta[i + 1];
+    }
+  }
+  absl::InlinedVector<int64_t, 3> new_dimensions;
+  for (int i = 0; i < dimensions.size(); i++) {
+    if (output_shape.dimensions(i) != 1) {
+      new_dimensions.push_back(dimensions[i] - delta[dimensions[i]]);
+    }
+  }
+
+  return GetNormalizedTransposeShapeHelper(
+      ShapeUtil::DropDegenerateDimensions(output_shape), new_dimensions,
+      permutation);
+}
+
 /*static*/ void ShapeUtil::FlattenTupleShape(
     const Shape& shape, std::vector<const Shape*>& flattened) {
   if (shape.IsTuple()) {
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index fde70d0dd22ef5..545b247fbc2a2c 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -435,6 +435,42 @@ class ShapeUtil {
   static bool IsEffectivelyMostMajorDimension(const Shape& shape,
                                               int64_t dimension);
 
+  // In this case, we care about transposes that permute dimensions of a shape
+  // that can be viewed as several logical components in the order of major to
+  // minor. As an example, let's consider a 0-2-1 transpose:
+  //
+  // If a shape can be viewed as three logical components 0-1-2 in the order of
+  // major to minor, a 0-2-1-transpose changes the order of such logical
+  // components to 0-2-1. We call the shape being transposed the input shape and
+  // the transposed shape the output shape. The logical view of the input/output
+  // shapes for the transpose are called the 0-1-2/0-2-1 shapes or the
+  // normalized shapes. The original input/output shapes are called unnormalized
+  // shapes.
+  //
+  // 'output_shape' should have the default layout (enforced by the caller).
+  //
+  // 'dimensions' specifies the kind of the unnormalized transpose and defines
+  // the permutation of the input shape that will result in the provided output
+  // shape. So to compute the input shape, we need to apply the inverse
+  // permutation of 'dimensions'.
+  //
+  // 'permutation' is an output parameter and specifies the kind of the
+  // normalized transpose.
+  //
+  // The method returns the dimensions for the normalized transpose shape.
+  //
+  // Example: Suppose the unnormalized output shape is [32, 1, 10, 11], and
+  // 'dimensions' is set to {3, 1, 0, 2}. This means the corresponding input
+  // shape is [10, 1, 11, 32]. The normalized output shape is [32, 110] with
+  // 'permutation' set to {1,0}.
+  // Note: the method fails if the input shape or the output shape has a
+  // non-monotonic layout.
+  static absl::StatusOr<absl::InlinedVector<int64_t, 3>>
+  GetNormalizedLogicalTransposeShape(
+      const Shape& input_shape, const Shape& output_shape,
+      absl::Span<int64_t const> dimensions,
+      absl::InlinedVector<int64_t, 3>& permutation);
+
   // Returns an empty tuple shape. Can be used as a sentinel Shape value.
   static Shape MakeNil() { return Shape(std::vector<Shape>{}); }
 
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index e1015ca2dc7778..8d3cedb6d1d21a 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -45,6 +47,7 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::IsEmpty;
 
 TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) {
   Shape matrix = ShapeUtil::MakeShape(F32, {2, 3});
@@ -1302,7 +1305,7 @@ TEST(ShapeUtilTest, B_250640044) {
              is_dynamic_dimension: false
            })pb",
       &proto));
-  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+  ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
@@ -1336,7 +1339,7 @@ TEST(ShapeUtilTest, B_251055887) {
           physical_shape { element_type: -562 }
         })pb",
       &proto));
-  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+  ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
@@ -1347,14 +1350,14 @@ TEST(ShapeUtilTest, B_385192799) {
   {
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
         R"pb(element_type: 2000)pb", &proto));
-    TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+    ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
     EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
   }
 
   {
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
         R"pb(element_type: -1)pb", &proto));
-    TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+    ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
     EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
   }
 }
@@ -1776,5 +1779,159 @@ void BM_ForEachIndexNoStatus(::testing::benchmark::State& state) {
 
 BENCHMARK(BM_ForEachIndexNoStatus)->Arg(0)->Arg(1)->Arg(2);
 
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {32, 1, 10, 11});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 1, 11, 32});
+  absl::InlinedVector<int64_t, 3> dimensions = {3, 1, 0, 2};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(32, 110));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape2) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {20, 30, 50});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {50, 20, 30});
+  absl::InlinedVector<int64_t, 3> dimensions = {1, 2, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(600, 50));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NoTranspose) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 1, 128});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {64, 128, 1});
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 2, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(8192));
+  EXPECT_THAT(permutation, IsEmpty());
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple2D) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 128});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  absl::InlinedVector<int64_t, 3> dimensions = {1, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(64, 128));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_021) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 32768});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 32768, 16});
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 2, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(8, 16, 32768));
+  EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_210) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {16, 32768, 8});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 32768, 16});
+  absl::InlinedVector<int64_t, 3> dimensions = {2, 1, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(16, 32768, 8));
+  EXPECT_THAT(permutation, ElementsAre(2, 1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple4D) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {16, 32768, 8, 4});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {32768, 4, 16, 8});
+  absl::InlinedVector<int64_t, 3> dimensions = {2, 0, 3, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(16, 32768, 8, 4));
+  EXPECT_THAT(permutation, ElementsAre(2, 0, 3, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NormalizeTo3D) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 32, 32, 32});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 32, 32, 32, 16});
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 4, 1, 2, 3};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(8, 16, 32768));
+  EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_LargeShapeSizeOverflow) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {16, 4096, 4096, 128});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4096, 4096, 128, 16});
+  absl::InlinedVector<int64_t, 3> dimensions = {3, 0, 1, 2};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(16, 2147483648));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_DegenerateDims) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {1, 32, 1, 64, 1, 3, 1});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 32, 1, 3, 1, 64, 1});
+  absl::InlinedVector<int64_t, 3> dimensions = {6, 1, 4, 5, 2, 3, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(32, 64, 3));
+  EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_TransposeWithGrouping) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {10, 1, 32, 100, 2});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {100, 1, 10, 32, 2});
+  absl::InlinedVector<int64_t, 3> dimensions = {2, 1, 3, 0, 4};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+
+  EXPECT_THAT(normalized_shape, ElementsAre(320, 100, 2));
+  EXPECT_THAT(permutation, ElementsAre(1, 0, 2));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_InvalidLayout) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {32, 10});
+  *output_shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 32});
+  absl::InlinedVector<int64_t, 3> dimensions = {1, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  EXPECT_FALSE(ShapeUtil::GetNormalizedLogicalTransposeShape(
+                   input_shape, output_shape, dimensions, permutation)
+                   .ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index ff8eef2699a26c..a80603eaa7b8d1 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -77,12 +77,20 @@ cc_library(
     name = "device_address",
     hdrs = ["device_address.h"],
     deps = [
-        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log:check",
     ],
 )
 
+xla_cc_test(
+    name = "device_address_test",
+    srcs = ["device_address_test.cc"],
+    deps = [
+        ":device_address",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "device_address_handle",
     srcs = ["device_address_handle.cc"],
@@ -247,6 +255,7 @@ cc_library(
         ":kernel_spec",
         ":memory_allocation",
         ":memory_allocator",
+        ":memory_space",
         ":module_spec",
         ":platform",
         ":stream",
@@ -336,6 +345,10 @@ xla_cc_test(
 cc_library(
     name = "memory_allocation",
     hdrs = ["memory_allocation.h"],
+    deps = [
+        ":device_address",
+        "@com_google_absl//absl/base:core_headers",
+    ],
 )
 
 cc_library(
@@ -472,9 +485,11 @@ cc_library(
         ":kernel_spec",
         ":memory_allocation",
         ":memory_allocator",
+        ":memory_space",
         ":module_spec",
         ":platform",
         ":stream",
+        ":tensor_map",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/lib/gtl:int_type",
         "@com_google_absl//absl/base:core_headers",
@@ -506,6 +521,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "memory_space",
+    hdrs = ["memory_space.h"],
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
 cc_library(
     name = "generic_memory_allocator",
     hdrs = ["generic_memory_allocator.h"],
@@ -521,6 +542,7 @@ cc_library(
     name = "generic_memory_allocation",
     hdrs = ["generic_memory_allocation.h"],
     deps = [
+        ":device_address",
         ":memory_allocation",
         "@com_google_absl//absl/functional:any_invocable",
     ],
@@ -718,8 +740,6 @@ cc_library(
     name = "kernel",
     hdrs = ["kernel.h"],
     deps = [
-        ":device_address",
-        ":device_memory",
         ":kernel_args",
         ":kernel_metadata",
         ":launch_dim",
@@ -821,7 +841,10 @@ cc_library(
     name = "allocator_stats",
     srcs = ["allocator_stats.cc"],
     hdrs = ["allocator_stats.h"],
-    deps = ["@com_google_absl//absl/strings:str_format"],
+    deps = [
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:numbers",
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/stream_executor/allocator_stats.cc b/third_party/xla/xla/stream_executor/allocator_stats.cc
index de6432b29d7bf2..1d10eba776da7b 100644
--- a/third_party/xla/xla/stream_executor/allocator_stats.cc
+++ b/third_party/xla/xla/stream_executor/allocator_stats.cc
@@ -18,23 +18,29 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_format.h"
+#include "tsl/platform/numbers.h"
 
 namespace stream_executor {
 
 std::string AllocatorStats::DebugString() const {
   return absl::StrFormat(
-      "Limit:            %20lld\n"
-      "InUse:            %20lld\n"
-      "MaxInUse:         %20lld\n"
+      "Limit:            %20s\n"
+      "InUse:            %20s\n"
+      "MaxInUse:         %20s\n"
       "NumAllocs:        %20lld\n"
-      "MaxAllocSize:     %20lld\n"
-      "Reserved:         %20lld\n"
-      "PeakReserved:     %20lld\n"
-      "LargestFreeBlock: %20lld\n",
-      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
-      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size,
-      this->bytes_reserved, this->peak_bytes_reserved,
-      this->largest_free_block_bytes);
+      "MaxAllocSize:     %20s\n"
+      "Reserved:         %20s\n"
+      "PeakReserved:     %20s\n"
+      "LargestFreeBlock: %20s\n",
+      tsl::strings::HumanReadableNumBytes(this->bytes_limit ? *this->bytes_limit
+                                                            : 0),
+      tsl::strings::HumanReadableNumBytes(this->bytes_in_use),
+      tsl::strings::HumanReadableNumBytes(this->peak_bytes_in_use),
+      this->num_allocs,
+      tsl::strings::HumanReadableNumBytes(this->largest_alloc_size),
+      tsl::strings::HumanReadableNumBytes(this->bytes_reserved),
+      tsl::strings::HumanReadableNumBytes(this->peak_bytes_reserved),
+      tsl::strings::HumanReadableNumBytes(this->largest_free_block_bytes));
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 3437d3dc6e7fc6..a3de00558310fc 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -76,6 +76,24 @@ bool_flag(
     ),
 )
 
+# Allows to explicitely disable nvshmem collectives using invocation flag.
+bool_flag(
+    name = "nvshmem_enabled",
+    build_setting_default = True,
+)
+
+# NVSHMEM requires builtin functions since it uses printf's for debugging.
+config_setting(
+    name = "no_builtin_used",
+    values = {"copt": "-fno-builtin"},
+)
+
+config_setting(
+    name = "nvshmem_supported",
+    constraint_values = ["@platforms//os:linux"],
+    flag_values = {":nvshmem_enabled": "True"},
+)
+
 config_setting(
     name = "libnvjitlink_support_enabled",
     flag_values = {
@@ -102,8 +120,10 @@ cuda_only_cc_library(
     deps = [
         ":cuda_diagnostics",
         ":cuda_executor",
+        ":cuda_memory_allocator",
         ":cuda_platform_id",
         ":cuda_status",
+        "//xla:debug_options_flags",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:executor_cache",
         "//xla/stream_executor:platform",
@@ -455,7 +475,9 @@ cuda_library(
         "gpu",
     ],
     deps = [
+        ":cuda_platform",
         ":cuda_platform_id",
+        "//xla:util",
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
@@ -477,6 +499,7 @@ xla_test(
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/backends/gpu/runtime:thunk_id",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -762,10 +785,13 @@ xla_test(
         ":cuda_event",
         ":cuda_executor",
         ":cuda_platform_id",
+        ":cuda_stream",
         "//xla/stream_executor:event",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:statusor",
@@ -927,6 +953,113 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "cuda_memory_allocator",
+    srcs = ["cuda_memory_allocator.cc"],
+    hdrs = ["cuda_memory_allocator.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "nccl_memory_allocator",
+    srcs = ["nccl_memory_allocator.cc"],
+    hdrs = ["nccl_memory_allocator.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_memory_allocator",
+        "//xla:util",
+        "//xla/stream_executor:activate_context",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/cuda:nccl",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:numbers",
+    ],
+    alwayslink = True,  # static registration
+)
+
+cc_library(
+    name = "nvshmem",
+    srcs = ["nvshmem.cc"],
+    hdrs = ["nvshmem.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@nvshmem//:nvshmem_lib",
+    ],
+)
+
+cc_library(
+    name = "nvshmem_memory_allocator",
+    srcs = ["nvshmem_memory_allocator.cc"],
+    hdrs = ["nvshmem_memory_allocator.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_memory_allocator",
+        ":nvshmem",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:numbers",
+        "@nvshmem//:nvshmem_lib",
+    ],
+    alwayslink = True,  # static registration
+)
+
 cc_library(
     name = "nvjitlink_support",
     srcs = ["nvjitlink_support.cc"],
@@ -1037,7 +1170,6 @@ xla_cc_test(
         "notsan",
     ],
     deps = [
-        ":compilation_provider",
         ":cuda_compute_capability",
         ":nvjitlink",
         ":nvjitlink_support",
@@ -1130,6 +1262,7 @@ cuda_only_cc_library(
         ":cuda_context",
         ":cuda_event",
         ":cuda_kernel",
+        ":cuda_memory_allocator",
         ":cuda_platform_id",
         ":cuda_status",
         ":cuda_stream",
@@ -1160,12 +1293,14 @@ cuda_only_cc_library(
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:memory_space",
         "//xla/stream_executor:module_spec",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:tensor_map",
         "//xla/stream_executor/gpu:context",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:multicast_memory",
@@ -1250,7 +1385,6 @@ xla_test(
     backend_tags = {
         "gpu": [
             "multi_gpu",
-            "no_oss",
         ],
     },
     backends = ["gpu"],
@@ -1462,14 +1596,13 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_semaphore",
-        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
index 2325478c1256fb..4f6e94ab7ccce2 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
@@ -13,9 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <array>
 #include <cassert>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
+#include <optional>
+#include <tuple>
 
 #include "absl/base/casts.h"
 #include "third_party/gpus/cuda/include/cuda/atomic"
@@ -24,11 +29,29 @@ limitations under the License.
 #include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/util.h"
 
 namespace se = stream_executor;
 
 namespace {
 
+using xla::gpu::FloatCheckResult;
+
+// https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
+// > CUDA architecture limits the numbers of threads per block (1024 threads
+// > per block limit).
+static constexpr uint64_t kBlockSize = 1024;
+// warpSize is not a compile time constant on all OSS CI builds, but we need it
+// to be one for static array initialization. We assert this value matches
+// warpSize at runtime.
+static constexpr uint64_t kWarpSize = 32;
+static constexpr uint64_t kMaxWarpsPerBlock = kBlockSize / kWarpSize;
+template <typename T>
+static constexpr uint64_t kElementsPerMemoryAccess =
+    std::max<uint64_t>(16 / sizeof(T), 1);
+template <typename T>
+using Chunk = std::array<T, kElementsPerMemoryAccess<T>>;
+
 __device__ unsigned int ThreadIdx() {
   return threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x +
          threadIdx.x;
@@ -39,16 +62,57 @@ __device__ unsigned int BlockIdx() {
          blockIdx.x;
 }
 
-// Based on
-// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
-template <unsigned int BLOCK_SIZE>
-__device__ void WarpReduceSum(unsigned int tid, volatile uint32_t* data) {
-  if (BLOCK_SIZE >= 64) data[tid] += data[tid + 32];
-  if (BLOCK_SIZE >= 32) data[tid] += data[tid + 16];
-  if (BLOCK_SIZE >= 16) data[tid] += data[tid + 8];
-  if (BLOCK_SIZE >= 8) data[tid] += data[tid + 4];
-  if (BLOCK_SIZE >= 4) data[tid] += data[tid + 2];
-  if (BLOCK_SIZE >= 2) data[tid] += data[tid + 1];
+// Reduce a warp worth of values into a single one and have the 0th thread in
+// the warp return it.
+__device__ uint32_t WarpReduceSum(uint32_t value) {
+  static constexpr uint32_t kFullMask = ~0;
+  for (unsigned int offset = 1; offset < kWarpSize; offset <<= 1) {
+    value += __shfl_down_sync(kFullMask, value, offset);
+  }
+  return value;
+}
+
+// Sum up a block worth of FloatCheckResults into a single one and have the 0th
+// thread in the block return it.
+__device__ FloatCheckResult BlockReduceSum(uint32_t tid,
+                                           FloatCheckResult value) {
+  assert(kWarpSize == warpSize);
+  static_assert(kBlockSize == kWarpSize * kMaxWarpsPerBlock);
+  // Required to do the second warp reduction.
+  static_assert(kMaxWarpsPerBlock == kWarpSize);
+
+  const size_t warp_idx = tid / kWarpSize;
+  const size_t lane_idx = tid % kWarpSize;
+
+  value.nan_count = WarpReduceSum(value.nan_count);
+  value.inf_count = WarpReduceSum(value.inf_count);
+  value.zero_count = WarpReduceSum(value.zero_count);
+
+  __shared__ uint32_t scratch_nan[kMaxWarpsPerBlock];
+  __shared__ uint32_t scratch_inf[kMaxWarpsPerBlock];
+  __shared__ uint32_t scratch_zero[kMaxWarpsPerBlock];
+  if (lane_idx == 0) {
+    scratch_nan[warp_idx] = value.nan_count;
+    scratch_inf[warp_idx] = value.inf_count;
+    scratch_zero[warp_idx] = value.zero_count;
+  }
+
+  __syncthreads();
+  // The first warp reduces the results from all warps.
+  if (warp_idx == 0) {
+    value.nan_count = scratch_nan[lane_idx];
+    value.inf_count = scratch_inf[lane_idx];
+    value.zero_count = scratch_zero[lane_idx];
+    value.nan_count = WarpReduceSum(value.nan_count);
+    value.inf_count = WarpReduceSum(value.inf_count);
+    value.zero_count = WarpReduceSum(value.zero_count);
+  } else {
+    value.nan_count = 0;
+    value.inf_count = 0;
+    value.zero_count = 0;
+  }
+
+  return value;
 }
 
 __device__ inline bool IsNan(float v) { return isnan(v); }
@@ -60,173 +124,126 @@ __device__ inline bool IsZero(__nv_bfloat16 v) {
   return v == __nv_bfloat16(0.0f);
 }
 
-// Calculates count of NaNs of all elements of `input` and puts result in
-// `output`.
-//
-// Optimized implementation based on
-// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
-// that takes advantage of `BLOCK_SIZE` threads.
-//
-// `BLOCK_SIZE` must be a power of 2 no larger than 1024.
-template <typename T, unsigned int BLOCK_SIZE>
-__device__ void ReduceSum(const T* input, uint64_t input_size,
-                          uint32_t* nan_counter, uint32_t* inf_counter,
-                          uint32_t* zero_counter) {
-  __shared__ uint32_t nan_count[BLOCK_SIZE];
-  __shared__ uint32_t inf_count[BLOCK_SIZE];
-  __shared__ uint32_t zero_count[BLOCK_SIZE];
+// Get a part of the input buffer current thread block is responsible for
+// processing, assuming the load is spread up to max_blocks across the entire
+// grid. If max_blocks is not provided, the entire grid is used.
+template <typename T>
+__device__ inline std::tuple<const T*, uint64_t> GetBlockInput(
+    const T* input, uint64_t input_size,
+    std::optional<uint64_t> max_blocks = std::nullopt) {
+  size_t grid_size = gridDim.x * gridDim.y * gridDim.z;
+  if (max_blocks.has_value()) {
+    grid_size = std::min<size_t>(grid_size, *max_blocks);
+  }
+  const uint64_t max_block_input_size = xla::RoundUpTo(
+      xla::CeilOfRatio(input_size, grid_size), kElementsPerMemoryAccess<T>);
+  const uint64_t block_input_offset = BlockIdx() * max_block_input_size;
+  const uint64_t block_input_size =
+      std::min(max_block_input_size, input_size - block_input_offset);
+  return {input + block_input_offset, block_input_size};
+}
 
-  assert(BlockIdx() == 0);
+template <typename T>
+__device__ FloatCheckResult CheckFloats(const T* input, uint64_t input_size,
+                                        uint64_t max_blocks) {
   const unsigned int tid = ThreadIdx();
+  const auto [block_input, block_input_size] =
+      GetBlockInput(input, input_size, max_blocks);
 
-  nan_count[tid] = 0;
-  inf_count[tid] = 0;
-  zero_count[tid] = 0;
-  for (unsigned int i = tid; i < input_size; i += BLOCK_SIZE) {
-    if (IsNan(input[i])) {
-      nan_count[tid]++;
-    }
-    if (IsInf(input[i])) {
-      inf_count[tid]++;
-    }
-    if (IsZero(input[i])) {
-      zero_count[tid]++;
-    }
-  }
-
-  __syncthreads();
+  const Chunk<T>* chunked_input =
+      reinterpret_cast<const Chunk<T>*>(block_input);
+  const uint64_t input_chunks =
+      xla::FloorOfRatio(block_input_size, kElementsPerMemoryAccess<T>);
+  // This may be less than block_input_size only for the last block.
+  const uint64_t chunked_input_size =
+      xla::RoundDownTo(block_input_size, kElementsPerMemoryAccess<T>);
 
-  if (BLOCK_SIZE >= 1024) {
-    if (tid < 512) {
-      nan_count[tid] += nan_count[tid + 512];
-      inf_count[tid] += inf_count[tid + 512];
-      zero_count[tid] += zero_count[tid + 512];
+  FloatCheckResult result{};
+  for (uint64_t i = tid; i < input_chunks; i += kBlockSize) {
+    Chunk<T> values = chunked_input[i];
+    for (const T value : values) {
+      result.nan_count += IsNan(value);
+      result.inf_count += IsInf(value);
+      result.zero_count += IsZero(value);
     }
-    __syncthreads();
   }
-  if (BLOCK_SIZE >= 512) {
-    if (tid < 256) {
-      nan_count[tid] += nan_count[tid + 256];
-      inf_count[tid] += inf_count[tid + 256];
-      zero_count[tid] += zero_count[tid + 256];
-    }
-    __syncthreads();
-  }
-  if (BLOCK_SIZE >= 256) {
-    if (tid < 128) {
-      nan_count[tid] += nan_count[tid + 128];
-      inf_count[tid] += inf_count[tid + 128];
-      zero_count[tid] += zero_count[tid + 128];
-    }
-    __syncthreads();
-  }
-  if (BLOCK_SIZE >= 128) {
-    if (tid < 64) {
-      nan_count[tid] += nan_count[tid + 64];
-      inf_count[tid] += inf_count[tid + 64];
-      zero_count[tid] += zero_count[tid + 64];
+
+  if (tid == 0 && chunked_input_size < block_input_size) {
+    const size_t rest = block_input_size - chunked_input_size;
+    for (uint64_t j = 0; j < rest; ++j) {
+      const T value = block_input[input_chunks + j];
+      result.nan_count += IsNan(value);
+      result.inf_count += IsInf(value);
+      result.zero_count += IsZero(value);
     }
-    __syncthreads();
-  }
-  if (tid < 32) {
-    WarpReduceSum<BLOCK_SIZE>(tid, nan_count);
-    WarpReduceSum<BLOCK_SIZE>(tid, inf_count);
-    WarpReduceSum<BLOCK_SIZE>(tid, zero_count);
   }
-  if (tid == 0) {
-    *nan_counter = nan_count[0];
-    *inf_counter = inf_count[0];
-    *zero_counter = zero_count[0];
+
+  return BlockReduceSum(tid, result);
+}
+
+__device__ FloatCheckResult ReduceResults(const FloatCheckResult* input,
+                                          uint64_t input_size) {
+  const unsigned int tid = ThreadIdx();
+  const auto [block_input, block_input_size] = GetBlockInput(input, input_size);
+
+  FloatCheckResult result{};
+  for (uint64_t i = tid; i < input_size; i += kBlockSize) {
+    const FloatCheckResult value = block_input[i];
+    result.nan_count += value.nan_count;
+    result.inf_count += value.inf_count;
+    result.zero_count += value.zero_count;
   }
+
+  // Now reduce a block worth of values into a single one.
+  return BlockReduceSum(tid, result);
 }
 
-// Attempts to append the NaN count of the `input` buffer to the
-// `float_check_entries`, using `log_header` to track available capacity and
-// used space.
-//
-// The log entry is tagged with `entry_id`. The NaN count is parallelized as
-// much as block dimensions allow it.
-//
-// If the log does not have enough space for the new entry, the entry is
-// discarded.
-//
-// `input_size_in_bytes` is the size of the input buffer in bytes.
-//
-// LIMITATIONS:
-// - Only a single thread block is supported.
-// - Block dimensions must be a power of 2.
+// Count the number of floats for NaNs, Infs and zeros in input buffer and store
+// partially accumulated results in the tmp array.
 template <typename T>
-__global__ void AppendFloatCheck(
-    xla::gpu::BufferDebugLogEntryId entry_id, const T* input,
-    uint64_t input_size_in_bytes, xla::gpu::BufferDebugLogHeader* log_header,
-    xla::gpu::BufferDebugFloatCheckEntry* float_check_entries) {
-  const uint32_t block_size = blockDim.x * blockDim.y * blockDim.z;
-  const uint64_t input_size = input_size_in_bytes / sizeof(T);
-  uint32_t nan_count = 0;
-  uint32_t inf_count = 0;
-  uint32_t zero_count = 0;
-
-  assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
-  if (BlockIdx() != 0) {
+__global__ void FloatCheck(const T* input, uint64_t input_size,
+                           xla::gpu::FloatCheckResult* tmp, uint64_t tmp_size) {
+  assert(blockDim.x * blockDim.y * blockDim.z == kBlockSize);
+  assert(BlockIdx() < tmp_size);
+  if (BlockIdx() >= tmp_size) {
     return;
   }
 
-  // https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
-  // > CUDA architecture limits the numbers of threads per block (1024 threads
-  // > per block limit).
-  switch (block_size) {
-    case 1024:
-      ReduceSum<T, 1024>(input, input_size, &nan_count, &inf_count,
-                         &zero_count);
-      break;
-    case 512:
-      ReduceSum<T, 512>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 256:
-      ReduceSum<T, 256>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 128:
-      ReduceSum<T, 128>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 64:
-      ReduceSum<T, 64>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 32:
-      ReduceSum<T, 32>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 16:
-      ReduceSum<T, 16>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 8:
-      ReduceSum<T, 8>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 4:
-      ReduceSum<T, 4>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 2:
-      ReduceSum<T, 2>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 1:
-      ReduceSum<T, 1>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    default:
-      // Unsupported block size.
-      assert(false);
-      return;
+  const FloatCheckResult result = CheckFloats(input, input_size, tmp_size);
+  if (ThreadIdx() == 0) {
+    tmp[BlockIdx()] = result;
   }
+}
 
-  if (ThreadIdx() == 0) {
-    cuda::atomic_ref<uint32_t, cuda::thread_scope_system>
-        nan_count_log_write_idx(log_header->write_idx);
+// Reduce the partially accumulated results from `FloatCheck` invocations and
+// append the result to the buffer debug log.
+__global__ void ReduceFloatCheckResults(
+    xla::gpu::FloatCheckResult* tmp, uint64_t tmp_size,
+    xla::gpu::BufferDebugLogEntryId entry_id,
+    xla::gpu::BufferDebugLogHeader* log_header,
+    xla::gpu::BufferDebugFloatCheckEntry* log_entries) {
+  assert(blockDim.x * blockDim.y * blockDim.z == kBlockSize);
+  assert(BlockIdx() == 0);
+  if (BlockIdx() >= 1) {
+    return;
+  }
+
+  assert(tmp_size > 0);
+  FloatCheckResult total = ReduceResults(tmp, tmp_size);
+
+  if (BlockIdx() == 0 && ThreadIdx() == 0) {
+    cuda::atomic_ref<uint32_t, cuda::thread_scope_system> log_write_idx(
+        log_header->write_idx);
 #if __CUDA_ARCH__ >= 600
-    const uint32_t write_idx = nan_count_log_write_idx.fetch_add(1);
-    if (nan_count_log_write_idx.load() < log_header->capacity) {
-      float_check_entries[write_idx] = xla::gpu::BufferDebugFloatCheckEntry{
-          entry_id, nan_count, inf_count, zero_count};
+    const uint32_t write_idx = log_write_idx.fetch_add(1);
+    if (write_idx < log_header->capacity) {
+      log_entries[write_idx] = xla::gpu::BufferDebugFloatCheckEntry{
+          entry_id, total.nan_count, total.inf_count, total.zero_count};
     }
 #else
     // Our toolchains generate a fetch_add PTX instructions with system scope,
     // which is not supported on pre-Pascal architectures.
+    (void)total;
     assert(false);
 #endif
   }
@@ -234,16 +251,22 @@ __global__ void AppendFloatCheck(
 
 se::KernelLoaderSpec GetFloatCheckF32KernelSpec(int arity) {
   return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      absl::bit_cast<void*>(&AppendFloatCheck<float>),
+      absl::bit_cast<void*>(&FloatCheck<float>),
       "BufferDebugFloatCheckF32Kernel", arity);
 }
 
 se::KernelLoaderSpec GetFloatCheckBf16KernelSpec(int arity) {
   return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      absl::bit_cast<void*>(&AppendFloatCheck<__nv_bfloat16>),
+      absl::bit_cast<void*>(&FloatCheck<__nv_bfloat16>),
       "BufferDebugFloatCheckBf16Kernel", arity);
 }
 
+se::KernelLoaderSpec GetReduceFloatCheckResultsKernelSpec(int arity) {
+  return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
+      absl::bit_cast<void*>(&ReduceFloatCheckResults),
+      "BufferDebugReduceFloatCheckResultsKernel", arity);
+}
+
 }  // namespace
 
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
@@ -253,3 +276,8 @@ GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     BufferDebugFloatCheckBf16Kernel, se::gpu::BufferDebugFloatCheckBf16Kernel,
     se::cuda::kCudaPlatformId, GetFloatCheckBf16KernelSpec);
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugReduceFloatCheckResultsKernel,
+    se::gpu::BufferDebugAppendReducedFloatCheckResultsKernel,
+    se::cuda::kCudaPlatformId, GetReduceFloatCheckResultsKernelSpec);
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
index 56ec5d18289bed..053dc05642c88f 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <cstdint>
+#include <cstdlib>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -29,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
 #include "xla/stream_executor/gpu/buffer_debug_log.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
@@ -61,7 +64,7 @@ class FloatCheckKernelTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
 
     if (!executor_->GetDeviceDescription()
              .cuda_compute_capability()
@@ -86,11 +89,17 @@ class FloatCheckKernelTest : public ::testing::Test {
   absl::Status AppendFloatCheckOnDevice(
       BufferDebugLogEntryId entry_id, const std::vector<InputType>& input,
       se::gpu::BufferDebugLog<BufferType>& buffer_debug_log,
-      stream_executor::ThreadDim dim = stream_executor::ThreadDim(1, 1, 1)) {
+      stream_executor::BlockDim block_dim = stream_executor::BlockDim(1, 1, 1),
+      size_t temp_buffer_size_elements = 1024) {
     // Load kernel
     gpu::GpuKernelRegistry registry =
         gpu::GpuKernelRegistry::GetGlobalRegistry();
     TF_ASSIGN_OR_RETURN(auto kernel, registry.LoadKernel<Kernel>(executor_));
+    TF_ASSIGN_OR_RETURN(
+        auto reduce_kernel,
+        registry
+            .LoadKernel<gpu::BufferDebugAppendReducedFloatCheckResultsKernel>(
+                executor_));
 
     // Setup device buffers
     TF_ASSIGN_OR_RETURN(
@@ -100,13 +109,27 @@ class FloatCheckKernelTest : public ::testing::Test {
     auto cleanup_input =
         absl::MakeCleanup([&]() { executor_->Deallocate(&device_input); });
 
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceAddress<xla::gpu::FloatCheckResult> device_tmp,
+        CheckNotNull(executor_->AllocateArray<xla::gpu::FloatCheckResult>(
+                         temp_buffer_size_elements),
+                     "tmp"));
+    auto cleanup_tmp =
+        absl::MakeCleanup([&]() { executor_->Deallocate(&device_tmp); });
+
+    const se::ThreadDim thread_dim(1024, 1, 1);
+
     // Call kernel
     TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
                                        input.size() * sizeof(input[0])));
-    TF_RETURN_IF_ERROR(kernel.Launch(
-        dim, stream_executor::BlockDim(1, 1, 1), stream_.get(), entry_id,
-        device_input, device_input.ElementCount() * sizeof(InputType),
-        buffer_debug_log.GetDeviceHeader(),
+    TF_RETURN_IF_ERROR(kernel.Launch(thread_dim, block_dim, stream_.get(),
+                                     device_input, device_input.ElementCount(),
+                                     device_tmp, device_tmp.ElementCount()));
+    TF_RETURN_IF_ERROR(reduce_kernel.Launch(
+        thread_dim, se::BlockDim(1, 1, 1), stream_.get(), device_tmp,
+        std::min(device_tmp.ElementCount(),
+                 block_dim.x * block_dim.y * block_dim.z),
+        entry_id, buffer_debug_log.GetDeviceHeader(),
         buffer_debug_log.GetDeviceEntries()));
     TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
 
@@ -117,7 +140,7 @@ class FloatCheckKernelTest : public ::testing::Test {
   se::Platform* platform_;
   se::StreamExecutor* executor_;
   std::unique_ptr<se::Stream> stream_;
-  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_F(FloatCheckKernelTest, ChecksFloatsForF32) {
@@ -170,33 +193,101 @@ TEST_F(FloatCheckKernelTest, ChecksFloatsForBf16) {
 }
 
 TEST_F(FloatCheckKernelTest, ChecksFloatsInParallel) {
-  se::DeviceAddress<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
-  std::vector<float> input(1024, 1.0f);
-  input[100] = std::numeric_limits<float>::quiet_NaN();
-  input[200] = std::numeric_limits<float>::quiet_NaN();
-  input[300] = std::numeric_limits<float>::quiet_NaN();
-  input[400] = 0.0f;
-  input[600] = std::numeric_limits<float>::infinity();
-  input[700] = std::numeric_limits<float>::infinity();
+  static constexpr size_t kNumNaNs = 100;
+  static constexpr size_t kNumInfs = 200;
+  static constexpr size_t kNumZeros = 300;
+  static constexpr size_t kMaxTestValues =
+      std::max(std::max(kNumNaNs, kNumInfs), kNumZeros);
+
+  const se::DeviceDescription& device_desc = executor_->GetDeviceDescription();
+  const size_t threads_per_core = device_desc.threads_per_core_limit();
+  const size_t num_cores = device_desc.core_count();
+  const size_t input_size = num_cores * threads_per_core * 3 / 2;
+  const size_t test_value_stride = input_size / (kMaxTestValues + 1);
+  ASSERT_GT(input_size, kMaxTestValues);
+  ASSERT_GT(test_value_stride, 2);
+
+  std::vector<float> input(input_size, 1.0f);
+  for (size_t i = 0; i < kNumNaNs; ++i) {
+    input[i * test_value_stride] = std::numeric_limits<float>::quiet_NaN();
+  }
+  for (size_t i = 0; i < kNumInfs; ++i) {
+    input[i * test_value_stride + 1] = std::numeric_limits<float>::infinity();
+  }
+  for (size_t i = 0; i < kNumZeros; ++i) {
+    input[i * test_value_stride + 2] = 0.0f;
+  }
 
+  se::DeviceAddress<uint8_t> log_mem = executor_->AllocateArray<uint8_t>(1024);
   TF_ASSERT_OK_AND_ASSIGN(
       auto device_log,
       se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
-          *stream_, mem));
+          *stream_, log_mem));
 
+  int64_t threads_per_block;
+  int64_t num_blocks;
+  CalculateDimensionality(executor_->GetDeviceDescription(), input.size(),
+                          &threads_per_block, &num_blocks);
+  const se::BlockDim block_dim(num_blocks);
   TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
-      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+      BufferDebugLogEntryId{0}, input, device_log, block_dim));
   TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
-      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+      BufferDebugLogEntryId{0}, input, device_log, block_dim));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 2);
-  EXPECT_EQ(host_log[0].nan_count, 3);
-  EXPECT_EQ(host_log[0].inf_count, 2);
-  EXPECT_EQ(host_log[0].zero_count, 1);
-  EXPECT_EQ(host_log[1].nan_count, 3);
-  EXPECT_EQ(host_log[1].inf_count, 2);
-  EXPECT_EQ(host_log[1].zero_count, 1);
+  EXPECT_EQ(host_log[0].nan_count, kNumNaNs);
+  EXPECT_EQ(host_log[0].inf_count, kNumInfs);
+  EXPECT_EQ(host_log[0].zero_count, kNumZeros);
+  EXPECT_EQ(host_log[1].nan_count, kNumNaNs);
+  EXPECT_EQ(host_log[1].inf_count, kNumInfs);
+  EXPECT_EQ(host_log[1].zero_count, kNumZeros);
+}
+
+TEST_F(FloatCheckKernelTest, ReduceFloatCheckResults) {
+  static constexpr size_t kNumNaNs = 100;
+  static constexpr size_t kNumInfs = 200;
+  static constexpr size_t kNumZeros = 300;
+  static constexpr size_t kIntermediateResults = 16 * 1024;
+
+  std::vector<xla::gpu::FloatCheckResult> results(kIntermediateResults);
+  for (size_t i = 0; i < kIntermediateResults; ++i) {
+    results[i].nan_count = i < kNumNaNs ? 1 : 0;
+    results[i].inf_count = i < kNumInfs ? 1 : 0;
+    results[i].zero_count = i < kNumZeros ? 1 : 0;
+  }
+
+  gpu::GpuKernelRegistry registry = gpu::GpuKernelRegistry::GetGlobalRegistry();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto reduce_kernel,
+      registry.LoadKernel<gpu::BufferDebugAppendReducedFloatCheckResultsKernel>(
+          executor_));
+
+  se::DeviceAddress<uint8_t> log_mem = executor_->AllocateArray<uint8_t>(1024);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, log_mem));
+  TF_ASSERT_OK_AND_ASSIGN(
+      se::DeviceAddress<xla::gpu::FloatCheckResult> device_results,
+      CheckNotNull(executor_->AllocateArray<xla::gpu::FloatCheckResult>(
+                       kIntermediateResults),
+                   "results"));
+  auto cleanup_results =
+      absl::MakeCleanup([&]() { executor_->Deallocate(&device_results); });
+
+  TF_ASSERT_OK(stream_->Memcpy(&device_results, results.data(),
+                               results.size() * sizeof(results[0])));
+  TF_ASSERT_OK(reduce_kernel.Launch(
+      se::ThreadDim(1024, 1, 1), se::BlockDim(1, 1, 1), stream_.get(),
+      device_results, device_results.ElementCount(), BufferDebugLogEntryId{0},
+      device_log.GetDeviceHeader(), device_log.GetDeviceEntries()));
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+
+  ASSERT_GE(host_log.size(), 1);
+  EXPECT_EQ(host_log[0].nan_count, kNumNaNs);
+  EXPECT_EQ(host_log[0].inf_count, kNumInfs);
+  EXPECT_EQ(host_log[0].zero_count, kNumZeros);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
index 26638e74e65d8e..b0583d8000dc1f 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
@@ -59,7 +59,7 @@ class ChecksumKernelTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
 
     if (!executor_->GetDeviceDescription()
              .cuda_compute_capability()
@@ -117,7 +117,7 @@ class ChecksumKernelTest : public ::testing::Test {
   se::Platform* platform_;
   se::StreamExecutor* executor_;
   std::unique_ptr<se::Stream> stream_;
-  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_F(ChecksumKernelTest, ComputesCorrectChecksumForMultipleOf32Bit) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
index d85c55114372c4..3a4b19a46996d3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
@@ -63,7 +63,7 @@ class CubPrefixSumKernelCudaTest
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
   }
 
   template <typename T>
@@ -148,7 +148,7 @@ class CubPrefixSumKernelCudaTest
   se::Platform* platform_;
   se::StreamExecutor* executor_;
   std::unique_ptr<se::Stream> stream_;
-  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_P(CubPrefixSumKernelCudaTest, TestPrefixSum) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index 831a7424404b93..581abc2ca0966e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -318,10 +318,12 @@ auto BlasLt::GetMatmulPlan(const gpu::GemmConfig& cfg,
 
   auto compute_type = cfg.compute_type;
   if (!compute_type) {  // obtain compute_type unless provided by the user
-    TF_ASSIGN_OR_RETURN(compute_type,
-                        gpu::GetBlasComputationType(
-                            cfg.precision_algorithm, lhs_layout.dtype,
-                            output_layout.dtype, cfg.compute_precision));
+    TF_ASSIGN_OR_RETURN(
+        compute_type,
+        gpu::GetBlasComputationType(
+            cfg.precision_algorithm, lhs_layout.dtype, output_layout.dtype,
+            cfg.compute_precision,
+            parent_->GetDeviceDescription().gpu_compute_capability()));
   }
 
   // FP8 matmuls have a fast accumulation mode that is less precise than the
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
index 5656939a68c091..f232232db3f053 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
@@ -88,6 +88,11 @@ absl::Status CudaEvent::WaitForEventOnExternalStream(std::intptr_t stream) {
                            handle_);
 }
 
+absl::Status CudaEvent::Synchronize() {
+  std::unique_ptr<ActivateContext> activation = executor_->Activate();
+  return cuda::ToStatus(cuEventSynchronize(handle_));
+}
+
 absl::StatusOr<CudaEvent> CudaEvent::Create(StreamExecutor *executor,
                                             bool allow_timing) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.h b/third_party/xla/xla/stream_executor/cuda/cuda_event.h
index 0d6f871d0fbcc7..c338da7795fa8b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.h
@@ -33,6 +33,7 @@ class CudaEvent : public Event {
  public:
   Event::Status PollForStatus() override;
   absl::Status WaitForEventOnExternalStream(std::intptr_t stream) override;
+  absl::Status Synchronize() override;
 
   // Creates a new CudaEvent. If allow_timing is false, the event will not
   // support timing, which is cheaper to create.
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
index 1a8469f4a33e81..d9b230f7f198d9 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
@@ -15,16 +15,20 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_event.h"
 
+#include <memory>
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -50,6 +54,31 @@ TEST(CudaEventTest, CreateEvent) {
   EXPECT_EQ(event2.GetHandle(), handle);
 }
 
+TEST(CudaEventTest, Synchronize) {
+  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
+                          stream_executor::PlatformManager::PlatformWithId(
+                              stream_executor::cuda::kCudaPlatformId));
+  TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
+                          platform->ExecutorForDevice(0));
+  CudaExecutor* cuda_executor = reinterpret_cast<CudaExecutor*>(executor);
+
+  TF_ASSERT_OK_AND_ASSIGN(CudaEvent event,
+                          CudaEvent::Create(cuda_executor, false));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CudaStream> stream,
+                          CudaStream::Create(cuda_executor,
+                                             /*priority=*/std::nullopt));
+
+  // Record the event on the stream.
+  TF_ASSERT_OK(stream->RecordEvent(&event));
+
+  // Synchronize on the event (blocks until the event is recorded).
+  EXPECT_THAT(event.Synchronize(), absl_testing::IsOk());
+
+  // After synchronization, the event should be complete.
+  EXPECT_EQ(event.PollForStatus(), Event::Status::kComplete);
+}
+
 }  // namespace
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 488e0f465f594a..32d6ef67a058dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/base/casts.h"
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/numeric/int128.h"
@@ -61,6 +62,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_context.h"
 #include "xla/stream_executor/cuda/cuda_event.h"
 #include "xla/stream_executor/cuda/cuda_kernel.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
@@ -90,12 +92,14 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tensor_map.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
@@ -408,27 +412,8 @@ bool CanEnablePeerAccess(CUdevice from, CUdevice to) {
     LOG(ERROR) << "failed to detect peer access capability: " << status;
     return false;
   }
-  return can_access_peer;
-}
 
-bool CanEnablePeerAccess(Context* from, Context* to) {
-  if (from == to) {
-    return true;  // A context can always access its own memory.
-  }
-
-  auto from_device = DeviceFromContext(from);
-  if (!from_device.ok()) {
-    LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
-               << from_device.status();
-    return false;
-  }
-  auto to_device = DeviceFromContext(to);
-  if (!to_device.ok()) {
-    LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
-               << to_device.status();
-    return false;
-  }
-  return CanEnablePeerAccess(from_device.value(), to_device.value());
+  return can_access_peer;
 }
 
 absl::Status EnablePeerAccess(Context* from, Context* to) {
@@ -783,8 +768,9 @@ absl::StatusOr<FabricInfo> GetDeviceFabricInfo(nvmlDevice_t device) {
 
   if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
     std::string error_message =
-        "NVML doesn't support extracting fabric info or NVLink is not used by "
-        "the device.";
+        "[Ignore this message unless multi-node NVLink is used] "
+        "CUDA driver version is too low for extracting fabric info (550+ "
+        "required), or multi-node NVLink is not available.";
     VLOG(2) << error_message;
     return absl::InternalError(error_message);
   }
@@ -944,7 +930,7 @@ absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
   int device_count = 0;
   TF_RETURN_IF_ERROR(cuda::ToStatus(cudaGetDeviceCount(&device_count)));
   for (int peer = 0; peer < device_count; peer++) {
-    if (peer == device_ordinal() || CanEnablePeerAccess(peer, device_)) {
+    if (peer == device_ordinal() || CanEnablePeerAccessTo(peer)) {
       CUmemAccessDesc accessDesc = GetVmmAccessDescriptor(peer);
       TF_RETURN_IF_ERROR(
           cuda::ToStatus(cuMemSetAccess(ptr, padded_size, &accessDesc, 1)));
@@ -1015,8 +1001,8 @@ absl::Status CollectiveMemoryDeallocate(StreamExecutor* executor,
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-CudaExecutor::CreateMemoryAllocator(MemoryType type) {
-  if (type == MemoryType::kUnified) {
+CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
+  if (type == MemorySpace::kUnified) {
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -1048,7 +1034,8 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
         });
   }
 
-  if (type == MemoryType::kCollective) {
+  if (type == MemorySpace::kCollective) {
+    // TODO(469289220): Use NCCL/NVSHMEM memory allocator here instead.
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -1072,7 +1059,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
         });
   }
 
-  if (type == MemoryType::kHost) {
+  if (type == MemorySpace::kHost) {
     return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
       return AllocateHostMemory(cuda_context_, numa_node_, size);
     });
@@ -1096,6 +1083,17 @@ absl::Status CudaExecutor::Init() {
   if (numa_node_ == tsl::port::kNUMANoAffinity) {
     XLA_VLOG_DEVICE(2, device_ordinal()) << "Could not determine NUMA node";
   }
+
+  int cuda_device_count = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cudaGetDeviceCount(&cuda_device_count)));
+  for (int i = 0; i < cuda_device_count; ++i) {
+    if (i == device_ordinal()) {
+      peer_access_cache_[i] = true;
+      continue;
+    }
+
+    peer_access_cache_[i] = CanEnablePeerAccess(device_, i);
+  }
   return absl::OkStatus();
 }
 
@@ -1407,7 +1405,7 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
       << "CudaExecutor::Allocate size: " << size
       << " memory_space: " << memory_space;
 
-  if (memory_space == static_cast<int64_t>(MemoryType::kCollective)) {
+  if (memory_space == static_cast<int64_t>(MemorySpace::kCollective)) {
     auto result = CollectiveMemoryAllocate(this, size);
     if (!result.ok()) {
       XLA_LOG_DEVICE(ERROR, device_ordinal())
@@ -1418,7 +1416,7 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceAddressBase(result.value(), size);
   }
 
-  if (memory_space == static_cast<int64_t>(MemoryType::kHost)) {
+  if (memory_space == static_cast<int64_t>(MemorySpace::kHost)) {
     auto result = HostAllocate(cuda_context_, numa_node_, size);
     if (!result.ok()) {
       XLA_LOG_DEVICE(ERROR, device_ordinal())
@@ -1430,7 +1428,7 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceAddressBase(result.value(), size);
   }
 
-  if (memory_space == static_cast<int64_t>(MemoryType::kP2P) &&
+  if (memory_space == static_cast<int64_t>(MemorySpace::kP2P) &&
       is_vmm_supported_) {
     auto device_buf_base = VmmAllocateMemory(size);
 
@@ -1444,8 +1442,8 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceAddressBase(nullptr, 0);
   }
 
-  CHECK(memory_space == static_cast<int64_t>(MemoryType::kDevice) ||
-        memory_space == static_cast<int64_t>(MemoryType::kP2P));
+  CHECK(memory_space == static_cast<int64_t>(MemorySpace::kDevice) ||
+        memory_space == static_cast<int64_t>(MemorySpace::kP2P));
 
   auto device_buf_base = DeviceAllocate(cuda_context_, size);
   XLA_VLOG_DEVICE(1, device_ordinal())
@@ -1468,7 +1466,7 @@ void CudaExecutor::Deallocate(DeviceAddressBase* mem) {
     return;
   }
   auto memory_space = status_or_memory_space.value();
-  if (memory_space == MemoryType::kHost) {
+  if (memory_space == MemorySpace::kHost) {
     HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size());
   } else {
     // Memory space is always kDevice here, so the only way to check if the
@@ -1619,7 +1617,25 @@ fft::FftSupport* CudaExecutor::AsFft() {
 
 bool CudaExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
   CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
-  return CanEnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
+  absl::StatusOr<int> to_device = DeviceFromContext(cuda_other->cuda_context_);
+  if (!to_device.ok()) {
+    LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
+               << to_device.status();
+    return false;
+  }
+  return CanEnablePeerAccessTo(*to_device);
+}
+
+bool CudaExecutor::CanEnablePeerAccessTo(int other_device_ordinal) {
+  auto it = peer_access_cache_.find(other_device_ordinal);
+  if (it != peer_access_cache_.end()) {
+    return it->second;
+  }
+
+  LOG(WARNING) << "Attemping to enable peer access from: " << device_ordinal()
+               << " to: " << other_device_ordinal
+               << " which was not available during initialization.";
+  return false;
 }
 
 absl::Status CudaExecutor::EnablePeerAccessTo(StreamExecutor* other) {
@@ -1836,9 +1852,6 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
     if (fabric_info.ok()) {
       info.cluster_uuid = fabric_info->cluster_uuid;
       info.clique_id = fabric_info->clique_id;
-    } else {
-      LOG(WARNING) << "GPU interconnect information not available: "
-                   << fabric_info.status();
     }
     desc.set_device_interconnect_info(info);
   }
@@ -1901,7 +1914,7 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
   return std::make_unique<DeviceDescription>(std::move(desc));
 }
 
-absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
+absl::StatusOr<MemorySpace> CudaExecutor::GetPointerMemorySpace(
     const void* ptr) {
   CUdeviceptr pointer = reinterpret_cast<CUdeviceptr>(const_cast<void*>(ptr));
   unsigned int is_managed;
@@ -1909,7 +1922,7 @@ absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
       &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, pointer)));
 
   if (is_managed) {
-    return MemoryType::kUnified;
+    return MemorySpace::kUnified;
   }
 
   unsigned int value;
@@ -1917,9 +1930,9 @@ absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
       &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)));
   switch (value) {
     case CU_MEMORYTYPE_DEVICE:
-      return MemoryType::kDevice;
+      return MemorySpace::kDevice;
     case CU_MEMORYTYPE_HOST:
-      return MemoryType::kHost;
+      return MemorySpace::kHost;
     default:
       return absl::InternalError(
           absl::StrCat("unknown memory space provided by CUDA API: ", value));
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index 4c92d0eac36255..b115fc95d07e13 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_context.h"
 #include "xla/stream_executor/cuda/cuda_kernel.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
@@ -53,17 +54,22 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tensor_map.h"
 
 namespace stream_executor::gpu {
 
 // This class implements GpuExecutor for NVIDIA GPUs that use CUDA libraries.
 class CudaExecutor : public GpuExecutor {
  public:
-  CudaExecutor(Platform* platform, int device_ordinal)
-      : GpuExecutor(platform, device_ordinal) {}
+  CudaExecutor(Platform* platform, int device_ordinal,
+               CollectiveAllocatorType collective_allocator_type)
+      : GpuExecutor(platform, device_ordinal),
+        collective_allocator_type_(collective_allocator_type) {}
+
   ~CudaExecutor() override;
   std::unique_ptr<ActivateContext> Activate() override;
   absl::Status Init() override;
@@ -84,6 +90,7 @@ class CudaExecutor : public GpuExecutor {
   void DeallocateStream(Stream* stream) override;
   absl::Status EnablePeerAccessTo(StreamExecutor* other) override;
   bool CanEnablePeerAccessTo(StreamExecutor* other) override;
+  bool CanEnablePeerAccessTo(int other_device_ordinal) override;
   bool DeviceMemoryUsage(int64_t* free_out, int64_t* total_out) const override;
   absl::StatusOr<std::unique_ptr<Kernel>> LoadKernel(
       const KernelLoaderSpec& spec) override;
@@ -114,7 +121,7 @@ class CudaExecutor : public GpuExecutor {
   bool HostMemoryRegister(void* location, uint64_t size) override;
   bool HostMemoryUnregister(void* location) override;
 
-  absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
+  absl::StatusOr<MemorySpace> GetPointerMemorySpace(const void* ptr) override;
 
   Stream* FindAllocatedStream(void* gpu_stream) override {
     absl::MutexLock lock(alive_gpu_streams_mu_);
@@ -138,7 +145,7 @@ class CudaExecutor : public GpuExecutor {
   absl::StatusOr<TensorMap> CreateTensorMap(const TmaDescriptor& tma_desc,
                                             void* global_address) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override;
+      MemorySpace type) override;
 
   // Returns the granularity which is the minimum unit of memory that can be
   // allocated with VMM API. In order to map the memory slices to multicast
@@ -225,6 +232,8 @@ class CudaExecutor : public GpuExecutor {
   // Returns true if a delay kernel is supported.
   absl::StatusOr<bool> DelayKernelIsSupported();
 
+  CollectiveAllocatorType collective_allocator_type_;
+
   bool is_vmm_supported_ = false;
 
   bool is_rdma_supported_ = false;
@@ -310,6 +319,7 @@ class CudaExecutor : public GpuExecutor {
   int stream_priority_lowest_ = 0;
   int stream_priority_highest_ = 0;
   bool stream_priority_query_ok_ = false;
+  absl::flat_hash_map<int, bool> peer_access_cache_;
 };
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
index 1bd00cb53a35bb..a89c134efd8829 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
@@ -45,7 +45,7 @@ template <typename T>
 absl::StatusOr<stream_executor::DeviceAddressBase> AllocateInitializedMemory(
     CudaExecutor* executor, size_t size, size_t offset, T value) {
   stream_executor::DeviceAddressBase device_memory = executor->Allocate(
-      size + offset, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
+      size + offset, static_cast<int64_t>(stream_executor::MemorySpace::kP2P));
   if (device_memory.opaque() == nullptr) {
     return absl::InternalError("Failed to allocate memory.");
   }
@@ -81,6 +81,21 @@ StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
   return platform->ExecutorForDevice(device_ordinal).value();
 }
 
+TEST(CudaExecutorMultiGpuTest, PeerAccess) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  EXPECT_TRUE(executors[0]->CanEnablePeerAccessTo(0));
+  EXPECT_TRUE(executors[0]->CanEnablePeerAccessTo(1));
+  EXPECT_TRUE(executors[1]->CanEnablePeerAccessTo(0));
+  EXPECT_TRUE(executors[1]->CanEnablePeerAccessTo(1));
+  EXPECT_FALSE(executors[0]->CanEnablePeerAccessTo(100));
+}
+
 TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryResubscriptionFails) {
   std::vector<CudaExecutor*> executors = {
       static_cast<CudaExecutor*>(GetGpuExecutor(0)),
@@ -130,7 +145,6 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemorySubscribeMoreDevices) {
   EXPECT_THAT(multicast_memory->SubscribeDevice(2),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        "All devices are already subscribed."));
-  ;
 }
 
 TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingNonVmmMemory) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
index 076495e91cf41e..86e4198a59b4d0 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
@@ -115,11 +115,11 @@ TEST(CudaExecutorTest, CreateUnifiedMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kUnified));
+      executor->CreateMemoryAllocator(MemorySpace::kUnified));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 TEST(CudaExecutorTest, CreateHostMemoryAllocatorWorks) {
@@ -128,11 +128,11 @@ TEST(CudaExecutorTest, CreateHostMemoryAllocatorWorks) {
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocator> allocator,
-                          executor->CreateMemoryAllocator(MemoryType::kHost));
+                          executor->CreateMemoryAllocator(MemorySpace::kHost));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 TEST(CudaExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
@@ -142,11 +142,11 @@ TEST(CudaExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kCollective));
+      executor->CreateMemoryAllocator(MemorySpace::kCollective));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 // TODO: b/420735471 - Enable test once fixed.
@@ -158,7 +158,7 @@ TEST(CudaExecutorTest,
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kCollective));
+      executor->CreateMemoryAllocator(MemorySpace::kCollective));
   constexpr uint64_t kTooBig = 1125899906842624;  // 1 PiB
   EXPECT_THAT(
       allocator->Allocate(kTooBig),
@@ -173,7 +173,7 @@ TEST(CudaExecutorTest, CreateUnsupportedMemoryAllocatorsFail) {
                           PlatformManager::PlatformWithName("CUDA"));
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
-  EXPECT_THAT(executor->CreateMemoryAllocator(MemoryType::kDevice),
+  EXPECT_THAT(executor->CreateMemoryAllocator(MemorySpace::kDevice),
               Not(absl_testing::IsOk()));
 }
 
@@ -185,12 +185,12 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithUnifiedMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto unified_memory_allocator,
-      executor->CreateMemoryAllocator(MemoryType::kUnified));
+      executor->CreateMemoryAllocator(MemorySpace::kUnified));
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           unified_memory_allocator->Allocate(256));
-  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kUnified));
+  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->address().opaque()),
+              absl_testing::IsOkAndHolds(MemorySpace::kUnified));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
@@ -201,8 +201,8 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           executor->HostMemoryAllocate(256));
-  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kHost));
+  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->address().opaque()),
+              absl_testing::IsOkAndHolds(MemorySpace::kHost));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceAddress) {
@@ -214,7 +214,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceAddress) {
   DeviceAddressBase allocation = executor->Allocate(256);
   EXPECT_NE(allocation.opaque(), nullptr);
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation.opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kDevice));
+              absl_testing::IsOkAndHolds(MemorySpace::kDevice));
 }
 
 TEST(CudaExecutorTest, AllocateMemoryWithVmmApi) {
@@ -226,12 +226,12 @@ TEST(CudaExecutorTest, AllocateMemoryWithVmmApi) {
   auto cuda_executor = dynamic_cast<CudaExecutor*>(executor);
   ASSERT_NE(cuda_executor, nullptr);
   DeviceAddressBase ptr =
-      cuda_executor->Allocate(1024, static_cast<int>(MemoryType::kP2P));
+      cuda_executor->Allocate(1024, static_cast<int>(MemorySpace::kP2P));
 
   EXPECT_NE(ptr.opaque(), nullptr);
   EXPECT_EQ(ptr.size(), 1024);
   EXPECT_THAT(executor->GetPointerMemorySpace(ptr.opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kDevice));
+              absl_testing::IsOkAndHolds(MemorySpace::kDevice));
 
   TF_ASSERT_OK_AND_ASSIGN(CudaExecutor::VmmMemoryHandle handle,
                           cuda_executor->RetainVmmMemoryHandle(ptr.opaque()));
@@ -248,7 +248,7 @@ TEST(CudaExecutorTest,
   auto cuda_executor = dynamic_cast<CudaExecutor*>(executor);
   ASSERT_NE(cuda_executor, nullptr);
   DeviceAddressBase ptr =
-      cuda_executor->Allocate(1024, static_cast<int>(MemoryType::kDevice));
+      cuda_executor->Allocate(1024, static_cast<int>(MemorySpace::kDevice));
 
   EXPECT_NE(ptr.opaque(), nullptr);
   EXPECT_EQ(ptr.size(), 1024);
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc
new file mode 100644
index 00000000000000..2aa70468f488eb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/const_init.h"
+#include "absl/base/no_destructor.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// Per-process registry of collective allocator factories.
+static absl::Mutex collective_allocators_mu(absl::kConstInit);
+static absl::NoDestructor<
+    absl::flat_hash_map<CollectiveAllocatorType, CollectiveAllocatorFactory>>
+    collective_allocators ABSL_GUARDED_BY(collective_allocators_mu);
+
+namespace {
+// Instead of failing early we return a memory allocator that always fails when
+// asked to allocate collective memory.
+//
+// TODO(patrios): We should fail early, but in open source builds something is
+// wrong with linking order and allocators are not registered.
+class NoCollectiveMemoryAllocator : public MemoryAllocator {
+ public:
+  explicit NoCollectiveMemoryAllocator(CollectiveAllocatorType allocator_type)
+      : allocator_type_(allocator_type) {}
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> Allocate(
+      uint64_t size) override {
+    return absl::UnimplementedError(absl::StrCat(
+        "No collective memory allocator registered for ", allocator_type_));
+  }
+
+ private:
+  CollectiveAllocatorType allocator_type_;
+};
+}  // namespace
+
+void RegisterCollectiveAllocatorFactory(
+    CollectiveAllocatorType allocator_type,
+    absl::AnyInvocable<std::unique_ptr<MemoryAllocator>(StreamExecutor*)>
+        allocator_factory) {
+  VLOG(1) << "Registering collective allocator factory for "
+          << absl::StrCat(allocator_type);
+  absl::MutexLock lock(collective_allocators_mu);
+  collective_allocators->insert({allocator_type, std::move(allocator_factory)});
+}
+
+absl::StatusOr<std::unique_ptr<MemoryAllocator>>
+CreateCollectiveMemoryAllocator(StreamExecutor* executor,
+                                CollectiveAllocatorType allocator_type) {
+  absl::MutexLock lock(collective_allocators_mu);
+  auto it = collective_allocators->find(allocator_type);
+  if (it == collective_allocators->end()) {
+    return std::make_unique<NoCollectiveMemoryAllocator>(allocator_type);
+  }
+  return it->second(executor);
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
new file mode 100644
index 00000000000000..b6a705673e91f9
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
@@ -0,0 +1,62 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_MEMORY_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// A type of memory allocator for kCollective memory space.
+enum class CollectiveAllocatorType { kNccl, kNvshmem };
+
+template <typename Sink>
+void AbslStringify(Sink& sink, CollectiveAllocatorType allocator_type) {
+  switch (allocator_type) {
+    case CollectiveAllocatorType::kNccl:
+      sink.Append("NCCL");
+      break;
+    case CollectiveAllocatorType::kNvshmem:
+      sink.Append("NVSHMEM");
+      break;
+  }
+}
+
+using CollectiveAllocatorFactory =  // NOLINT
+    absl::AnyInvocable<std::unique_ptr<MemoryAllocator>(StreamExecutor*)>;
+
+// Static registration of a collective memory allocator factory. NCCL and
+// NVSHMEM allocators are not supported in all build configurations, and
+// we rely on the static registration pattern as a way to ensure that
+// we can dynamically select between available allocators.
+void RegisterCollectiveAllocatorFactory(
+    CollectiveAllocatorType allocator_type,
+    CollectiveAllocatorFactory allocator_factory);
+
+// Creates a collective memory allocator for the given allocator type.
+absl::StatusOr<std::unique_ptr<MemoryAllocator>>
+CreateCollectiveMemoryAllocator(StreamExecutor* executor,
+                                CollectiveAllocatorType allocator_type);
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 87bc56a136f65e..5247fc7a9abe77 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -26,8 +26,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
+#include "xla/debug_options_flags.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/device_description.h"
@@ -120,7 +122,13 @@ absl::StatusOr<StreamExecutor*> CudaPlatform::FindExisting(int ordinal) {
 
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(int ordinal) {
-  auto executor = std::make_unique<CudaExecutor>(this, ordinal);
+  // TODO(b/468297040): We should not be using DebugOptions here.
+  xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
+  auto executor = std::make_unique<CudaExecutor>(
+      this, ordinal,
+      debug_options.xla_gpu_experimental_enable_nvshmem()
+          ? CollectiveAllocatorType::kNvshmem
+          : CollectiveAllocatorType::kNccl);
   TF_RETURN_IF_ERROR(executor->Init());
   return std::move(executor);
 }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc b/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc
index b33fda0dc59317..89c2018dfeaf12 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc
@@ -28,9 +28,10 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/cuda/delay_kernel.h"
 #include "xla/stream_executor/gpu/gpu_semaphore.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 
diff --git a/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
new file mode 100644
index 00000000000000..0d8e3bcbd0dc9c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
@@ -0,0 +1,119 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/nccl_memory_allocator.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "third_party/nccl/nccl.h"
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform/initialize.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/numbers.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+absl::StatusOr<void*> NcclAllocate(StreamExecutor* executor, uint64_t size) {
+  std::unique_ptr<ActivateContext> activate = executor->Activate();
+
+  void* ptr = nullptr;
+  ncclResult_t res = ncclMemAlloc(&ptr, size);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to allocate %s (%llu bytes) from NCCL: %s. Last "
+        "NCCL warning(error) log entry (may be unrelated): %s",
+        tsl::strings::HumanReadableNumBytes(size), size,
+        ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+  XLA_VLOG_DEVICE(2, executor->device_ordinal())
+      << "Allocated memory " << ptr << " of " << size << " bytes from NCCL";
+  return ptr;
+}
+
+absl::Status NcclFree(StreamExecutor* executor, void* ptr, uint64_t size) {
+  std::unique_ptr<ActivateContext> activate = executor->Activate();
+
+  ncclResult_t res = ncclMemFree(ptr);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to free NCCL memory at %p; result: %s. Last "
+        "NCCL warning(error) log entry (may be unrelated): %s",
+        ptr, ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+
+  XLA_VLOG_DEVICE(2, executor->device_ordinal())
+      << "Freed NCCL memory " << ptr << " of " << size << " bytes";
+  return absl::OkStatus();
+}
+
+// A memory allocated from NCCL on the given executor.
+class NcclMemoryAllocation : public MemoryAllocation {
+ public:
+  NcclMemoryAllocation(StreamExecutor* executor, void* ptr, uint64_t size);
+
+  ~NcclMemoryAllocation() final;
+  DeviceAddressBase address() const final;
+
+ private:
+  StreamExecutor* executor_;
+  void* ptr_;
+  uint64_t size_;
+};
+
+}  // namespace
+
+NcclMemoryAllocation::NcclMemoryAllocation(StreamExecutor* executor, void* ptr,
+                                           uint64_t size)
+    : executor_(executor), ptr_(ptr), size_(size) {}
+
+NcclMemoryAllocation::~NcclMemoryAllocation() {
+  CHECK_OK(NcclFree(executor_, ptr_, size_));  // Crash OK
+}
+
+DeviceAddressBase NcclMemoryAllocation::address() const {
+  return DeviceAddressBase(ptr_, size_);
+}
+
+NcclMemoryAllocator::NcclMemoryAllocator(StreamExecutor* executor)
+    : executor_(executor) {}
+
+absl::StatusOr<std::unique_ptr<MemoryAllocation>> NcclMemoryAllocator::Allocate(
+    uint64_t size) {
+  TF_ASSIGN_OR_RETURN(void* ptr, NcclAllocate(executor_, size));
+  return std::make_unique<NcclMemoryAllocation>(executor_, ptr, size);
+}
+
+}  // namespace stream_executor::gpu
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    nccl_memory_allocator,
+    stream_executor::gpu::RegisterCollectiveAllocatorFactory(
+        stream_executor::gpu::CollectiveAllocatorType::kNccl,
+        [](stream_executor::StreamExecutor* executor) {
+          return std::make_unique<stream_executor::gpu::NcclMemoryAllocator>(
+              executor);
+        }));
diff --git a/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h
new file mode 100644
index 00000000000000..0c678d0c569e57
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NCCL_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NCCL_MEMORY_ALLOCATOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// A memory allocator that uses NCCL to allocate memory.
+class NcclMemoryAllocator : public MemoryAllocator {
+ public:
+  explicit NcclMemoryAllocator(StreamExecutor* executor);
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> Allocate(
+      uint64_t size) final;
+
+ private:
+  StreamExecutor* executor_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NCCL_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem.cc
new file mode 100644
index 00000000000000..a88b84332c6c28
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem.cc
@@ -0,0 +1,134 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/nvshmem.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/base/call_once.h"
+#include "absl/base/no_destructor.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
+#include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu::nvshmem {
+
+// NVSHMEM environment information is stored per process in a static variable.
+namespace {
+struct EnvInfo {
+  int process_id = -1;
+  size_t num_processes = 0;
+  size_t device_count_per_process = 0;
+  std::weak_ptr<xla::KeyValueStoreInterface> kv_store;
+  bool initialized = false;
+};
+
+static absl::NoDestructor<EnvInfo> env;
+}  // namespace
+
+void SetEnvInfo(int process_id, size_t num_processes,
+                size_t device_count_per_process,
+                std::weak_ptr<xla::KeyValueStoreInterface> kv_store) {
+  env->process_id = process_id;
+  env->num_processes = num_processes;
+  env->device_count_per_process = device_count_per_process;
+  env->kv_store = kv_store;
+}
+
+bool IsInitialized() { return env->initialized; }
+
+absl::Status InitializeOnce() {
+  static constexpr absl::string_view kKvStoreKey = "nvshmem_global_init";
+
+  auto init_fn = []() -> absl::Status {
+    VLOG(2) << "Initializing NVSHMEM: process_id=" << env->process_id
+            << ", num_processes=" << env->num_processes
+            << ", device_count_per_process=" << env->device_count_per_process;
+
+    if (env->process_id == -1) {
+      LOG(FATAL)
+          << "NvshmemCollectives::SetEnvInfo was not called before using "
+             "NVSHMEM API";
+    }
+    if (env->device_count_per_process != 1) {
+      LOG(FATAL) << "NVSHMEM API is only supported with one device per process";
+    }
+    nvshmemx_init_attr_t nvshmem_init_attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
+    nvshmemx_uniqueid_t nvshmem_id = NVSHMEMX_UNIQUEID_INITIALIZER;
+
+    // Initialize NVSHMEM
+    if (std::shared_ptr<xla::KeyValueStoreInterface> kv_store =
+            env->kv_store.lock()) {
+      if (env->process_id == 0) {
+        if (nvshmemx_get_uniqueid(&nvshmem_id) != 0) {
+          return absl::InternalError("nvshmemx_get_uniqueid failed.");
+        }
+        char buf[sizeof(nvshmemx_uniqueid_t)];
+        std::memcpy(buf, &nvshmem_id, sizeof(nvshmemx_uniqueid_t));
+        absl::string_view nvshmem_id_str{buf, sizeof(buf)};
+        TF_RETURN_IF_ERROR(kv_store->Set(kKvStoreKey, nvshmem_id_str));
+      } else {
+        TF_ASSIGN_OR_RETURN(std::string id_str,
+                            kv_store->Get(kKvStoreKey, absl::Minutes(10)));
+        CHECK(id_str.size() >= sizeof(nvshmemx_uniqueid_t));
+        std::memcpy(&nvshmem_id, id_str.data(), sizeof(nvshmemx_uniqueid_t));
+      }
+    } else {
+      return absl::InternalError(
+          "KV store is not available for nvshmem initialization.");
+    }
+
+    if (nvshmemx_set_attr_uniqueid_args(env->process_id, env->num_processes,
+                                        &nvshmem_id, &nvshmem_init_attr) != 0) {
+      return absl::InternalError("nvshmemx_set_attr_uniqueid_args failed.");
+    }
+    if (nvshmemx_hostlib_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID,
+                                   &nvshmem_init_attr) != 0) {
+      return absl::InternalError("nvshmemx_hostlib_init_attr failed.");
+    }
+
+    VLOG(3) << absl::StreamFormat(
+        "Initialized NVSHMEM on process %d; num_processes=%llu",
+        env->process_id, env->num_processes);
+    return absl::OkStatus();
+  };
+
+  static absl::once_flag once_flag;
+  absl::Status status = absl::OkStatus();
+  absl::call_once(once_flag, [&]() {
+    status = init_fn();
+    env->initialized = true;
+  });
+  return status;
+}
+
+void Finalize() {
+  VLOG(3) << absl::StreamFormat(
+      "Finilizing NVSHMEM on process %d; num_processes=%llu", env->process_id,
+      env->num_processes);
+  nvshmemx_hostlib_finalize();
+}
+
+}  // namespace stream_executor::gpu::nvshmem
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem.h b/third_party/xla/xla/stream_executor/cuda/nvshmem.h
new file mode 100644
index 00000000000000..4fa7313a9e57d2
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+
+namespace stream_executor::gpu::nvshmem {
+
+// Set environment information for NVSHMEM library.
+void SetEnvInfo(int process_id, size_t num_processes,
+                size_t device_count_per_process,
+                std::weak_ptr<xla::KeyValueStoreInterface> kv_store);
+
+// Returns true if NVSHMEM library is initialized.
+bool IsInitialized();
+
+// Initializes NVSHMEM library once per process.
+absl::Status InitializeOnce();
+
+// Finalizes NVSHMEM library
+void Finalize();
+
+}  // namespace stream_executor::gpu::nvshmem
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
new file mode 100644
index 00000000000000..3988de1106c821
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
@@ -0,0 +1,104 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/nvshmem_memory_allocator.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
+#include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
+#include "xla/stream_executor/cuda/nvshmem.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform/initialize.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/numbers.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+absl::StatusOr<void*> NvshmemAllocate(uint64_t size) {
+  TF_RETURN_IF_ERROR(nvshmem::InitializeOnce());
+  VLOG(3) << absl::StreamFormat(
+      "Start allocation of %s (%llu bytes) for NVSHMEM",
+      tsl::strings::HumanReadableNumBytes(size), size);
+  void* buffer = nvshmem_malloc(size);
+  if (buffer == nullptr) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to allocate %s (%llu bytes) from NVSHMEM memory",
+        tsl::strings::HumanReadableNumBytes(size), size));
+  }
+  return buffer;
+}
+
+absl::Status NvshmemFree(void* ptr) {
+  TF_RETURN_IF_ERROR(nvshmem::InitializeOnce());
+  VLOG(3) << absl::StreamFormat("Start de-allocation for NVSHMEM buffer: %p",
+                                ptr);
+  nvshmem_free(ptr);
+  return absl::OkStatus();
+}
+
+// A memory allocated from NVSHMEM on the given executor.
+class NvshmemMemoryAllocation : public MemoryAllocation {
+ public:
+  NvshmemMemoryAllocation(void* ptr, uint64_t size);
+
+  ~NvshmemMemoryAllocation() final;
+  DeviceAddressBase address() const final;
+
+ private:
+  void* ptr_;
+  uint64_t size_;
+};
+
+}  // namespace
+
+NvshmemMemoryAllocation::NvshmemMemoryAllocation(void* ptr, uint64_t size)
+    : ptr_(ptr), size_(size) {}
+
+NvshmemMemoryAllocation::~NvshmemMemoryAllocation() {
+  CHECK_OK(NvshmemFree(ptr_));  // Crash OK
+}
+
+DeviceAddressBase NvshmemMemoryAllocation::address() const {
+  return DeviceAddressBase(ptr_, size_);
+}
+
+absl::StatusOr<std::unique_ptr<MemoryAllocation>>
+NvshmemMemoryAllocator::Allocate(uint64_t size) {
+  TF_ASSIGN_OR_RETURN(void* ptr, NvshmemAllocate(size));
+  return std::make_unique<NvshmemMemoryAllocation>(ptr, size);
+}
+
+}  // namespace stream_executor::gpu
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    nvshmem_memory_allocator,
+    stream_executor::gpu::RegisterCollectiveAllocatorFactory(
+        stream_executor::gpu::CollectiveAllocatorType::kNvshmem,
+        [](stream_executor::StreamExecutor* executor) {
+          return std::make_unique<
+              stream_executor::gpu::NvshmemMemoryAllocator>();
+        }));
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h
new file mode 100644
index 00000000000000..ba19a2f8c8e66d
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h
@@ -0,0 +1,37 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_MEMORY_ALLOCATOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_allocator.h"
+
+namespace stream_executor::gpu {
+
+// A memory allocator that uses NVSHMEM to allocate memory.
+class NvshmemMemoryAllocator : public MemoryAllocator {
+ public:
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> Allocate(
+      uint64_t size) final;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/stream_executor/device_address.h b/third_party/xla/xla/stream_executor/device_address.h
index a2ac3d8ac02095..9884ade4430b7d 100644
--- a/third_party/xla/xla/stream_executor/device_address.h
+++ b/third_party/xla/xla/stream_executor/device_address.h
@@ -41,7 +41,7 @@ namespace stream_executor {
 // check for `opaque` being null to determine if the device address is null.
 class DeviceAddressBase {
  public:
-  // Default constructor instantiates a null-pointed, zero-sized device memory
+  // Default constructor instantiates a null-pointed, zero-sized device address
   // region. An opaque pointer may be provided -- see header for details on the
   // opacity of that pointer.
   explicit DeviceAddressBase(void* opaque = nullptr, uint64_t size = 0)
@@ -53,10 +53,12 @@ class DeviceAddressBase {
     //  explicit DeviceAddressBase(void *opaque) = delete;
   }
 
-  // Returns whether the backing memory is the null pointer.
+  // Returns whether the backing address is the null pointer.
   // A `== nullptr` convenience method is also provided.
   bool is_null() const { return opaque_ == nullptr; }
 
+  explicit operator bool() const { return !is_null(); }
+
   bool operator==(std::nullptr_t other) const { return is_null(); }
   bool operator!=(std::nullptr_t other) const { return !is_null(); }
 
@@ -64,7 +66,7 @@ class DeviceAddressBase {
     return opaque_ == other.opaque_ && size_ == other.size_;
   }
 
-  // Provides a partial order between device memory values.
+  // Provides a partial order between device address values.
   //
   // This operator is provided so that this object can be used as a key in an
   // ordered map.
@@ -72,14 +74,14 @@ class DeviceAddressBase {
     return std::tie(opaque_, size_) < std::tie(other.opaque_, other.size_);
   }
 
-  // Returns the size, in bytes, for the backing memory.
+  // Returns the size, in bytes, for the backing address range.
   uint64_t size() const { return size_; }
 
   // Warning: note that the pointer returned is not necessarily directly to
   // device virtual address space, but is platform-dependent.
   void* opaque() const { return opaque_; }
 
-  // Returns the payload of this memory region.
+  // Returns the payload of this address range.
   uint64_t payload() const { return payload_; }
 
   // Sets payload to given value.
@@ -91,60 +93,58 @@ class DeviceAddressBase {
     return opaque() == other.opaque() && size() == other.size();
   }
 
-  // Creates a memory region (slice) inside another allocated memory region.
-  // Offset and size are in bytes.
+  // Creates and address range slice at the given offset and size. Offset and
+  // size are in bytes.
   ABSL_ATTRIBUTE_ALWAYS_INLINE DeviceAddressBase
   GetByteSlice(uint64_t offset_bytes, uint64_t size_bytes) const {
     DCHECK(offset_bytes + size_bytes <= size_)
-        << "requested slice allocation (offset + size) is greater "
-        << "than parent allocation size: (" << offset_bytes << " + "
-        << size_bytes << ") vs. (" << size_ << ")";
+        << "requested address slice (offset + size) is out of bounds "
+        << "of parent address: (" << offset_bytes << " + " << size_bytes
+        << ") vs. (" << size_ << ")";
 
     return DeviceAddressBase(
         reinterpret_cast<std::byte*>(opaque_) + offset_bytes, size_bytes);
   }
 
  private:
-  void* opaque_;   // Platform-dependent value representing addressable memory.
-  uint64_t size_;  // Size in bytes of this allocation.
-  uint64_t payload_ = 0;  // Payload data associated with this allocation.
+  void* opaque_;          // Platform-dependent value representing base address.
+  uint64_t size_;         // Size in bytes of this address range.
+  uint64_t payload_ = 0;  // Payload data associated with this address.
 };
 
 // Typed wrapper around "void *"-like DeviceAddressBase.
 //
 // For example, DeviceAddress<int32_t> is a simple wrapper around
-// DeviceAddressBase that represents one or more integers in Device memory.
+// DeviceAddressBase that represents one or more integers on Device.
 template <typename T>
 class DeviceAddress final : public DeviceAddressBase {
  public:
-  // Default constructor instantiates a null-pointed, zero-sized memory region.
+  // Default constructor instantiates a null-pointed, zero-sized addess range.
   DeviceAddress() : DeviceAddressBase(nullptr, 0) {}
   explicit DeviceAddress(std::nullptr_t) : DeviceAddress() {}
 
-  // Typed device memory regions may be constructed from untyped device memory
-  // regions, this effectively amounts to a cast from a void*.
+  // Typed device address range may be constructed from untyped device address
+  // range, this effectively amounts to a cast from a void*.
   explicit DeviceAddress(const DeviceAddressBase& other)
-      : DeviceAddressBase(const_cast<DeviceAddressBase&>(other).opaque(),
-                          other.size()) {
+      : DeviceAddressBase(other.opaque(), other.size()) {
     SetPayload(other.payload());
   }
 
-  // Returns the number of elements of type T that constitute this
-  // allocation.
+  // Returns the number of elements of type T that constitute this address.
   uint64_t ElementCount() const { return size() / sizeof(T); }
 
-  // Returns pointer to the allocated data
+  // Returns a base pointer to the data.
   T* base() const { return reinterpret_cast<T*>(opaque()); }
 
   // Creates a typed area of DeviceAddress with a given opaque pointer and the
-  // quantity of bytes in the allocation. This function is broken out to
+  // quantity of bytes in the address range. This function is broken out to
   // distinguish bytes from an element count.
   static DeviceAddress<T> MakeFromByteSize(void* opaque, uint64_t bytes) {
     return DeviceAddress<T>(opaque, bytes);
   }
 
-  // Creates a memory region (slice) inside another allocated memory region.
-  // Offset and size are specified in terms of T elements.
+  // Creates and address range slice at the given offset and count. Offset and
+  // count are specified in terms of T elements.
   DeviceAddress<T> GetSlice(uint64_t element_offset, uint64_t element_count) {
     return DeviceAddress<T>(
         GetByteSlice(sizeof(T) * element_offset, sizeof(T) * element_count));
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc b/third_party/xla/xla/stream_executor/device_address_test.cc
similarity index 60%
rename from third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
rename to third_party/xla/xla/stream_executor/device_address_test.cc
index ee9b15487bab65..71acd21672215e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
+++ b/third_party/xla/xla/stream_executor/device_address_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,21 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/device_address.h"
 
-#include "absl/base/casts.h"
-#include "absl/log/check.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/stream.h"
+#include <cstdint>
 
-namespace stream_executor {
-namespace gpu {
+#include <gtest/gtest.h>
 
-GpuStreamHandle AsGpuStreamValue(Stream* stream) {
-  DCHECK(stream != nullptr);
-  return absl::bit_cast<GpuStreamHandle>(
-      stream->platform_specific_handle().stream);
+namespace stream_executor {
+namespace {
+
+TEST(DeviceAddressTest, NullptrComparisons) {
+  {
+    DeviceAddressBase null_ptr;
+    EXPECT_FALSE(null_ptr);
+    EXPECT_TRUE(null_ptr == nullptr);
+  }
+
+  {
+    DeviceAddress<int32_t> null_ptr;
+    EXPECT_FALSE(null_ptr);
+    EXPECT_TRUE(null_ptr == nullptr);
+  }
 }
 
-}  // namespace gpu
+}  // namespace
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index f38a2597972d75..d837220e6f4fe4 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -691,8 +691,7 @@ std::vector<int64_t> BatchDescriptor::full_dims(
   std::vector<int64_t> bdyx_dims(ndims() + 2);
   bdyx_dims[0] = count();
   bdyx_dims[1] = feature_map_count();
-  std::copy(spatial_size().begin(), spatial_size().end(),
-            bdyx_dims.begin() + 2);
+  absl::c_copy(spatial_size(), bdyx_dims.begin() + 2);
   return ReorderDims(bdyx_dims, DataLayout::kBatchDepthYX, layout);
 }
 
@@ -831,8 +830,7 @@ std::vector<int64_t> FilterDescriptor::full_dims(
   std::vector<int64_t> oiyx_dims(ndims() + 2);
   oiyx_dims[0] = output_feature_map_count();
   oiyx_dims[1] = input_feature_map_count();
-  std::copy(input_filter_dims().begin(), input_filter_dims().end(),
-            oiyx_dims.begin() + 2);
+  absl::c_copy(input_filter_dims(), oiyx_dims.begin() + 2);
   return ReorderDims(oiyx_dims, FilterLayout::kOutputInputYX, layout);
 }
 
diff --git a/third_party/xla/xla/stream_executor/event.h b/third_party/xla/xla/stream_executor/event.h
index d79c10b74f4b95..f63e9736e30e9d 100644
--- a/third_party/xla/xla/stream_executor/event.h
+++ b/third_party/xla/xla/stream_executor/event.h
@@ -50,6 +50,13 @@ class Event {
   virtual absl::Status WaitForEventOnExternalStream(std::intptr_t stream) {
     return absl::UnimplementedError("Not supported for this Event.");
   }
+
+  // Blocks the calling host thread until the event has been recorded.
+  // Wraps the underlying platform-specific synchronization (e.g.
+  // cuEventSynchronize for CUDA).
+  virtual absl::Status Synchronize() {
+    return absl::UnimplementedError("Not supported for this Event.");
+  }
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/generic_memory_allocation.h b/third_party/xla/xla/stream_executor/generic_memory_allocation.h
index c443df3408eacf..69ec0d9361ec1a 100644
--- a/third_party/xla/xla/stream_executor/generic_memory_allocation.h
+++ b/third_party/xla/xla/stream_executor/generic_memory_allocation.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/memory_allocation.h"
 
 namespace stream_executor {
@@ -39,8 +40,9 @@ class GenericMemoryAllocation final : public MemoryAllocation {
     }
   }
 
-  void* opaque() const final { return ptr_; }
-  uint64_t size() const final { return size_; }
+  DeviceAddressBase address() const final {
+    return DeviceAddressBase(ptr_, size_);
+  }
 
  private:
   void* ptr_ = nullptr;
diff --git a/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc b/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc
index cd3fe22ea1c34b..023cc5b3917938 100644
--- a/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc
+++ b/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc
@@ -34,8 +34,8 @@ TEST(GenericMemoryAllocationTest, DeleterIsCalledWithCorrectArguments) {
   };
   {
     GenericMemoryAllocation allocation(array.data(), array.size(), deleter);
-    EXPECT_EQ(allocation.opaque(), array.data());
-    EXPECT_EQ(allocation.size(), array.size());
+    EXPECT_EQ(allocation.address().opaque(), array.data());
+    EXPECT_EQ(allocation.address().size(), array.size());
     EXPECT_FALSE(deleter_called);
   }
   EXPECT_TRUE(deleter_called);
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 181275413c33ac..9b11c015f54a7f 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -5,10 +5,6 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load(
-    "@local_config_sycl//sycl:build_defs.bzl",
-    "if_sycl_is_configured",
-)
 load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
@@ -171,7 +167,6 @@ cc_library(
     hdrs = ["gpu_executor.h"],
     deps = [
         ":multicast_memory",
-        "//xla/stream_executor:device_address",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_common",
         "//xla/stream_executor:stream_executor_h",
@@ -242,19 +237,6 @@ tsl_gpu_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "gpu_stream",
-    srcs = ["gpu_stream.cc"],
-    hdrs = ["gpu_stream.h"],
-    tags = ["gpu"],
-    deps = [
-        ":gpu_types_header",
-        "//xla/stream_executor:stream",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/log:check",
-    ],
-)
-
 cc_library(
     name = "gpu_semaphore",
     srcs = ["gpu_semaphore.cc"],
@@ -268,24 +250,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_types_header",
-    hdrs = ["gpu_types.h"],
-    defines = if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]) + if_sycl_is_configured([
-        "TENSORFLOW_USE_SYCL=1",
-    ]),
-    tags = ["gpu"],
-    deps = if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]) + if_sycl_is_configured([
-        "@local_config_sycl//sycl:sycl_headers",
-    ]),
-)
-
 cc_library(
     name = "gpu_asm_opts",
     hdrs = ["gpu_asm_opts.h"],
@@ -458,7 +422,6 @@ cc_library(
     name = "gpu_blas_lt",
     srcs = ["gpu_blas_lt.cc"],
     hdrs = ["gpu_blas_lt.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":gpu_blas_lt_proto_cc",
         "//xla:shape_util",
@@ -469,8 +432,11 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/algorithm:container",
@@ -484,9 +450,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
-    ]) + if_static([
+    ] + if_static([
         "@local_tsl//tsl/platform:tensor_float_32_utils",
     ]),
 )
@@ -704,11 +669,12 @@ xla_test(
     size = "small",
     srcs = ["gpu_executor_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_space",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
@@ -724,28 +690,7 @@ xla_test(
     name = "gpu_device_info_test",
     srcs = ["gpu_device_info_test.cc"],
     backends = ["gpu"],
-    data = if_cuda_is_configured([
-        "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_pcie.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
-    ]) + if_rocm_is_configured([
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-    ]),
-    local_defines = if_cuda_is_configured([
-        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"a100_pcie_80\\", \\"a100_sxm_40\\", \
-         \\"a100_sxm_80\\", \\"a6000\\", \\"h100_pcie\\", \\"h100_sxm\\", \\"p100\\", \\"v100\\"}',
-        'PLATFORM_NAME=\\"CUDA\\"',
-    ]) + if_rocm_is_configured([
-        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"mi200\\"}',
-        'PLATFORM_NAME=\\"ROCM\\"',
-    ]),
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
     deps = [
         "//xla/service:platform_util",
         "//xla/stream_executor:device_description",
@@ -765,6 +710,14 @@ xla_test(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
+    local_defines = if_cuda_is_configured([
+        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"a100_pcie_80\\", \\"a100_sxm_40\\", \
+         \\"a100_sxm_80\\", \\"a6000\\", \\"h100_pcie\\", \\"h100_sxm\\", \\"p100\\", \\"v100\\"}',
+         'PLATFORM_NAME=\\"CUDA\\"'
+    ]) + if_rocm_is_configured([
+        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"mi200\\"}',
+        'PLATFORM_NAME=\\"ROCM\\"'
+    ]),
 )
 
 tf_proto_library(
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
index 421a1a08b7d547..af0b687d6f9578 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
@@ -25,21 +25,32 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 
-// Trait for a kernel that computes the NaN count of given input buffer and
-// appends it to the buffer debug log.
-//
-// This kernel MUST execute on a single thread block.
+// Counts the number of NaNs, Infs and zeros in a buffer of floats in parallel,
+// and stores partially accumulated results in the FloatCheckResult array.
 struct BufferDebugFloatCheckF32Kernel {
   using KernelType =
-      TypedKernel<xla::gpu::BufferDebugLogEntryId, DeviceAddress<float>,
-                  uint64_t, DeviceAddress<xla::gpu::BufferDebugLogHeader>,
-                  DeviceAddress<xla::gpu::BufferDebugFloatCheckEntry>>;
+      TypedKernel<DeviceAddress<float>, uint64_t,
+                  DeviceAddress<xla::gpu::FloatCheckResult>, uint64_t>;
 };
 
+// Counts the number of NaNs, Infs and zeros in a buffer of bfloat16s in
+// parallel, and stores partially accumulated results in the FloatCheckResult
+// array.
 struct BufferDebugFloatCheckBf16Kernel {
   using KernelType =
-      TypedKernel<xla::gpu::BufferDebugLogEntryId,
-                  DeviceAddress<Eigen::bfloat16>, uint64_t,
+      TypedKernel<DeviceAddress<Eigen::bfloat16>, uint64_t,
+                  DeviceAddress<xla::gpu::FloatCheckResult>, uint64_t>;
+};
+
+// Trait for a kernel that reduces the partially accumulated results from
+// `BufferDebugFloatCheckF32Kernel` or `BufferDebugFloatCheckBf16Kernel`
+// invocations and appends the result to the buffer debug log.
+//
+// This kernel MUST execute on a single thread block.
+struct BufferDebugAppendReducedFloatCheckResultsKernel {
+  using KernelType =
+      TypedKernel<DeviceAddress<xla::gpu::FloatCheckResult>, uint64_t,
+                  xla::gpu::BufferDebugLogEntryId,
                   DeviceAddress<xla::gpu::BufferDebugLogHeader>,
                   DeviceAddress<xla::gpu::BufferDebugFloatCheckEntry>>;
 };
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
index 5d390b22c0cec6..d6b811cdff6da4 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
@@ -52,13 +52,13 @@ class BufferDebugLogTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
   }
 
   Platform* platform_;
   StreamExecutor* executor_;
   std::unique_ptr<Stream> stream_;
-  std::unique_ptr<StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesEmptyLog) {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
index 2df49a47abe288..e024a9ddcb87e0 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -37,9 +38,7 @@ limitations under the License.
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#if GOOGLE_CUDA
 #include "tsl/platform/tensor_float_32_utils.h"
-#endif
 
 namespace stream_executor {
 
@@ -205,7 +204,8 @@ xla::GemmConfigProto::MatrixLayout MatrixLayout::ToProto() const {
 
 absl::StatusOr<ComputationType> GetBlasComputationType(
     xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
-    xla::PrimitiveType output_dtype, int64_t compute_precision) {
+    xla::PrimitiveType output_dtype, int64_t compute_precision,
+    const GpuComputeCapability& cc) {
   if (algorithm == xla::PrecisionConfig::ALG_UNSET) {
     switch (output_dtype) {
       case PrimitiveType::F8E5M2:      // fall-through
@@ -222,14 +222,12 @@ absl::StatusOr<ComputationType> GetBlasComputationType(
         return ComputationType::kF32;
       case PrimitiveType::F32:  // fall-through
       case PrimitiveType::C64:
-#if GOOGLE_CUDA
-        if (tsl::tensor_float_32_execution_enabled() &&
+        if (cc.IsCuda() && tsl::tensor_float_32_execution_enabled() &&
             compute_precision <= 1 && lhs_dtype == output_dtype) {
           // CublasLt requires compute type to be F32 for F8 matmul.
           // TF32 should only be chosen for FP32 or C64 gemm
           return ComputationType::kTF32AsF32;
         }
-#endif
         return ComputationType::kF32;
       case PrimitiveType::F64:  // fall-through
       case PrimitiveType::C128:
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index 8b6043b8bf8b64..963c28b3b9550d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -44,7 +45,8 @@ absl::StatusOr<xla::PrimitiveType> AsXlaPrimitiveType(blas::DataType dtype);
 
 absl::StatusOr<blas::ComputationType> GetBlasComputationType(
     xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
-    xla::PrimitiveType output_dtype, int64_t compute_precision);
+    xla::PrimitiveType output_dtype, int64_t compute_precision,
+    const GpuComputeCapability& cc);
 
 // Returns the type for the alpha and beta scalars.
 blas::DataType GetScaleType(blas::DataType c_type,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index 1f95fdd526b790..b56153a0668dfd 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -47,8 +47,8 @@ TEST(DeviceInfoTest, DeviceInfoMatches) {
     TF_ASSERT_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
         tsl::io::JoinPath(path, "external/local_xla/xla",
-                          "tools", "hlo_opt",
-                          "gpu_specs", absl::StrCat(file_name, ".txtpb")),
+                          "backends/gpu/target_config/specs",
+                          absl::StrCat(file_name, ".txtpb")),
         &spec_string));
     ASSERT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
index 4d0bb1b1e3c711..47f59c064bc088 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -42,19 +46,19 @@ using GetPointerMemorySpaceTest = GpuExecutorTest;
 TEST_F(GetPointerMemorySpaceTest, Host) {
   StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
   TF_ASSERT_OK_AND_ASSIGN(auto host_ptr, executor->HostMemoryAllocate(64));
-  TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
-                          executor->GetPointerMemorySpace(host_ptr->opaque()));
-  EXPECT_EQ(memory_space, MemoryType::kHost);
+  TF_ASSERT_OK_AND_ASSIGN(auto memory_space, executor->GetPointerMemorySpace(
+                                                 host_ptr->address().opaque()));
+  EXPECT_EQ(memory_space, MemorySpace::kHost);
 }
 
 TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) {
   StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
   DeviceAddressBase host_ptr = executor->Allocate(
-      64, static_cast<int64_t>(stream_executor::MemoryType::kHost));
+      64, static_cast<int64_t>(stream_executor::MemorySpace::kHost));
   EXPECT_FALSE(host_ptr.is_null());
-  TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space,
+  TF_ASSERT_OK_AND_ASSIGN(MemorySpace memory_space,
                           executor->GetPointerMemorySpace(host_ptr.opaque()));
-  EXPECT_EQ(memory_space, MemoryType::kHost);
+  EXPECT_EQ(memory_space, MemorySpace::kHost);
   executor->Deallocate(&host_ptr);
 }
 
@@ -64,7 +68,7 @@ TEST_F(GetPointerMemorySpaceTest, Device) {
   ASSERT_NE(mem, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
                           executor->GetPointerMemorySpace(mem.opaque()));
-  EXPECT_EQ(memory_space, MemoryType::kDevice);
+  EXPECT_EQ(memory_space, MemorySpace::kDevice);
   executor->Deallocate(&mem);
 }
 
@@ -82,8 +86,9 @@ TEST_F(HostMemoryAllocateTest, Numa) {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> host_ptr,
                             executor->HostMemoryAllocate(kSize));
     ASSERT_TRUE(host_ptr);
-    EXPECT_NE(host_ptr->opaque(), nullptr);
-    const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
+    EXPECT_NE(host_ptr->address().opaque(), nullptr);
+    const int numa_node =
+        tsl::port::NUMAGetMemAffinity(host_ptr->address().opaque());
     if (numa_node == tsl::port::kNUMANoAffinity) {
       // Could be because `executor` could not determine its own NUMA node, in
       // which case numa_node() will be -1 or 0, depending on the failure mode.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
index 0294d3dfbeb21f..87dec701c5b2de 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
@@ -35,6 +35,6 @@ DeviceAddress<GpuSemaphoreState> GpuSemaphore::device() {
   // This assumes unified addressing, as we do not explicitly translate the
   // host pointer into a device pointer.
   return DeviceAddress<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
+      ptr_->address().opaque(), sizeof(GpuSemaphoreState));
 }
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
index 010bde955cfd2f..e7f34b00264434 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
@@ -42,7 +42,7 @@ class GpuSemaphore {
   explicit operator bool() const { return bool{ptr_}; }
 
   GpuSemaphoreState& operator*() {
-    return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
+    return *static_cast<GpuSemaphoreState*>(ptr_->address().opaque());
   }
   DeviceAddress<GpuSemaphoreState> device();
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_types.h b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
deleted file mode 100644
index 84c5d400c99106..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// GPU (SYCL / ROCm / CUDA) specific type handle resolution
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
-
-#if TENSORFLOW_USE_SYCL
-
-#include "sycl/sycl.hpp"
-
-#elif TENSORFLOW_USE_ROCM
-
-#include "rocm/include/hip/hip_runtime.h"
-#include "rocm/include/hiprand/hiprand.h"
-
-#else  // CUDA
-
-#include "third_party/gpus/cuda/include/cuda.h"
-
-#endif
-
-namespace stream_executor {
-namespace gpu {
-
-#if TENSORFLOW_USE_SYCL
-
-using GpuStreamHandle = ::sycl::queue*;
-
-#elif TENSORFLOW_USE_ROCM
-
-using GpuStreamHandle = hipStream_t;
-#else  // CUDA
-
-using GpuStreamHandle = CUstream;
-
-#endif
-
-}  // namespace gpu
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc b/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
index 641b96adaa51a1..4a94fe0d821fef 100644
--- a/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
@@ -33,7 +33,7 @@ TEST(MemcpyTest, PinnedHostMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto d_ptr,
                           executor->HostMemoryAllocate(sizeof(int)));
-  DeviceAddressBase d_mem(d_ptr->opaque(), sizeof(int));
+  DeviceAddressBase d_mem(d_ptr->address().opaque(), sizeof(int));
 
   int h_ptr;
   TF_ASSERT_OK(stream->Memcpy(&h_ptr, d_mem, d_mem.size()));
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index 60559910852bed..9d6fc02bdeb14e 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -85,7 +85,7 @@ absl::StatusOr<DeviceAddress<uint8_t>> RedzoneAllocator::AllocateBytes(
 
   int64_t rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
   TF_ASSIGN_OR_RETURN(
-      OwningDeviceAddress allocated_buffer,
+      ScopedDeviceAddress<uint8_t> allocated_buffer,
       memory_allocator_->Allocate(device_ordinal_,
                                   byte_size + 2 * redzone_size_ + rhs_slop,
                                   /*retry_on_failure=*/false));
@@ -277,13 +277,13 @@ absl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
 
   DeviceAddressHandle out_param(executor, executor->AllocateScalar<uint64_t>());
   TF_RETURN_IF_ERROR(
-      stream_->MemZero(out_param.memory_ptr(), sizeof(uint64_t)));
+      stream_->MemZero(out_param.address_ptr(), sizeof(uint64_t)));
 
   for (const auto& buf_and_size : allocated_buffers_) {
     TF_ASSIGN_OR_RETURN(
         RedzoneCheckStatus redzone_status,
         CheckRedzonesForBuffer(stream_, *buf_and_size.first,
-                               DeviceAddress<uint64_t>(out_param.memory()),
+                               DeviceAddress<uint64_t>(out_param.address()),
                                kernel, buf_and_size.second, redzone_size_,
                                redzone_pattern_));
     if (!redzone_status.ok()) {
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
index ab8ccb8d2d94ad..22e6953225fbfe 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
@@ -136,7 +136,8 @@ class RedzoneAllocator : public ScratchAllocator {
   //
   // ScratchAllocators need to free all allocated memory on destruction so we
   // use `OwningDeviceAddress` here.
-  std::vector<std::pair<OwningDeviceAddress, int64_t>> allocated_buffers_;
+  std::vector<std::pair<ScopedDeviceAddress<uint8_t>, int64_t>>
+      allocated_buffers_;
 
   int64_t allocated_bytes_excluding_redzones_ = 0;
 };
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
index b18c5025d68391..c610d2304a0261 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
@@ -60,7 +60,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
   Platform* platform =
       PlatformManager::PlatformWithName(GpuPlatformName()).value();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).value();
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  StreamExecutorAddressAllocator se_allocator(platform, {stream_exec});
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec->CreateStream());
   RedzoneAllocator allocator(stream.get(), &se_allocator,
@@ -134,7 +134,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
   Platform* platform =
       PlatformManager::PlatformWithName(GpuPlatformName()).value();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).value();
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  StreamExecutorAddressAllocator se_allocator(platform, {stream_exec});
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec->CreateStream());
   RedzoneAllocator allocator(stream.get(), &se_allocator,
                              /*memory_limit=*/(1LL << 32),
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index a8f70bd25ccac0..b854988eafdaf1 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -85,7 +85,7 @@ DeviceAddressBase HostExecutor::Allocate(uint64_t size, int64_t memory_space) {
   // This should probably be kept in sync with
   // tsl::Allocator::kAllocatorAlignment.
   return DeviceAddressBase(
-      tsl::port::AlignedMalloc(size, /*minimum_alignment=*/64), size);
+      tsl::port::AlignedMalloc(size, static_cast<std::align_val_t>(64)), size);
 }
 
 void HostExecutor::Deallocate(DeviceAddressBase* mem) {
@@ -148,8 +148,8 @@ absl::StatusOr<std::unique_ptr<Stream>> HostExecutor::CreateStream(
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-HostExecutor::CreateMemoryAllocator(MemoryType type) {
-  if (type == MemoryType::kHost) {
+HostExecutor::CreateMemoryAllocator(MemorySpace type) {
+  if (type == MemorySpace::kHost) {
     return std::make_unique<GenericMemoryAllocator>(
         [](uint64_t size) -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
           void* ptr = new char[size];
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
index 50475bb0116296..69e40a59a880bf 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -101,7 +101,7 @@ class HostExecutor : public StreamExecutorCommon {
   absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
       std::optional<std::variant<StreamPriority, int>> priority) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override;
+      MemorySpace type) override;
 
  private:
   int device_ordinal_;
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
index bcdeebf8eb022b..cf2ad44a4b0b85 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
@@ -31,7 +31,7 @@ limitations under the License.
 namespace stream_executor {
 
 StreamExecutorAllocator::StreamExecutorAllocator(
-    std::unique_ptr<MemoryAllocator> memory_allocator, MemoryType memory_type,
+    std::unique_ptr<MemoryAllocator> memory_allocator, MemorySpace memory_type,
     int index, const std::vector<Visitor>& alloc_visitors,
     const std::vector<Visitor>& free_visitors)
     : tsl::SubAllocator(alloc_visitors, free_visitors),
@@ -39,16 +39,16 @@ StreamExecutorAllocator::StreamExecutorAllocator(
       memory_type_(memory_type),
       index_(index) {}
 
-// Converts MemoryType to a human-readable string for allocation error messages
-static absl::string_view MemoryTypeToString(MemoryType type) {
+// Converts MemorySpace to a human-readable string for allocation error messages
+static absl::string_view MemorySpaceToString(MemorySpace type) {
   switch (type) {
-    case MemoryType::kDevice:
+    case MemorySpace::kDevice:
       return "device";
-    case MemoryType::kUnified:
+    case MemorySpace::kUnified:
       return "unified";
-    case MemoryType::kHost:
+    case MemorySpace::kHost:
       return "pinned host";
-    case MemoryType::kCollective:
+    case MemorySpace::kCollective:
       return "collective";
     default:
       return "unknown";
@@ -64,8 +64,9 @@ void* StreamExecutorAllocator::Alloc(size_t alignment, size_t num_bytes,
   if (num_bytes > 0) {
     auto allocation = memory_allocator_->Allocate(num_bytes);
     if (!allocation.ok()) {
-      LOG(WARNING) << "could not allocate " << MemoryTypeToString(memory_type_)
-                   << " of size: " << num_bytes;
+      LOG(WARNING) << "could not allocate " << MemorySpaceToString(memory_type_)
+                   << " of size: " << num_bytes << " (" << allocation.status()
+                   << ')';
       *bytes_received = 0;
       return nullptr;
     }
@@ -95,7 +96,7 @@ void StreamExecutorAllocator::Free(void* ptr, size_t num_bytes) {
 bool StreamExecutorAllocator::SupportsCoalescing() const { return false; }
 
 tsl::AllocatorMemoryType StreamExecutorAllocator::GetMemoryType() const {
-  if (memory_type_ == MemoryType::kHost) {
+  if (memory_type_ == MemorySpace::kHost) {
     return tsl::AllocatorMemoryType::kHostPinned;
   } else {
     return tsl::AllocatorMemoryType::kDevice;
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h
index 312de7bbba9616..8b104ca784c66e 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h
@@ -35,7 +35,7 @@ namespace stream_executor {
 class StreamExecutorAllocator : public tsl::SubAllocator {
  public:
   StreamExecutorAllocator(std::unique_ptr<MemoryAllocator> memory_allocator,
-                          MemoryType memory_type, int index,
+                          MemorySpace memory_type, int index,
                           const std::vector<Visitor>& alloc_visitors = {},
                           const std::vector<Visitor>& free_visitors = {});
 
@@ -48,7 +48,7 @@ class StreamExecutorAllocator : public tsl::SubAllocator {
 
  private:
   std::unique_ptr<MemoryAllocator> memory_allocator_;
-  MemoryType memory_type_;
+  MemorySpace memory_type_;
   int index_;
 
   StreamExecutorAllocator(const StreamExecutorAllocator&) = delete;
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc
index 84e2580aff9d90..8a40b3b8c796c1 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc
@@ -39,7 +39,7 @@ TEST(StreamExecutorAllocatorTest, NoMemoryReturnsNullptr) {
       });
 
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kHost, /*index=*/0,
+      std::move(allocator), MemorySpace::kHost, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   size_t bytes_received = 0;
@@ -55,7 +55,7 @@ TEST(StreamExecutorAllocatorTest, DoesntSupportCoalescing) {
         return absl::InternalError("Failed to allocate memory");
       });
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kHost, /*index=*/0,
+      std::move(allocator), MemorySpace::kHost, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   EXPECT_FALSE(stream_executor_allocator.SupportsCoalescing());
@@ -67,7 +67,7 @@ TEST(StreamExecutorAllocatorTest, GetMemoryTypeReturnsHostPinnedForHostMemory) {
         return absl::InternalError("Failed to allocate memory");
       });
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kHost, /*index=*/0,
+      std::move(allocator), MemorySpace::kHost, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   EXPECT_EQ(tsl::AllocatorMemoryType::kHostPinned,
@@ -80,7 +80,7 @@ TEST(StreamExecutorAllocatorTest, GetMemoryTypeReturnsDeviceForDeviceAddress) {
         return absl::InternalError("Failed to allocate memory");
       });
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kDevice, /*index=*/0,
+      std::move(allocator), MemorySpace::kDevice, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   EXPECT_EQ(tsl::AllocatorMemoryType::kDevice,
@@ -122,7 +122,7 @@ TEST(StreamExecutorAllocatorTest,
     free_visitor_called = true;
   };
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kDevice, /*index=*/0, {alloc_visitor},
+      std::move(allocator), MemorySpace::kDevice, /*index=*/0, {alloc_visitor},
       {free_visitor});
   EXPECT_FALSE(free_visitor_called);
   EXPECT_FALSE(alloc_visitor_called);
diff --git a/third_party/xla/xla/stream_executor/memory_allocation.h b/third_party/xla/xla/stream_executor/memory_allocation.h
index 0e0df2442001e0..2a75a1069b648b 100644
--- a/third_party/xla/xla/stream_executor/memory_allocation.h
+++ b/third_party/xla/xla/stream_executor/memory_allocation.h
@@ -18,11 +18,22 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/base/macros.h"
+#include "xla/stream_executor/device_address.h"
+
 namespace stream_executor {
 
-// An RAII handle for a memory allocated for a device. It can be pinned host
-// memory, unified memory, device memory, etc. depending on what kinds of
-// memories are supported by underlying device.
+// A MemoryAllocation is a block of physical memory allocated on the
+// StreamExecutor device.
+//
+// MemoryAllocation is not necessarily a physical memory on the physical device
+// (i.e. GPU), it can be a memory on the host pre-mapped for the host to device
+// communication. It can be pinned host memory, unified memory, device memory,
+// etc. depending on what kinds of memories are supported by underlying device.
+//
+// MemoryAllocation can be mapped to a DeviceAddress, which can be used to
+// access the memory from device or host. Multiple device address ranges can be
+// mapped to the same MemoryAllocation.
 class MemoryAllocation {
  public:
   MemoryAllocation() = default;
@@ -31,8 +42,16 @@ class MemoryAllocation {
   MemoryAllocation(MemoryAllocation&&) = delete;
   MemoryAllocation& operator=(MemoryAllocation&&) = delete;
 
-  virtual void* opaque() const = 0;
-  virtual uint64_t size() const = 0;
+  // A device address which gives access to the memory allocation. Can be
+  // nullptr if memory allocation is not adressable, i.e. physical allocation
+  // might not be mapped to any virtual address by default.
+  virtual DeviceAddressBase address() const = 0;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  void* opaque() const { return address().opaque(); }
+
+  ABSL_DEPRECATE_AND_INLINE()
+  uint64_t size() const { return address().size(); }
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/memory_allocator.h b/third_party/xla/xla/stream_executor/memory_allocator.h
index 5183768efe12e5..3aed5aac37ea44 100644
--- a/third_party/xla/xla/stream_executor/memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/memory_allocator.h
@@ -24,7 +24,12 @@ limitations under the License.
 
 namespace stream_executor {
 
-// This class defines the interface for memory allocators.
+// A base class for stream executor memory allocators.
+//
+// Memory allocators are responsible allocating physical memory for a given
+// stream executor, this physical memory might reside in different memory spaces
+// such as device memory, unified memory, host memory, etc. See MemoryAllocation
+// documentation for more details.
 class MemoryAllocator {
  public:
   virtual ~MemoryAllocator() = default;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/memory_space.h
similarity index 55%
rename from third_party/xla/xla/stream_executor/gpu/gpu_stream.h
rename to third_party/xla/xla/stream_executor/memory_space.h
index ec95ec50e25226..251dc9bbd5b339 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/memory_space.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,21 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the GpuStream type - the CUDA-specific implementation of the generic
-// StreamExecutor Stream interface.
+#ifndef XLA_STREAM_EXECUTOR_MEMORY_SPACE_H_
+#define XLA_STREAM_EXECUTOR_MEMORY_SPACE_H_
 
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#include <cstdint>
 
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/stream.h"
+#include "absl/base/macros.h"
 
 namespace stream_executor {
-namespace gpu {
 
-// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
-GpuStreamHandle AsGpuStreamValue(Stream* stream);
-}  // namespace gpu
+// Identifies the memory space where a physical allocation resides.
+enum class MemorySpace : uint8_t {
+  kDevice = 0,
+  kUnified,
+  kCollective,
+  kP2P,
+  kHost = 5,
+};
+
+using MemoryType ABSL_DEPRECATE_AND_INLINE() = MemorySpace;
+
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#endif  // XLA_STREAM_EXECUTOR_MEMORY_SPACE_H_
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
index 231ddf297e3e66..589a590e65c36e 100644
--- a/third_party/xla/xla/stream_executor/mock_stream_executor.h
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
@@ -112,7 +113,7 @@ class MockStreamExecutor : public StreamExecutor {
               CreateEventBasedTimer, (Stream * stream, bool use_delay_kernel),
               (override));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<MemoryAllocator>>,
-              CreateMemoryAllocator, (MemoryType type), (override));
+              CreateMemoryAllocator, (MemorySpace memory_space), (override));
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index ab3b73b0fc8fa9..5b01213aeb2a06 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -43,6 +43,7 @@ xla_cc_test(
     srcs = ["rocm_compute_capability_test.cc"],
     deps = [
         ":rocm_compute_capability",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -421,9 +422,6 @@ cc_library(
     name = "rocm_solver_context",
     srcs = ["rocm_solver_context.cc"],
     hdrs = ["rocm_solver_context.h"],
-    local_defines = [
-        "TENSORFLOW_USE_ROCM=1",
-    ],
     tags = [
         "gpu",
         "manual",
@@ -530,11 +528,13 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util:determinism_for_kernels",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -676,6 +676,7 @@ cc_library(
     deps = [
         ":hip_blas_utils",
         ":hipblas_lt_header",
+        ":hipblaslt_if_static",
         ":rocblas_plugin",
         ":rocm_executor",
         ":rocm_platform_id",
@@ -696,6 +697,7 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -705,8 +707,6 @@ cc_library(
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:ml_dtypes",
-    ] + [
-        ":hipblaslt_if_static",
     ],
     alwayslink = True,
 )
@@ -732,7 +732,10 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:dso_loader",
     ],
diff --git a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
index 7b7d43ab3e460d..d6dcaade5543da 100644
--- a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
@@ -252,7 +252,7 @@ static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
           .Attr<size_t>("num_items")                                          \
           .Attr<size_t>("batch_size"));                                       \
   XLA_FFI_REGISTER_HANDLER(                                                   \
-      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_keys_" #suffix, "CUDA", \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_keys_" #suffix, "ROCM", \
       {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                \
        /* .initialize = */ kCubSortKeysInitialize_##suffix,                   \
        /* .execute = */ kCubSortKeysExecute_##suffix});
@@ -278,7 +278,7 @@ static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
           .Attr<size_t>("num_items")                                           \
           .Attr<size_t>("batch_size"));                                        \
   XLA_FFI_REGISTER_HANDLER(                                                    \
-      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_pairs_" #suffix, "CUDA", \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_pairs_" #suffix, "ROCM", \
       {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                 \
        /* .initialize = */ kCubSortPairsInitialize_##suffix,                   \
        /* .execute = */ kCubSortPairsExecute_##suffix});
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index a4f95af756281e..531f11daa296a0 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "Eigen/Core"
 #include "rocm/include/hip/library_types.h"
 #include "rocm/include/hipblas/hipblas.h"
 #include "rocm/include/hipblaslt/hipblaslt.h"
@@ -55,7 +54,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
-#include "tsl/platform/ml_dtypes.h"
 
 #define SET_ATTR(setter, handle, attr, value) \
   ToStatus(setter(handle, attr, &value, sizeof(decltype(value))), #setter)
@@ -326,10 +324,12 @@ auto BlasLt::GetMatmulPlan(const gpu::GemmConfig& cfg, Epilogue epilogue) const
 
   auto compute_type = cfg.compute_type;
   if (!compute_type) {  // obtain compute_type unless provided by the user
-    TF_ASSIGN_OR_RETURN(compute_type,
-                        gpu::GetBlasComputationType(
-                            cfg.precision_algorithm, lhs_layout.dtype,
-                            output_layout.dtype, cfg.compute_precision));
+    TF_ASSIGN_OR_RETURN(
+        compute_type,
+        gpu::GetBlasComputationType(
+            cfg.precision_algorithm, lhs_layout.dtype, output_layout.dtype,
+            cfg.compute_precision,
+            parent_->GetDeviceDescription().gpu_compute_capability()));
   }
 
   if (lhs_layout.order == gpu::MatrixLayout::Order::kRowMajor) {
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 17775b42ab3e4b..48ccd371c0edc7 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -16,7 +16,10 @@ limitations under the License.
 #include <cstddef>
 #include <utility>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "rocm/rocm_config.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
index 528f4febfcd2e4..0bb07079f362ba 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
@@ -17,13 +17,11 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
index ed9eba6c545e17..5ecb0db327988d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/device_description.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace stream_executor::rocm {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 1922c006b60deb..e1dbd07db3a95a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "rocm/include/miopen/miopen.h"
 #include "xla/stream_executor/device_address.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
 
 namespace stream_executor {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 9b704686dd3bb2..7abcee72577faa 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -779,12 +779,12 @@ absl::StatusOr<ModuleHandle> RocmExecutor::LoadModuleFromHsaco(
 }
 
 DeviceAddressBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) {
-  switch (static_cast<MemoryType>(memory_space)) {
-    case MemoryType::kCollective:
-    case MemoryType::kDevice:
+  switch (static_cast<MemorySpace>(memory_space)) {
+    case MemorySpace::kCollective:
+    case MemorySpace::kDevice:
       return DeviceAddressBase(
           DeviceAllocate(rocm_context_, size, /*is_fine_grained*/ false), size);
-    case MemoryType::kP2P:
+    case MemorySpace::kP2P:
       // On the ROCm platform, differences in cache design (e.g., coherence
       // protocol) can cause cache coherence issues for some archs (e.g., MI200)
       // when using normal device memory. To avoid these problems, we use
@@ -792,7 +792,7 @@ DeviceAddressBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) {
       // the correctness.
       return DeviceAddressBase(
           DeviceAllocate(rocm_context_, size, /*is_fine_grained*/ true), size);
-    case MemoryType::kHost:
+    case MemorySpace::kHost:
       if (auto result = HostAllocate(rocm_context_, size); result.ok()) {
         return DeviceAddressBase(*result, size);
       }
@@ -812,9 +812,9 @@ void RocmExecutor::Deallocate(DeviceAddressBase* mem) {
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-RocmExecutor::CreateMemoryAllocator(MemoryType type) {
+RocmExecutor::CreateMemoryAllocator(MemorySpace type) {
   switch (type) {
-    case MemoryType::kUnified:
+    case MemorySpace::kUnified:
       return std::make_unique<GenericMemoryAllocator>(
           [this](uint64_t size)
               -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -842,7 +842,7 @@ RocmExecutor::CreateMemoryAllocator(MemoryType type) {
                   }
                 });
           });
-    case MemoryType::kCollective:
+    case MemorySpace::kCollective:
       return std::make_unique<GenericMemoryAllocator>(
           [](uint64_t size)
               -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -869,7 +869,7 @@ RocmExecutor::CreateMemoryAllocator(MemoryType type) {
                   }
                 });
           });
-    case MemoryType::kHost:
+    case MemorySpace::kHost:
       return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
         return AllocateHostMemory(rocm_context_, size);
       });
@@ -1243,7 +1243,7 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
   return std::make_unique<DeviceDescription>(std::move(desc));
 }
 
-absl::StatusOr<MemoryType> RocmExecutor::GetPointerMemorySpace(
+absl::StatusOr<MemorySpace> RocmExecutor::GetPointerMemorySpace(
     const void* ptr) {
   hipDeviceptr_t pointer =
       reinterpret_cast<hipDeviceptr_t>(const_cast<void*>(ptr));
@@ -1253,9 +1253,9 @@ absl::StatusOr<MemoryType> RocmExecutor::GetPointerMemorySpace(
   if (result == hipSuccess) {
     switch (value) {
       case hipMemoryTypeDevice:
-        return MemoryType::kDevice;
+        return MemorySpace::kDevice;
       case hipMemoryTypeHost:
-        return MemoryType::kHost;
+        return MemorySpace::kHost;
       default:
         return absl::InternalError(
             absl::StrCat("unknown memory space provided by ROCM API: ", value));
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
index 71bec6a2376f8c..cbf064795206c6 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
@@ -113,7 +113,7 @@ class RocmExecutor : public GpuExecutor {
   bool HostMemoryRegister(void* location, uint64_t size) override;
   bool HostMemoryUnregister(void* location) override;
 
-  absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
+  absl::StatusOr<MemorySpace> GetPointerMemorySpace(const void* ptr) override;
 
   Stream* FindAllocatedStream(void* gpu_stream) override {
     absl::MutexLock lock(alive_gpu_streams_mu_);
@@ -131,7 +131,7 @@ class RocmExecutor : public GpuExecutor {
   // associated with this executor. Otherwise a NotFound error is returned.
   absl::StatusOr<const RocmKernel*> GetRocmKernel(const Kernel* kernel);
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override;
+      MemorySpace type) override;
 
   int GetGpuStreamPriority(StreamPriority priority) override;
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
index 4ede48411b45b5..342c3ca5952b40 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_executor.h"
 
 #include <memory>
-#include <variant>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -90,11 +89,11 @@ TEST(RocmExecutorTest, CreateUnifiedMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kUnified));
+      executor->CreateMemoryAllocator(MemorySpace::kUnified));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
   allocation.reset();
 }
 
@@ -104,11 +103,11 @@ TEST(RocmExecutorTest, CreateHostMemoryAllocatorWorks) {
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocator> allocator,
-                          executor->CreateMemoryAllocator(MemoryType::kHost));
+                          executor->CreateMemoryAllocator(MemorySpace::kHost));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
   allocation.reset();
 }
 
@@ -119,11 +118,11 @@ TEST(RocmExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kCollective));
+      executor->CreateMemoryAllocator(MemorySpace::kCollective));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
   allocation.reset();
 }
 
@@ -132,7 +131,7 @@ TEST(RocmExecutorTest, CreateUnsupportedMemoryAllocatorsFail) {
                           PlatformManager::PlatformWithName("ROCM"));
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
-  EXPECT_THAT(executor->CreateMemoryAllocator(MemoryType::kDevice),
+  EXPECT_THAT(executor->CreateMemoryAllocator(MemorySpace::kDevice),
               Not(absl_testing::IsOk()));
 }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_stream.h b/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
index a16a3bb5305559..4d8cfbb7acf90c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
@@ -23,6 +23,8 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "rocm/include/hip/hip_runtime.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/event.h"
diff --git a/third_party/xla/xla/stream_executor/scratch_allocator.h b/third_party/xla/xla/stream_executor/scratch_allocator.h
index 0c50a4686dc1dc..86830d2c01a229 100644
--- a/third_party/xla/xla/stream_executor/scratch_allocator.h
+++ b/third_party/xla/xla/stream_executor/scratch_allocator.h
@@ -72,7 +72,7 @@ class OwningScratchAllocator : public ScratchAllocator {
 
   absl::StatusOr<DeviceAddress<uint8_t>> AllocateBytes(
       int64_t byte_size) override {
-    TF_ASSIGN_OR_RETURN(OwningDeviceAddress buffer,
+    TF_ASSIGN_OR_RETURN(ScopedDeviceAddress<uint8_t> buffer,
                         allocator_->Allocate(device_ordinal_, byte_size,
                                              /*retry_on_failure=*/false));
     buffers_.push_back(std::move(buffer));
@@ -82,7 +82,7 @@ class OwningScratchAllocator : public ScratchAllocator {
  private:
   int device_ordinal_;
   DeviceAddressAllocator* allocator_;
-  absl::InlinedVector<OwningDeviceAddress, N> buffers_;
+  absl::InlinedVector<ScopedDeviceAddress<uint8_t>, N> buffers_;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 8b43c676ea7014..c3e8bd145fc7fe 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -46,9 +46,11 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/tensor_map.h"
 #include "xla/tsl/lib/gtl/int_type.h"
 
 // TODO(ezhulenev): Remove this once transitive dependencies are fixed.
@@ -56,9 +58,6 @@ limitations under the License.
 
 namespace stream_executor {
 
-// Identifies the memory space where an allocation resides.
-enum class MemoryType { kDevice = 0, kUnified, kCollective, kP2P, kHost = 5 };
-
 /// The StreamExecutor is a single-device abstraction for:
 //
 // * Loading/launching data-parallel-kernels
@@ -109,7 +108,7 @@ class StreamExecutor {
 
   // Creates a MemoryAllocator for the given type.
   virtual absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-  CreateMemoryAllocator(MemoryType type) {
+  CreateMemoryAllocator(MemorySpace memory_space) {
     return absl::UnimplementedError("Not Implemented");
   }
 
@@ -179,7 +178,7 @@ class StreamExecutor {
       uint64_t size) = 0;
 
   // Returns the memory space of the given pointer.
-  virtual absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) {
+  virtual absl::StatusOr<MemorySpace> GetPointerMemorySpace(const void* ptr) {
     return absl::UnimplementedError("Not implemented");
   }
 
@@ -236,6 +235,9 @@ class StreamExecutor {
   // StreamExecutor to memory allocated by another.
   virtual bool CanEnablePeerAccessTo(StreamExecutor* other) = 0;
 
+  // Same as above, but takes the device ordinal of the other device.
+  virtual bool CanEnablePeerAccessTo(int other_device_ordinal) { return false; }
+
   // Returns the underlying device memory usage information, if it is available.
   // If it is not available (false is returned), free/total may not be
   // initialized.
diff --git a/third_party/xla/xla/stream_executor/stream_executor_test.cc b/third_party/xla/xla/stream_executor/stream_executor_test.cc
index 98c34e3ff6a1c4..74f6421952b8cd 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_test.cc
@@ -42,8 +42,8 @@ static absl::StatusOr<StreamExecutor*> NewStreamExecutor() {
 TEST(StreamExecutorTest, HostMemoryAllocate) {
   TF_ASSERT_OK_AND_ASSIGN(auto executor, NewStreamExecutor());
   TF_ASSERT_OK_AND_ASSIGN(auto allocation, executor->HostMemoryAllocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 TEST(StreamExecutorTest, GetOrCreateResource) {
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index c0e4af83f39ddc..1bc9dc62a099ed 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -72,7 +72,7 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
@@ -608,7 +608,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/stream_executor:device_address",
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index 4f06ee508fd8fb..43ce77e155b57f 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -34,7 +33,7 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
@@ -66,9 +65,9 @@ static void CreateVectorBase(const absl::Span<Src> src, DstList* dst) {
   dst->size = src.size();
   if (dst->size > TPU_C_API_MAX_INLINED) {
     dst->heap = new Dst[dst->size];
-    std::copy(src.begin(), src.end(), dst->heap);
+    absl::c_copy(src, dst->heap);
   } else {
-    std::copy(src.begin(), src.end(), dst->inlined);
+    absl::c_copy(src, dst->inlined);
   }
 }
 
@@ -158,14 +157,14 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
   return xla_shaped_buffer;
 }
 
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem,
-                               bool aliased) {
-  SE_MaybeOwningDeviceMemory se_mem;
+SE_MaybeOwningDeviceAddress ToC(xla::MaybeOwningDeviceAddress& mem,
+                                bool aliased) {
+  SE_MaybeOwningDeviceAddress se_mem;
   se_mem.owned = mem.HasOwnership();
-  se_mem.memory = ApiConverter::ToC(mem.AsDeviceMemoryBase());
+  se_mem.memory = ApiConverter::ToC(mem.AsDeviceAddress());
   if (mem.HasOwnership()) {
-    const stream_executor::OwningDeviceAddress* owned =
-        mem.AsOwningDeviceMemory();
+    const stream_executor::ScopedDeviceAddress<uint8_t>* owned =
+        mem.AsScopedDeviceAddress();
     se_mem.device_ordinal = owned->device_ordinal();
     se_mem.allocator = ApiConverter::ToC(owned->allocator());
     if (!aliased) {
@@ -180,15 +179,16 @@ SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem,
   return se_mem;
 }
 
-xla::MaybeOwningDeviceMemory FromC(
-    SE_MaybeOwningDeviceMemory* se_mem,
+xla::MaybeOwningDeviceAddress FromC(
+    SE_MaybeOwningDeviceAddress* se_mem,
     stream_executor::DeviceAddressAllocator* allocator) {
   if (se_mem->owned) {
-    return xla::MaybeOwningDeviceMemory(stream_executor::OwningDeviceAddress(
-        ApiConverter::FromC(se_mem->memory), se_mem->device_ordinal,
-        allocator));
+    return xla::MaybeOwningDeviceAddress(
+        stream_executor::ScopedDeviceAddress<uint8_t>(
+            ApiConverter::FromC(se_mem->memory), se_mem->device_ordinal,
+            allocator));
   } else {
-    return xla::MaybeOwningDeviceMemory(ApiConverter::FromC(se_mem->memory));
+    return xla::MaybeOwningDeviceAddress(ApiConverter::FromC(se_mem->memory));
   }
 }
 
@@ -244,8 +244,9 @@ stream_executor::DeviceAddressAllocator* FromC(
       c_allocator.ctx);
 }
 
-SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceAddress* mem) {
-  SE_MaybeOwningDeviceMemory se_mem;
+SE_MaybeOwningDeviceAddress ToC(
+    stream_executor::ScopedDeviceAddress<uint8_t>* mem) {
+  SE_MaybeOwningDeviceAddress se_mem;
   se_mem.device_ordinal = mem->device_ordinal();
   se_mem.memory = ApiConverter::ToC(mem->Release());
   se_mem.allocator = ApiConverter::ToC(mem->allocator());
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
index a3b7c716996b34..1ce5c3121f4cca 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -117,8 +117,8 @@ struct TpuEmbeddingEngineParametersData {
 
 std::unique_ptr<TpuEmbeddingEngineParametersData> Create(int num_tables);
 
-xla::MaybeOwningDeviceMemory FromC(
-    SE_MaybeOwningDeviceMemory* se_mem,
+xla::MaybeOwningDeviceAddress FromC(
+    SE_MaybeOwningDeviceAddress* se_mem,
     stream_executor::DeviceAddressAllocator* allocator);
 
 // DeviceAddressAllocator
@@ -128,11 +128,13 @@ stream_executor::DeviceAddressAllocator* FromC(
     const SE_DeviceAddressAllocator& c_allocator);
 
 // OwningDeviceAddress
-SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceAddress* mem);
+SE_MaybeOwningDeviceAddress ToC(
+    stream_executor::ScopedDeviceAddress<uint8_t>* mem);
 // mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
 // released. 'aliased' should be true in this case. 'aliased' has no effect if
 // 'mem' is unowned.
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem, bool aliased);
+SE_MaybeOwningDeviceAddress ToC(xla::MaybeOwningDeviceAddress& mem,
+                                bool aliased);
 
 // HloModule
 XLA_HloModule ToC(const xla::HloModule& module);
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
index 05ec51c5e79ea8..c96e6be263d884 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
@@ -308,7 +308,7 @@ TEST(XlaHloModule, ToAndFromC) {
 }
 
 // TODO(b/290654348): SE_DeviceAddressBase, SE_DeviceAddressAllocator,
-// SE_MaybeOwningDeviceMemory
+// SE_MaybeOwningDeviceAddress
 
 }  // namespace
 
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
index 096f265acaec79..a42221294fa16c 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
@@ -72,15 +72,11 @@ typedef struct SE_DeviceAddressBase {
   uint64_t payload;
 } SE_DeviceAddressBase;
 
-typedef SE_DeviceAddressBase SE_DeviceMemoryBase;
-
 typedef struct SE_ScopedDeviceAddress {
   SE_DeviceAddressBase wrapped;
   int device_ordinal;
 } SE_ScopedDeviceAddress;
 
-typedef SE_ScopedDeviceAddress SE_ScopedDeviceMemory;
-
 typedef struct SE_AllocatorStats {
   int64_t num_allocs;
   int64_t bytes_in_use;
@@ -117,8 +113,6 @@ typedef struct SE_DeviceAddressAllocator {
   SE_DeallocateFn deallocate;
 } SE_DeviceAddressAllocator;
 
-typedef SE_DeviceAddressAllocator SE_DeviceMemoryAllocator;
-
 typedef struct SE_DeviceDescription {
   char* device_vendor;
   char* platform_version;
@@ -175,14 +169,14 @@ typedef struct SE_ExecutableRunOptions {
 typedef struct SE_ExecutableSerializationHandle
     SE_ExecutableSerializationHandle;
 
-typedef struct SE_MaybeOwningDeviceMemory {
+typedef struct SE_MaybeOwningDeviceAddress {
   SE_DeviceAddressBase memory;
   bool owned;
 
   // Set if owned
   int device_ordinal;
   SE_DeviceAddressAllocator allocator;
-} SE_MaybeOwningDeviceMemory;
+} SE_MaybeOwningDeviceAddress;
 
 typedef struct IntList {
   union {
@@ -277,10 +271,10 @@ typedef struct XLA_Literal {
   XLA_Shape shape;
 } XLA_Literal;
 
-typedef struct XLA_MaybeOwningDeviceMemoryShapeTree {
+typedef struct XLA_MaybeOwningDeviceAddressShapeTree {
   XLA_Shape shape;
-  SE_MaybeOwningDeviceMemory* buffers;
-} XLA_MaybeOwningDeviceMemoryShapeTree;
+  SE_MaybeOwningDeviceAddress* buffers;
+} XLA_MaybeOwningDeviceAddressShapeTree;
 
 typedef struct XLA_ShapeIndex {
   int64_t indices[8];
@@ -288,7 +282,7 @@ typedef struct XLA_ShapeIndex {
 } XLA_ShapeIndex;
 
 typedef struct SE_ExecutionInput {
-  XLA_MaybeOwningDeviceMemoryShapeTree shape_tree;
+  XLA_MaybeOwningDeviceAddressShapeTree shape_tree;
   XLA_ShapeIndex* unowned_indices;
   int unowned_indices_size;
   XLA_Shape dynamic_shape;
@@ -296,7 +290,7 @@ typedef struct SE_ExecutionInput {
 
 typedef struct SE_ExecutionOutput {
   XLA_ShapedBuffer result;
-  SE_MaybeOwningDeviceMemory* to_be_released;
+  SE_MaybeOwningDeviceAddress* to_be_released;
   int to_be_released_size;
   XLA_ShapeIndex* aliased_indices;
   int aliased_indices_size;
diff --git a/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h b/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
index 8fb31232886372..4ae63ada43c19c 100644
--- a/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
+++ b/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
@@ -141,9 +141,9 @@ class NoncopyableBuffer {
   }
 
   static OwnedDataPtr AlignedAlloc(size_t size, size_t alignment) {
-    return OwnedDataPtr(
-        static_cast<uint8_t*>(tsl::port::AlignedMalloc(size, alignment)),
-        tsl::port::AlignedFree);
+    return OwnedDataPtr(static_cast<uint8_t*>(tsl::port::AlignedMalloc(
+                            size, static_cast<std::align_val_t>(alignment))),
+                        tsl::port::AlignedFree);
   }
 
  private:
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
index 3ef4c531a066fd..b5f5c6d80017ab 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
@@ -102,13 +102,13 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
 
     ApiConverter::ToC(arg.shape(), &se_args[i]->shape_tree.shape);
     auto* arg_buffers = arg.MutableBuffers();
-    absl::InlinedVector<SE_MaybeOwningDeviceMemory, 2> se_buffers;
+    absl::InlinedVector<SE_MaybeOwningDeviceAddress, 2> se_buffers;
     for (auto& pair : *arg_buffers) {
       bool aliased = arg.unowned_indices().count(pair.first) > 0;
       se_buffers.push_back(ApiConverter::ToC(pair.second, aliased));
     }
     se_args[i]->shape_tree.buffers =
-        new SE_MaybeOwningDeviceMemory[se_buffers.size()];
+        new SE_MaybeOwningDeviceAddress[se_buffers.size()];
     for (int j = 0; j < se_buffers.size(); ++j) {
       se_args[i]->shape_tree.buffers[j] = se_buffers[j];
     }
@@ -166,7 +166,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
             .Release()
             .value());
   }
-  ExecutorApiFn()->TpuExecutable_FreeMaybeOwningDeviceMemoryArrayFn(
+  ExecutorApiFn()->TpuExecutable_FreeMaybeOwningDeviceAddressArrayFn(
       se_execution_output.to_be_released);
 
   return output;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
index 0b4c4db98728d2..f8080a29d01fb3 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
@@ -106,7 +106,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
           -> absl::Status {
         if (alias && alias->must_alias()) {
           VLOG(1) << alias->ToString();
-          const MaybeOwningDeviceMemory& original_input =
+          const MaybeOwningDeviceAddress& original_input =
               (*arguments)[alias->parameter_number].Buffers().element(
                   alias->parameter_index);
           if (!original_input.HasOwnership()) {
@@ -152,7 +152,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
     if (alias) {
       TF_RET_CHECK(alias->parameter_number < arguments->size());
       ExecutionInput& input = (*arguments)[alias->parameter_number];
-      MaybeOwningDeviceMemory* device_memory =
+      MaybeOwningDeviceAddress* device_memory =
           input.MutableBuffer(alias->parameter_index);
       if (auto owning = device_memory->Release()) {
         // If the caller passes the ownership of the device memory, reuse it
@@ -212,7 +212,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
   std::vector<se::DeviceAddressBase> memory_bases;
   memory_bases.reserve(arguments.size());
   for (auto& argument : arguments) {
-    memory_bases.push_back(argument.Buffer({}).AsDeviceMemoryBase());
+    memory_bases.push_back(argument.Buffer({}).AsDeviceAddress());
   }
   se::Stream* stream = run_options->stream();
 
@@ -240,16 +240,16 @@ absl::StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
       // data from fast memory instead of fresh data in large memory.
       auto it = arguments[parameter].MutableBuffers()->find({index});
       CHECK(it != arguments[parameter].MutableBuffers()->end());
-      CHECK(!it->second.AsDeviceMemoryBase().is_null());
+      CHECK(!it->second.AsDeviceAddress().is_null());
       CHECK(offset);
       bool is_prefetch_output_alias =
           absl::c_any_of(result.Result().buffers(), [&](auto index_addr_pair) {
             return index_addr_pair.second.IsSameAs(
-                it->second.AsDeviceMemoryBase());
+                it->second.AsDeviceAddress());
           });
       cross_program_prefetch_addrs.emplace_back(
           is_prefetch_output_alias ? stream_executor::DeviceAddressBase()
-                                   : it->second.AsDeviceMemoryBase());
+                                   : it->second.AsDeviceAddress());
       cross_program_prefetch_offsets.emplace_back(
           is_prefetch_output_alias ? std::numeric_limits<uint32_t>::max()
                                    : *offset);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 8209ec55e0b12c..7edaba7a8e9fdd 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -118,8 +118,8 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   // to TpuExecutorInterface.
   absl::StatusOr<std::unique_ptr<
       tensorflow::tpu::TpuExecutorInterface::TemporaryDeviceAddress>>
-  CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
-                              int64_t size) override {
+  CreateTemporaryDeviceAddress(int64_t memory_space, int64_t byte_offset,
+                               int64_t size) override {
     LOG(FATAL) << "Unimplemented.";
   }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
index ce57d254450d4e..7f2c1b02e094b2 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -298,12 +298,12 @@ TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
 TFTPU_CAPI_EXPORT void TpuExecutable_FreeXlaShapeIndexArray(
     XLA_ShapeIndex* array);
 
-// This frees the SE_MaybeOwningDeviceMemory* array allocated when se_output is
+// This frees the SE_MaybeOwningDeviceAddress* array allocated when se_output is
 // returned by TpuExecutable_ExecuteAsyncOnStream.
 // Note that this only frees the heap-allocated array itself, and does not
 // free any of the underlying device memory.
-TFTPU_CAPI_EXPORT void TpuExecutable_FreeMaybeOwningDeviceMemoryArray(
-    SE_MaybeOwningDeviceMemory* array);
+TFTPU_CAPI_EXPORT void TpuExecutable_FreeMaybeOwningDeviceAddressArray(
+    SE_MaybeOwningDeviceAddress* array);
 
 TFTPU_CAPI_EXPORT void TpuExecutable_Fingerprint(SE_Executable* executable,
                                                  const char** fingerprint,
@@ -479,7 +479,7 @@ struct TfTpu_ExecutorApiFn {
 
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_ExecuteAsyncOnStream);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeXlaShapeIndexArray);
-  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeMaybeOwningDeviceMemoryArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeMaybeOwningDeviceAddressArray);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Fingerprint);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Serialize);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutableSerialize_GetByteSize);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
index 5bc6a8ac9c4086..ee02abad1bf401 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
@@ -125,7 +125,7 @@ absl::Status SetExecutorStructFn(
   TFTPU_SET_FN(executor_fn, TpuCompiler_DefaultDeviceShapeRepresentation);
   TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
   TFTPU_SET_FN(executor_fn, TpuExecutable_FreeXlaShapeIndexArray);
-  TFTPU_SET_FN(executor_fn, TpuExecutable_FreeMaybeOwningDeviceMemoryArray);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_FreeMaybeOwningDeviceAddressArray);
   TFTPU_SET_FN(executor_fn, TpuExecutable_Fingerprint);
   TFTPU_SET_FN(executor_fn, TpuExecutable_Serialize);
   TFTPU_SET_FN(executor_fn, TpuExecutableSerialize_GetByteSize);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
index 6012bb3752dd4f..db95ca86242a95 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -42,15 +42,12 @@ class TpuExecutorInterface : public stream_executor::StreamExecutorCommon {
   class TemporaryDeviceAddress {
    public:
     virtual ~TemporaryDeviceAddress() {}
-    virtual stream_executor::DeviceAddressBase AsDeviceMemoryBase() const = 0;
+    virtual stream_executor::DeviceAddressBase AsDeviceAddress() const = 0;
   };
 
-  using TemporaryDeviceMemory ABSL_DEPRECATE_AND_INLINE() =
-      TemporaryDeviceAddress;
-
   virtual absl::StatusOr<std::unique_ptr<TemporaryDeviceAddress>>
-  CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
-                              int64_t size) {
+  CreateTemporaryDeviceAddress(int64_t memory_space, int64_t byte_offset,
+                               int64_t size) {
     LOG(FATAL) << "Unimplemented.";
   }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc b/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
index d63966f02427e2..bc491734f969b6 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
@@ -179,7 +179,7 @@ class TpuCompiler : public Compiler {
 
   // Compiles the HLO module group for ahead-of-time execution.  This is
   // intended for use in static compilation.
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                      const AotCompilationOptions& options) override {
     return Unimplemented("This compiler does not support CompileAheadOfTime.");
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 3ef274af17bee4..113a3b4d975b73 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -184,7 +184,7 @@ cc_library(
         "//xla/service:hlo_runner_pjrt",
         "//xla/service:interpreter_plugin",  # reference backend
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
@@ -232,7 +232,6 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -244,7 +243,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -369,12 +367,14 @@ cc_library(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_module_util",
         "//xla/tsl/lib/core:bitmap",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings:string_view",
@@ -451,8 +451,8 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -520,10 +520,10 @@ xla_test(
         "//xla/service:backend",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -801,9 +801,12 @@ xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
     shard_count = 2,
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:error_spec",
@@ -812,11 +815,14 @@ xla_test(
         "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/service:hlo_runner_interface",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1505,7 +1511,6 @@ xla_test(
     ],
     shard_count = 40,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1538,7 +1543,6 @@ xla_test(
     ],
     shard_count = 50,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1572,7 +1576,6 @@ xla_test(
     backends = ["gpu"],
     shard_count = 40,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1610,7 +1613,6 @@ xla_test(
     backends = ["gpu"],
     shard_count = 40,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1675,7 +1677,6 @@ xla_test(
     backends = ["gpu"],
     shard_count = 25,
     tags = [
-        "cuda-only",
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
@@ -1761,7 +1762,6 @@ xla_test(
     name = "convolution_cudnn_test",
     timeout = "long",
     srcs = ["convolution_cudnn_test.cc"],
-    tags = ["cuda-only"],  # No int8
     backends = [
         "v100",
         "a100",
@@ -2214,20 +2214,22 @@ xla_test(
         "cpu",
         "interpreter",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":hlo_test_base",
-        ":test_utils",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
+        "//xla:shape_layout",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2918,10 +2920,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "broken",
-            "no_oss",
-        ],
     },
     backends = [
         "gpu",
@@ -2973,9 +2971,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "no_oss",
-        ],
     },
     backends = ["gpu"],
     deps = [
@@ -2998,7 +2993,7 @@ xla_test(
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/service:collective_ops_utils",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
@@ -3018,10 +3013,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "broken",
-            "no_oss",
-        ],
     },
     backends = [
         "gpu",
@@ -3045,10 +3036,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "broken",
-            "no_oss",
-        ],
     },
     backends = [
         "gpu",
@@ -3537,7 +3524,7 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -3608,22 +3595,15 @@ xla_test(
 xla_test(
     name = "reshape_motion_test",
     srcs = ["reshape_motion_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
-        "//xla:array2d",
-        "//xla:array4d",
-        "//xla:literal",
-        "//xla:reference_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -3677,7 +3657,7 @@ xla_test(
         "//xla/service:generic_transfer_manager",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tests:xla_test_backend_predicates",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
@@ -3871,13 +3851,15 @@ xla_test(
     name = "triangular_solve_test",
     srcs = ["triangular_solve_test.cc"],
     real_hardware_only = True,
-    shard_count = 3,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array",
         "//xla:array2d",
@@ -4154,7 +4136,6 @@ cc_library(
     hdrs = ["hlo_test_base_with_mlir_context.h"],
     deps = [
         ":hlo_test_base",
-        "//xla/hlo/analysis:symbolic_expr",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 150e6c769ace79..27400cc261c601 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -39,14 +39,14 @@ limitations under the License.
 #include "xla/service/backend.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -105,7 +105,8 @@ class BufferDonationTest : public HloTestBase {
     TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
 
     auto& executors = backend_->stream_executors();
-    se::StreamExecutorMemoryAllocator memory_allocator(platform_, executors);
+    stream_executor::StreamExecutorAddressAllocator memory_allocator(platform_,
+                                                                     executors);
     ExecutableRunOptions run_options;
     run_options.set_stream(stream.get());
     run_options.set_allocator(&memory_allocator);
@@ -113,7 +114,7 @@ class BufferDonationTest : public HloTestBase {
         run_options, backend_->StreamBorrowerWithPriority());
 
     std::vector<ExecutionInput> args;
-    std::vector<ShapeTree<se::DeviceMemoryBase>> inputs_buffers;
+    std::vector<ShapeTree<se::DeviceAddressBase>> inputs_buffers;
 
     CHECK_EQ(argument_literals.size(), donate_arguments.size());
 
@@ -130,14 +131,15 @@ class BufferDonationTest : public HloTestBase {
       ShapedBuffer shaped_buffer = scoped_shaped_buffer.release();
       CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
           stream.get(), argument_literal, shaped_buffer));
-      ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
+      ShapeTree<se::DeviceAddressBase> input_buffers = shaped_buffer.buffers();
       inputs_buffers.push_back(input_buffers);
-      ShapeTree<MaybeOwningDeviceMemory> owned_buffers(
+      ShapeTree<MaybeOwningDeviceAddress> owned_buffers(
           argument_literal.shape());
       owned_buffers.ForEachMutableElement(
-          [&](const ShapeIndex& index, MaybeOwningDeviceMemory* device_memory) {
+          [&](const ShapeIndex& index,
+              MaybeOwningDeviceAddress* device_memory) {
             if (donate_argument) {
-              *device_memory = se::OwningDeviceMemory(
+              *device_memory = se::ScopedDeviceAddress<uint8_t>(
                   input_buffers.element(index), executor_->device_ordinal(),
                   &memory_allocator);
             } else {
@@ -161,7 +163,7 @@ class BufferDonationTest : public HloTestBase {
     }
     ExecutionOutput output = std::move(output_status).value();
 
-    se::DeviceMemoryBase result_root_buffer = output.Result().root_buffer();
+    se::DeviceAddressBase result_root_buffer = output.Result().root_buffer();
     LOG(INFO) << "result allocation = " << result_root_buffer.opaque()
               << "             size = " << result_root_buffer.size();
 
diff --git a/third_party/xla/xla/tests/client_library_test_runner_mixin.h b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
index 159ef5318b421f..15bd5d2ec1a7ef 100644
--- a/third_party/xla/xla/tests/client_library_test_runner_mixin.h
+++ b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_MIXIN_H_
 #define XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_MIXIN_H_
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -32,9 +33,12 @@ limitations under the License.
 #include "xla/execution_options_util.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/shape.h"
@@ -51,6 +55,7 @@ limitations under the License.
 #include "xla/types.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -113,6 +118,24 @@ class ClientLibraryTestRunnerMixin : public T {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                         BuildAndVerifyHloModule(computation, argument_shapes,
                                                 &execution_options));
+    const int64_t num_partitions =
+        std::max(1, execution_options.num_partitions());
+    if (const int64_t num_devices =
+            execution_options.num_replicas() * num_partitions;
+        num_devices > 1) {
+      std::optional<DeviceAssignment> device_assignment;
+      DeviceAssignment* device_assignment_ptr = nullptr;
+      if (execution_options.has_device_assignment()) {
+        device_assignment = module->config().static_device_assignment();
+        device_assignment_ptr = &*device_assignment;
+      }
+      ASSIGN_OR_RETURN(std::vector<Literal> results,
+                       this->ExecuteReplicated(
+                           std::move(module), arguments, num_devices,
+                           device_assignment_ptr, /*run_hlo_passes=*/true,
+                           /*use_threads=*/true));
+      return std::move(results.front());
+    }
     return this->Execute(std::move(module), arguments);
   }
 
@@ -384,6 +407,10 @@ class ClientLibraryTestRunnerMixin : public T {
   DebugOptions* mutable_debug_options() {
     return execution_options_.mutable_debug_options();
   }
+  const ExecutionOptions& execution_options() const {
+    return execution_options_;
+  }
+  ExecutionOptions* mutable_execution_options() { return &execution_options_; }
 
  private:
   absl::StatusOr<std::unique_ptr<HloModule>> BuildAndVerifyHloModule(
diff --git a/third_party/xla/xla/tests/codegen_test_base.cc b/third_party/xla/xla/tests/codegen_test_base.cc
index c4a56308074b75..ac768913fc01ce 100644
--- a/third_party/xla/xla/tests/codegen_test_base.cc
+++ b/third_party/xla/xla/tests/codegen_test_base.cc
@@ -33,12 +33,12 @@ CodegenTestBase::CompileToExecutable(std::unique_ptr<HloModule> hlo_module,
                                           /*device_allocator=*/nullptr);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CodegenTestBase::CompileToAotCompilationResult(
     std::unique_ptr<HloModule> hlo_module,
     const AotCompilationOptions& options) {
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<AotCompilationResult>> results,
+      std::vector<std::unique_ptr<CompiledModule>> results,
       backend().compiler()->CompileAheadOfTime(std::move(hlo_module), options));
   return std::move(results.front());
 }
diff --git a/third_party/xla/xla/tests/codegen_test_base.h b/third_party/xla/xla/tests/codegen_test_base.h
index d4f4ff7d5ad441..fffe0863c4ae79 100644
--- a/third_party/xla/xla/tests/codegen_test_base.h
+++ b/third_party/xla/xla/tests/codegen_test_base.h
@@ -34,9 +34,9 @@ class CodegenTestBase : public HloTestBaseWithMLIRContext {
       bool run_optimization_passes = true);
 
   // Compiles hlo_module with the AOT compiler.
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  CompileToAotCompilationResult(std::unique_ptr<HloModule> hlo_module,
-                                const AotCompilationOptions& options);
+  absl::StatusOr<std::unique_ptr<CompiledModule>> CompileToAotCompilationResult(
+      std::unique_ptr<HloModule> hlo_module,
+      const AotCompilationOptions& options);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/collective_metadata_test.cc b/third_party/xla/xla/tests/collective_metadata_test.cc
index 33aad38269cee9..836b0157f1bba4 100644
--- a/third_party/xla/xla/tests/collective_metadata_test.cc
+++ b/third_party/xla/xla/tests/collective_metadata_test.cc
@@ -84,25 +84,127 @@ TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadata) {
   ASSERT_EQ(first_result_data.size(), kNumElements);
   ASSERT_EQ(second_result_data.size(), kNumElements);
 
-  // Check the rank in the first position.
-  EXPECT_EQ(first_result_data[0], 0);
-  EXPECT_EQ(second_result_data[0], 1);
+  EXPECT_EQ(first_result_data[0], 0) << "First result rank is not 0.";
+  EXPECT_EQ(second_result_data[0], 1) << "Second result rank is not 1.";
 
-  // Check pointer to peers in the second position.
-  EXPECT_NE(first_result_data[1], 0);
-  EXPECT_NE(second_result_data[1], 0);
+  EXPECT_NE(first_result_data[1], 0)
+      << "First result pointer to peers is NULL.";
+  EXPECT_NE(second_result_data[1], 0)
+      << "Second result pointer to peers is NULL.";
 
-  // Check pointer to multimem metadata in the third position.
-  EXPECT_NE(first_result_data[2], 0);
-  EXPECT_NE(second_result_data[2], 0);
+  EXPECT_NE(first_result_data[2], 0)
+      << "First result pointer to multimem metadata is not set.";
+  EXPECT_NE(second_result_data[2], 0)
+      << "Second result pointer to multimem metadata is not set.";
 
-  // Check param_to_peers structure.
   for (int i = 3; i < kNumElements; ++i) {
-    EXPECT_NE(first_result_data[i], 0);
-    EXPECT_EQ(second_result_data[i], first_result_data[i]);
+    EXPECT_NE(first_result_data[i], 0)
+        << "First result param_to_peers is NULL.";
+    EXPECT_EQ(second_result_data[i], first_result_data[i])
+        << "Param_to_peers mismatch at index " << i
+        << " in the first result: " << first_result_data[i]
+        << " and in the second result: " << second_result_data[i];
   }
 }
 
+TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadataForPartitions) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test, allow_spmd_sharding_propagation_to_parameters={true}, allow_spmd_sharding_propagation_to_output={true}, num_partitions=2
+
+  ENTRY test_computation {
+    param_0 = f32[4] parameter(0)
+    param_1 = f32[4] parameter(1)
+
+    const_0 = f32[1] constant({10})
+
+    result_tuple = (f32[4], f32[4]{0}, f32[1], u64[9]) custom-call(param_0, param_1, const_0), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {}), {1}: (1, {})}
+    ROOT get_tuple_element = u64[9] get-tuple-element(result_tuple), index=3
+  })";
+
+  constexpr int kNumPartitions = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumPartitions)
+      << "Test requires at least " << kNumPartitions << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto unoptimized_module,
+      ParseAndReturnVerifiedModule(kModuleStr, /*replica_count=*/1,
+                                   /*num_partitions=*/kNumPartitions));
+
+  Literal input_0 = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  Literal input_1 = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(unoptimized_module),
+                        /*arguments=*/std::vector<Literal*>{&input_0, &input_1},
+                        /*run_hlo_passes=*/false));
+  const std::vector<Literal>& result = execution_result.results;
+  ASSERT_EQ(result.size(), kNumPartitions);
+
+  absl::Span<const uint64_t> first_result_data = result[0].data<uint64_t>();
+  absl::Span<const uint64_t> second_result_data = result[1].data<uint64_t>();
+  constexpr int kNumElements = 9;
+  ASSERT_EQ(first_result_data.size(), kNumElements);
+  ASSERT_EQ(second_result_data.size(), kNumElements);
+}
+
+TEST_F(CollectiveMetadataTest, BuildMultimemOnlyOncePerModuleExecution) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test, replica_count=2
+
+  ENTRY test_computation {
+    param_0 = f32[1] parameter(0)
+    copy_1 = f32[1]{0:S(1)} copy(param_0)
+
+    first_result_tuple = (f32[1]{0:S(1)}, u64[5]) custom-call(copy_1), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {})}
+    first_result = u64[5] get-tuple-element(first_result_tuple), index=1
+    second_result_tuple = (f32[1]{0:S(1)}, u64[5]) custom-call(copy_1), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {})}
+    second_result = u64[5] get-tuple-element(second_result_tuple), index=1
+    ROOT result_tuple = (u64[5], u64[5]) tuple(first_result, second_result)
+  })";
+
+  constexpr int kNumReplicas = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleStr, kNumReplicas));
+
+  Literal input_0 = LiteralUtil::CreateR1<float>({1.0f});
+  Literal input_1 = LiteralUtil::CreateR1<float>({1.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(module),
+                        /*arguments=*/std::vector<Literal*>{&input_0, &input_1},
+                        /*run_hlo_passes=*/false));
+
+  std::vector<Literal>& literals = execution_result.results;
+  ASSERT_EQ(literals.size(), kNumReplicas);
+
+  std::vector<Literal> first_result = literals[0].DecomposeTuple();
+  std::vector<Literal> second_result = literals[1].DecomposeTuple();
+
+  absl::Span<const uint64_t> first_device_first_result =
+      first_result[0].data<uint64_t>();
+  absl::Span<const uint64_t> first_device_second_result =
+      first_result[1].data<uint64_t>();
+  absl::Span<const uint64_t> second_device_first_result =
+      second_result[0].data<uint64_t>();
+  absl::Span<const uint64_t> second_device_second_result =
+      second_result[1].data<uint64_t>();
+  constexpr int kNumElements = 5;
+  ASSERT_EQ(first_device_first_result.size(), kNumElements);
+  ASSERT_EQ(first_device_second_result.size(), kNumElements);
+  ASSERT_EQ(second_device_first_result.size(), kNumElements);
+  ASSERT_EQ(second_device_second_result.size(), kNumElements);
+
+  EXPECT_EQ(first_device_first_result[2], first_device_second_result[2])
+      << "Multimem metadata should be the same for both results.";
+  EXPECT_EQ(second_device_first_result[2], second_device_second_result[2])
+      << "Multimem metadata should be the same for both results.";
+}
+
 TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadataWithReplicaGroup) {
   const absl::string_view kModuleStr = R"(
   HloModule test, replica_count=4
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test.cc b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
index 1191df40032c41..0d68205cd6e6eb 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
@@ -2396,6 +2396,14 @@ class AllReduceTest
                                    /*memory_size=*/32 * kMB,
                                    /*collectives_memory_size=*/0) {}
 
+  void SetUp() override {
+    CollectiveOpsE2ETestBase::SetUp();
+    if (!IsAmpereAndHigher()) {
+      GTEST_SKIP() << "Test requires Ampere or newer architecture since it's "
+                      "using triton.";
+    }
+  }
+
  protected:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions opts = CollectiveOpsWithFlagsBase::GetDebugOptionsForTest();
@@ -2881,5 +2889,96 @@ INSTANTIATE_TEST_SUITE_P(
       return absl::StrCat(GetAsyncTestName(std::get<0>(info.param)), "_",
                           std::get<1>(info.param) ? "one_shot" : "nccl");
     });
+
+TEST_F(CollectiveOpsTestE2E, MultipleModuleDifferentDeviceGroupsShouldRun) {
+  const absl::string_view kModuleStr_1 = R"(
+  HloModule test
+
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    param_0 = f32[8] parameter(0)
+    ROOT all-reduce = f32[8] all-reduce(param_0), to_apply=apply_op, replica_groups={{0,1}}
+  }
+  )";
+  const absl::string_view kModuleStr_2 = R"(
+  HloModule test
+
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    param_0 = f32[8] parameter(0)
+    all-reduce.1 = f32[8] all-reduce(param_0), to_apply=apply_op, replica_groups={{0,1}, {2,3}}
+    all-reduce.2 = f32[8] all-reduce(all-reduce.1), to_apply=apply_op, replica_groups={{0,1}, {2,3}}
+    all-reduce.3 = f32[8] all-reduce(all-reduce.2), to_apply=apply_op, replica_groups={{0,1}, {2,3}}
+    ROOT all-reduce.4 = f32[8] all-reduce(all-reduce.3), to_apply=apply_op, replica_groups={{0,1,2,3}}
+  }
+  )";
+
+  const int64_t kNumReplicas_1 = 2;
+  const int64_t kNumReplicas_2 = 4;
+  if (hlo_runner_->device_count() < kNumReplicas_2) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas_2 << " devices ("
+                 << hlo_runner_->device_count() << " available)";
+  }
+
+  HloModuleConfig config_1 =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas_1);
+  HloModuleConfig config_2 =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas_2);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_1,
+                          ParseAndReturnVerifiedModule(kModuleStr_1, config_1));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_2,
+                          ParseAndReturnVerifiedModule(kModuleStr_2, config_2));
+
+  int64_t num_elements_1 = ShapeUtil::ElementsIn(
+      module_1->entry_computation()->parameter_instructions()[0]->shape());
+
+  int64_t num_elements_2 = ShapeUtil::ElementsIn(
+      module_2->entry_computation()->parameter_instructions()[0]->shape());
+
+  Array<float> input1_1({num_elements_1}), input1_2({num_elements_1});
+  input1_1.FillRandom(1.0f, 10.0f, /*seed=*/0);
+  input1_2.FillRandom(1.0f, 10.0f, /*seed=*/1);
+
+  Literal input_literal1_1 = LiteralUtil::CreateFromArray(input1_1);
+  Literal input_literal1_2 = LiteralUtil::CreateFromArray(input1_2);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result_1,
+      ExecuteReplicated(std::move(module_1),
+                        std::vector<std::vector<Literal*>>{
+                            {&input_literal1_1}, {&input_literal1_2}}));
+
+  Array<float> input2_1({num_elements_2}), input2_2({num_elements_2}),
+      input2_3({num_elements_2}), input2_4({num_elements_2});
+  input2_1.FillRandom(1.0f, 10.0f, /*seed=*/0);
+  input2_2.FillRandom(1.0f, 10.0f, /*seed=*/1);
+  input2_3.FillRandom(1.0f, 10.0f, /*seed=*/2);
+  input2_4.FillRandom(1.0f, 10.0f, /*seed=*/3);
+
+  Literal input_literal2_1 = LiteralUtil::CreateFromArray(input2_1);
+  Literal input_literal2_2 = LiteralUtil::CreateFromArray(input2_2);
+  Literal input_literal2_3 = LiteralUtil::CreateFromArray(input2_3);
+  Literal input_literal2_4 = LiteralUtil::CreateFromArray(input2_4);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result_2,
+      ExecuteReplicated(std::move(module_2), std::vector<std::vector<Literal*>>{
+                                                 {&input_literal2_1},
+                                                 {&input_literal2_2},
+                                                 {&input_literal2_3},
+                                                 {&input_literal2_4}}));
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc b/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
index 775007fa085eec..d8302a06c5ad3e 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
@@ -161,7 +161,7 @@ CollectiveOpsE2ETestBase::ExecuteReplicated(
   // TODO(b/441865120): Use designated initializers this once XLA moves to
   // C++20.
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_devices;
+  options.num_devices = num_devices;
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
 
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test_base.h b/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
index 93190cc0e7c85c..8cf62249adca78 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
@@ -80,6 +80,11 @@ class CollectiveOpsE2ETestBase : public HloHardwareIndependentTestBase {
            Capability().cuda_compute_capability()->IsAtLeastHopper();
   }
 
+  bool IsAmpereAndHigher() {
+    return Capability().IsCuda() &&
+           Capability().cuda_compute_capability()->IsAtLeastAmpere();
+  }
+
  protected:
   std::unique_ptr<HloRunner> hlo_runner_;
   std::unique_ptr<HloRunner> reference_hlo_runner_;
diff --git a/third_party/xla/xla/tests/collective_ops_ffi_test.cc b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
index f56ef7045eca7b..21d423965efc0e 100644
--- a/third_party/xla/xla/tests/collective_ops_ffi_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/collective_ops_e2e_test_base.h"
 #include "xla/tests/literal_test_util.h"
diff --git a/third_party/xla/xla/tests/collective_ops_test.cc b/third_party/xla/xla/tests/collective_ops_test.cc
index 26dab88fea8a8b..b27f4e65c9db88 100644
--- a/third_party/xla/xla/tests/collective_ops_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_test.cc
@@ -435,7 +435,7 @@ TEST_F(CollectiveOpsTest, AllReduce_ManyConcurrentAllReduces) {
   auto device_assn = MakeDeviceAssn(devices);
 
   HloRunnerInterface::ReplicatedExecuteOptions opts;
-  opts.num_replicas = devices.size();
+  opts.num_devices = devices.size();
   opts.use_threads = true;
   opts.arguments.push_back(&input_literal);
 
diff --git a/third_party/xla/xla/tests/conditional_test.cc b/third_party/xla/xla/tests/conditional_test.cc
index 157c89c4241f4a..c293e12186e7d4 100644
--- a/third_party/xla/xla/tests/conditional_test.cc
+++ b/third_party/xla/xla/tests/conditional_test.cc
@@ -13,24 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "xla/array2d.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -39,12 +46,17 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
+
 constexpr ErrorSpec kErrorSpec{0.001};
 
-class ConditionalOpTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
+class ConditionalOpTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   void SetUp() override {
-    ClientLibraryTestRunnerMixin<HloTestBase>::SetUp();
+    ClientLibraryTestRunnerMixin<
+        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>::SetUp();
     mutable_debug_options()->set_xla_test_add_command_buffer_mode(true);
   }
 
@@ -212,31 +224,37 @@ TEST_F(ConditionalOpTest, Parameters0) {
 
 // Test branch computations that do not take any parameters.
 TEST_P(CaseOpTest, Parameters0) {
-  int num_branches = GetParam();
+  const int num_branches = GetParam();
+
+  XlaBuilder builder(TestName());
+  const XlaOp branch_index =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "branch_index_arg");
+  auto operand = Tuple(&builder, {});
+  std::vector<XlaOp> operands(num_branches, operand);
+  std::vector<XlaComputation> branches;
+  branches.reserve(num_branches);
+  std::vector<const XlaComputation*> branches_p(num_branches);
+  for (int i = 0; i < num_branches; ++i) {
+    branches.push_back(CreateR0ConstantComputation(static_cast<float>(i) * 10));
+    branches_p[i] = &branches[i];
+  }
+  Conditional(branch_index, branches_p, operands);
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> executable,
+                       test_runner().CreateExecutable(std::move(module),
+                                                      /*run_hlo_passes=*/true));
   for (int bi = -1; bi <= num_branches; ++bi) {
     SCOPED_TRACE(bi);
-    XlaBuilder builder(TestName());
-    XlaOp branch_index;
-    auto branch_index_arg = CreateR0Parameter<int32_t>(
-        bi, 0, "branch_index_arg", &builder, &branch_index);
-    auto operand = Tuple(&builder, {});
-
-    std::vector<XlaOp> operands(num_branches, operand);
-    std::vector<XlaComputation> branches;
-    branches.reserve(num_branches);
-    std::vector<const XlaComputation*> branches_p(num_branches);
-    for (int i = 0; i < num_branches; ++i) {
-      branches.emplace_back(
-          CreateR0ConstantComputation(static_cast<float>(i) * 10));
-      branches_p[i] = &branches[i];
-    }
-    Conditional(branch_index, branches_p, operands);
-
-    float expected = 10 * static_cast<float>((bi < 0 || bi >= num_branches)
-                                                 ? num_branches - 1
-                                                 : bi);
-    ComputeAndCompareR0<float>(&builder, expected, {&branch_index_arg},
-                               kErrorSpec);
+    const Literal expected = LiteralUtil::CreateR0<float>(
+        10 * static_cast<float>(
+                 (bi < 0 || bi >= num_branches) ? num_branches - 1 : bi));
+    const Literal branch_index_arg = LiteralUtil::CreateR0<int32_t>(bi);
+    ASSERT_OK_AND_ASSIGN(const Literal result,
+                         test_runner().ExecuteWithExecutable(
+                             executable.get(), {&branch_index_arg}));
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, result, kErrorSpec));
   }
 }
 
@@ -255,40 +273,45 @@ TEST_F(ConditionalOpTest, Parameters1) {
 
 // Test branch computations that take in 1 parameter.
 TEST_P(CaseOpTest, Parameters1) {
-  int num_branches = GetParam();
+  const int num_branches = GetParam();
+
+  XlaBuilder builder(TestName());
+  const XlaOp branch_index =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "branch_index_arg");
+  std::vector<XlaComputation> branches;
+  branches.reserve(num_branches);
+  std::vector<const XlaComputation*> branches_p(num_branches);
+  std::vector<XlaOp> operands;
+  operands.reserve(num_branches);
+  std::vector<Literal> expecteds(num_branches);
+  for (int i = 0; i < num_branches; ++i) {
+    std::unique_ptr<XlaBuilder> sb =
+        builder.CreateSubBuilder(absl::StrCat("branch_", i));
+    Add(ConstantR0<float>(sb.get(), static_cast<float>(i)),
+        Parameter(sb.get(), 0, r0f32_, "p0"));
+    branches.push_back(sb->BuildAndNoteError());
+    branches_p[i] = &branches[i];
+    const float fi = static_cast<float>(i);
+    operands.push_back(ConstantR0<float>(&builder, 10 * fi + 7));
+    expecteds[i] = LiteralUtil::CreateR0<float>(10 * fi + 7 + fi);
+  }
+  Conditional(branch_index, branches_p, operands);
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> executable,
+                       test_runner().CreateExecutable(std::move(module),
+                                                      /*run_hlo_passes=*/true));
   for (int bi = -1; bi <= num_branches; ++bi) {
     SCOPED_TRACE(bi);
-    XlaBuilder builder(TestName());
-    XlaOp branch_index;
-    auto branch_index_arg = CreateR0Parameter<int32_t>(
-        bi, 0, "branch_index_arg", &builder, &branch_index);
-
-    auto make_branch = [&builder, this](int i) {
-      auto sb = builder.CreateSubBuilder(absl::StrCat("branch_", i));
-      Add(ConstantR0<float>(sb.get(), static_cast<float>(i)),
-          Parameter(sb.get(), 0, r0f32_, "p0"));
-      return sb->BuildAndNoteError();
-    };
-    std::vector<XlaComputation> branches;
-    branches.reserve(num_branches);
-    std::vector<const XlaComputation*> branches_p(num_branches);
-    std::vector<XlaOp> operands;
-    operands.reserve(num_branches);
-    std::vector<float> expecteds(num_branches);
-    for (int i = 0; i < num_branches; ++i) {
-      branches.emplace_back(make_branch(i));
-      branches_p[i] = &branches[i];
-      auto fi = static_cast<float>(i);
-      operands.emplace_back(ConstantR0<float>(&builder, 10 * fi + 7));
-      expecteds[i] = 10 * fi + 7 + fi;
-    }
-
-    Conditional(branch_index, branches_p, operands);
-    float expected = (bi < 0 || bi >= num_branches)
-                         ? expecteds[num_branches - 1]
-                         : expecteds[bi];
-    ComputeAndCompareR0<float>(&builder, expected, {&branch_index_arg},
-                               kErrorSpec);
+    const Literal& expected = (bi < 0 || bi >= num_branches)
+                                  ? expecteds[num_branches - 1]
+                                  : expecteds[bi];
+    const Literal branch_index_arg = LiteralUtil::CreateR0<int32_t>(bi);
+    ASSERT_OK_AND_ASSIGN(const Literal result,
+                         test_runner().ExecuteWithExecutable(
+                             executable.get(), {&branch_index_arg}));
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, result, kErrorSpec));
   }
 }
 
@@ -428,38 +451,46 @@ TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
 
 // Test branch computations that take in 2 array parameters.
 TEST_P(CaseOpTest, Parameters2Array) {
-  int num_branches = GetParam();
+  const int num_branches = GetParam();
+
+  XlaBuilder builder(TestName());
+  const XlaOp branch_index =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "branch_index_arg");
+  const XlaOp operand1 = ConstantR1<float>(&builder, {24.0f, 56.0f});
+  const XlaOp operand2 = ConstantR1<float>(&builder, {10.0f, 11.0f});
+  const XlaOp operands = Tuple(&builder, {operand1, operand2});
+  std::vector<XlaComputation> branches;
+  branches.reserve(num_branches);
+  std::vector<const XlaComputation*> branches_p(num_branches);
+  for (int i = 0; i < num_branches; ++i) {
+    std::unique_ptr<XlaBuilder> sb =
+        builder.CreateSubBuilder(absl::StrCat("branch_", i));
+    const XlaOp p = Parameter(sb.get(), 0, tuple_2_r1s2f32_, "p0");
+    Add(Mul(ConstantR0<float>(sb.get(), static_cast<float>(i)),
+            GetTupleElement(p, 0)),
+        GetTupleElement(p, 1));
+    branches.push_back(sb->BuildAndNoteError());
+    branches_p[i] = &branches[i];
+  }
+  Conditional(branch_index, branches_p,
+              std::vector<XlaOp>(num_branches, operands));
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> executable,
+                       test_runner().CreateExecutable(std::move(module),
+                                                      /*run_hlo_passes=*/true));
   for (int bi = -1; bi <= num_branches; ++bi) {
     SCOPED_TRACE(bi);
-    XlaBuilder builder(TestName());
-    XlaOp branch_index;
-    auto branch_index_arg =
-        CreateR0Parameter<int32_t>(bi, 0, "pred", &builder, &branch_index);
-    auto operand1 = ConstantR1<float>(&builder, {24.0f, 56.0f});
-    auto operand2 = ConstantR1<float>(&builder, {10.0f, 11.0f});
-    auto operands = Tuple(&builder, {operand1, operand2});
-    auto make_branch = [&builder, this](int i) {
-      auto sb = builder.CreateSubBuilder(absl::StrCat("branch_", i));
-      auto p = Parameter(sb.get(), 0, tuple_2_r1s2f32_, "p0");
-      Add(Mul(ConstantR0<float>(sb.get(), static_cast<float>(i)),
-              GetTupleElement(p, 0)),
-          GetTupleElement(p, 1));
-      return sb->BuildAndNoteError();
-    };
-    std::vector<XlaComputation> branches;
-    branches.reserve(num_branches);
-    std::vector<const XlaComputation*> branches_p(num_branches);
-    for (int i = 0; i < num_branches; ++i) {
-      branches.emplace_back(make_branch(i));
-      branches_p[i] = &branches[i];
-    }
-    Conditional(branch_index, branches_p,
-                std::vector<XlaOp>(num_branches, operands));
-    auto modified_bi = static_cast<float>(
+    const Literal branch_index_arg = LiteralUtil::CreateR0<int32_t>(bi);
+    const float modified_bi = static_cast<float>(
         (bi < 0 || bi >= num_branches) ? num_branches - 1 : bi);
-    ComputeAndCompareR1<float>(
-        &builder, {24.0f * modified_bi + 10, 56.0f * modified_bi + 11},
-        {&branch_index_arg}, kErrorSpec);
+    const Literal expected = LiteralUtil::CreateR1<float>(
+        {24.0f * modified_bi + 10, 56.0f * modified_bi + 11});
+    ASSERT_OK_AND_ASSIGN(const Literal result,
+                         test_runner().ExecuteWithExecutable(
+                             executable.get(), {&branch_index_arg}));
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, result, kErrorSpec));
   }
 }
 
@@ -561,48 +592,53 @@ TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   XlaBuilder true_builder(TestName() + ".true");
   {
     Parameter(&true_builder, 0, empty_tuple_, "tuple");
-    auto true_constant1 = ConstantR0<float>(&true_builder, 12.2f);
-    auto true_constant2 = ConstantR1<float>(&true_builder, {12.8f, 14.6f});
-    auto true_constant3 = ConstantR1<float>(&true_builder, {25.4f, 29.8f});
-    auto true_constant4 = ConstantR0<float>(&true_builder, 35.6f);
+    const XlaOp true_constant1 = ConstantR0<float>(&true_builder, 12.2f);
+    const XlaOp true_constant2 =
+        ConstantR1<float>(&true_builder, {12.8f, 14.6f});
+    const XlaOp true_constant3 =
+        ConstantR1<float>(&true_builder, {25.4f, 29.8f});
+    const XlaOp true_constant4 = ConstantR0<float>(&true_builder, 35.6f);
     Tuple(&true_builder,
           {Tuple(&true_builder, {true_constant1, true_constant2}),
            Tuple(&true_builder, {true_constant3, true_constant4})});
   }
-  auto true_builder_result = true_builder.Build();
-  EXPECT_IS_OK(true_builder_result.status());
+  ASSERT_OK_AND_ASSIGN(XlaComputation true_comp, true_builder.Build());
 
   XlaBuilder false_builder(TestName() + ".false");
   {
     Parameter(&false_builder, 0, empty_tuple_, "tuple");
-    auto false_constant1 = ConstantR0<float>(&false_builder, 46.6f);
-    auto false_constant2 = ConstantR1<float>(&false_builder, {54.4f, 58.4f});
-    auto false_constant3 = ConstantR1<float>(&false_builder, {62.1f, 67.4f});
-    auto false_constant4 = ConstantR0<float>(&false_builder, 9.3f);
+    const XlaOp false_constant1 = ConstantR0<float>(&false_builder, 46.6f);
+    const XlaOp false_constant2 =
+        ConstantR1<float>(&false_builder, {54.4f, 58.4f});
+    const XlaOp false_constant3 =
+        ConstantR1<float>(&false_builder, {62.1f, 67.4f});
+    const XlaOp false_constant4 = ConstantR0<float>(&false_builder, 9.3f);
     Tuple(&false_builder,
           {Tuple(&false_builder, {false_constant1, false_constant2}),
            Tuple(&false_builder, {false_constant3, false_constant4})});
   }
-  auto false_builder_result = false_builder.Build();
-  EXPECT_IS_OK(false_builder_result.status());
+  ASSERT_OK_AND_ASSIGN(XlaComputation false_comp, false_builder.Build());
 
   XlaBuilder builder(TestName());
   XlaOp pred;
-  auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
-  auto operands = Tuple(&builder, {});
-  Conditional(pred, operands, std::move(true_builder_result).value(), operands,
-              std::move(false_builder_result).value());
+  const Literal pred_arg =
+      CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
+  const XlaOp operands = Tuple(&builder, {});
+  const XlaOp result = Conditional(pred, operands, std::move(true_comp),
+                                   operands, std::move(false_comp));
+  // Flatten nested tuple for PjRt.
+  const XlaOp e0 = GetTupleElement(result, 0);
+  const XlaOp e1 = GetTupleElement(result, 1);
+  Tuple(&builder, {GetTupleElement(e0, 0), GetTupleElement(e0, 1),
+                   GetTupleElement(e1, 0), GetTupleElement(e1, 1)});
 
-  ComputeAndCompareLiteral(
-      &builder,
-      LiteralUtil::MakeTupleFromSlices(
-          {LiteralUtil::MakeTupleFromSlices(
-               {LiteralUtil::CreateR0<float>(46.6f),
-                LiteralUtil::CreateR1<float>({54.4f, 58.4f})}),
-           LiteralUtil::MakeTupleFromSlices(
-               {LiteralUtil::CreateR1<float>({62.1f, 67.4f}),
-                LiteralUtil::CreateR0<float>(9.3f)})}),
-      {&pred_arg}, kErrorSpec);
+  ComputeAndCompareLiteral(&builder,
+                           LiteralUtil::MakeTupleFromSlices(
+                               {LiteralUtil::CreateR0<float>(46.6f),
+                                LiteralUtil::CreateR1<float>({54.4f, 58.4f}),
+                                LiteralUtil::CreateR1<float>({62.1f, 67.4f}),
+                                LiteralUtil::CreateR0<float>(9.3f)}),
+                           {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional that takes in scalar operands in the form of external
@@ -751,21 +787,31 @@ TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
     main = builder.Build().value();
   }
 
-  auto test_swap = [&](float a, float b) {
-    XlaBuilder builder(TestName());
-    XlaOp x, y;
-    auto x_arg = CreateR0Parameter<float>(a, 0, "x", &builder, &x);
-    auto y_arg = CreateR0Parameter<float>(b, 1, "y", &builder, &y);
-    auto tuple_operand = Tuple(&builder, {x, y});
-    Call(&builder, main, {tuple_operand});
-    ComputeAndCompareLiteral(
-        &builder,
-        LiteralUtil::MakeTupleFromSlices(
-            {LiteralUtil::CreateR0<float>(a), LiteralUtil::CreateR0<float>(b)}),
-        {&x_arg, &y_arg}, kErrorSpec);
+  XlaBuilder builder(TestName());
+  const XlaOp x = Parameter(&builder, 0, r0f32_, "x");
+  const XlaOp y = Parameter(&builder, 1, r0f32_, "y");
+  const XlaOp tuple_operand = Tuple(&builder, {x, y});
+  Call(&builder, main, {tuple_operand});
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+
+  const auto test_swap =
+      [&, this](float a,
+                float b) -> absl::StatusOr<::testing::AssertionResult> {
+    const Literal x_arg = LiteralUtil::CreateR0<float>(a);
+    const Literal y_arg = LiteralUtil::CreateR0<float>(b);
+    const Literal expected = LiteralUtil::MakeTupleFromSlices(
+        {LiteralUtil::CreateR0<float>(a), LiteralUtil::CreateR0<float>(b)});
+    ASSIGN_OR_RETURN(const Literal result,
+                     test_runner().ExecuteWithExecutable(executable.get(),
+                                                         {&x_arg, &y_arg}));
+    return LiteralTestUtil::Near(expected, result, kErrorSpec);
   };
-  test_swap(3.11f, 9.4f);
-  test_swap(11.24f, 5.55f);
+  EXPECT_THAT(test_swap(3.11f, 9.4f), IsOkAndHolds(true));
+  EXPECT_THAT(test_swap(11.24f, 5.55f), IsOkAndHolds(true));
 }
 
 // Test conditional that duplicates tuple elements in the then and else
@@ -792,35 +838,45 @@ TEST_F(ConditionalOpTest, DuplicateElementsConditional) {
     else_comp = builder.Build().value();
   }
 
-  {
-    // Pred is true case.
-    std::vector<Literal> args;
-    args.push_back(LiteralUtil::MakeTupleFromSlices(
-        {LiteralUtil::CreateR0<int32_t>(123),
-         LiteralUtil::CreateR0<int32_t>(-42)}));
-    args.push_back(LiteralUtil::CreateR0<bool>(true));
-    XlaBuilder builder(TestName() + ".main");
-    auto p = Parameter(&builder, 0, tuple2, "p0");
-    auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1");
-    Conditional(p_pred, p, then_comp, p, else_comp);
-    ComputeAndCompare(&builder, {&args[0], &args[1]});
-  }
-  {
-    // Pred is false case.
-    std::vector<Literal> args;
-    args.push_back(LiteralUtil::MakeTupleFromSlices(
-        {LiteralUtil::CreateR0<int32_t>(123),
-         LiteralUtil::CreateR0<int32_t>(-42)}));
-    args.push_back(LiteralUtil::CreateR0<bool>(false));
-    XlaBuilder builder(TestName() + ".main");
-    auto p = Parameter(&builder, 0, tuple2, "p0");
-    auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1");
-    Conditional(p_pred, p, then_comp, p, else_comp);
-    ComputeAndCompare(&builder, {&args[0], &args[1]});
-  }
+  XlaBuilder builder(TestName() + ".main");
+  auto p0 = Parameter(&builder, 0, scalar, "p0.0");
+  auto p1 = Parameter(&builder, 1, scalar, "p0.1");
+  auto p = Tuple(&builder, {p0, p1});
+  auto p_pred = Parameter(&builder, 2, ShapeUtil::MakeShape(PRED, {}), "p1");
+  Conditional(p_pred, p, then_comp, p, else_comp);
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+
+  const std::array<Literal, 4> args = {
+      LiteralUtil::CreateR0<int32_t>(123), LiteralUtil::CreateR0<int32_t>(-42),
+      LiteralUtil::CreateR0<bool>(true), LiteralUtil::CreateR0<bool>(false)};
+  const std::array<const Literal*, 3> true_args = {&args[0], &args[1],
+                                                   &args[2]};
+  const std::array<const Literal*, 3> false_args = {&args[0], &args[1],
+                                                    &args[3]};
+
+  // Compute reference values. Because this test is not parameterized, we need
+  // to manually invoke the test runner and reference runner.
+  ASSERT_OK_AND_ASSIGN(Literal true_reference,
+                       reference_runner().Execute(module->Clone(), true_args,
+                                                  /*run_hlo_passes=*/true));
+  ASSERT_OK_AND_ASSIGN(Literal false_reference,
+                       reference_runner().Execute(module->Clone(), false_args,
+                                                  /*run_hlo_passes=*/true));
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+  ASSERT_OK_AND_ASSIGN(Literal true_result, test_runner().ExecuteWithExecutable(
+                                                executable.get(), true_args));
+  ASSERT_OK_AND_ASSIGN(
+      Literal false_result,
+      test_runner().ExecuteWithExecutable(executable.get(), false_args));
+  EXPECT_TRUE(LiteralTestUtil::Equal(true_reference, true_result));
+  EXPECT_TRUE(LiteralTestUtil::Equal(false_reference, false_result));
 }
 
-using ConditionalOpHloTest = HloTestBase;
+using ConditionalOpHloTest = HloPjRtTestBase;
 
 TEST_F(ConditionalOpHloTest, ParallelExecution) {
   // Test conditional works when an executable is executed in parallel.
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index 5163b9842d8ba5..2c1b2f7fa52cec 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -951,7 +951,8 @@ void BM_ParallelFusion(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
 
   const int64_t intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index 28bfe038cb97f7..5965f66b18c571 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -2275,7 +2275,8 @@ ENTRY main {
 void DOT_ReorderContracting(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
 
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
diff --git a/third_party/xla/xla/tests/dynamic_ops_test.cc b/third_party/xla/xla/tests/dynamic_ops_test.cc
index 74930092306a27..9bb74a2d505c7a 100644
--- a/third_party/xla/xla/tests/dynamic_ops_test.cc
+++ b/third_party/xla/xla/tests/dynamic_ops_test.cc
@@ -1003,7 +1003,8 @@ ENTRY main {
 void BM_DynamicSlice(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index db5fd00500278a..d0e54a61e4660f 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -56,7 +56,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/util.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -108,9 +107,9 @@ HloRunnerAgnosticTestBase::ParseAndReturnVerifiedModule(
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>>
-HloRunnerAgnosticTestBase::HloModuleFromXlaBuilder(
-    XlaBuilder* builder, const ExecutionOptions& execution_options) const {
-  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder->Build());
+HloRunnerAgnosticTestBase::HloModuleFromXlaComputation(
+    const XlaComputation& computation,
+    const ExecutionOptions& execution_options) const {
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig module_config,
       HloModule::CreateModuleConfigFromProto(computation.proto(),
@@ -123,6 +122,13 @@ HloRunnerAgnosticTestBase::HloModuleFromXlaBuilder(
   return module;
 }
 
+absl::StatusOr<std::unique_ptr<HloModule>>
+HloRunnerAgnosticTestBase::HloModuleFromXlaBuilder(
+    XlaBuilder* builder, const ExecutionOptions& execution_options) const {
+  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder->Build());
+  return HloModuleFromXlaComputation(computation, execution_options);
+}
+
 HloComputation*
 HloRunnerAgnosticTestBase::AddEntryComputationAndUpdateEntryComputationLayout(
     HloModule* const module, std::unique_ptr<HloComputation> computation) {
@@ -148,11 +154,10 @@ absl::StatusOr<Literal> HloRunnerAgnosticTestBase::Execute(
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
-    const absl::Span<const Literal* const> arguments,
-    const int64_t num_replicas, const bool use_threads,
-    const bool run_hlo_passes) {
+    const absl::Span<const Literal* const> arguments, const int64_t num_devices,
+    const bool use_threads, const bool run_hlo_passes) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_devices;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
@@ -163,11 +168,11 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
-    const absl::Span<const Literal* const> arguments,
-    const int64_t num_replicas, DeviceAssignment* const device_assignment,
-    const bool run_hlo_passes, const bool use_threads) {
+    const absl::Span<const Literal* const> arguments, const int64_t num_devices,
+    DeviceAssignment* const device_assignment, const bool run_hlo_passes,
+    const bool use_threads) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_devices;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
@@ -181,10 +186,10 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
     absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
     absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
     absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
-    const int64_t num_replicas, const bool run_hlo_passes,
+    const int64_t num_devices, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_devices;
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
   return test_runner_->ExecuteReplicated(
@@ -196,11 +201,10 @@ absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const std::vector<std::vector<Literal*>> arguments,
-    const int64_t num_replicas, const bool run_hlo_passes,
+    const int64_t num_devices, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
-  CHECK(num_replicas > 0 && "expect at least one replica");
-  CHECK(num_replicas == arguments.size() &&
-        "expect arguments for each replica");
+  CHECK(num_devices > 0 && "expected at least one device");
+  CHECK(num_devices == arguments.size() && "expect arguments for each device");
   int64_t argument_count = arguments.front().size();
   TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module.get()));
   TF_ASSIGN_OR_RETURN(
@@ -213,7 +217,7 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
       [&](int64_t replica_idx, int64_t argument_idx) -> const Literal* {
         return arguments[replica_idx][argument_idx];
       },
-      num_replicas, /*run_hlo_passes=*/run_hlo_passes,
+      num_devices, /*run_hlo_passes=*/run_hlo_passes,
       /*device_assignment=*/device_assignment);
 }
 
@@ -259,11 +263,11 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesReplicated(
            << "Number of replicas is not the same: " << replica_count << " Vs "
            << module_1->config().replica_count();
   }
-  if (options.num_replicas != replica_count) {
+  if (options.num_devices != replica_count) {
     return ::testing::AssertionFailure()
            << "Number of execution replicas is different from number of "
               "replicas in the module: requested number of replicas = "
-           << options.num_replicas
+           << options.num_devices
            << ", number of replicas in hlo = " << replica_count;
   }
 
@@ -313,15 +317,12 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesReplicated(
     std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
     const std::vector<Literal>& fake_arguments, const bool run_hlo_passes,
     const bool use_threads, const std::optional<ErrorSpec>& error) {
-  const HloRunnerInterface::ReplicatedExecuteOptions options{
-      /*num_replicas=*/module_0->config().replica_count(),
-      /*arguments=*/LiteralUtil::MakePointers(fake_arguments),
-      /*infeed_values=*/{},
-      /*infeed_steps=*/-1,
-      /*outfeed_shape=*/{},
-      /*outfeed_values=*/nullptr,
-      /*run_hlo_passes=*/run_hlo_passes,
-      /*use_threads=*/use_threads};
+  HloRunnerInterface::ReplicatedExecuteOptions options;
+  options.num_devices =
+      module_0->config().replica_count() * module_0->config().num_partitions();
+  options.arguments = LiteralUtil::MakePointers(fake_arguments);
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = use_threads;
   return RunAndCompareTwoModulesReplicated(std::move(module_0),
                                            std::move(module_1), options, error);
 }
@@ -512,9 +513,10 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::Run(
 
 ::testing::AssertionResult HloRunnerAgnosticTestBase::RunReplicated(
     const absl::string_view hlo_string, const bool run_hlo_passes,
-    const int64_t num_replicas, const tsl::protobuf::Message* backend_config) {
+    const int64_t num_devices, const tsl::protobuf::Message* backend_config) {
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>> module =
-      ParseAndReturnVerifiedModule(hlo_string, num_replicas);
+      ParseAndReturnVerifiedModule(hlo_string, /*num_replicas=*/num_devices,
+                                   /*num_partitions=*/1);
   if (!module.ok()) {
     return ::testing::AssertionFailure()
            << "Error while parsing HLO text format: "
@@ -540,7 +542,7 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::RunReplicated(
   }
 
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_devices;
   options.arguments = {fake_argument_ptrs.begin(), fake_argument_ptrs.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
index fc1fc3ccc75c0e..bbce2c3037becb 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
@@ -34,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -139,6 +139,12 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       absl::string_view hlo_text, const HloModuleConfig& config,
       const HloParserOptions& parser_options = HloParserOptions()) const;
 
+  // Builds an HLO module from the given XlaComputation using the given
+  // execution options.
+  absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromXlaComputation(
+      const XlaComputation& computation,
+      const ExecutionOptions& execution_options) const;
+
   // Builds an HLO module from the given XlaBuilder using the given
   // execution options.
   absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromXlaBuilder(
@@ -159,36 +165,36 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
     return test_runner_->CreateExecutable(std::move(module), run_hlo_passes);
   }
 
-  // Executes the given module on multiple replicas.
+  // Executes the given module on multiple devices.
   //
   // use_threads indicates whether this replicated computation will be executed
-  // with a thread-per-replica, vs using an implicitly async call such as
+  // with a thread-per-device, vs using an implicitly async call such as
   // Executable::ExecuteOnStreams.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      absl::Span<const Literal* const> arguments, int64_t num_devices,
       bool use_threads, bool run_hlo_passes = false);
 
   // Same as above, but uses specified device assignment.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      absl::Span<const Literal* const> arguments, int64_t num_devices,
       DeviceAssignment* device_assignment, bool run_hlo_passes,
       bool use_threads);
 
-  // Same as above, but allows passing different programs for replicas.
+  // Same as above, but allows passing different programs for devices.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
       absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
-      int64_t num_replicas, bool run_hlo_passes,
+      int64_t num_devices, bool run_hlo_passes,
       DeviceAssignment* device_assignment = nullptr);
 
   // Convenience function for above. Allows passing different inputs to
-  // different replicas of the same program.
+  // different devices of the same program.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      std::vector<std::vector<Literal*>> arguments, int64_t num_replicas,
+      std::vector<std::vector<Literal*>> arguments, int64_t num_devices,
       bool run_hlo_passes, DeviceAssignment* device_assignment = nullptr);
 
   // Executes an hlo module with fake inputs and checks that the execution is
@@ -225,8 +231,8 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       bool use_threads, const std::optional<ErrorSpec>& error);
 
   // Parses the modules, and executes them based on `run_hlo_passes` and
-  // `use_threads` flags. The replica count should be mentioned in the module
-  // itself.
+  // `use_threads` flags. The replica + partition count should be set in the
+  // module itself.
   ::testing::AssertionResult RunAndCompareTwoModulesReplicated(
       absl::string_view module_0_str, absl::string_view module_1_str,
       bool run_hlo_passes, bool use_threads,
@@ -268,10 +274,10 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
 
-  // Executes an hlo module with fake inputs on multiple replicas.
+  // Executes an hlo module with fake inputs on multiple devices.
   ::testing::AssertionResult RunReplicated(
       absl::string_view hlo_string, bool run_hlo_passes = true,
-      int64_t num_replicas = 1,
+      int64_t num_devices = 1,
       const tsl::protobuf::Message* backend_config = nullptr);
 
   // If assert_determinism is true, the assertion will fail unless all runs
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index 6421e9badcbec7..9291c2073c3140 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/hlo_runner_pjrt.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
@@ -174,10 +174,11 @@ ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
                                   reference_preprocessor);
 }
 
-se::DeviceMemoryAllocator* HloTestBase::GetAllocator() {
+se::DeviceAddressAllocator* HloTestBase::GetAllocator() {
   if (allocator_ == nullptr) {
-    allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
-        backend().default_stream_executor());
+    allocator_ =
+        std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+            backend().default_stream_executor());
   }
   return allocator_.get();
 }
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 31efd1fc5ff2bb..c378860ec85a40 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -48,7 +48,7 @@ static_assert(false,
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
@@ -210,7 +210,7 @@ class ABSL_DEPRECATED(
   static se::Platform* GetTestPlatform();
 
   // Creates or retrieves the allocator.
-  se::DeviceMemoryAllocator* GetAllocator();
+  se::DeviceAddressAllocator* GetAllocator();
 
   ErrorSpec error_spec_{0.0001};
 
@@ -224,7 +224,7 @@ class ABSL_DEPRECATED(
               bool allow_mixed_precision_in_hlo_verifier,
               HloPredicate instruction_can_change_layout_func);
 
-  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+  std::unique_ptr<se::DeviceAddressAllocator> allocator_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h b/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
index 516aec73ef38e4..5f11e1cbb4896f 100644
--- a/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
+++ b/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_TESTS_HLO_TEST_BASE_WITH_MLIR_CONTEXT_H_
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index ac4aec28517450..01279073d4cf48 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -920,7 +920,8 @@ TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
 void BM_LocalClientOverhead(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index 29563c202f26a2..1db48c62865aed 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -57,7 +57,7 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
+absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> TestAllocator::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
@@ -66,19 +66,20 @@ absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return se::StreamExecutorMemoryAllocator::Allocate(
+  return stream_executor::StreamExecutorAddressAllocator::Allocate(
       device_ordinal, size, retry_on_failure, memory_space);
 }
 
 absl::Status TestAllocator::Deallocate(int device_ordinal,
-                                       se::DeviceMemoryBase mem) {
+                                       se::DeviceAddressBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     absl::MutexLock lock(count_mutex_);
     deallocation_count_++;
     device_deallocation_count_[device_ordinal]++;
   }
-  return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
+  return stream_executor::StreamExecutorAddressAllocator::Deallocate(
+      device_ordinal, mem);
 }
 
 int64_t TestAllocator::allocation_count() const {
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index cb7de54135e8db..0c813dac236408 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -37,8 +37,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -47,17 +47,17 @@ limitations under the License.
 
 namespace xla {
 
-class TestAllocator : public se::StreamExecutorMemoryAllocator {
+class TestAllocator : public stream_executor::StreamExecutorAddressAllocator {
  public:
   explicit TestAllocator(se::Platform* platform)
-      : se::StreamExecutorMemoryAllocator(
+      : stream_executor::StreamExecutorAddressAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
 
-  absl::StatusOr<se::OwningDeviceMemory> Allocate(
+  absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override;
   absl::Status Deallocate(int device_ordinal,
-                          se::DeviceMemoryBase mem) override;
+                          se::DeviceAddressBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64_t allocation_count() const;
diff --git a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
index cad950b9f08c83..d229e0b0fa4745 100644
--- a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
+++ b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
@@ -361,6 +361,54 @@ TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
 }
 
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_CommandBuffer) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[4] parameter(0)
+    output = f32[4] parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                        kModuleReplicatedStr, kNumReplicas));
+
+  // Verify correctness of ragged-all-to-all when command buffers for
+  // collectives are enabled.
+  // As of Dec 2025, ragged-all-to-all command is not implemented, so this test
+  // verifies that we don't try to accidentally create a command buffer and
+  // crash.
+  DebugOptions& debug_options =
+      module->mutable_config().mutable_debug_options();
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+
+  ASSERT_OK(CreateRandomTestData(module.get(),
+                                 /*input_sizes=*/{/*replica_0=*/{1, 1},
+                                                  /*replica_1=*/{3, 1}}));
+
+  ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs()));
+
+  const std::vector<Literal>& results = execution_result.results;
+
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
 TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_S4) {
   absl::string_view kModuleReplicatedStr = R"(
   HloModule module, num_partitions=1
@@ -931,6 +979,59 @@ TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
   }
 }
 
+TEST_P(RaggedAllToAllMultiHostDecomposerTest,
+       RaggedAllToAll_8GPUs_SliceSize4_ShuffledReplicaGroups) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
+  HloModule module
+
+  ENTRY entry {
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,2,4,6,1,3,5,7}}
+  })",
+                       num_input_rows, num_output_rows);
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumUpdatesPerReplica = 4;
+  if (hlo_runner_->device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << hlo_runner_->device_count() << " available)";
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleReplicatedStr, kNumReplicas));
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(4);
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs()));
+
+  const std::vector<Literal>& results = execution_result.results;
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
 TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
   auto [num_input_rows, num_output_rows] = GetParam();
 
diff --git a/third_party/xla/xla/tests/reduce_hlo_test.cc b/third_party/xla/xla/tests/reduce_hlo_test.cc
index 9e5c5ed2bb9f54..f69438baa48268 100644
--- a/third_party/xla/xla/tests/reduce_hlo_test.cc
+++ b/third_party/xla/xla/tests/reduce_hlo_test.cc
@@ -16,11 +16,10 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
-#include <ostream>
 #include <string>
 #include <utility>
-#include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -33,10 +32,10 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_utils.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/shape_layout.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 
 // Tests the Reduce HLO in ways that can't be done using the ComputationBuilder
 // API.
@@ -59,12 +58,8 @@ std::string PrintReduceLayout(
   return reduce_layout_param.param.ToString();
 }
 
-void PrintTo(const ReduceLayout& reduce_layout, ::std::ostream* os) {
-  *os << reduce_layout.ToString();
-}
-
 class ReduceWithLayoutTest
-    : public HloTestBase,
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
       public ::testing::WithParamInterface<ReduceLayout> {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> GetParsedModule() {
@@ -127,6 +122,11 @@ TEST_P(ReduceWithLayoutTest, Reduce) {
 
   Literal reduce_input_relaid =
       reduce_input.Relayout(reduce_input_shape->layout());
+
+  // Strict layout check in PjRt requires entry computation layout to match.
+  *module->mutable_entry_computation_layout()->mutable_parameter_layout(0) =
+      ShapeLayout(*reduce_input_shape);
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), {&reduce_input_relaid}, ErrorSpec(1e-5)));
 }
diff --git a/third_party/xla/xla/tests/replicated_io_feed_test.cc b/third_party/xla/xla/tests/replicated_io_feed_test.cc
index a6d82d33112c40..e7ff6762b41a72 100644
--- a/third_party/xla/xla/tests/replicated_io_feed_test.cc
+++ b/third_party/xla/xla/tests/replicated_io_feed_test.cc
@@ -63,7 +63,7 @@ TEST_F(ReplicatedIOFeedTest, InfeedAndOutfeed) {
   std::vector<Literal> outfeed_literals;
 
   HloRunnerInterface::ReplicatedExecuteOptions opts;
-  opts.num_replicas = kNumReplicas;
+  opts.num_devices = kNumReplicas;
 
   // Initialize infeed literal = replica_id * 10
   std::vector<Literal> infeed_literals(kNumReplicas);
diff --git a/third_party/xla/xla/tests/reshape_motion_test.cc b/third_party/xla/xla/tests/reshape_motion_test.cc
index 54c63a5e64ed26..b46765e5c7ebc6 100644
--- a/third_party/xla/xla/tests/reshape_motion_test.cc
+++ b/third_party/xla/xla/tests/reshape_motion_test.cc
@@ -13,31 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <numeric>
-#include <random>
-#include <vector>
+#include <cstdint>
 
-#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/array2d.h"
-#include "xla/array4d.h"
-#include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/testlib/test_helpers.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/reference_util.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using ReshapeMotionTest = ClientLibraryTestBase;
+using ReshapeMotionTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) {
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/sort_test.cc b/third_party/xla/xla/tests/sort_test.cc
index ae00ac94b1c4e7..c065c718707719 100644
--- a/third_party/xla/xla/tests/sort_test.cc
+++ b/third_party/xla/xla/tests/sort_test.cc
@@ -91,6 +91,29 @@ TEST_F(SortTest, SortTwiceWithSameComparator) {
   EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{0.0, 0.0}));
 }
 
+// TODO(b/456833594): Enable this test once PJRT packs int4 types.
+TEST_F(SortTest, DISABLED_SortTuple) {
+  absl::string_view hlo_text_module = R"(
+    HloModule sort
+
+    compare {
+      p.0.lhs = s4[] parameter(0)
+      p.0.rhs = s4[] parameter(1)
+      p.1.lhs = s32[] parameter(2)
+      p.1.rhs = s32[] parameter(3)
+      ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+    }
+
+    ENTRY main {
+      p0 = s4[2,1452]{1,0} parameter(0)
+      p1 = s32[2,1452]{1,0} iota(), iota_dimension=1
+      ROOT sort = (s4[2,1452]{1,0}, s32[2,1452]{1,0}) sort(p0, p1), dimensions={1}, is_stable=true, to_apply=compare
+    }
+  )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{0.0, 0.0}));
+}
+
 class SortManyInputsTest : public SortTest,
                            public ::testing::WithParamInterface<int> {
  public:
diff --git a/third_party/xla/xla/tests/transfer_manager_test.cc b/third_party/xla/xla/tests/transfer_manager_test.cc
index 6a4a188afd94fa..66d84eebb73fb7 100644
--- a/third_party/xla/xla/tests/transfer_manager_test.cc
+++ b/third_party/xla/xla/tests/transfer_manager_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/stream_pool.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/local_client_test_base.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/tests/triangular_solve_test.cc b/third_party/xla/xla/tests/triangular_solve_test.cc
index d5e95eca8c7345..2060edcc668b49 100644
--- a/third_party/xla/xla/tests/triangular_solve_test.cc
+++ b/third_party/xla/xla/tests/triangular_solve_test.cc
@@ -34,7 +34,8 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -45,9 +46,10 @@ namespace {
 constexpr float kNan = std::numeric_limits<float>::quiet_NaN();
 constexpr complex64 kNanC64 = complex64(kNan, kNan);
 
-using TriangularSolveTest = ClientLibraryTestRunnerMixin<HloTestBase>;
-using TriangularSolveLeftLookingTest =
-    ClientLibraryTestRunnerMixin<HloTestBase>;
+using TriangularSolveTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
+using TriangularSolveLeftLookingTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 Array2D<float> AValsLower() {
   return {{2, kNan, kNan, kNan},
@@ -448,24 +450,13 @@ struct TriangularSolveTestSpec {
 };
 
 class TriangularSolveParametricTest
-    : public ClientLibraryTestRunnerMixin<HloTestBase>,
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
       public ::testing::WithParamInterface<TriangularSolveTestSpec> {};
 
 TEST_P(TriangularSolveParametricTest, Random) {
   TriangularSolveTestSpec spec = GetParam();
 
-  if (backend()
-          .default_stream_executor()
-          ->GetDeviceDescription()
-          .cuda_compute_capability()
-          .major == 6) {
-    if (spec.dims.size() == 3 && spec.dims[0] > 1 && spec.dims[1] == 150 &&
-        (spec.dims[2] == 150 || spec.dims[2] == 5) &&
-        (!spec.left_side || spec.dims[2] == 150)) {
-      GTEST_SKIP() << "triggers a bug in cuda 12. b/287345077";
-    }
-  }
-
   XlaBuilder builder(TestName());
 
   CHECK_GE(spec.dims.size(), 2);
diff --git a/third_party/xla/xla/tests/while_test.cc b/third_party/xla/xla/tests/while_test.cc
index dbbf0d80bcd9f6..ebe82ad75740f4 100644
--- a/third_party/xla/xla/tests/while_test.cc
+++ b/third_party/xla/xla/tests/while_test.cc
@@ -1311,7 +1311,8 @@ void BM_WhileLoop(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
 
   const int64_t seq_len = 100;
diff --git a/third_party/xla/xla/text_literal_reader.cc b/third_party/xla/xla/text_literal_reader.cc
index c1641b2239b482..d5fde2a80aa0f3 100644
--- a/third_party/xla/xla/text_literal_reader.cc
+++ b/third_party/xla/xla/text_literal_reader.cc
@@ -73,13 +73,24 @@ absl::StatusOr<Literal> TextLiteralReader::ReadAllLines() {
 
   absl::StripAsciiWhitespace(&shape_string);
   TF_ASSIGN_OR_RETURN(Shape shape, ParseShape(shape_string));
+
+  // Sanity check to reject shapes that are obviously too large. This doesn't
+  // guarantee allocation will succeed, but prevents crashes from absurdly
+  // large sizes (e.g., from fuzz testing).
+  constexpr int64_t kMaxSupportedBytes = std::numeric_limits<int32_t>::max();
+  int64_t byte_size = ShapeUtil::ByteSizeOf(shape);
+  if (byte_size < 0 || byte_size > kMaxSupportedBytes) {
+    return ResourceExhausted("Shape %s requires too much memory (%d bytes)",
+                             ShapeUtil::HumanString(shape), byte_size);
+  }
+
   if (shape.element_type() != F32) {
     return Unimplemented(
         "unsupported element type for text literal reading: %s",
         ShapeUtil::HumanString(shape));
   }
 
-  Literal result(shape);
+  TF_ASSIGN_OR_RETURN(Literal result, Literal::Make(shape));
   const float fill = std::numeric_limits<float>::quiet_NaN();
   result.PopulateWithValue<float>(fill);
   std::vector<absl::string_view> pieces;
diff --git a/third_party/xla/xla/text_literal_reader_test.cc b/third_party/xla/xla/text_literal_reader_test.cc
index b7414c87d734bd..d57b9a06c2c842 100644
--- a/third_party/xla/xla/text_literal_reader_test.cc
+++ b/third_party/xla/xla/text_literal_reader_test.cc
@@ -89,5 +89,16 @@ TEST(TextLiteralReaderTest, MissingColonReturnsInvalidArgument) {
   EXPECT_THAT(literal, StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST(TextLiteralReaderTest, ShapeTooLargeReturnsResourceExhausted) {
+  // Shape requires too much memory, should fail gracefully rather than crash.
+  std::string contents = "f32[272222222222222222]\n";
+
+  std::string fname = tsl::testing::TmpDir() + "/ShapeTooLarge.data.txt";
+  ASSERT_THAT(tsl::WriteStringToFile(tsl::Env::Default(), fname, contents),
+              IsOk());
+  absl::StatusOr<Literal> literal = TextLiteralReader::ReadPath(fname);
+  EXPECT_THAT(literal, StatusIs(absl::StatusCode::kResourceExhausted));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 5f422444fd55e9..de88faf0390424 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -387,6 +387,7 @@ cc_library(
         "//xla/hlo/translate/stablehlo_to_hlo:translate",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -474,7 +475,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -691,7 +691,6 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
@@ -713,7 +712,6 @@ xla_test(
     deps = [
         ":matmul_perf_table_gen",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
@@ -794,12 +792,21 @@ cc_library(
         ":collective_perf_table_gen",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service/gpu/model:collective_interpolator_data",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
     ] + if_cuda([
         "//xla/service:gpu_plugin",
@@ -865,7 +872,6 @@ xla_test(
         ":collective_perf_table_gen",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
@@ -1007,8 +1013,8 @@ tsl_gpu_library(
         "//xla/service/cpu:cpu_executable",
         "//xla/service/gpu:gpu_symbol_repository",
         "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -1097,8 +1103,8 @@ xla_test(
     ],
     data = [
         ":data/add.hlo",
+        "//xla/backends/gpu/target_config:all_gpu_specs",
         "//xla/service/gpu:gpu_compiler_test_autotune_db.textproto",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
     ],
     deps = [
         ":xla_compile_lib",
@@ -1148,7 +1154,6 @@ xla_cc_binary(
     deps = [
         ":hlo_module_loader",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:command_line_flags",
diff --git a/third_party/xla/xla/tools/collective_perf_table_gen_main.cc b/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
index 7f8ca1015cdba0..2ec4b59813245e 100644
--- a/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
+++ b/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
@@ -18,23 +18,32 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <iostream>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "google/protobuf/text_format.h"
+#include "xla/service/gpu/model/collective_interpolator_data.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/tools/collective_perf_table_gen.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
 
 namespace {
 
@@ -72,6 +81,24 @@ to 4 GPUs.
   (--tensor_size_bytes_spec)
 * AllReduce will run across all 8 devices.
   (--collective_devices_spec, HloShardingV2 format)
+
+This tool can also merge new profiles (either generated or loaded using
+--merge or --merge_path) into the static performance table defined in
+a C++ header file, e.g. `collective_interpolator_data.h`.
+Use `--update_header_path` to specify header file to update in-place.
+
+Example for generating COLLECTIVE_PERMUTE profiles:
+  CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bazel run --config=cuda  -- \
+    tools:collective_perf_table_gen_main  \
+    --num_nodes=1 --task_id=0 --collectives=COLLECTIVE_PERMUTE \
+    --collective_devices_spec='[1,8]<=[8]' \
+    --tensor_size_bytes_spec='start=1024,stop=2147483648,factor=2' \
+
+Example for merging profiles from `cp_perf_table.pbtxt` into
+`service/gpu/model/collective_interpolator_data.h`:
+  bazel run tools:collective_perf_table_gen_main -- \
+    --merge=cp_perf_table.pbtxt \
+    --update_header_path=service/gpu/model/collective_interpolator_data.h
 )";
 
 constexpr absl::string_view kDefaultCoordinatorAddress = "127.0.0.1:1234";
@@ -161,6 +188,93 @@ std::string DefaultCollectiveDevicesIfEmpty(
   return collective_devices_spec_unparsed;
 }
 
+// Helper to get full path if running under bazel run.
+std::string GetFullPath(const std::string& path) {
+  if (tsl::io::IsAbsolutePath(path)) {
+    return path;
+  }
+  const char* build_workspace_dir = getenv("BUILD_WORKSPACE_DIRECTORY");
+  if (build_workspace_dir != nullptr) {
+    return tsl::io::JoinPath(build_workspace_dir, path);
+  }
+  return path;  // Fallback to relative path if not in bazel run
+}
+
+// Helper to inject proto string into header by replacing content between
+// R"pb( and )pb tags.
+// Note: this function assumes there is only one R"pb(...)pb" block in the
+// header file, and finds the first opening tag R"pb(\n and last closing tag
+// \n)pb".
+std::string InjectProtoToString(const std::string& header_content,
+                                const std::string& new_proto_string) {
+  const std::string start_str = "R\"pb(\n";
+  const std::string end_str = "\n)pb";
+  size_t start = header_content.find(start_str);
+  size_t end = header_content.rfind(end_str);
+  CHECK(start != std::string::npos && end != std::string::npos);
+  start += start_str.length();
+
+  std::string result = header_content.substr(0, start);
+  result += new_proto_string;
+  result += header_content.substr(end);
+  return result;
+}
+
+absl::Status UpdateHeader(const DeviceHloInstructionProfiles& new_profiles,
+                          const std::string& header_path_flag,
+                          CollectivePerfTableGen* gen) {
+  std::string header_path = GetFullPath(header_path_flag);
+
+  // 1. Parse kDefaultCollectivePTable to get current profiles
+  DeviceHloInstructionProfiles current_profiles_proto;
+  CHECK(tsl::protobuf::TextFormat::ParseFromString(kDefaultCollectivePTable,
+                                                   &current_profiles_proto));
+  std::string current_profiles_pbtxt;
+  tsl::protobuf::TextFormat::PrintToString(current_profiles_proto,
+                                           &current_profiles_pbtxt);
+
+  // 2. Save current profiles to temp file
+  std::string temp_file_current =
+      tsl::io::JoinPath("/tmp", "xla_gpu_perf_merge_current.pbtxt");
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
+      tsl::Env::Default(), temp_file_current, current_profiles_pbtxt));
+
+  // 3. Save new profiles to temp file
+  std::string new_profiles_pbtxt;
+  tsl::protobuf::TextFormat::PrintToString(new_profiles, &new_profiles_pbtxt);
+  std::string temp_file_new =
+      tsl::io::JoinPath("/tmp", "xla_gpu_perf_merge_new.pbtxt");
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), temp_file_new,
+                                            new_profiles_pbtxt));
+
+  // 4. Merge
+  std::vector<std::string> files_to_merge = {temp_file_current, temp_file_new};
+  DeviceHloInstructionProfiles merged_profiles = gen->Merge(files_to_merge);
+
+  // 5. Format as text
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetInitialIndentLevel(1);
+  std::string merged_profiles_pbtxt;
+  printer.PrintToString(merged_profiles, &merged_profiles_pbtxt);
+  // The printer might add a trailing newline which we don't want inside
+  // R"pb(...)pb" to avoid unnecessary ClangTidy warnings.
+  if (!merged_profiles_pbtxt.empty() && merged_profiles_pbtxt.back() == '\n') {
+    merged_profiles_pbtxt.pop_back();
+  }
+
+  // 6. Update header
+  std::string header_content;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), header_path, &header_content));
+  std::string new_header_content =
+      InjectProtoToString(header_content, merged_profiles_pbtxt);
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), header_path,
+                                            new_header_content));
+
+  LOG(INFO) << "Successfully merged profiles into " << header_path_flag;
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 int main(int argc, char* argv[]) {
@@ -178,6 +292,7 @@ int main(int argc, char* argv[]) {
   std::string output = std::string(CollectivePerfTableGen::Config::kStdout);
   std::string merge_path;
   std::vector<std::string> merge_files;
+  std::string update_header_path;
 
   // Parse flags.
   std::vector<tsl::Flag> flag_list = {
@@ -219,6 +334,9 @@ int main(int argc, char* argv[]) {
           "none",
           "Path to individual DeviceHloInstructionProfiles files. If "
           "specified, these files will be merged into a single one."),
+      tsl::Flag(
+          "update_header_path", &update_header_path,
+          "Path to C++ header file to update in-place with new profiles."),
   };
 
   std::string kUsageString =
@@ -248,10 +366,22 @@ int main(int argc, char* argv[]) {
   if (!merge_path.empty()) {
     profiles = gen->Merge(merge_path);
   } else if (!merge_files.empty()) {
-    profiles = gen->Merge(merge_files);
+    std::vector<std::string> full_path_merge_files;
+    full_path_merge_files.reserve(merge_files.size());
+    absl::c_transform(merge_files, std::back_inserter(full_path_merge_files),
+                      GetFullPath);
+    profiles = gen->Merge(full_path_merge_files);
   } else {
     profiles = gen->ComputeTable();
   }
+
+  if (!update_header_path.empty()) {
+    CHECK_OK(UpdateHeader(profiles, update_header_path, gen.get()));
+    if (output == CollectivePerfTableGen::Config::kStdout) {
+      return 0;  // If header is updated, avoid printing to stdout.
+    }
+  }
+
   CHECK_OK(gen->Dump(profiles));
   return 0;
 }
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
index 11bec92ba9a32d..4fd83f5f10c84f 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
@@ -49,7 +49,6 @@ class HloControlFlowFlatteningTest : public HloHardwareIndependentTestBase {
     spmd::SpmdPartitionerOptions options;
     auto collective_ops_creator =
         spmd::GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
-    collective_ops_creator.create_cross_partition_all_gather = nullptr;
 
     HloModuleConfig config = GetModuleConfigForTest();
     config.set_use_spmd_partitioning(true);
diff --git a/third_party/xla/xla/tools/hlo_module_loader.cc b/third_party/xla/xla/tools/hlo_module_loader.cc
index d805b45d9426ec..96ea37ed9353f4 100644
--- a/third_party/xla/xla/tools/hlo_module_loader.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index f793bd0308fdbc..068d179775cda8 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -259,18 +259,8 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "gpu_specs/a100_pcie_80.txtpb",
-        "gpu_specs/mi200.txtpb",
+        "//xla/backends/gpu/target_config:all_gpu_specs",
         "//xla/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
     ],
 )
-
-filegroup(
-    name = "all_gpu_specs",
-    data = glob(["gpu_specs/*.txtpb"]),
-)
-
-exports_files(glob([
-    "gpu_specs/*.txtpb",
-]))
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
index 71eed4621791ac..43ab1735faa499 100755
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
index c12803692ffefc..f1e4e166d7fed6 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
index 94fea4361d0516..267720369ba9d9 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
index fa7b25f68abea6..ba200234183f5c 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
index 6e482da819805b..f53ec227451880 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=html --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=html --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK: <!DOCTYPE html>
 // CHECK: bitcast
@@ -9,4 +9,4 @@ ENTRY computation {
     c = f32[6000,5000] transpose(p), dimensions={1,0}
     r = f32[300,20,5000] reshape(c)
     ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
index e92abf0eaee91f..59416aa1821e6b 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule m
 
@@ -22,9 +22,9 @@ ENTRY e {
 HloModule Test, is_scheduled=true
 
 
-// CHECK-LABEL: fusion
-// CHECK-PTX:     call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
-// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
+// CHECK-LABEL: wrapped_b
+// CHECK-PTX:     call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-GCN:     call i32 @llvm.amdgcn.workitem.id.x()
 fused_computation {
   param_0 = f32[100,200]{1,0} parameter(0)
   ROOT b.1 = f32[100,200]{0,1} copy(f32[100,200]{1,0} param_0)
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
index 8df2cdf621eb15..95342aae192d09 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --passes=dot-algorithm-rewriter --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --passes=dot-algorithm-rewriter --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule Algorithm3xBF16
 // CHECK-LABEL: HloModule Algorithm3xBF16, entry_computation_layout={(f32[128,128]{1,0}, f32[128,128]{1,0})->f32[128,128]{1,0}}
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
index b01f867b96483a..b4ba7b9549c7d0 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
index 7e47a652c452f0..b0e6e7115dc157 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // CHECK-PTX:     define ptx_kernel void @fusion
 // CHECK-GCN:     define amdgpu_kernel void @fusion
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
index 7aa3d6863d752e..aa11c8fe86b2bd 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/tools/matmul_perf_table_gen.h"
 
 #include <cstdint>
-#include <variant>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 306598ec942a14..37090027c4301d 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -610,6 +610,7 @@ absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
           running_options.recreate_profiler_session_between_repeats ||
           is_last_repeat;
       if (has_active_profiler_session && upload_active_profiler_session) {
+        XLA_SCOPED_LOGGING_TIMER("FunctionalHloRunner::XProfUpload");
         running_options.profiler->UploadSession();
         has_active_profiler_session = false;
       }
diff --git a/third_party/xla/xla/tools/ptx_opt/BUILD b/third_party/xla/xla/tools/ptx_opt/BUILD
index 262a448e9e2f9e..dcdbb9cc1162b4 100644
--- a/third_party/xla/xla/tools/ptx_opt/BUILD
+++ b/third_party/xla/xla/tools/ptx_opt/BUILD
@@ -22,6 +22,7 @@ xla_cc_binary(
     ],
     deps = [
         "//xla:debug_options_flags",
+        "//xla:xla_proto_cc",
         "//xla/service/gpu/llvm_gpu_backend:load_ir_module",
         "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc b/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
index df00cb8039c253..64114733254f84 100644
--- a/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
+++ b/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/util/command_line_flags.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/init_main.h"
 
 namespace xla::gpu::nvptx {
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index 01f7feac6b8f80..60c82a6b173013 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -112,7 +112,7 @@ static absl::StatusOr<std::string> CompileGpuExecutable(
   TF_ASSIGN_OR_RETURN(stream_executor::StreamExecutor * stream_executor,
                       platform->ExecutorForDevice(0));
   auto allocator =
-      std::make_unique<stream_executor::StreamExecutorMemoryAllocator>(
+      std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
           stream_executor);
   compile_options.device_allocator = allocator.get();
 
diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index d517926832e821..dfb1a60da7df27 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -74,8 +74,9 @@ TEST_F(XlaCompileLibTest, CompilesForGpuWithDevice) {
 TEST_F(XlaCompileLibTest, CompilesForGpuWithoutDevice) {
   auto path = tsl::testing::XlaSrcRoot();
   path = path.erase(path.length() - 4);
-  const std::string target_config_path = tsl::io::JoinPath(
-      path, "external/local_xla/xla/tools/hlo_opt/gpu_specs", "h100_sxm.txtpb");
+  const std::string target_config_path =
+      tsl::io::JoinPath(path, "external/local_xla/xla",
+                        "backends/gpu/target_config/specs", "h100_sxm.txtpb");
   stream_executor::GpuTargetConfigProto target_config;
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
                                   &target_config));
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_ref.h b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
index 83825c973a4e5f..1d72dbb4cd05b0 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_ref.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
@@ -357,7 +357,7 @@ class AsyncValueRef {
     SetError(absl::InternalError(message_view));
   }
 
-  explicit operator bool() const { return value_.get() != nullptr; }
+  explicit operator bool() const { return value_ != nullptr; }
   bool operator==(const AsyncValueRef& r) const { return value_ == r.value_; }
   bool operator!=(const AsyncValueRef& r) const { return value_ != r.value_; }
 
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
index 57f968d6824057..75005391c3ef4b 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
@@ -178,7 +178,8 @@ TEST(AsyncValueTest, StackAllocatedAsyncValue) {
   EXPECT_TRUE(ptr.IsAvailable());
 
   // Check that when owner is destructed it calls the payload destructor.
-  std::make_unique<AsyncValueOwningRef<Payload>>(std::move(owner));
+  static_cast<void>(
+      std::make_unique<AsyncValueOwningRef<Payload>>(std::move(owner)));
   EXPECT_EQ(2, counter);
 }
 
diff --git a/third_party/xla/xla/tsl/cuda/nccl.symbols b/third_party/xla/xla/tsl/cuda/nccl.symbols
index 48524f43e189e7..43f799578f82ef 100644
--- a/third_party/xla/xla/tsl/cuda/nccl.symbols
+++ b/third_party/xla/xla/tsl/cuda/nccl.symbols
@@ -1,5 +1,6 @@
 ncclAllGather
 ncclAllReduce
+ncclAlltoAll
 ncclBcast
 ncclBroadcast
 ncclCommAbort
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
index b373d0a4a9bf0b..becff37233c630 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -46,20 +46,20 @@ namespace tsl {
 
 namespace {
 
-string MakeAddress(const string& job, int replica, int task) {
+std::string MakeAddress(const std::string& job, int replica, int task) {
   return strings::StrCat("/job:", job, "/replica:", replica, "/task:", task);
 }
 
 // Allows the host to be a raw IP (either v4 or v6).
-absl::Status ValidateHostPortPair(const string& host_port) {
-  string bns_prefix = "/bns/";
+absl::Status ValidateHostPortPair(const std::string& host_port) {
+  std::string bns_prefix = "/bns/";
   if (host_port.substr(0, bns_prefix.length()) == bns_prefix) {
     return absl::OkStatus();
   }
-  uint32 port;
+  uint32_t port;
   auto colon_index = host_port.find_last_of(':');
   if (!absl::SimpleAtoi(host_port.substr(colon_index + 1), &port) ||
-      host_port.substr(0, colon_index).find('/') != string::npos) {
+      host_port.substr(0, colon_index).find('/') != std::string::npos) {
     return absl::InvalidArgumentError(absl::StrCat(
         "Could not interpret \"", host_port, "\" as a host-port pair."));
   }
@@ -71,7 +71,7 @@ ::grpc::ChannelArguments* CreateDefaultChannelArguments() {
   const char* env = std::getenv("TF_GRPC_DEFAULT_OPTIONS");
   if (env != nullptr) {
     for (auto& grpc_option : absl::StrSplit(env, ',')) {
-      std::vector<string> name_value = absl::StrSplit(grpc_option, '=');
+      std::vector<std::string> name_value = absl::StrSplit(grpc_option, '=');
       if (name_value.size() != 2) {
         LOG(ERROR) << "Invalid GRPC options format: " << grpc_option;
         continue;
@@ -79,9 +79,10 @@ ::grpc::ChannelArguments* CreateDefaultChannelArguments() {
       VLOG(3) << "Setting GRPC default for '" << name_value[0] << "' to '"
               << name_value[1] << "'";
       if (name_value[1].size() >= 2 && name_value[1][0] == '"') {
-        string ue_value = name_value[1].substr(1, name_value[1].size() - 2);
-        string value;
-        string error;
+        std::string ue_value =
+            name_value[1].substr(1, name_value[1].size() - 2);
+        std::string value;
+        std::string error;
         if (!absl::CUnescape(ue_value, &value, &error)) {
           LOG(ERROR) << "Failed to parse escaped string for " << grpc_option
                      << ": " << error;
@@ -111,7 +112,7 @@ const ::grpc::ChannelArguments* GetDefaultChannelArguments() {
 ::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   // TODO(mrry): Implement secure channels.
   ::grpc::ChannelArguments args = *GetDefaultChannelArguments();
-  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32_t>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
@@ -142,7 +143,7 @@ ::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   return args;
 }
 
-absl::Status NewHostPortGrpcChannel(const string& target,
+absl::Status NewHostPortGrpcChannel(const std::string& target,
                                     const RPCOptions* rpc_options,
                                     SharedGrpcChannelPtr* channel_pointer) {
   // Minimally ensure that the target is valid
@@ -155,10 +156,11 @@ absl::Status NewHostPortGrpcChannel(const string& target,
 }
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<absl::Status(string, const RPCOptions*,
+    const std::function<absl::Status(std::string, const RPCOptions*,
                                      SharedGrpcChannelPtr*)>&
         new_channel_func_ptr) {
-  return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
+  return [new_channel_func_ptr](
+             const std::string& target) -> SharedGrpcChannelPtr {
     SharedGrpcChannelPtr channel_ptr;
     if (new_channel_func_ptr(target, /*rpc_options=*/nullptr, &channel_ptr)
             .ok()) {
@@ -170,7 +172,7 @@ ChannelCreationFunction ConvertToChannelCreationFunction(
 }
 
 absl::Status GrpcChannelSpec::AddHostPortsJob(
-    const string& job_id, const std::map<int, string>& host_ports) {
+    const std::string& job_id, const std::map<int, std::string>& host_ports) {
   if (!job_ids_.insert(job_id).second) {
     return absl::InvalidArgumentError(
         absl::StrCat("Duplicate job ID in cluster specification: ", job_id));
@@ -201,25 +203,25 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<std::string>* workers) override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkers(workers);
     }
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkersInJob(job_name, workers);
     }
   }
 
-  string TranslateTask(const string& target) override {
+  std::string TranslateTask(const std::string& target) override {
     absl::MutexLock l(mu_);  // could use reader lock
     GrpcChannelCache* cache = gtl::FindPtrOrNull(target_caches_, target);
     if (cache == nullptr) {
       for (GrpcChannelCache* c : caches_) {
-        string r = c->TranslateTask(target);
+        std::string r = c->TranslateTask(target);
         if (!r.empty()) {
           target_caches_.insert({target, c});
           cache = c;
@@ -233,7 +235,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
  protected:
-  SharedGrpcChannelPtr FindChannelOnce(const string& target) override {
+  SharedGrpcChannelPtr FindChannelOnce(const std::string& target) override {
     for (GrpcChannelCache* cache : caches_) {
       SharedGrpcChannelPtr ch(cache->FindWorkerChannel(target));
       if (ch) {
@@ -252,14 +254,14 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   absl::Mutex mu_;
   // Cache of channels keyed by the target they are handling.
   // The same GrpcChannelCache can appear multiple times in the cache.
-  std::unordered_map<string, GrpcChannelCache*> target_caches_
+  std::unordered_map<std::string, GrpcChannelCache*> target_caches_
       TF_GUARDED_BY(mu_);
 };
 
 class SparseGrpcChannelCache : public CachingGrpcChannelCache {
  public:
-  SparseGrpcChannelCache(const string& job_id,
-                         const std::map<int, string>& host_ports,
+  SparseGrpcChannelCache(const std::string& job_id,
+                         const std::map<int, std::string>& host_ports,
                          ChannelCreationFunction channel_func,
                          int num_channels_per_target)
       : CachingGrpcChannelCache(num_channels_per_target),
@@ -270,7 +272,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
   ~SparseGrpcChannelCache() override {}
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<std::string>* workers) override {
     workers->reserve(workers->size() + host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       std::vector<std::string> replicas =
@@ -282,14 +284,14 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) override {
     if (job_name == job_id_) {
       ListWorkers(workers);
     }
   }
 
-  string TranslateTask(const string& target) override {
+  std::string TranslateTask(const std::string& target) override {
     DeviceNameUtils::ParsedName parsed;
     if (!DeviceNameUtils::ParseFullName(target, &parsed)) {
       LOG(WARNING) << "Invalid target: " << target;
@@ -319,8 +321,8 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
  protected:
-  SharedGrpcChannelPtr FindChannelOnce(const string& target) override {
-    const string host_port = TranslateTask(target);
+  SharedGrpcChannelPtr FindChannelOnce(const std::string& target) override {
+    const std::string host_port = TranslateTask(target);
     if (host_port.empty()) {
       return nullptr;
     }
@@ -332,19 +334,19 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
  private:
-  string ToString() {
-    std::vector<string> task_strings;
+  std::string ToString() {
+    std::vector<std::string> task_strings;
     task_strings.reserve(host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       task_strings.emplace_back(
-          strings::StrCat(id_host_port.first, " -> ", id_host_port.second));
+          absl::StrCat(id_host_port.first, " -> ", id_host_port.second));
     }
-    return strings::StrCat(job_id_, " -> {", absl::StrJoin(task_strings, ", "),
-                           "}");
+    return absl::StrCat(job_id_, " -> {", absl::StrJoin(task_strings, ", "),
+                        "}");
   }
 
-  const string job_id_;
-  const std::map<int, string> host_ports_;
+  const std::string job_id_;
+  const std::map<int, std::string> host_ports_;
   const ChannelCreationFunction channel_func_;
   SparseGrpcChannelCache(const SparseGrpcChannelCache&) = delete;
   void operator=(const SparseGrpcChannelCache&) = delete;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
index e608b614704564..7611409f936424 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -37,14 +37,15 @@ using tensorflow::RPCOptions;
 class GrpcChannelSpec {
  public:
   struct HostPortsJob {
-    HostPortsJob(const string& job_id, const std::map<int, string>& host_ports)
+    HostPortsJob(const std::string& job_id,
+                 const std::map<int, std::string>& host_ports)
         : job_id(job_id), host_ports(host_ports) {}
-    const string job_id;
-    const std::map<int, string> host_ports;
+    const std::string job_id;
+    const std::map<int, std::string> host_ports;
   };
 
-  absl::Status AddHostPortsJob(const string& job_id,
-                               const std::map<int, string>& host_ports);
+  absl::Status AddHostPortsJob(const std::string& job_id,
+                               const std::map<int, std::string>& host_ports);
 
   const std::vector<HostPortsJob>& host_ports_jobs() const {
     return host_ports_jobs_;
@@ -52,7 +53,7 @@ class GrpcChannelSpec {
 
  private:
   std::vector<HostPortsJob> host_ports_jobs_;
-  std::set<string> job_ids_;
+  std::set<std::string> job_ids_;
 };
 
 class GrpcChannelCache {
@@ -63,21 +64,22 @@ class GrpcChannelCache {
   // was created to handle.  Worker names are in the format
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
-  virtual void ListWorkers(std::vector<string>* workers) = 0;
-  virtual void ListWorkersInJob(const string& job_name,
-                                std::vector<string>* workers) = 0;
+  virtual void ListWorkers(std::vector<std::string>* workers) = 0;
+  virtual void ListWorkersInJob(const std::string& job_name,
+                                std::vector<std::string>* workers) = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
   // format: /job:<job identifier>/task:<task id>
   // E.g., /job:mnist/task:2
-  virtual SharedGrpcChannelPtr FindWorkerChannel(const string& target) = 0;
+  virtual SharedGrpcChannelPtr FindWorkerChannel(const std::string& target) = 0;
 
   // Translates a string in the form `/job:X/task:Z` into a host_port.
-  virtual string TranslateTask(const string& task) = 0;
+  virtual std::string TranslateTask(const std::string& task) = 0;
 };
 
-typedef std::function<SharedGrpcChannelPtr(string)> ChannelCreationFunction;
+typedef std::function<SharedGrpcChannelPtr(std::string)>
+    ChannelCreationFunction;
 
 GrpcChannelCache* NewGrpcChannelCache(
     const GrpcChannelSpec& channel_spec, ChannelCreationFunction channel_func,
@@ -92,7 +94,7 @@ ChannelCreationFunction ConvertToChannelCreationFunction(
                                      SharedGrpcChannelPtr*)>&
         new_channel_func_ptr);
 
-absl::Status NewHostPortGrpcChannel(const string& target,
+absl::Status NewHostPortGrpcChannel(const std::string& target,
                                     const RPCOptions* rpc_options,
                                     SharedGrpcChannelPtr* channel_pointer);
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index eae6d0a6c26169..5d4940cbccdfe0 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -95,10 +95,10 @@ TEST(GrpcChannelTest, HostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
     EXPECT_EQ(
-        std::vector<string>(
+        std::vector<std::string>(
             {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
              "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
              "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
@@ -106,10 +106,10 @@ TEST(GrpcChannelTest, HostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("mnist", &workers);
     EXPECT_EQ(
-        std::vector<string>(
+        std::vector<std::string>(
             {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
              "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
              "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
@@ -117,7 +117,7 @@ TEST(GrpcChannelTest, HostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
@@ -179,25 +179,25 @@ TEST(GrpcChannelTest, HostPortsMultiChannelPerTarget) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:1",
-                                   "/job:mnist/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:1",
+                                        "/job:mnist/replica:0/task:2"}),
               workers);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("mnist", &workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:1",
-                                   "/job:mnist/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:1",
+                                        "/job:mnist/replica:0/task:2"}),
               workers);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
@@ -262,10 +262,10 @@ TEST(GrpcChannelTest, HostPortsMultiGrpcMultiChannelPerTarget) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
     EXPECT_EQ(
-        std::vector<string>(
+        std::vector<std::string>(
             {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
              "/job:mnist/replica:0/task:2", "/job:mnist2/replica:0/task:0",
              "/job:mnist2/replica:0/task:1", "/job:mnist2/replica:0/task:2"}),
@@ -273,21 +273,21 @@ TEST(GrpcChannelTest, HostPortsMultiGrpcMultiChannelPerTarget) {
   }
 
   {
-    std::vector<string> workers, workers2;
+    std::vector<std::string> workers, workers2;
     cc->ListWorkersInJob("mnist", &workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:1",
-                                   "/job:mnist/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:1",
+                                        "/job:mnist/replica:0/task:2"}),
               workers);
     cc->ListWorkersInJob("mnist2", &workers2);
-    EXPECT_EQ(std::vector<string>({"/job:mnist2/replica:0/task:0",
-                                   "/job:mnist2/replica:0/task:1",
-                                   "/job:mnist2/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist2/replica:0/task:0",
+                                        "/job:mnist2/replica:0/task:1",
+                                        "/job:mnist2/replica:0/task:2"}),
               workers2);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
@@ -332,17 +332,17 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
     std::sort(workers.begin(), workers.end());
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:3",
-                                   "/job:mnist/replica:0/task:4"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:3",
+                                        "/job:mnist/replica:0/task:4"}),
               workers);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("mnist", &workers);
     EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
                                         "/job:mnist/replica:0/task:3",
@@ -351,7 +351,7 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
diff --git a/third_party/xla/xla/tsl/framework/BUILD b/third_party/xla/xla/tsl/framework/BUILD
index 6983cd1250f1ba..141d67cf93d5f8 100644
--- a/third_party/xla/xla/tsl/framework/BUILD
+++ b/third_party/xla/xla/tsl/framework/BUILD
@@ -120,6 +120,7 @@ cc_library(
             ":allocator_registry_impl",
             "@com_google_absl//absl/synchronization",
             "//xla/tsl/lib/gtl:inlined_vector",
+            "@local_tsl//tsl/platform:numbers",
             "@local_tsl//tsl/platform:strcat",
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:env_impl",
@@ -132,6 +133,7 @@ cc_library(
         otherwise = [
             "//xla/tsl/lib/gtl:inlined_vector",
             "//xla/tsl/platform:logging",
+            "@local_tsl//tsl/platform:numbers",
             "@local_tsl//tsl/platform:platform_port",
             "@local_tsl//tsl/platform:strcat",
             "//xla/tsl/platform:env",
@@ -413,7 +415,6 @@ tsl_cc_test(
         ":cancellation",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",  # buildcleaner: keep
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/tsl/framework/allocator.cc b/third_party/xla/xla/tsl/framework/allocator.cc
index 0c496045c93e56..112f6555ef5902 100644
--- a/third_party/xla/xla/tsl/framework/allocator.cc
+++ b/third_party/xla/xla/tsl/framework/allocator.cc
@@ -19,33 +19,35 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/tsl/framework/allocator_registry.h"
 #include "xla/tsl/framework/tracking_allocator.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/strcat.h"
+#include "tsl/platform/numbers.h"
 
 namespace tsl {
 
 std::string AllocatorStats::DebugString() const {
   return absl::StrFormat(
-      "Limit:            %20lld\n"
-      "InUse:            %20lld\n"
-      "MaxInUse:         %20lld\n"
+      "Limit:            %20s\n"
+      "InUse:            %20s\n"
+      "MaxInUse:         %20s\n"
       "NumAllocs:        %20lld\n"
-      "MaxAllocSize:     %20lld\n"
-      "Reserved:         %20lld\n"
-      "PeakReserved:     %20lld\n"
-      "LargestFreeBlock: %20lld\n",
-      static_cast<long long>(this->bytes_limit ? *this->bytes_limit : 0),
-      static_cast<long long>(this->bytes_in_use),
-      static_cast<long long>(this->peak_bytes_in_use),
+      "MaxAllocSize:     %20s\n"
+      "Reserved:         %20s\n"
+      "PeakReserved:     %20s\n"
+      "LargestFreeBlock: %20s\n",
+      strings::HumanReadableNumBytes(this->bytes_limit ? *this->bytes_limit
+                                                       : 0),
+      strings::HumanReadableNumBytes(this->bytes_in_use),
+      strings::HumanReadableNumBytes(this->peak_bytes_in_use),
       static_cast<long long>(this->num_allocs),
-      static_cast<long long>(this->largest_alloc_size),
-      static_cast<long long>(this->bytes_reserved),
-      static_cast<long long>(this->peak_bytes_reserved),
-      static_cast<long long>(this->largest_free_block_bytes));
+      strings::HumanReadableNumBytes(this->largest_alloc_size),
+      strings::HumanReadableNumBytes(this->bytes_reserved),
+      strings::HumanReadableNumBytes(this->peak_bytes_reserved),
+      strings::HumanReadableNumBytes(this->largest_free_block_bytes));
 }
 
 constexpr size_t Allocator::kAllocatorAlignment;
@@ -59,9 +61,9 @@ void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; }
 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 std::string AllocatorAttributes::DebugString() const {
-  return strings::StrCat("AllocatorAttributes(on_host=", on_host(),
-                         " nic_compatible=", nic_compatible(),
-                         " gpu_compatible=", gpu_compatible(), ")");
+  return absl::StrCat("AllocatorAttributes(on_host=", on_host(),
+                      " nic_compatible=", nic_compatible(),
+                      " gpu_compatible=", gpu_compatible(), ")");
 }
 
 Allocator* cpu_allocator_base() {
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.cc b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
index 4891503a0314be..283df25cc17306 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.cc
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
@@ -1116,11 +1116,15 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   }
   LOG(INFO) << "Sum Total of in-use chunks: "
             << strings::HumanReadableNumBytes(total_bytes);
-  LOG(INFO) << "Total bytes in pool: " << *stats_.pool_bytes
-            << " memory_limit_: " << memory_limit_
-            << " available bytes: " << (memory_limit_ - *stats_.pool_bytes)
+  LOG(INFO) << "Total size in pool: "
+            << strings::HumanReadableNumBytes(*stats_.pool_bytes)
+            << " memory_limit_: "
+            << strings::HumanReadableNumBytes(memory_limit_)
+            << " available size: "
+            << strings::HumanReadableNumBytes(memory_limit_ -
+                                              *stats_.pool_bytes)
             << " curr_region_allocation_bytes_: "
-            << curr_region_allocation_bytes_;
+            << strings::HumanReadableNumBytes(curr_region_allocation_bytes_);
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }
 
diff --git a/third_party/xla/xla/tsl/lib/gtl/BUILD b/third_party/xla/xla/tsl/lib/gtl/BUILD
index 31e08d6c09686e..9f1ff1dc6723eb 100644
--- a/third_party/xla/xla/tsl/lib/gtl/BUILD
+++ b/third_party/xla/xla/tsl/lib/gtl/BUILD
@@ -98,6 +98,7 @@ cc_library(
     deps = [
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -247,6 +248,7 @@ tsl_cc_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:hash",
     ],
diff --git a/third_party/xla/xla/tsl/lib/gtl/int_type.h b/third_party/xla/xla/tsl/lib/gtl/int_type.h
index c0760d45cae7c0..7930484b948a83 100644
--- a/third_party/xla/xla/tsl/lib/gtl/int_type.h
+++ b/third_party/xla/xla/tsl/lib/gtl/int_type.h
@@ -154,11 +154,13 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <cstdint>
 #include <functional>
 #include <iosfwd>
 #include <ostream>  // NOLINT
 #include <unordered_map>
 
+#include "absl/strings/str_format.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
 
@@ -290,6 +292,21 @@ std::ostream& operator<<(std::ostream& os,  // NOLINT
   return os << arg.value();
 }
 
+template <typename Sink, typename... T>
+void AbslStringify(Sink& sink, IntType<T...> arg) {
+  using ValueType = typename decltype(arg)::ValueType;
+
+  // int8_t/uint8_t are not supported by the "%v" specifier due to it being
+  // ambiguous whether an integer or character should be printed.
+  if constexpr (std::is_same_v<ValueType, int8_t>) {
+    absl::Format(&sink, "%d", arg.value());
+  } else if constexpr (std::is_same_v<ValueType, uint8_t>) {
+    absl::Format(&sink, "%u", arg.value());
+  } else {
+    absl::Format(&sink, "%v", arg.value());
+  }
+}
+
 // -- NON-MEMBER ARITHMETIC OPERATORS ------------------------------------------
 // We support only the +, -, *, and / operators with the same IntType and
 // ValueType types.  The reason is to allow simple manipulation on these IDs
diff --git a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
index 1205cd7a3e3251..0120e2f96e1ecc 100644
--- a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
 
@@ -291,4 +292,10 @@ TYPED_TEST(IntTypeTest, TestMove) {
   EXPECT_EQ(321, *foo.ptr);
 }
 
+TYPED_TEST(IntTypeTest, TestAbslStringify) {
+  TypeParam a(1);
+
+  EXPECT_EQ(absl::StrCat(a), absl::StrCat(a.value()));
+}
+
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/random/philox_random_test.cc b/third_party/xla/xla/tsl/lib/random/philox_random_test.cc
index 3a4cc70d9f6ba8..c25cd3f1bea3dc 100644
--- a/third_party/xla/xla/tsl/lib/random/philox_random_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/philox_random_test.cc
@@ -50,15 +50,15 @@ TEST(PhiloxRandomTest, SkipMatchTest) {
   constexpr int count = 1024;
   constexpr int skip_count = 2048;
 
-  uint64 test_seed = GetTestSeed();
-  std::vector<uint32> v1(count);
+  uint64_t test_seed = GetTestSeed();
+  std::vector<uint32_t> v1(count);
   {
     PhiloxRandom gen(test_seed);
     gen.Skip(skip_count / 4);
     FillRandoms<TrivialPhiloxDistribution>(gen, &v1[0], v1.size());
   }
 
-  std::vector<uint32> v2(count + skip_count);
+  std::vector<uint32_t> v2(count + skip_count);
   {
     PhiloxRandom gen(test_seed);
     FillRandoms<TrivialPhiloxDistribution>(gen, &v2[0], v2.size());
diff --git a/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h b/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
index 3c76e1553774f3..dce28404322b95 100644
--- a/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
+++ b/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
@@ -26,7 +26,7 @@ namespace tsl {
 namespace random {
 
 // Return a random seed.
-inline uint64 GetTestSeed() { return New64(); }
+inline uint64_t GetTestSeed() { return New64(); }
 
 // A utility function to fill the given array with samples from the given
 // distribution.
diff --git a/third_party/xla/xla/tsl/lib/random/random_distributions.cc b/third_party/xla/xla/tsl/lib/random/random_distributions.cc
index ab8930008f8c8b..46763c7c63196c 100644
--- a/third_party/xla/xla/tsl/lib/random/random_distributions.cc
+++ b/third_party/xla/xla/tsl/lib/random/random_distributions.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tsl {
 namespace random {
 template <>
-void SingleSampleAdapter<PhiloxRandom>::SkipFromGenerator(uint64 num_skips) {
+void SingleSampleAdapter<PhiloxRandom>::SkipFromGenerator(uint64_t num_skips) {
   // Use the O(1) PhiloxRandom::Skip instead of the default O(N) impl.
   generator_->Skip(num_skips);
 }
diff --git a/third_party/xla/xla/tsl/lib/random/random_distributions.h b/third_party/xla/xla/tsl/lib/random/random_distributions.h
index 72ee2ae49aa875..7c77797688276f 100644
--- a/third_party/xla/xla/tsl/lib/random/random_distributions.h
+++ b/third_party/xla/xla/tsl/lib/random/random_distributions.h
@@ -29,9 +29,9 @@ namespace tsl {
 namespace random {
 
 // Helper function to convert a 16-bit integer to a half between [0..1).
-PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x);
 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
-PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16_t x);
 
 // Computes a + b. Requires that the result is representable in the destination
 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
@@ -158,7 +158,7 @@ class UniformDistribution<Generator, double> {
 };
 
 template <class Generator>
-class UniformDistribution<Generator, int32> {
+class UniformDistribution<Generator, int32_t> {
  public:
   // The number of elements that will be returned.
   static constexpr int kResultElementCount = Generator::kResultElementCount;
@@ -167,12 +167,13 @@ class UniformDistribution<Generator, int32> {
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
   static constexpr bool kVariableSamplesPerOutput = false;
-  typedef Array<int32, kResultElementCount> ResultType;
-  typedef int32 ResultElementType;
+  typedef Array<int32_t, kResultElementCount> ResultType;
+  typedef int32_t ResultElementType;
 
   // Must have lo < hi
   UniformDistribution(int32_t lo, int32_t hi)
-      : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
+      : lo_(lo),
+        range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
@@ -188,8 +189,8 @@ class UniformDistribution<Generator, int32> {
   // Note that lo_ is intentionally signed while range_ is intentionally
   // unsigned.  This is because hi - lo can overflow signed integers if
   // lo < 0 < hi, but always fits in unsigned.
-  int32 lo_;
-  uint32 range_;
+  int32_t lo_;
+  uint32_t range_;
 };
 
 template <class Generator>
@@ -207,14 +208,16 @@ class UniformDistribution<Generator, int64_t> {
 
   // Must have lo < hi
   UniformDistribution(int64_t lo, int64_t hi)
-      : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
+      : lo_(lo),
+        range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+      auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1])
+                                      << 32;
       result[i] = SignedAdd(lo_, bits % range_);
     }
     return result;
@@ -225,7 +228,7 @@ class UniformDistribution<Generator, int64_t> {
   // unsigned.  This is because hi - lo can overflow signed integers if
   // lo < 0 < hi, but always fits in unsigned.
   int64_t lo_;
-  uint64 range_;
+  uint64_t range_;
 };
 
 // Similar to `UniformDistribution`, except that instead of generating numbers
@@ -276,24 +279,25 @@ class UniformFullIntDistribution64 {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+      result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1])
+                                      << 32;
     }
     return result;
   }
 };
 
 template <typename Generator>
-class UniformFullIntDistribution<Generator, int32>
-    : public UniformFullIntDistribution32<Generator, int32> {};
+class UniformFullIntDistribution<Generator, int32_t>
+    : public UniformFullIntDistribution32<Generator, int32_t> {};
 template <typename Generator>
-class UniformFullIntDistribution<Generator, uint32>
-    : public UniformFullIntDistribution32<Generator, uint32> {};
+class UniformFullIntDistribution<Generator, uint32_t>
+    : public UniformFullIntDistribution32<Generator, uint32_t> {};
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int64_t>
     : public UniformFullIntDistribution64<Generator, int64_t> {};
 template <typename Generator>
-class UniformFullIntDistribution<Generator, uint64>
-    : public UniformFullIntDistribution64<Generator, uint64> {};
+class UniformFullIntDistribution<Generator, uint64_t>
+    : public UniformFullIntDistribution64<Generator, uint64_t> {};
 
 // A class that adapts the underlying native multiple samples to return a single
 // sample at a time.
@@ -322,7 +326,7 @@ class SingleSampleAdapter {
   }
 
   PHILOX_DEVICE_INLINE
-  void Skip(uint64 num_skips) {
+  void Skip(uint64_t num_skips) {
     if (!num_skips) {
       return;
     }
@@ -346,7 +350,7 @@ class SingleSampleAdapter {
   // from `generator_`. There is an O(1) implementation for PhiloxRandom
   // in random_distributions.cc.
   PHILOX_DEVICE_INLINE
-  void SkipFromGenerator(uint64 num_skips) {
+  void SkipFromGenerator(uint64_t num_skips) {
     while (num_skips--) {
       (*generator_)();
     }
@@ -372,8 +376,8 @@ template <class Generator, typename RealType>
 class NormalDistribution;
 
 PHILOX_DEVICE_INLINE
-void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
-                     double* d1);
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+                     double* d0, double* d1);
 
 // Exactly like the float version, except that we convert to half afterwards;
 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
@@ -527,8 +531,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
       // Repeatedly take samples from the normal distribution, until we have
       // the desired number of elements that fall within the pre-defined cutoff
       // threshold.
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
@@ -573,8 +577,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
       // Repeatedly take samples from the normal distribution, until we have
       // the desired number of elements that fall within the pre-defined cutoff
       // threshold.
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
@@ -620,8 +624,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, float> {
       // Repeatedly take samples from the normal distribution, until we have
       // the desired number of elements that fall within the pre-defined cutoff
       // threshold.
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
@@ -664,10 +668,10 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> {
     ResultType results;
     int index = 0;
     while (true) {
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
-      const uint32 x2 = (*gen)();
-      const uint32 x3 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      const uint32_t x2 = (*gen)();
+      const uint32_t x3 = (*gen)();
       double d[2];
       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
 
@@ -690,8 +694,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> {
 // Helper function to convert four 32-bit uniform integers to two doubles
 // under the unit normal distribution.
 PHILOX_DEVICE_INLINE
-void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
-                     double* d1) {
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+                     double* d0, double* d1) {
   // This function implements the Box-Muller transform:
   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
   // Do not send a really small number to log().
@@ -714,16 +718,16 @@ void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
 }
 
 // Helper function to convert an 16-bit integer to a half between [0..1).
-PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x) {
   // IEEE754 halfs are formatted as follows (MSB first):
   //    sign(1) exponent(5) mantissa(10)
   // Conceptually construct the following:
   //    sign == 0
   //    exponent == 15  -- an excess 15 representation of a zero exponent
   //    mantissa == 10 random bits
-  const uint16 man = x & 0x3ffu;  // 10 bit mantissa
-  const uint16 exp = static_cast<uint16>(15);
-  const uint16 val = (exp << 10) | man;
+  const uint16_t man = x & 0x3ffu;  // 10 bit mantissa
+  const uint16_t exp = static_cast<uint16_t>(15);
+  const uint16_t val = (exp << 10) | man;
 
   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(val);
   return result - Eigen::half(1.0);
@@ -731,16 +735,16 @@ PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
 
 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
 // This can create a uniform distribution of values between [0..1).
-PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16_t x) {
   // bfloat are formatted as follows (MSB first):
   //    sign(1) exponent(8) mantissa(7)
   // Conceptually construct the following:
   //    sign == 0
   //    exponent == 127  -- an excess 127 representation of a zero exponent
   //    mantissa == 7 random bits
-  const uint16 man = x & 0x7fu;  // 7 bit mantissa
-  const uint16 exp = static_cast<uint16>(127);
-  const uint16 val = (exp << 7) | man;
+  const uint16_t man = x & 0x7fu;  // 7 bit mantissa
+  const uint16_t exp = static_cast<uint16_t>(127);
+  const uint16_t val = (exp << 7) | man;
 
   bfloat16 result;
   memcpy(&result, &val, sizeof(val));
diff --git a/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc b/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc
index 5203daac1a04c0..4b69232418e05b 100644
--- a/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc
@@ -151,7 +151,7 @@ void UniformMomentsTest(int count, int max_moments,
   auto uniform_moments = [](int n) -> double { return 1. / (n + 1); };
 
   std::vector<T> v1(count);
-  uint64 seed = GetTestSeed();
+  uint64_t seed = GetTestSeed();
   PhiloxRandom gen(seed);
   FillRandoms<UniformDistribution<PhiloxRandom, T> >(gen, &v1[0], v1.size());
   for (int stride : strides) {
@@ -181,7 +181,7 @@ void NormalMomentsTest(int count, int max_moments,
   };
 
   std::vector<T> v1(count);
-  uint64 seed = GetTestSeed();
+  uint64_t seed = GetTestSeed();
   PhiloxRandom gen(seed);
   FillRandoms<NormalDistribution<PhiloxRandom, T> >(gen, &v1[0], v1.size());
 
@@ -241,7 +241,7 @@ template <typename T>
 void RandomParametersMomentsTest(int count, int max_moments,
                                  const std::vector<int>& strides, T z_limit) {
   std::vector<T> v1(count);
-  uint64 seed = GetTestSeed();
+  uint64_t seed = GetTestSeed();
   PhiloxRandom gen(seed);
   FillRandomsWithSingles<
       TruncatedNormalDistribution<SingleSampleAdapter<PhiloxRandom>, T> >(
@@ -302,9 +302,9 @@ TEST(PhiloxRandomTest, RandomParametersDoubleMomentsTest) {
 
 class MockGenerator {
  public:
-  explicit MockGenerator(uint64 seed) : counter_(seed) {}
-  using ResultType = std::vector<uint32>;
-  using ResultElementType = uint32;
+  explicit MockGenerator(uint64_t seed) : counter_(seed) {}
+  using ResultType = std::vector<uint32_t>;
+  using ResultElementType = uint32_t;
   static constexpr int kResultElementCount = 1;
   ResultType operator()() {
     ResultType result;
@@ -313,20 +313,20 @@ class MockGenerator {
   }
 
  private:
-  uint32 counter_;
+  uint32_t counter_;
 };
 
 template <typename T>
 void SingleSampleAdapterSkipTest() {
-  std::vector<uint64> skips(10);
-  std::vector<uint64> skip_afters(10);
+  std::vector<uint64_t> skips(10);
+  std::vector<uint64_t> skip_afters(10);
   absl::c_iota(skips, 0);
   absl::c_iota(skip_afters, 0);
-  uint64 total_samples = 100;
-  uint64 seed = GetTestSeed();
+  uint64_t total_samples = 100;
+  uint64_t seed = GetTestSeed();
 
-  for (uint64 skip : skips) {
-    for (uint64 skip_after : skip_afters) {
+  for (uint64_t skip : skips) {
+    for (uint64_t skip_after : skip_afters) {
       // Baseline rngs.
       T parent_gen(seed);
       SingleSampleAdapter<T> gen(&parent_gen);
diff --git a/third_party/xla/xla/tsl/lib/random/simple_philox.cc b/third_party/xla/xla/tsl/lib/random/simple_philox.cc
index 8b3481ac7c4f39..81c553a7d0fcef 100644
--- a/third_party/xla/xla/tsl/lib/random/simple_philox.cc
+++ b/third_party/xla/xla/tsl/lib/random/simple_philox.cc
@@ -21,18 +21,19 @@ limitations under the License.
 namespace tsl {
 namespace random {
 
-uint32 SimplePhilox::Uniform(uint32 n) {
-  return ExactUniformInt<uint32>(n, [this]() { return Rand32(); });
+uint32_t SimplePhilox::Uniform(uint32_t n) {
+  return ExactUniformInt<uint32_t>(n, [this]() { return Rand32(); });
 }
 
-uint64 SimplePhilox::Uniform64(uint64 n) {
-  return ExactUniformInt<uint64>(n, [this]() { return Rand64(); });
+uint64_t SimplePhilox::Uniform64(uint64_t n) {
+  return ExactUniformInt<uint64_t>(n, [this]() { return Rand64(); });
 }
 
-uint32 SimplePhilox::Skewed(int max_log) {
+uint32_t SimplePhilox::Skewed(int max_log) {
   CHECK(0 <= max_log && max_log <= 32);
   const int shift = Rand32() % (max_log + 1);
-  const uint32 mask = shift == 32 ? ~static_cast<uint32>(0) : (1 << shift) - 1;
+  const uint32_t mask =
+      shift == 32 ? ~static_cast<uint32_t>(0) : (1 << shift) - 1;
   return Rand32() & mask;
 }
 
diff --git a/third_party/xla/xla/tsl/lib/random/simple_philox.h b/third_party/xla/xla/tsl/lib/random/simple_philox.h
index 736bec4d84d238..c8b7005968b0b8 100644
--- a/third_party/xla/xla/tsl/lib/random/simple_philox.h
+++ b/third_party/xla/xla/tsl/lib/random/simple_philox.h
@@ -34,12 +34,12 @@ class SimplePhilox {
   explicit SimplePhilox(PhiloxRandom* gen) : single_(gen) {}
 
   // 32 random bits
-  PHILOX_DEVICE_INLINE uint32 Rand32() { return single_(); }
+  PHILOX_DEVICE_INLINE uint32_t Rand32() { return single_(); }
 
   // 64 random bits
-  PHILOX_DEVICE_INLINE uint64 Rand64() {
-    const uint32 lo = single_(), hi = single_();
-    return lo | static_cast<uint64>(hi) << 32;
+  PHILOX_DEVICE_INLINE uint64_t Rand64() {
+    const uint32_t lo = single_(), hi = single_();
+    return lo | static_cast<uint64_t>(hi) << 32;
   }
 
   // Uniform float in [0, 1)
@@ -47,25 +47,25 @@ class SimplePhilox {
 
   // Uniform double in [0, 1)
   PHILOX_DEVICE_INLINE double RandDouble() {
-    const uint32 x0 = single_(), x1 = single_();
+    const uint32_t x0 = single_(), x1 = single_();
     return Uint64ToDouble(x0, x1);
   }
 
   // Uniform integer in [0, n).
   // Uses rejection sampling, so may need more than one 32-bit sample.
-  uint32 Uniform(uint32 n);
+  uint32_t Uniform(uint32_t n);
 
   // Approximately uniform integer in [0, n).
   // Uses rejection sampling, so may need more than one 64-bit sample.
-  uint64 Uniform64(uint64 n);
+  uint64_t Uniform64(uint64_t n);
 
   // True with probability 1/n.
-  bool OneIn(uint32 n) { return Uniform(n) == 0; }
+  bool OneIn(uint32_t n) { return Uniform(n) == 0; }
 
   // Skewed: pick "base" uniformly from range [0,max_log] and then
   // return "base" random bits.  The effect is to pick a number in the
   // range [0,2^max_log-1] with bias towards smaller numbers.
-  uint32 Skewed(int max_log);
+  uint32_t Skewed(int max_log);
 
  private:
   SingleSampleAdapter<PhiloxRandom> single_;
diff --git a/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc b/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc
index 7a20dbeccf56c0..4433351c295d6f 100644
--- a/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc
@@ -76,10 +76,10 @@ TEST(SimplePhiloxTest, Regression_CloseSeedsAreDifferent) {
   PhiloxRandom philox1(0, 1), philox2(1, 1);
   SimplePhilox gen1(&philox1), gen2(&philox2);
 
-  std::set<uint32> first;
-  std::set<uint32> all;
+  std::set<uint32_t> first;
+  std::set<uint32_t> all;
   for (int i = 0; i < kCount; ++i) {
-    uint32 v = gen1.Rand32();
+    uint32_t v = gen1.Rand32();
     first.insert(v);
     all.insert(v);
     all.insert(gen2.Rand32());
@@ -96,13 +96,13 @@ TEST(SimplePhiloxTest, TestUniform) {
   PhiloxRandom philox(17, 17);
   SimplePhilox gen(&philox);
 
-  uint32 range = 3 * (1L << 29);
-  uint32 threshold = 1L << 30;
+  uint32_t range = 3 * (1L << 29);
+  uint32_t threshold = 1L << 30;
 
   size_t count = 0;
   static const int kTrials = 100000;
   for (int i = 0; i < kTrials; ++i) {
-    uint32 rnd = gen.Uniform(range);
+    uint32_t rnd = gen.Uniform(range);
     if (rnd < threshold) {
       ++count;
     }
@@ -115,13 +115,13 @@ TEST(SimplePhiloxTest, TestUniform64) {
   PhiloxRandom philox(17, 17);
   SimplePhilox gen(&philox);
 
-  uint64 range = 3 * (1LL << 59);
-  uint64 threshold = 1LL << 60;
+  uint64_t range = 3 * (1LL << 59);
+  uint64_t threshold = 1LL << 60;
 
   size_t count = 0;
   static const int kTrials = 100000;
   for (int i = 0; i < kTrials; ++i) {
-    uint64 rnd = gen.Uniform64(range);
+    uint64_t rnd = gen.Uniform64(range);
     if (rnd < threshold) {
       ++count;
     }
diff --git a/third_party/xla/xla/tsl/lib/random/weighted_picker.cc b/third_party/xla/xla/tsl/lib/random/weighted_picker.cc
index 911f0f4d300616..8dc3edd659c97b 100644
--- a/third_party/xla/xla/tsl/lib/random/weighted_picker.cc
+++ b/third_party/xla/xla/tsl/lib/random/weighted_picker.cc
@@ -35,9 +35,9 @@ WeightedPicker::WeightedPicker(int N) {
   }
 
   // Initialize the levels
-  level_ = new int32*[num_levels_];
+  level_ = new int32_t*[num_levels_];
   for (int l = 0; l < num_levels_; l++) {
-    level_[l] = new int32[LevelSize(l)];
+    level_[l] = new int32_t[LevelSize(l)];
   }
 
   SetAllWeights(1);
@@ -50,9 +50,9 @@ WeightedPicker::~WeightedPicker() {
   delete[] level_;
 }
 
-static int32 UnbiasedUniform(SimplePhilox* r, int32_t n) {
+static int32_t UnbiasedUniform(SimplePhilox* r, int32_t n) {
   CHECK_LE(0, n);
-  const uint32 range = ~static_cast<uint32>(0);
+  const uint32_t range = ~static_cast<uint32_t>(0);
   if (n == 0) {
     return r->Rand32() * n;
   } else if (0 == (n & (n - 1))) {
@@ -64,8 +64,8 @@ static int32 UnbiasedUniform(SimplePhilox* r, int32_t n) {
     // Rand32's output is uniform in the half-open interval [0, 2^{32}).
     // For any interval [m,n), the number of elements in it is n-m.
 
-    uint32 rem = (range % n) + 1;
-    uint32 rnd;
+    uint32_t rem = (range % n) + 1;
+    uint32_t rnd;
 
     // rem = ((2^{32}-1) \bmod n) + 1
     // 1 <= rem <= n
@@ -145,7 +145,7 @@ void WeightedPicker::set_weight(int index, int32_t weight) {
 
 void WeightedPicker::SetAllWeights(int32_t weight) {
   // Initialize leaves
-  int32* leaves = level_[num_levels_ - 1];
+  int32_t* leaves = level_[num_levels_ - 1];
   for (int i = 0; i < N_; i++) leaves[i] = weight;
   for (int i = N_; i < LevelSize(num_levels_ - 1); i++) leaves[i] = 0;
 
@@ -153,11 +153,11 @@ void WeightedPicker::SetAllWeights(int32_t weight) {
   RebuildTreeWeights();
 }
 
-void WeightedPicker::SetWeightsFromArray(int N, const int32* weights) {
+void WeightedPicker::SetWeightsFromArray(int N, const int32_t* weights) {
   Resize(N);
 
   // Initialize leaves
-  int32* leaves = level_[num_levels_ - 1];
+  int32_t* leaves = level_[num_levels_ - 1];
   for (int i = 0; i < N_; i++) leaves[i] = weights[i];
   for (int i = N_; i < LevelSize(num_levels_ - 1); i++) leaves[i] = 0;
 
@@ -167,8 +167,8 @@ void WeightedPicker::SetWeightsFromArray(int N, const int32* weights) {
 
 void WeightedPicker::RebuildTreeWeights() {
   for (int l = num_levels_ - 2; l >= 0; l--) {
-    int32* level = level_[l];
-    int32* children = level_[l + 1];
+    int32_t* level = level_[l];
+    int32_t* children = level_[l + 1];
     for (int i = 0; i < LevelSize(l); i++) {
       level[i] = children[2 * i] + children[2 * i + 1];
     }
@@ -202,8 +202,8 @@ void WeightedPicker::Resize(int new_size) {
   // O(N) regardless.
   assert(new_size > N_);
   WeightedPicker new_picker(new_size);
-  int32* dst = new_picker.level_[new_picker.num_levels_ - 1];
-  int32* src = this->level_[this->num_levels_ - 1];
+  int32_t* dst = new_picker.level_[new_picker.num_levels_ - 1];
+  int32_t* src = this->level_[this->num_levels_ - 1];
   memcpy(dst, src, sizeof(dst[0]) * N_);
   memset(dst + N_, 0, sizeof(dst[0]) * (new_size - N_));
   new_picker.RebuildTreeWeights();
diff --git a/third_party/xla/xla/tsl/lib/random/weighted_picker.h b/third_party/xla/xla/tsl/lib/random/weighted_picker.h
index 1300fba858d881..e2060f35b43eac 100644
--- a/third_party/xla/xla/tsl/lib/random/weighted_picker.h
+++ b/third_party/xla/xla/tsl/lib/random/weighted_picker.h
@@ -58,7 +58,7 @@ class WeightedPicker {
 
   // Get the weight associated with an element
   // REQUIRES 0 <= index < N
-  int32 get_weight(int index) const;
+  int32_t get_weight(int index) const;
 
   // Set the weight associated with an element
   // REQUIRES weight >= 0.0f
@@ -66,7 +66,7 @@ class WeightedPicker {
   void set_weight(int index, int32_t weight);
 
   // Get the total combined weight of all elements
-  int32 total_weight() const;
+  int32_t total_weight() const;
 
   // Get the number of elements in the picker
   int num_elements() const;
@@ -78,7 +78,7 @@ class WeightedPicker {
   // sets the weight of each element i to weight[i].
   // The sum of the weights should not exceed 2^31 - 2
   // Complexity O(N).
-  void SetWeightsFromArray(int N, const int32* weights);
+  void SetWeightsFromArray(int N, const int32_t* weights);
 
   // REQUIRES   N >= 0
   //
@@ -106,7 +106,7 @@ class WeightedPicker {
   // the sum of the weights of its children.
   int N_;           // Number of elements
   int num_levels_;  // Number of levels in tree (level-0 is root)
-  int32** level_;   // Array that holds nodes per level
+  int32_t** level_;  // Array that holds nodes per level
 
   // Size of each level
   static int LevelSize(int level) { return 1 << level; }
@@ -118,13 +118,13 @@ class WeightedPicker {
   void operator=(const WeightedPicker&) = delete;
 };
 
-inline int32 WeightedPicker::get_weight(int index) const {
+inline int32_t WeightedPicker::get_weight(int index) const {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, N_);
   return level_[num_levels_ - 1][index];
 }
 
-inline int32 WeightedPicker::total_weight() const { return level_[0][0]; }
+inline int32_t WeightedPicker::total_weight() const { return level_[0][0]; }
 
 inline int WeightedPicker::num_elements() const { return N_; }
 
diff --git a/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc b/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc
index c4ae1bb4a1b036..34b004ece20ec6 100644
--- a/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc
@@ -32,7 +32,7 @@ namespace random {
 static void TestPicker(SimplePhilox* rnd, int size);
 static void CheckUniform(SimplePhilox* rnd, WeightedPicker* picker, int trials);
 static void CheckSkewed(SimplePhilox* rnd, WeightedPicker* picker, int trials);
-static void TestPickAt(int items, const int32* weights);
+static void TestPickAt(int items, const int32_t* weights);
 
 TEST(WeightedPicker, Simple) {
   PhiloxRandom philox(testing::RandomSeed(), 17);
@@ -101,7 +101,7 @@ TEST(WeightedPicker, BigWeights) {
 
 TEST(WeightedPicker, Deterministic) {
   VLOG(0) << "======= Testing deterministic pick";
-  static const int32 weights[] = {1, 0, 200, 5, 42};
+  static const int32_t weights[] = {1, 0, 200, 5, 42};
   TestPickAt(TF_ARRAYSIZE(weights), weights);
 }
 
@@ -130,7 +130,7 @@ static void TestPicker(SimplePhilox* rnd, int size) {
   }
 
   // Create zero weights array
-  std::vector<int32> weights(size);
+  std::vector<int32_t> weights(size);
   for (int elem = 0; elem < size; elem++) {
     weights[elem] = 0;
   }
@@ -221,7 +221,7 @@ static void CheckSkewed(SimplePhilox* rnd, WeightedPicker* picker, int trials) {
   delete[] count;
 }
 
-static void TestPickAt(int items, const int32* weights) {
+static void TestPickAt(int items, const int32_t* weights) {
   WeightedPicker picker(items);
   picker.SetWeightsFromArray(items, weights);
   int weight_index = 0;
@@ -245,7 +245,7 @@ BENCHMARK(BM_Create)->Range(1, 1024);
 
 static void BM_CreateAndSetWeights(::testing::benchmark::State& state) {
   int arg = state.range(0);
-  std::vector<int32> weights(arg);
+  std::vector<int32_t> weights(arg);
   for (int i = 0; i < arg; i++) {
     weights[i] = i * 10;
   }
diff --git a/third_party/xla/xla/tsl/mkl/BUILD.bazel b/third_party/xla/xla/tsl/mkl/BUILD.bazel
index fdb5bb30887803..36b349fc0afbb2 100644
--- a/third_party/xla/xla/tsl/mkl/BUILD.bazel
+++ b/third_party/xla/xla/tsl/mkl/BUILD.bazel
@@ -1,6 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_xla//xla/tsl:tsl.bzl", "clean_dep")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "mkl_dep")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 licenses(["notice"])  # 3-Clause BSD
 
diff --git a/third_party/xla/xla/tsl/platform/BUILD b/third_party/xla/xla/tsl/platform/BUILD
index 16110e0432404f..cc094e447e7d8f 100644
--- a/third_party/xla/xla/tsl/platform/BUILD
+++ b/third_party/xla/xla/tsl/platform/BUILD
@@ -46,7 +46,6 @@ exports_files(
         "env.cc",
         "ram_file_system.h",
         "grpc_credentials.h",
-        "recordphase.h",
         "resource.h",
         "rocm_rocdl_path.h",
         "resource_loader.h",
@@ -133,7 +132,6 @@ filegroup(
         "file_system_helper.h",
         "prefetch.h",
         "ram_file_system.h",
-        "recordphase.h",
         "resource.h",
         "stack_frame.h",
         "statusor.h",
@@ -906,12 +904,6 @@ tsl_cc_test(
     ],
 )
 
-cc_library(
-    name = "recordphase",
-    textual_hdrs = ["recordphase.h"],
-    deps = tf_platform_deps("recordphase") + ["@com_google_absl//absl/strings:string_view"],
-)
-
 cc_library(
     name = "debug_me_context",
     hdrs = ["debug_me_context.h"],
diff --git a/third_party/xla/xla/tsl/platform/cloud/BUILD b/third_party/xla/xla/tsl/platform/cloud/BUILD
index 4fbc7b0633da6f..cd3ca465817554 100644
--- a/third_party/xla/xla/tsl/platform/cloud/BUILD
+++ b/third_party/xla/xla/tsl/platform/cloud/BUILD
@@ -431,7 +431,6 @@ tsl_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:retrying_utils",
-        "@local_tsl//tsl/platform:strcat",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
index 0f625395e0489d..5b81602719b6f6 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
@@ -405,7 +405,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
       return read_fn_(filename_, offset, n, result, scratch);
     }
     {
-      absl::MutexLock l(&buffer_mutex_);
+      absl::MutexLock l(buffer_mutex_);
       size_t buffer_end = buffer_start_ + buffer_.size();
       size_t copy_size = 0;
       if (offset < buffer_end && offset >= buffer_start_) {
@@ -1061,7 +1061,7 @@ absl::Status GcsFileSystem::NewRandomAccessFile(
                                                      uint64 offset, size_t n,
                                                      absl::string_view* result,
                                                      char* scratch) {
-      absl::ReaderMutexLock l(&block_cache_lock_);
+      absl::ReaderMutexLock l(block_cache_lock_);
       GcsFileStat stat;
       TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
           fname, &stat,
@@ -1110,7 +1110,7 @@ absl::Status GcsFileSystem::NewRandomAccessFile(
 void GcsFileSystem::ResetFileBlockCache(size_t block_size_bytes,
                                         size_t max_bytes,
                                         uint64 max_staleness_secs) {
-  absl::MutexLock l(&block_cache_lock_);
+  absl::MutexLock l(block_cache_lock_);
   file_block_cache_ =
       MakeFileBlockCache(block_size_bytes, max_bytes, max_staleness_secs);
   if (stats_ != nullptr) {
@@ -1346,7 +1346,7 @@ absl::Status GcsFileSystem::ParseGcsPath(absl::string_view fname,
 }
 
 void GcsFileSystem::ClearFileCaches(const string& fname) {
-  absl::ReaderMutexLock l(&block_cache_lock_);
+  absl::ReaderMutexLock l(block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
   stat_cache_->Delete(fname);
   // TODO(rxsang): Remove the patterns that match the file in
@@ -2337,7 +2337,7 @@ absl::Status GcsFileSystem::RenameFolderHns(const string& src,
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
 void GcsFileSystem::FlushCaches(TransactionToken* token) {
-  absl::ReaderMutexLock l(&block_cache_lock_);
+  absl::ReaderMutexLock l(block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
   matching_paths_cache_->Clear();
@@ -2348,13 +2348,13 @@ void GcsFileSystem::FlushCaches(TransactionToken* token) {
 void GcsFileSystem::SetStats(GcsStatsInterface* stats) {
   CHECK(stats_ == nullptr) << "SetStats() has already been called.";
   CHECK(stats != nullptr);
-  absl::MutexLock l(&block_cache_lock_);
+  absl::MutexLock l(block_cache_lock_);
   stats_ = stats;
   stats_->Configure(this, &throttle_, file_block_cache_.get());
 }
 
 void GcsFileSystem::SetCacheStats(FileBlockCacheStatsInterface* cache_stats) {
-  absl::ReaderMutexLock l(&block_cache_lock_);
+  absl::ReaderMutexLock l(block_cache_lock_);
   if (file_block_cache_ == nullptr) {
     LOG(ERROR) << "Tried to set cache stats of non-initialized file block "
                   "cache object. This may result in not exporting the intended "
@@ -2366,7 +2366,7 @@ void GcsFileSystem::SetCacheStats(FileBlockCacheStatsInterface* cache_stats) {
 
 void GcsFileSystem::SetAuthProvider(
     std::unique_ptr<AuthProvider> auth_provider) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   auth_provider_ = std::move(auth_provider);
 }
 
@@ -2382,7 +2382,7 @@ absl::Status GcsFileSystem::CreateHttpRequest(
 
   string auth_token;
   {
-    absl::ReaderMutexLock l(&mu_);
+    absl::ReaderMutexLock l(mu_);
     TF_RETURN_IF_ERROR(
         AuthProvider::GetToken(auth_provider_.get(), &auth_token));
   }
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
index 58db30d178d149..646ee9f1a8a68b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/retrying_utils.h"
-#include "tsl/platform/strcat.h"
 
 // Undef DeleteFile macro defined in wndows.h.
 #ifdef PLATFORM_WINDOWS
@@ -1497,9 +1496,9 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
            "path%2Frandom_access.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"", content.size(), "\"",
-                           ", \"generation\": \"1\"",
-                           ", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+           absl::StrCat("{\"size\": \"", content.size(), "\"",
+                        ", \"generation\": \"1\"",
+                        ", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            absl::StrCat("Uri: https://storage.googleapis.com/bucket/"
                         "path%2Frandom_access.txt\n"
@@ -4383,12 +4382,12 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
                             "location"}}),
       // Uploads entire file again.
       new FakeHttpRequest(
-          strings::StrCat("Uri: https://custom/upload/location\n"
-                          "Auth Token: fake_token\n"
-                          "Header Content-Range: bytes 0-26/27\n"
-                          "Timeouts: 5 1 30\n"
-                          "Put body: ",
-                          contents[0], contents[1], contents[2], "\n"),
+          absl::StrCat("Uri: https://custom/upload/location\n"
+                       "Auth Token: fake_token\n"
+                       "Header Content-Range: bytes 0-26/27\n"
+                       "Timeouts: 5 1 30\n"
+                       "Put body: ",
+                       contents[0], contents[1], contents[2], "\n"),
           ""),
       new FakeHttpRequest(
           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
@@ -4399,15 +4398,14 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
           "Timeouts: 5 1 10\n",
           "", {{"Location", "https://custom/upload/location"}}),
       // Uploads entire file again.
-      new FakeHttpRequest(
-          strings::StrCat("Uri: https://custom/upload/location\n"
-                          "Auth Token: fake_token\n"
-                          "Header Content-Range: bytes 0-35/36\n"
-                          "Timeouts: 5 1 30\n"
-                          "Put body: ",
-                          contents[0], contents[1], contents[2], contents[3],
-                          "\n"),
-          ""),
+      new FakeHttpRequest(absl::StrCat("Uri: https://custom/upload/location\n"
+                                       "Auth Token: fake_token\n"
+                                       "Header Content-Range: bytes 0-35/36\n"
+                                       "Timeouts: 5 1 30\n"
+                                       "Put body: ",
+                                       contents[0], contents[1], contents[2],
+                                       contents[3], "\n"),
+                          ""),
   });
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
index df762f15acb831..c3c1ada70a5df4 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
@@ -151,7 +151,7 @@ GoogleAuthProvider::GoogleAuthProvider(
       env_(env) {}
 
 absl::Status GoogleAuthProvider::GetToken(string* t) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   const uint64 now_sec = env_->NowSeconds();
 
   if (now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {
diff --git a/third_party/xla/xla/tsl/platform/default/BUILD b/third_party/xla/xla/tsl/platform/default/BUILD
index 0d82327fbcdfad..699c941b26c651 100644
--- a/third_party/xla/xla/tsl/platform/default/BUILD
+++ b/third_party/xla/xla/tsl/platform/default/BUILD
@@ -670,17 +670,3 @@ exports_files(
         "//tensorflow/core/platform:__pkg__",
     ]),
 )
-
-cc_library(
-    name = "recordphase",
-    srcs = ["recordphase.cc"],
-    hdrs = ["//xla/tsl/platform:recordphase.h"],
-    tags = [
-        "manual",
-        "no_oss",
-        "nobuilder",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
diff --git a/third_party/xla/xla/tsl/platform/default/build_config.bzl b/third_party/xla/xla/tsl/platform/default/build_config.bzl
index 14d8f1bba7a487..a9b14ee1123671 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config.bzl
@@ -6,6 +6,7 @@
 load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
 load("@com_github_grpc_grpc//bazel:python_rules.bzl", "py_grpc_library")
 load("@com_google_protobuf//bazel:cc_proto_library.bzl", "cc_proto_library")
+load("@com_google_protobuf//bazel:proto_library.bzl", "proto_library")
 load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
 load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
 load("@rules_cc//cc:cc_test.bzl", _cc_test = "cc_test")
@@ -228,7 +229,7 @@ def tf_proto_library(
         name_sans_proto = name[:-6]
     else:
         name_sans_proto = name
-    native.proto_library(
+    proto_library(
         name = name,
         srcs = srcs,
         deps = deps + protodeps + [
diff --git a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
index 12c127bac63727..51dd87ab1c9920 100644
--- a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
+++ b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
@@ -3,7 +3,11 @@
 # This file is used in OSS only. It is not transformed by copybara. Therefore all paths in this
 # file are OSS paths.
 
+load("@rules_cc//cc:cc_binary.bzl", _cc_binary = "cc_binary")
+load("@rules_cc//cc:cc_import.bzl", _cc_import = "cc_import")
 load("@rules_cc//cc:cc_library.bzl", _cc_library = "cc_library")
+load("@rules_cc//cc:cc_shared_library.bzl", _cc_shared_library = "cc_shared_library")
+load("@rules_cc//cc:cc_test.bzl", _cc_test = "cc_test")
 
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
 # to prevent g3 .bzl files from loading this file.
@@ -11,11 +15,6 @@ load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
-_cc_binary = native.cc_binary
-_cc_import = native.cc_import
-_cc_shared_library = native.cc_shared_library
-_cc_test = native.cc_test
-
 cc_binary = _cc_binary
 cc_import = _cc_import
 cc_shared_library = _cc_shared_library
@@ -27,7 +26,7 @@ def cc_library(name, deps = None, **kwargs):
     Args:
       name: name of target.
       deps: deps with `xla/tsl:bazel_issue_21519` added.
-      **kwargs: passed to native.cc_library.
+      **kwargs: passed to cc_library.
     """
 
     if deps == None:
diff --git a/third_party/xla/xla/tsl/platform/numa_hwloc.cc b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
index 50ba2c6a664fd1..e8163ab7eb5616 100644
--- a/third_party/xla/xla/tsl/platform/numa_hwloc.cc
+++ b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
@@ -35,6 +35,11 @@ hwloc_topology_t GetHWLocTopology() {
       LOG(ERROR) << "Call to hwloc_topology_init() failed";
       return;
     }
+    if (hwloc_topology_set_flags(hwloc_topology_handle,
+                                 HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING)) {
+      LOG(ERROR) << "Call to hwloc_topology_set_flags() failed";
+      return;
+    }
     if (hwloc_topology_load(hwloc_topology_handle)) {
       LOG(ERROR) << "Call to hwloc_topology_load() failed";
       return;
@@ -157,7 +162,7 @@ void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
       LOG(ERROR) << "Failed to find hwloc NUMA node " << node;
     }
   }
-  return ::tsl::port::AlignedMalloc(size, minimum_alignment);
+  return AlignedMalloc(size, static_cast<std::align_val_t>(minimum_alignment));
 }
 
 void NUMAFree(void* ptr, size_t size) {
diff --git a/third_party/xla/xla/tsl/platform/numa_noop.cc b/third_party/xla/xla/tsl/platform/numa_noop.cc
index 616c3ae57c5ded..2f50ad7504b260 100644
--- a/third_party/xla/xla/tsl/platform/numa_noop.cc
+++ b/third_party/xla/xla/tsl/platform/numa_noop.cc
@@ -30,7 +30,7 @@ void NUMASetThreadNodeAffinity(int node) {}
 int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
-  return ::tsl::port::AlignedMalloc(size, minimum_alignment);
+  return AlignedMalloc(size, static_cast<std::align_val_t>(minimum_alignment));
 }
 
 void NUMAFree(void* ptr, size_t size) { ::tsl::port::Free(ptr); }
diff --git a/third_party/xla/xla/tsl/platform/ram_file_system.h b/third_party/xla/xla/tsl/platform/ram_file_system.h
index 74c1e19a2273a4..892d5ed3fd7d68 100644
--- a/third_party/xla/xla/tsl/platform/ram_file_system.h
+++ b/third_party/xla/xla/tsl/platform/ram_file_system.h
@@ -66,13 +66,13 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
     return absl::OkStatus();
   }
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     if (offset >= data_->size()) {
       return absl::OutOfRangeError("");
     }
 
-    uint64 left = std::min(static_cast<uint64>(n), data_->size() - offset);
+    uint64_t left = std::min(static_cast<uint64_t>(n), data_->size() - offset);
     auto start = data_->begin() + offset;
     auto end = data_->begin() + offset + left;
 
@@ -305,7 +305,7 @@ class RamFileSystem : public FileSystem {
   }
 
   absl::Status GetFileSize(const std::string& fname_, TransactionToken* token,
-                           uint64* file_size) override {
+                           uint64_t* file_size) override {
     absl::MutexLock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
@@ -355,14 +355,14 @@ class RamFileSystem : public FileSystem {
     return absl::StartsWith(s, prefix);
   }
 
-  string StripPrefix(std::string s, std::string prefix) {
+  std::string StripPrefix(std::string s, std::string prefix) {
     if (absl::StartsWith(s, prefix)) {
       return s.erase(0, prefix.size());
     }
     return s;
   }
 
-  string StripRamFsPrefix(std::string name) {
+  std::string StripRamFsPrefix(std::string name) {
     std::string s = StripPrefix(name, "ram://");
     if (*(s.rbegin()) == '/') {
       s.pop_back();
diff --git a/third_party/xla/xla/tsl/platform/recordphase.h b/third_party/xla/xla/tsl/platform/recordphase.h
deleted file mode 100644
index 23bf927136525a..00000000000000
--- a/third_party/xla/xla/tsl/platform/recordphase.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Client library for recording action subphase timing metrics.
-//
-// This is **different** to the TSL profiler TraceMe functionality.
-//
-// Currently the public implementation of this library is a stub that does
-// nothing. There is an implementation internally at Google.
-//
-// This library is used to record the start and end of a subphase in an action.
-// A subphase is a named section of work that happens within an action.
-//
-// Example:
-//   // Start a phase named "parse_action".
-//   StartPhase("parse_action");
-//   // Do some work.
-//   // ...
-//   // End the phase.
-//   EndPhase("parse_action");
-//   // Start another phase named "link_executable" which depends on
-//   // (always starts after) "parse_action".
-//   StartPhase("link_executable", {"parse_action"});
-//   // Do some work.
-//   // ...
-//   // End the phase.
-//   EndPhase("link_executable");
-//
-// The StartPhase and EndPhase methods are thread-safe.
-//
-// The LoadPhase and LoadAllPhases methods can be used in
-// unit tests to verify the recorded phase timing information.
-
-#ifndef XLA_TSL_PLATFORM_RECORDPHASE_H_
-#define XLA_TSL_PLATFORM_RECORDPHASE_H_
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-
-namespace tsl::recordphase {
-// Records the start of a phase.
-// * phase_name: the name of the phase, must be unique within the namespace.
-// * dependencies: the phases that must complete before this phase can start.
-// phase_name and dependencies must contain only alphanumeric characters,
-// dashes and underscores.
-// If a phase has already started, or a dependency does not exist,
-// or if there are illegal characters in the phase name or its dependencies,
-// the method will log an error and fail silently.
-void StartPhase(absl::string_view phase_name,
-                const std::vector<absl::string_view>& dependencies = {});
-
-// This is like StartPhase, but it generates a unique phase name (which it uses
-// to invoke StartPhase) and returns it.
-std::string StartPhaseUnique(
-    absl::string_view phase_name,
-    const std::vector<absl::string_view>& dependencies = {});
-
-// Records the end of a phase. The phase must have been started before.
-void EndPhase(absl::string_view phase_name);
-
-// Simple RAII wrapper around StartPhase and EndPhase. Does not perform any
-// additional checking.
-class RecordScoped {
- public:
-  explicit RecordScoped(const absl::string_view phase_name,
-                        bool use_unique_phase_name = false,
-                        const std::vector<absl::string_view>& dependencies = {})
-      : phase_name_(phase_name) {
-    if (!use_unique_phase_name) {
-      StartPhase(phase_name_, dependencies);
-    } else {
-      phase_name_ = StartPhaseUnique(phase_name, dependencies);
-    }
-  }
-  ~RecordScoped() { EndPhase(phase_name_); }
-
-  absl::string_view phase_name() const { return phase_name_; }
-
- private:
-  std::string phase_name_;
-};
-}  // namespace tsl::recordphase
-
-#endif  // XLA_TSL_PLATFORM_RECORDPHASE_H_
diff --git a/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h b/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h
index a766492f0a95b2..62764273e11081 100644
--- a/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h
+++ b/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h
@@ -22,10 +22,10 @@ namespace tsl {
 
 // Returns the root directory of the ROCM SDK, which contains sub-folders such
 // as bin, lib, and rocdl.
-string RocmRoot();
+std::string RocmRoot();
 
 // Returns the directory that contains ROCm-Device-Libs files in the ROCm SDK.
-string RocdlRoot();
+std::string RocdlRoot();
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/status.cc b/third_party/xla/xla/tsl/platform/status.cc
index ab23dc080c9942..7e469216605715 100644
--- a/third_party/xla/xla/tsl/platform/status.cc
+++ b/third_party/xla/xla/tsl/platform/status.cc
@@ -322,7 +322,7 @@ absl::Status StatusGroup::as_concatenated_status() const {
   }
 
   if (!non_derived_.empty()) {
-    std::vector<string> fmt;
+    std::vector<std::string> fmt;
     fmt.emplace_back("\n=====================");
     for (const auto& s : non_derived_) {
       fmt.emplace_back(MakeString(s));
diff --git a/third_party/xla/xla/tsl/platform/subprocess.h b/third_party/xla/xla/tsl/platform/subprocess.h
index 8702b7795a8062..317b17da239575 100644
--- a/third_party/xla/xla/tsl/platform/subprocess.h
+++ b/third_party/xla/xla/tsl/platform/subprocess.h
@@ -52,7 +52,8 @@ class SubProcess;
 // launched with the given command-line arguments `argv`. The process
 // must be explicitly started by calling the Start() method on the
 // returned object.
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
+std::unique_ptr<SubProcess> CreateSubProcess(
+    const std::vector<std::string>& argv);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/subprocess_test.cc b/third_party/xla/xla/tsl/platform/subprocess_test.cc
index 5bcf7824177964..0c5f9e99097a61 100644
--- a/third_party/xla/xla/tsl/platform/subprocess_test.cc
+++ b/third_party/xla/xla/tsl/platform/subprocess_test.cc
@@ -36,26 +36,25 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-
-string EchoProgram() {
+std::string EchoProgram() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_echo");
   return tsl::io::AppendDotExeIfWindows(path);
 }
 
-string EchoArgv1Program() {
+std::string EchoArgv1Program() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_echo_argv_1");
   return tsl::io::AppendDotExeIfWindows(path);
 }
 
-string NoopProgram() {
+std::string NoopProgram() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_noop");
   return tsl::io::AppendDotExeIfWindows(path);
 }
 
-string StdErrProgram() {
+std::string StdErrProgram() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_stderr");
   return tsl::io::AppendDotExeIfWindows(path);
@@ -77,7 +76,7 @@ TEST_F(SubProcessTest, NoOutput) {
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string out, err;
+  std::string out, err;
   int status = proc.Communicate(nullptr, &out, &err);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -94,7 +93,7 @@ TEST_F(SubProcessTest, Stdout) {
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string out, err;
+  std::string out, err;
   int status = proc.Communicate(nullptr, &out, &err);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -124,7 +123,7 @@ TEST_F(SubProcessTest, Stderr) {
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string out, err;
+  std::string out, err;
   int status = proc.Communicate(nullptr, &out, &err);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_NE(0, WEXITSTATUS(status));
@@ -151,7 +150,7 @@ TEST_F(SubProcessTest, Stdin) {
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string in = "foobar\nbarfoo\nhaha\n";
+  std::string in = "foobar\nbarfoo\nhaha\n";
   int status = proc.Communicate(&in, nullptr, nullptr);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -164,8 +163,8 @@ TEST_F(SubProcessTest, StdinStdout) {
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string in = "foobar\nbarfoo\nhaha\n";
-  string out;
+  std::string in = "foobar\nbarfoo\nhaha\n";
+  std::string out;
   int status = proc.Communicate(&in, &out, nullptr);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -182,7 +181,7 @@ TEST_F(SubProcessTest, StdinChildExit) {
 
   // Verify that the parent handles the child exiting immediately as the
   // parent is trying to write a large string to the child's stdin.
-  string in;
+  std::string in;
   in.reserve(1000000);
   for (int i = 0; i < 100000; i++) {
     in += "hello xyz\n";
@@ -202,13 +201,13 @@ TEST_F(SubProcessTest, StdinStdoutOverlap) {
 
   // Verify that the parent handles multiplexed reading/writing to the child
   // process.  The string is large enough to exceed the buffering of the pipes.
-  string in;
+  std::string in;
   in.reserve(1000000);
   for (int i = 0; i < 100000; i++) {
     in += "hello xyz\n";
   }
 
-  string out;
+  std::string out;
   int status = proc.Communicate(&in, &out, nullptr);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
diff --git a/third_party/xla/xla/tsl/platform/threadpool.cc b/third_party/xla/xla/tsl/platform/threadpool.cc
index 5d176deaaf945f..9d05c94a1cb816 100644
--- a/third_party/xla/xla/tsl/platform/threadpool.cc
+++ b/third_party/xla/xla/tsl/platform/threadpool.cc
@@ -60,13 +60,13 @@ struct EigenEnvironment {
   struct TaskImpl {
     std::function<void()> fn;
     Context context;
-    uint64 trace_id;
+    uint64_t trace_id;
   };
 
   struct Task {
     Task() = default;
 
-    Task(std::function<void()> fn, Context context, uint64 trace_id)
+    Task(std::function<void()> fn, Context context, uint64_t trace_id)
         : f(TaskImpl{std::move(fn), std::move(context), trace_id}) {}
 
     Task(Task&&) = default;
@@ -97,7 +97,7 @@ struct EigenEnvironment {
   }
 
   Task CreateTask(std::function<void()> f) {
-    uint64 id = 0;
+    uint64_t id = 0;
     if (ABSL_PREDICT_FALSE(tracing::EventCollector::IsEnabled())) {
       id = tracing::GetUniqueArg();
       tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
diff --git a/third_party/xla/xla/tsl/profiler/convert/BUILD b/third_party/xla/xla/tsl/profiler/convert/BUILD
index 987d2b8d1367a8..e4c3349f5bc75c 100644
--- a/third_party/xla/xla/tsl/profiler/convert/BUILD
+++ b/third_party/xla/xla/tsl/profiler/convert/BUILD
@@ -58,6 +58,7 @@ tsl_cc_test(
     deps = [
         ":xla_op_utils",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
index 601a43b564a3c6..bc074d9bbbd3a2 100644
--- a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
+++ b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
@@ -173,7 +173,7 @@ inline bool IsOffDutyOp(absl::string_view category) {
 // to in a user's program; e.g. it could be the file and line of user code that
 // generated the op.
 struct OpSourceInfo {
-  absl::string_view source_file;
+  std::string source_file;
   int32_t source_line = -1;
   std::string stack_frame;
 
diff --git a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc
index 3e9244c1d4a92a..c3415879c313c4 100644
--- a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "xla/tsl/profiler/convert/xla_op_utils.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/platform/test.h"
 
 namespace tsl {
 namespace profiler {
 namespace {
 
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::Property;
+
 TEST(XlaOpUtilsTest, HloModuleNameWithProgramId) {
   EXPECT_EQ("module(123)", HloModuleNameWithProgramId("module", 123));
 }
@@ -74,6 +80,22 @@ TEST(XlaOpUtilsTest, IsXlaArgsOrRetvals) {
   EXPECT_FALSE(IsXlaArgsOrRetvals("op_name"));
 }
 
+// Tests that OpSourceInfo members are std::string and capable of owning
+// the string data. If the members were absl::string_view, this test would
+// fail due to dangling references.
+TEST(XlaOpUtilsTest, OpSourceInfo) {
+  OpSourceInfo op_source_info = {
+      .source_file = absl::StrCat("file", ".cc"),
+      .source_line = 10,
+      .stack_frame = absl::StrCat("frame1", "\n", "frame2"),
+  };
+  EXPECT_THAT(op_source_info,
+              AllOf(Field(&OpSourceInfo::source_file, "file.cc"),
+                    Field(&OpSourceInfo::stack_frame, "frame1\nframe2"),
+                    Property(&OpSourceInfo::GetSourceTopLine, "file.cc:10"),
+                    Property(&OpSourceInfo::GetSourceStack, "frame1\nframe2")));
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/rpc/BUILD b/third_party/xla/xla/tsl/profiler/rpc/BUILD
index 523db019d51d6f..fa081c98557a7e 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/BUILD
@@ -35,21 +35,15 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:types",
         "//xla/tsl/profiler/rpc/client:save_profile",
-        "//xla/tsl/profiler/utils:file_system_utils",
         "//xla/tsl/profiler/utils:math_utils",
         "//xla/tsl/profiler/utils:profiler_options_util",
         "//xla/tsl/profiler/utils:time_utils",
         "//xla/tsl/profiler/utils:xplane_utils",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_proto_cc",
diff --git a/third_party/xla/xla/tsl/profiler/utils/BUILD b/third_party/xla/xla/tsl/profiler/utils/BUILD
index 5088f54edeb11c..175eabf5b957bc 100644
--- a/third_party/xla/xla/tsl/profiler/utils/BUILD
+++ b/third_party/xla/xla/tsl/profiler/utils/BUILD
@@ -297,6 +297,7 @@ cc_library(
     deps = [
         ":tf_xplane_visitor",
         ":timespan",
+        ":tpu_xplane_utils",
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
@@ -311,8 +312,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -344,6 +345,7 @@ tsl_cc_test(
         ":preprocess_xplane",
         ":tf_xplane_visitor",
         ":timespan",
+        ":trace_utils",
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_test_utils",
@@ -353,8 +355,10 @@ tsl_cc_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
diff --git a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
index 43f9c6f2c07bb0..f0a791c4fd84a9 100644
--- a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
@@ -50,8 +50,8 @@ uint8_t* BufferPool::GetOrCreateBuffer() {
 
   // Allocate and return a new buffer.
   constexpr size_t kBufferAlignSize = 8;
-  uint8_t* buffer = reinterpret_cast<uint8_t*>(
-      port::AlignedMalloc(buffer_size_in_bytes_, kBufferAlignSize));
+  uint8_t* buffer = reinterpret_cast<uint8_t*>(port::AlignedMalloc(
+      buffer_size_in_bytes_, static_cast<std::align_val_t>(kBufferAlignSize)));
   if (buffer == nullptr) {
     LOG(WARNING) << "Buffer not allocated.";
     return nullptr;
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.cc b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
index 7edf70559c50f3..20a74dd74eea91 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
@@ -15,11 +15,8 @@ limitations under the License.
 
 #include "xla/tsl/profiler/utils/group_events.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
-#include <iterator>
-#include <map>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -31,18 +28,21 @@ limitations under the License.
 #include "absl/base/no_destructor.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/lib/gtl/map_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
 #include "xla/tsl/profiler/utils/timespan.h"
+#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tsl/platform/dso_loader.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
@@ -144,16 +144,13 @@ void SetContextGroup(const GroupingEventStats& stats, EventNode* event,
 }
 
 void ConnectContextGroups(const ContextGroupMap& context_groups) {
-  for (auto& type_id_group : context_groups) {
-    for (auto& id_group : type_id_group.second) {
-      const ContextGroup& group = id_group.second;
+  for (auto& [type, id_group] : context_groups) {
+    for (auto& [id, group] : id_group) {
       if (group.producers.size() >= 64 && group.consumers.size() >= 64) {
         LOG_EVERY_N(WARNING, 1000)
-            << "id:" << id_group.first
+            << "type: " << type << " id: " << id
             << " producers:" << group.producers.size() << " : "
-            << group.producers[0]->GetEventVisitor().Name()
-            << " consumers:" << group.consumers.size() << " : "
-            << group.consumers[0]->GetEventVisitor().Name();
+            << " consumers:" << group.consumers.size() << " : ";
         continue;
       }
 
@@ -166,6 +163,13 @@ void ConnectContextGroups(const ContextGroupMap& context_groups) {
   }
 }
 
+bool IsTPUParentLineEvent(const XEventVisitor& event) {
+  return event.LineName() == kStepLineName ||
+         event.LineName() == kSparseCoreStepLineName ||
+         event.LineName() == kXlaModuleLineName ||
+         event.LineName() == kSparseCoreModuleLineName;
+}
+
 bool IsImplicitRootEvent(const XEventVisitor& event) {
   static const absl::NoDestructor<absl::flat_hash_set<int64_t>>
       kImplicitRootEvents({
@@ -174,8 +178,9 @@ bool IsImplicitRootEvent(const XEventVisitor& event) {
           HostEventType::kRunGraph,
           HostEventType::kExecutorStateProcess,
       });
-  return event.Type().has_value() &&
-         kImplicitRootEvents->contains(*event.Type());
+  return (event.Type().has_value() &&
+          kImplicitRootEvents->contains(*event.Type())) ||
+         IsTPUParentLineEvent(event);
 }
 
 void ProcessRootEvent(int64_t group_id, EventNode* root_event,
@@ -269,7 +274,7 @@ std::string EventNode::GetGroupName() const {
   std::string name;
   if (std::optional<XStatVisitor> stat = GetContextStat(StatType::kGraphType)) {
     absl::StrAppend(&name, stat->StrOrRefValue(), " ");
-  } else if (!(IsImplicitRootEvent(visitor_))) {
+  } else if (!IsImplicitRootEvent(visitor_)) {
     absl::StrAppend(&name, GetEventVisitor().Name(), " ");
   }
   int64_t step_num = group_id_.value_or(0);
@@ -381,41 +386,133 @@ void EventForest::FindEventNodeAndApply(
   }
 }
 
-void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
-                                     ContextGroupMap* context_groups) {
-  bool is_host_plane = (visitor->Name() == kHostThreadsPlaneName);
+// Finds the primary line used for grouping TPU events (StepLine or ModuleLine).
+// Returns nullptr if no suitable grouping line is found.
+XLine* GetGroupingLineForTPU(XPlane* plane) {
+  XLine* step_line = nullptr;
+  XLine* module_line = nullptr;
+  for (auto& line : *plane->mutable_lines()) {
+    if (line.name() == kStepLineName ||
+        line.name() == kSparseCoreStepLineName) {
+      step_line = &line;
+    } else if (line.name() == kXlaModuleLineName ||
+               line.name() == kSparseCoreModuleLineName) {
+      module_line = &line;
+    }
+  }
+
+  if (step_line != nullptr && step_line->events_size() > 0) {
+    // Prefer the step line for grouping if it is not empty.
+    return step_line;
+  }
+  if (module_line != nullptr && module_line->events_size() > 0) {
+    // Fall back to the module line for inference grouping.
+    return module_line;
+  }
+  return nullptr;
+}
+
+void EventForest::ConnectIntraThreadTPU(XPlane* plane, XPlaneVisitor* visitor,
+                                        ContextGroupMap* context_groups) {
+  std::optional<int64_t> tc_id = GetTensorCoreId(visitor->Name());
+  std::optional<int64_t> sc_id = GetSparseCoreId(visitor->Name());
+  if (!tc_id.has_value() && !sc_id.has_value()) {
+    LOG(ERROR) << "TensorCore or SparseCore ID is missing. Skipping grouping "
+                  "for device plane: "
+               << visitor->Name();
+    return;
+  }
+  XLine* grouping_line = GetGroupingLineForTPU(plane);
+  if (grouping_line == nullptr) {
+    LOG(ERROR) << "No grouping line found. Skipping grouping for device plane: "
+               << visitor->Name();
+    return;
+  }
+
+  // Step 1: Connect the grouping events and store them as the parent nodes for
+  // future connection delegation. NOTE: This will need to be updated to support
+  // sub-step grouping.
+  std::vector<EventNode*> parent_nodes;
+  parent_nodes.reserve(grouping_line->events_size());
+  for (auto& event : *grouping_line->mutable_events()) {
+    XEventVisitor event_visitor(visitor, grouping_line, &event);
+    int64_t event_type = GetEventType(/*is_host_plane=*/false, event_visitor);
+    EventNode* step_node =
+        &event_node_map_[event_type].emplace_back(std::move(event_visitor));
+    GroupingEventStats stats(step_node->GetEventVisitor());
+    parent_nodes.push_back(step_node);
+    SetContextGroup(stats, step_node, context_groups);
+  }
+  // Step 2: Process all other events and propagate their connection metadata to
+  // the parent nodes.
   for (auto& line : *plane->mutable_lines()) {
-    if (line.name() == kTensorCoreSyncFlagLineName ||
-        line.name() == kSparseCoreSyncsLineName) {
-      VLOG(1) << "Skipping Xline with name: " << line.name()
-              << " in plane: " << visitor->Name();
+    if (&line == grouping_line) {
       continue;
     }
-    std::vector<EventNode*> parent_nodes;
+    int parent_index = 0;  // Reset index for each line
     for (auto& event : *line.mutable_events()) {
       XEventVisitor event_visitor(visitor, &line, &event);
-      int64_t event_type = GetEventType(is_host_plane, event_visitor);
-      EventNode* cur_node =
-          &event_node_map_[event_type].emplace_back(std::move(event_visitor));
-      GroupingEventStats stats(cur_node->GetEventVisitor());
-      if (stats.root_level.has_value()) {
-        cur_node->SetRootLevel(*stats.root_level);
+      GroupingEventStats stats(event_visitor);
+      // Find the first step node that *may* be the parent of this event.
+      while (parent_index < parent_nodes.size() &&
+             parent_nodes[parent_index]
+                     ->GetEventVisitor()
+                     .GetTimespan()
+                     .end_ps() <= event_visitor.GetTimespan().begin_ps()) {
+        parent_index++;
       }
-      // Update `context_groups` for `ConnectInterThread`.
-      SetContextGroup(stats, cur_node, context_groups);
-      // Async events are ignored when processing the nesting relationship.
-      if (!stats.is_async) {
-        while (!parent_nodes.empty()) {
-          EventNode* parent_node = parent_nodes.back();
-          if (parent_node->GetEventVisitor().GetTimespan().Includes(
-                  cur_node->GetEventVisitor().GetTimespan())) {
-            parent_node->AddChild(cur_node);
-            break;
-          } else {
+      if (parent_index == parent_nodes.size()) {
+        // Short-circuit when we've reached the end of the parent line.
+        break;
+      }
+      if (parent_nodes[parent_index]->GetEventVisitor().GetTimespan().Includes(
+              event_visitor.GetTimespan())) {
+        // For device events, the parent nodes will consume the
+        // producer/consumer stats of children to reduce the number of nodes
+        // in DAG.
+        SetContextGroup(stats, parent_nodes[parent_index], context_groups);
+      }
+    }
+  }
+  // Step 3: [Only for TensorCore] Store the parent nodes for later if they fail
+  // to be grouped with the host events.
+  if (tc_id.has_value()) {
+    tensor_core_root_events_per_core_.emplace_back(std::move(parent_nodes));
+  }
+}
+
+void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
+                                     ContextGroupMap* context_groups) {
+  bool is_host_plane = (visitor->Name() == kHostThreadsPlaneName);
+  if (absl::StartsWith(visitor->Name(), kTpuPlanePrefix)) {
+    ConnectIntraThreadTPU(plane, visitor, context_groups);
+  } else {
+    for (auto& line : *plane->mutable_lines()) {
+      std::vector<EventNode*> parent_nodes;
+      for (auto& event : *line.mutable_events()) {
+        XEventVisitor event_visitor(visitor, &line, &event);
+        int64_t event_type = GetEventType(is_host_plane, event_visitor);
+        EventNode* cur_node =
+            &event_node_map_[event_type].emplace_back(std::move(event_visitor));
+        GroupingEventStats stats(cur_node->GetEventVisitor());
+        if (stats.root_level.has_value()) {
+          cur_node->SetRootLevel(*stats.root_level);
+        }
+        // Update `context_groups` for `ConnectInterThread`.
+        SetContextGroup(stats, cur_node, context_groups);
+        // Async events are ignored when processing the nesting relationship.
+        if (!stats.is_async) {
+          while (!parent_nodes.empty()) {
+            EventNode* parent_node = parent_nodes.back();
+            if (parent_node->GetEventVisitor().GetTimespan().Includes(
+                    cur_node->GetEventVisitor().GetTimespan())) {
+              parent_node->AddChild(cur_node);
+              break;
+            }
             parent_nodes.pop_back();
           }
+          parent_nodes.push_back(cur_node);
         }
-        parent_nodes.push_back(cur_node);
       }
     }
   }
@@ -468,7 +565,7 @@ bool RootNeedsGrouping(const EventNode* root) {
   // different levels are grouped separately.
   const EventNode* root_parent = FindParentWithComparator(
       [root](const EventNode* parent) {
-        return parent->RootLevel() == root->RootLevel();
+        return parent->IsRoot() && parent->RootLevel() == root->RootLevel();
       },
       root,
       /*include_self=*/false);
@@ -493,28 +590,56 @@ void EventForest::CreateEventGroups() {
     for (EventNode* root_event : tf_loop_root_events_) {
       ProcessRootEvent(group_id++, root_event, &group_metadata_map_);
     }
-    return;
-  }
+  } else {
+    // Iterate over all events and collect all root events.
+    EventList root_events;
+    EventList implicit_root_events;
+    for (auto& [event_type, events] : event_node_map_) {
+      for (EventNode& event : events) {
+        if (!event.IsRoot()) {
+          continue;
+        }
+        std::optional<XStatVisitor> step_id_stat =
+            event.GetEventVisitor().GetStat(StatType::kStepId);
+        // If this is a root event that associated with tf.data, skip.
+        if (step_id_stat &&
+            tf_data_step_ids_.contains(step_id_stat->IntValue())) {
+          continue;
+        }
+        root_events.push_back(&event);
+      }
+    }
 
-  // Iterate over all events and collect all root events.
-  EventList root_events;
-  for (auto& [event_type, events] : event_node_map_) {
-    for (EventNode& event : events) {
-      if (!event.RootLevel()) continue;
-      std::optional<XStatVisitor> step_id_stat =
-          event.GetEventVisitor().GetStat(StatType::kStepId);
-      // If this is a root event that associated with tf.data, skip.
-      if (step_id_stat && tf_data_step_ids_.contains(step_id_stat->IntValue()))
-        continue;
-      root_events.push_back(&event);
+    SortRootEventList(&root_events);
+
+    for (EventNode* root_event : root_events) {
+      if (RootNeedsGrouping(root_event)) {
+        ProcessRootEvent(group_id++, root_event, &group_metadata_map_);
+      }
     }
   }
 
-  SortRootEventList(&root_events);
-
-  for (EventNode* root_event : root_events) {
-    if (RootNeedsGrouping(root_event)) {
-      ProcessRootEvent(group_id++, root_event, &group_metadata_map_);
+  // Check if any TPU root events were grouped. If not, group all in lock step.
+  bool tpu_needs_grouping = absl::c_all_of(
+      tensor_core_root_events_per_core_, [](const auto& core_root_events) {
+        return absl::c_all_of(core_root_events, [](const auto& event) {
+          return RootNeedsGrouping(event);
+        });
+      });
+  if (tpu_needs_grouping) {
+    for (auto& core_root_events : tensor_core_root_events_per_core_) {
+      // Do not change the group_id. This is a cheap way to align the TensorCore
+      // and SparseCore device step events. But can be incorrect if somehow one
+      // core started from an earlier step.
+      uint64_t device_step_group_id = group_id;
+      for (EventNode* root_event : core_root_events) {
+        // If the device step event hasn't been grouped, then treat it as a root
+        // event and group it.
+        if (RootNeedsGrouping(root_event)) {
+          ProcessRootEvent(device_step_group_id++, root_event,
+                           &group_metadata_map_);
+        }
+      }
     }
   }
 }
@@ -615,6 +740,10 @@ void EventForest::ProcessTensorFlowLoop() {
 void EventForest::AddPlane(
     const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
     XPlane* plane) {
+  if (registered_planes_.contains(plane)) {
+    return;
+  }
+  registered_planes_.insert(plane);
   CreateStatMetadata(plane);
   planes_.push_back({plane, visitor_factory(plane)});
 }
@@ -828,32 +957,55 @@ void MergeHostSteps(const XStatMetadata& group_id_stat_metadata,
       GetStatTypeStr(StatType::kDeviceDurationPs));
   auto device_offset_stat_metadata = *plane_builder->GetOrCreateStatMetadata(
       GetStatTypeStr(StatType::kDeviceOffsetPs));
+  auto step_idle_time_stat_metadata = *plane_builder->GetOrCreateStatMetadata(
+      GetStatTypeStr(StatType::kStepIdleTimePs));
   std::optional<int64_t> merged_group_id;
   std::optional<Timespan> merged_device_timespan;
   std::optional<XEventBuilder> merged_step_builder;
+  int64_t merged_step_idle_time = 0;
   absl::flat_hash_set<const XEvent*> events_to_remove;
   for (XEvent& step_event : *step_line->mutable_events()) {
     XEventVisitor step_visitor(&plane_visitor, step_line, &step_event);
     auto group_id = GetGroupId(step_visitor, group_id_stat_metadata);
-    if (!group_id) {
+    if (!group_id.has_value()) {
       // Discard ungrouped event.
       // This usually happens at the beginning of a trace collected using
       // sampling mode, since the host is ahead of the device.
       merged_group_id.reset();
       merged_step_builder.reset();
+      merged_step_idle_time = 0;
       events_to_remove.insert(&step_event);
     } else if (merged_group_id != group_id) {
       // Start a new step with the current event.
       merged_group_id = group_id;
       merged_device_timespan.reset();
-      if (step_visitor.GetStat(StatType::kDeviceOffsetPs).has_value() &&
-          step_visitor.GetStat(StatType::kDeviceDurationPs).has_value()) {
+      if (std::optional<XStatVisitor> current_step_idle_time =
+              step_visitor.GetStat(StatType::kStepIdleTimePs,
+                                   step_idle_time_stat_metadata);
+          current_step_idle_time.has_value()) {
+        merged_step_idle_time = current_step_idle_time->IntOrUintValue();
+      }
+      if (step_visitor
+              .GetStat(StatType::kDeviceOffsetPs, device_offset_stat_metadata)
+              .has_value() &&
+          step_visitor
+              .GetStat(StatType::kDeviceDurationPs,
+                       device_duration_stat_metadata)
+              .has_value()) {
         merged_device_timespan = GetDeviceEventTimespan(step_visitor);
       }
       merged_step_builder.emplace(step_line, plane_builder, &step_event);
+      merged_step_builder->SetOrAddStatValue(step_idle_time_stat_metadata,
+                                             merged_step_idle_time);
     } else {
       // Multi-module step: extend the previous step until the end of the
       // current event and discard the current event.
+      if (std::optional<XStatVisitor> current_step_idle_time =
+              step_visitor.GetStat(StatType::kStepIdleTimePs,
+                                   step_idle_time_stat_metadata);
+          current_step_idle_time.has_value()) {
+        merged_step_idle_time += current_step_idle_time->IntOrUintValue();
+      }
       if (merged_device_timespan.has_value()) {
         merged_device_timespan->ExpandToInclude(
             GetDeviceEventTimespan(step_visitor));
@@ -864,6 +1016,8 @@ void MergeHostSteps(const XStatMetadata& group_id_stat_metadata,
             merged_device_timespan->duration_ps());
       }
       merged_step_builder->SetEndTimestampPs(step_visitor.EndTimestampPs());
+      merged_step_builder->SetOrAddStatValue(step_idle_time_stat_metadata,
+                                             merged_step_idle_time);
       events_to_remove.insert(&step_event);
     }
   }
@@ -902,89 +1056,40 @@ void GroupHostAndPlanes(
   event_forest->GroupEvents();
 }
 
-void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
-                       const GroupMetadataMap& group_metadata_map) {
-  // For each device_trace, the following happens:
-  // (1) Find the module line and the step line.
-  // (2) Assigns group_id to step events. group_id is read from the module
-  //     events nested by the step events.
-  // (3) Assigns group_id to other events nested by the grouped module events.
-  XLine* module_line = nullptr;
+// Groups the events in the device plane using the step line or module line as
+// the grouping line depending on whether the loop is on the device or host.
+void GroupTpuXPlaneEvents(tensorflow::profiler::XPlane* plane,
+                          const GroupMetadataMap& group_metadata_map) {
   XLine* step_line = nullptr;
   std::vector<XLine*> other_lines;
   for (XLine& line : *plane->mutable_lines()) {
-    if (line.name() == "XLA Modules") {
-      module_line = &line;
-    } else if (line.name() == "Steps") {
+    if (line.name() == kStepLineName ||
+        line.name() == kSparseCoreStepLineName) {
       step_line = &line;
     } else {
       other_lines.push_back(&line);
     }
   }
-
-  if (!module_line) return;
-
+  XLine* grouping_line = GetGroupingLineForTPU(plane);
+  if (grouping_line == nullptr) {
+    return;
+  }
   XPlaneBuilder plane_builder(plane);
   const XStatMetadata* group_id_stat_metadata =
       plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId));
   // NOTE: Create plane_visitor after adding new stat metadata to
   // plane_builder, so plane_visitor picks up the changes.
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
-  const XLine* group_line = module_line;
-  if (step_line) {
-    bool device_loop = (step_line->events_size() > module_line->events_size());
-    if (device_loop) {
-      int32_t group_id = 0;
-      for (XEvent& event : *step_line->mutable_events()) {
-        XEventBuilder step_builder(step_line, &plane_builder, &event);
-        XEventVisitor step_visitor(&plane_visitor, step_line, &event);
-        if (!step_visitor.GetStat(StatType::kGroupId).has_value()) {
-          step_builder.AddStatValue(*group_id_stat_metadata, group_id++);
-        }
-      }
-      group_line = step_line;
-    } else {  // host loop
-      if (group_line) {
-        // Determine whether the module line has been grouped.
-        bool is_grouped = false;
-        for (XEvent& event : *module_line->mutable_events()) {
-          XEventVisitor module_visitor(&plane_visitor, module_line, &event);
-          if (module_visitor.GetStat(StatType::kGroupId).has_value()) {
-            is_grouped = true;
-            break;
-          }
-        }
-        if (!is_grouped) {
-          // If the module line has not been grouped, then:
-          // (1) Assign group_id to each step event.
-          int32_t group_id = 0;
-          for (XEvent& event : *step_line->mutable_events()) {
-            XEventBuilder step_builder(step_line, &plane_builder, &event);
-            XEventVisitor step_visitor(&plane_visitor, step_line, &event);
-            if (!step_visitor.GetStat(StatType::kGroupId).has_value()) {
-              step_builder.AddStatValue(*group_id_stat_metadata, group_id++);
-            }
-          }
-          // (2) Group the module events nested by the step events.
-          GroupLine(*group_id_stat_metadata, plane_visitor, *step_line,
-                    &plane_builder, module_line);
-        }
-        // Host loop steps take the group_id from their module.
-        GroupLine(*group_id_stat_metadata, plane_visitor, *group_line,
-                  &plane_builder, step_line);
-        // Merge consecutive steps with the same group_id.
-        MergeHostSteps(*group_id_stat_metadata, plane_visitor, &plane_builder,
-                       step_line);
-        XLineBuilder step_line_builder(step_line, &plane_builder);
-        AddGroupMetadataToStepEvents(group_metadata_map, step_line_builder);
-      }
-    }
+  if (step_line != nullptr) {
+    // Merge consecutive steps with the same group_id.
+    MergeHostSteps(*group_id_stat_metadata, plane_visitor, &plane_builder,
+                   step_line);
+    XLineBuilder step_line_builder(step_line, &plane_builder);
+    AddGroupMetadataToStepEvents(group_metadata_map, step_line_builder);
   }
-  if (group_line) {
-    for (XLine* line : other_lines) {
-      GroupLine(*group_id_stat_metadata, plane_visitor, *group_line,
-                &plane_builder, line);
-    }
+  for (XLine* line : other_lines) {
+    GroupLine(*group_id_stat_metadata, plane_visitor, *grouping_line,
+              &plane_builder, line);
   }
 }
 
@@ -1008,7 +1113,7 @@ void GroupTpuEventsOSS(
   for (XPlane* plane : device_traces) {
     threads.emplace_back(Env::Default()->StartThread(
         thread_options, "group_xplane_events",
-        absl::bind_front(GroupXplaneEvents, plane,
+        absl::bind_front(GroupTpuXPlaneEvents, plane,
                          std::ref(group_metadata_map))));
   }
 }
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.h b/third_party/xla/xla/tsl/profiler/utils/group_events.h
index 2bb0aa811a92b9..51c3cf4c3c94c0 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.h
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
 #define XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <cstdint>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -101,7 +102,12 @@ class EventNode {
 
   void SetRootLevel(int root_level) { root_level_ = root_level; }
 
-  int RootLevel() const { return root_level_; }
+  // Returns the root level of this event.
+  // NOTE: return 0 if this event is not a root event to maintain the legacy
+  // behavior.
+  int RootLevel() const { return root_level_.value_or(0); }
+
+  bool IsRoot() const { return root_level_.has_value(); }
 
   bool IsCompiledFunc() const;
 
@@ -121,7 +127,7 @@ class EventNode {
   // Root event level.
   // By default root_level_ is set to 0, which means it is not a root event.
   // Events with root_level_ greater than 0 are considered as root events.
-  int root_level_ = 0;
+  std::optional<int> root_level_;
 };
 
 using EventNodeMap =
@@ -173,6 +179,9 @@ class EventForest {
       std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
           visitor_factory,
       tensorflow::profiler::XPlane* plane);
+  void ConnectIntraThreadTPU(tensorflow::profiler::XPlane* plane,
+                             XPlaneVisitor* visitor,
+                             ContextGroupMap* context_groups);
 
   // Creates an EventNode for each event in event_node_map and connect events
   // according to the nesting relationship within the thread.
@@ -216,12 +225,16 @@ class EventForest {
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
+  absl::flat_hash_set<XPlane*> registered_planes_;
   // std::deque for pointer stability.
   std::deque<std::pair<tensorflow::profiler::XPlane*, XPlaneVisitor>> planes_;
   // The "step" id (actually it is "function" id that are associated with
   // the tf.data pipeline.
   absl::flat_hash_set<int64_t> tf_data_step_ids_;
   EventList tf_loop_root_events_;
+  // The root events for TPUs per core.
+  std::vector<std::vector<EventNode*>> tensor_core_root_events_per_core_;
+  std::vector<std::vector<EventNode*>> sparse_core_root_events_per_core_;
   GroupMetadataMap group_metadata_map_;
 };
 
@@ -246,8 +259,10 @@ void GroupHostAndPlanes(
     const std::vector<tensorflow::profiler::XPlane*>& device_traces,
     EventForest* event_forest);
 
-void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
-                       const GroupMetadataMap& group_metadata_map);
+// Groups the events in the provided TPU plane by the step line or module line
+// depending on whether the loop is on the device or host.
+void GroupTpuXPlaneEvents(tensorflow::profiler::XPlane* plane,
+                          const GroupMetadataMap& group_metadata_map);
 
 void GroupTpuEventsOSS(
     tensorflow::profiler::XSpace* space,
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
index f3b8e20a56ebe6..04e2e33f178bba 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/test.h"
@@ -26,11 +27,13 @@ limitations under the License.
 #include "xla/tsl/profiler/utils/preprocess_xplane.h"
 #include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
 #include "xla/tsl/profiler/utils/timespan.h"
+#include "xla/tsl/profiler/utils/trace_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_test_utils.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/lib/context_types.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
@@ -734,19 +737,20 @@ TEST(GroupTPUEventsTest, TpuProgramCallbackTest) {
 
 TEST(GroupTPUEventsTest, ModuleRootEventTest) {
   tensorflow::profiler::XSpace space;
-  tensorflow::profiler::XPlane* device_plane = space.add_planes();
+  tensorflow::profiler::XPlane* device_plane =
+      GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
   XPlaneBuilder device_plane_builder(device_plane);
-  device_plane_builder.ReserveLines(1);
+  device_plane_builder.ReserveLines(3);
   auto step_line = device_plane_builder.GetOrCreateLine(0);
   step_line.SetName("Steps");
   CreateXEvent(&device_plane_builder, &step_line, "1", 100, 200,
                {{StatType::kStepNum, int64_t{1}}});
   auto module_line = device_plane_builder.GetOrCreateLine(1);
   module_line.SetName("XLA Modules");
-  CreateXEvent(&device_plane_builder, &module_line, "module", 105, 199,
+  CreateXEvent(&device_plane_builder, &module_line, "module", 105, 194,
                {{StatType::kRunId, int64_t{123}},
                 {StatType::kQueueId, int64_t{0}},
-                {StatType::kDeviceOrdinal, int64_t{1}}});
+                {StatType::kDeviceOrdinal, int64_t{0}}});
   auto hlo_line = device_plane_builder.GetOrCreateLine(2);
   hlo_line.SetName("XLA Ops");
   CreateXEvent(&device_plane_builder, &hlo_line, "matmul", 110, 190, {});
@@ -771,19 +775,23 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
   CreateXEvent(
       &host_plane_builder, &main_thread, "train", 100, 10,
       {{StatType::kStepNum, int64_t{1}}, {StatType::kIsRoot, int64_t{1}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 100, 1,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 100, 1,
                {{StatType::kRunId, int64_t{2}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 101, 2,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 101, 2,
                {{StatType::kRunId, int64_t{3}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 103, 2,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 103, 2,
                {{StatType::kRunId, int64_t{4}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 105, 4,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 105, 4,
                {{StatType::kRunId, int64_t{5}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
@@ -796,13 +804,13 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
       &device_plane_builder, &module_line, "jit_something(1)", 1000, 10,
       {{StatType::kRunId, int64_t{2}}, {StatType::kQueueId, int64_t{0}}});
   CreateXEvent(
-      &device_plane_builder, &module_line, "jit_something(2)", 1015, 100,
+      &device_plane_builder, &module_line, "jit_something(1)", 1015, 100,
       {{StatType::kRunId, int64_t{3}}, {StatType::kQueueId, int64_t{0}}});
   CreateXEvent(
-      &device_plane_builder, &module_line, "jit_something(3)", 1125, 50,
+      &device_plane_builder, &module_line, "jit_something(1)", 1125, 50,
       {{StatType::kRunId, int64_t{4}}, {StatType::kQueueId, int64_t{0}}});
   CreateXEvent(
-      &device_plane_builder, &module_line, "jit_something(4)", 1180, 25,
+      &device_plane_builder, &module_line, "jit_something(1)", 1180, 25,
       {{StatType::kRunId, int64_t{5}}, {StatType::kQueueId, int64_t{0}}});
   auto step_line = device_plane_builder.GetOrCreateLine(1);
   step_line.SetName(kStepLineName);
@@ -818,11 +826,97 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
   CreateXEvent(&device_plane_builder, &step_line, "3", 1180, 25,
                {{StatType::kDeviceOffsetPs, int64_t{1180}},
                 {StatType::kDeviceDurationPs, int64_t{25}}});
+  auto op_line = device_plane_builder.GetOrCreateLine(2);
+  op_line.SetName(kXlaOpLineName);
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1000, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{0}},
+                {StatType::kProducerId, int64_t{1}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1005, 5, {});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1015, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{1}},
+                {StatType::kProducerId, int64_t{2}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1020, 95, {});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1125, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{2}},
+                {StatType::kProducerId, int64_t{3}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1130, 45, {});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1180, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{3}},
+                {StatType::kProducerId, int64_t{4}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1185, 20, {});
+
+  // TPU SparseCore Plane (device_id 0, core_type 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 0);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1001, 8,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1016, 98,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1126, 48,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1181, 23,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1000, 10, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 1", 1015, 100, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 2", 1125, 50, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 3", 1180, 25, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1001, 8,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1016, 98,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{2}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1126, 48,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{3}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1181, 23,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{4}}});
+
   // Make sure to preprocess so that the Runtime events have a Producer/Consumer
   // event set created.
   PreprocessXSpace(&space);
   EventForest event_forest;
-  GroupTpuEventsOSS(&space, {device_plane}, &event_forest);
+  GroupTpuEventsOSS(&space, {device_plane, sparsecore_plane}, &event_forest);
+  EXPECT_EQ(event_forest.GetGroupMetadataMap().size(), 1);
   auto visitor = CreateTfXPlaneVisitor(device_plane);
   bool step_line_found = false;
   visitor.ForEachLine([&](const XLineVisitor& line) {
@@ -838,6 +932,414 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
     EXPECT_EQ(GetDeviceEventTimespan(step_event).end_ps(), 1205);
   });
   EXPECT_TRUE(step_line_found);
+
+  auto sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  bool sc_step_line_found = false;
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    if (line.Name() != kSparseCoreStepLineName) {
+      return;
+    }
+    sc_step_line_found = true;
+    EXPECT_EQ(line.NumEvents(), 1);
+    auto step_event = line.GetFirstEvent();
+    EXPECT_EQ(step_event.GetTimespan().begin_ps(), 1000);
+    EXPECT_EQ(step_event.GetTimespan().end_ps(), 1205);
+    EXPECT_EQ(GetDeviceEventTimespan(step_event).begin_ps(), 1000);
+    EXPECT_EQ(GetDeviceEventTimespan(step_event).end_ps(), 1205);
+  });
+  EXPECT_TRUE(sc_step_line_found);
+}
+
+TEST(GroupTPUEventsTest, MergeOffloadedScSteps) {
+  tensorflow::profiler::XSpace space;
+  // No host plane in this test.
+
+  // TPU TensorCore Plane (device_id 0)
+  XPlane* tensorcore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
+  XPlaneBuilder tc_plane_builder(tensorcore_plane);
+  tc_plane_builder.ReserveLines(3);
+
+  auto tc_module_line = tc_plane_builder.GetOrCreateLine(0);
+  tc_module_line.SetName(kXlaModuleLineName);
+  // The module event is strictly within the step event (1000-2000).
+  CreateXEvent(&tc_plane_builder, &tc_module_line, "jit_tc_module", 1010, 980,
+               {{StatType::kRunId, int64_t{1}}});
+
+  auto tc_step_line = tc_plane_builder.GetOrCreateLine(1);
+  tc_step_line.SetName(kStepLineName);
+  CreateXEvent(&tc_plane_builder, &tc_step_line, "tc step 0", 1000, 1000,
+               {{StatType::kDeviceOffsetPs, int64_t{1000}},
+                {StatType::kDeviceDurationPs, int64_t{1000}}});
+
+  auto tc_op_line = tc_plane_builder.GetOrCreateLine(2);
+  tc_op_line.SetName(kXlaOpLineName);
+  // First offload
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.start.1", 1050, 50,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{0}},
+                {StatType::kProducerId, int64_t{1}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.done.1", 1100, 400, {});
+  // Second offload
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.start.1", 1550, 50,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{1}},
+                {StatType::kProducerId, int64_t{2}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.done.1", 1600, 400, {});
+
+  // TPU SparseCore Plane (device_id 0, core_type 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 1);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  // These module events are strictly within their respective step events.
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1101, 398,
+               {{StatType::kTcOffloadStartId, int64_t{1}}});
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1601, 398,
+               {{StatType::kTcOffloadStartId, int64_t{1}}});
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1100, 400, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 1", 1600, 400, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1a", 1110, 100,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_2a", 1610, 100,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{2}}});
+
+  // Make sure to preprocess so that the Runtime events have a Producer/Consumer
+  // event set created.
+  PreprocessXSpace(&space);
+  EventForest event_forest;
+  GroupTpuEventsOSS(&space, {tensorcore_plane, sparsecore_plane},
+                    &event_forest);
+
+  // We expect only one group as all events are linked.
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
+  EXPECT_EQ(group_metadata_map.size(), 1);
+  const int64_t expected_group_id = group_metadata_map.begin()->first;
+
+  // Check the merged TensorCore step event.
+  auto tc_visitor = CreateTfXPlaneVisitor(tensorcore_plane);
+  tc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(tensorcore_plane->name(), ": ", line.Name(),
+                                " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntOrUintValue(), expected_group_id);
+    });
+  });
+
+  // Check the merged SparseCore step event.
+  auto sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(sparsecore_plane->name(), ": ", line.Name(),
+                                " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntOrUintValue(), expected_group_id);
+    });
+    if (line.Name() == kSparseCoreStepLineName) {
+      EXPECT_EQ(line.NumEvents(), 1);
+      auto step_event = line.GetFirstEvent();
+      EXPECT_EQ(step_event.GetTimespan().begin_ps(), 1100);
+      EXPECT_EQ(step_event.GetTimespan().end_ps(), 2000);
+    }
+  });
+}
+
+TEST(GroupTPUEventsTest, GroupOffloadedSparseCoreModulesHostLoopTest) {
+  tensorflow::profiler::XSpace space;
+  tensorflow::profiler::XPlane* host_plane = GetOrCreateHostXPlane(&space);
+  XPlaneBuilder host_plane_builder(host_plane);
+  host_plane_builder.ReserveLines(1);
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  main_thread.SetName("main");
+
+  CreateXEvent(&host_plane_builder, &main_thread, "host step 0", 0, 200,
+               {{StatType::kIsRoot, int64_t{1}}});
+  // Host event for TensorCore.
+  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 100, 10,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kReplicaId, int64_t{0}},
+                {StatType::kDeviceOrdinal, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}}});  // kTpuTensorCore
+
+  // TPU TensorCore Plane (device_id 0)
+  XPlane* tensorcore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
+  XPlaneBuilder tc_plane_builder(tensorcore_plane);
+  tc_plane_builder.ReserveLines(3);
+
+  auto tc_module_line = tc_plane_builder.GetOrCreateLine(0);
+  tc_module_line.SetName(kXlaModuleLineName);
+  CreateXEvent(&tc_plane_builder, &tc_module_line, "jit(123)", 1000, 1000,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kReplicaId, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}}});
+
+  auto tc_step_line = tc_plane_builder.GetOrCreateLine(1);
+  tc_step_line.SetName(kStepLineName);
+  CreateXEvent(&tc_plane_builder, &tc_step_line, "tc step 0", 1000, 1000, {});
+
+  auto tc_op_line = tc_plane_builder.GetOrCreateLine(2);
+  tc_op_line.SetName(kXlaOpLineName);
+  CreateXEvent(
+      &tc_plane_builder, &tc_op_line, "offload_start", 1050, 100,
+      {{StatType::kTcOffloadStartId, int64_t{123}},
+       {StatType::kOffloadCoreId, int64_t{0}},
+       {StatType::kOffloadExecutionIndex, int64_t{0}},
+       {StatType::kProducerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kProducerId, int64_t{1}}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload_done", 1200, 750, {});
+
+  // TPU SparseCore Plane (device_id 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 0);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(123)", 1100, 800,
+               {{StatType::kTcOffloadStartId, int64_t{123}}});
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1100, 800, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "offloaded_start.copy", 1100, 10,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(&sc_plane_builder, &sc_op_line, "offloaded_done.copy", 1120, 180,
+               {});
+
+  // Preprocess to create Producer/Consumer events.
+  PreprocessXSpace(&space);
+  EventForest event_forest;
+  GroupTpuEventsOSS(&space, {tensorcore_plane, sparsecore_plane},
+                    &event_forest);
+
+  // We expect one group, where all events are grouped under the same group.
+  EXPECT_EQ(event_forest.GetGroupMetadataMap().size(), 1);
+  const int64_t expected_group_id =
+      event_forest.GetGroupMetadataMap().begin()->first;
+
+  // Check Host events.
+  XPlaneVisitor host_visitor = CreateTfXPlaneVisitor(host_plane);
+  int host_event_idx = 0;
+  host_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(host_plane->name(), ": ", line.Name(), " ",
+                                event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), expected_group_id);
+      host_event_idx++;
+    });
+  });
+  EXPECT_EQ(host_event_idx, 2);
+
+  // Check TensorCore events.
+  XPlaneVisitor tc_visitor = CreateTfXPlaneVisitor(tensorcore_plane);
+  tc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(tensorcore_plane->name(), ": ",
+
+                                line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      // TensorCore events are associated with run_id 1, likely getting group_id
+      // 0.
+      EXPECT_EQ(group_id_stat->IntValue(), expected_group_id);
+    });
+  });
+
+  // Check SparseCore events.
+  XPlaneVisitor sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(
+          absl::StrCat(sparsecore_plane->name(), ": ",
+                       ParseDeviceOrdinal(sparsecore_plane->name()).value(),
+                       " ", line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      // SparseCore events are associated with run_id 2, likely getting
+      // group_id 1.
+      EXPECT_EQ(group_id_stat->IntValue(), expected_group_id);
+    });
+  });
+}
+
+TEST(GroupTPUEventsTest, GroupOffloadedSparseCoreModulesDeviceLoopTest) {
+  tensorflow::profiler::XSpace space;
+  tensorflow::profiler::XPlane* host_plane = GetOrCreateHostXPlane(&space);
+  XPlaneBuilder host_plane_builder(host_plane);
+  host_plane_builder.ReserveLines(2);
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  main_thread.SetName("main");
+
+  // Tf Loop event
+  CreateXEvent(
+      &host_plane_builder, &main_thread, HostEventType::kExecutorStateProcess,
+      100, 10,
+      {{StatType::kStepId, int64_t{1}}, {StatType::kIterNum, int64_t{99}}});
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kTpuSystemExecute, 100, 9,
+               {{StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kTfrtTpuRuntime)},
+                {StatType::kProducerId, int64_t{1}}});
+
+  auto enqueue_thread = host_plane_builder.GetOrCreateLine(1);
+  enqueue_thread.SetName("tf_enqueue");
+  CreateXEvent(&host_plane_builder, &enqueue_thread,
+               "tpu::System::Execute=>IssueSequencedEvent", 102, 10,
+               {{StatType::kConsumerType,
+                 static_cast<int64_t>(ContextType::kTfrtTpuRuntime)},
+                {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(&host_plane_builder, &enqueue_thread,
+               HostEventType::kDoEnqueueProgram, 103, 8,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}},
+                {StatType::kDeviceOrdinal, int64_t{0}}});
+
+  // TPU TensorCore Plane (device_id 0)
+  XPlane* tensorcore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
+  XPlaneBuilder tc_plane_builder(tensorcore_plane);
+  tc_plane_builder.ReserveLines(3);
+
+  auto tc_module_line = tc_plane_builder.GetOrCreateLine(0);
+  tc_module_line.SetName(kXlaModuleLineName);
+  // The module event encompasses the step event's time range (1000-2000).
+  CreateXEvent(&tc_plane_builder, &tc_module_line, "jit(123)", 900, 1200,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kReplicaId, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}}});
+
+  auto tc_step_line = tc_plane_builder.GetOrCreateLine(1);
+  tc_step_line.SetName(kStepLineName);
+  CreateXEvent(&tc_plane_builder, &tc_step_line, "tc step 0", 1000, 1000, {});
+
+  auto tc_op_line = tc_plane_builder.GetOrCreateLine(2);
+  tc_op_line.SetName(kXlaOpLineName);
+  CreateXEvent(
+      &tc_plane_builder, &tc_op_line, "offload_start", 1050, 100,
+      {{StatType::kTcOffloadStartId, int64_t{123}},
+       {StatType::kOffloadCoreId, int64_t{0}},
+       {StatType::kOffloadExecutionIndex, int64_t{0}},
+       {StatType::kProducerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kProducerId, int64_t{1}}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload_done", 1200, 750, {});
+
+  // TPU SparseCore Plane (device_id 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 0);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(123)", 1100, 800,
+               {{StatType::kTcOffloadStartId, int64_t{123}}});
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1100, 800, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "offloaded_start.copy", 1100, 100,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(&sc_plane_builder, &sc_op_line, "offloaded_done.copy", 1300, 100,
+               {});
+
+  // Preprocess to create Producer/Consumer events.
+  PreprocessXSpace(&space);
+  EventForest event_forest;
+  GroupTpuEventsOSS(&space, {tensorcore_plane, sparsecore_plane},
+                    &event_forest);
+
+  // We expect two groups, one for the host events and one for the device
+  // events.
+  EXPECT_EQ(event_forest.GetGroupMetadataMap().size(), 2);
+
+  // Check Host events.
+  XPlaneVisitor host_visitor = CreateTfXPlaneVisitor(host_plane);
+  int host_event_idx = 0;
+  host_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(host_plane->name(), ": ", line.Name(), " ",
+                                event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), 0);
+      host_event_idx++;
+    });
+  });
+  EXPECT_EQ(host_event_idx, 4);
+
+  // Check TensorCore events.
+  XPlaneVisitor tc_visitor = CreateTfXPlaneVisitor(tensorcore_plane);
+  tc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (line.Name() == kXlaModuleLineName) {
+        // The module event encompasses multiple steps, so it cannot be grouped.
+        return;
+      }
+      SCOPED_TRACE(absl::StrCat(tensorcore_plane->name(), ": ",
+
+                                line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), 1);
+    });
+  });
+
+  // Check SparseCore events.
+  XPlaneVisitor sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(
+          absl::StrCat(sparsecore_plane->name(), ": ",
+                       ParseDeviceOrdinal(sparsecore_plane->name()).value(),
+                       " ", line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), 1);
+    });
+  });
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc
index b693dcfebe9579..f95fc81f1769ed 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc
@@ -16,11 +16,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <initializer_list>
+#include <optional>
 #include <string>
 #include <utility>
 #include <variant>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
@@ -58,8 +60,13 @@ XPlane* GetOrCreateHostXPlane(XSpace* space) {
 XPlane* GetOrCreateTpuXPlane(XSpace* space, int32_t device_ordinal,
                              absl::string_view device_type,
                              double peak_tera_flops_per_second,
-                             double peak_hbm_bw_gigabytes_per_second) {
+                             double peak_hbm_bw_gigabytes_per_second,
+                             std::optional<int32_t> sparsecore_core_id) {
   std::string name = TpuPlaneName(device_ordinal);
+  if (sparsecore_core_id.has_value()) {
+    name = std::string(
+        absl::StrCat(name, " SparseCore ", sparsecore_core_id.value()));
+  }
   XPlane* xplane = FindOrAddMutablePlaneWithName(space, name);
   XPlaneBuilder builder(xplane);
   builder.AddStatValue(*builder.GetOrCreateStatMetadata(
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
index f7292594df0af0..7d2a38c2ec1e4f 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
 #define XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
 
+#include <cstdint>
 #include <initializer_list>
+#include <optional>
 #include <utility>
 #include <variant>
 
@@ -34,10 +36,10 @@ XPlane* GetOrCreateHostXPlane(XSpace* space);
 
 XPlane* GetOrCreateGpuXPlane(XSpace* space, int32_t device_ordinal);
 
-XPlane* GetOrCreateTpuXPlane(XSpace* space, int32_t device_ordinal,
-                             absl::string_view device_type,
-                             double peak_tera_flops_per_second,
-                             double peak_hbm_bw_gigabytes_per_second);
+XPlane* GetOrCreateTpuXPlane(
+    XSpace* space, int32_t device_ordinal, absl::string_view device_type,
+    double peak_tera_flops_per_second, double peak_hbm_bw_gigabytes_per_second,
+    std::optional<int32_t> sparsecore_core_id = std::nullopt);
 
 void CreateXEvent(
     XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
index b2eb5af4ab47b3..f5f8b0fdfaf0ad 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
@@ -169,6 +169,8 @@ class XEventVisitor : public XStatsOwner<XEvent> {
 
   absl::string_view Name() const { return metadata_->name(); }
 
+  absl::string_view LineName() const { return line_->name(); }
+
   std::optional<int64_t> Type() const { return type_; }
 
   bool HasDisplayName() const { return !metadata_->display_name().empty(); }
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
index a79551b6b7f080..549bf3f9f7c933 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -43,7 +43,7 @@ static bool IsJobName(absl::string_view in) {
          std::all_of(in.begin(), in.end(), IsAlphaNumOrUnderscore);
 }
 
-static bool ConsumePrefix(absl::string_view* in, string* out,
+static bool ConsumePrefix(absl::string_view* in, std::string* out,
                           absl::string_view prefix_terminators) {
   if (in->empty() || !absl::ascii_isalpha(in->front())) {
     return false;
@@ -60,13 +60,13 @@ static bool ConsumePrefix(absl::string_view* in, string* out,
 }
 
 // Returns true and fills in "*job" iff "*in" starts with a job name.
-static bool ConsumeJobName(absl::string_view* in, string* job) {
+static bool ConsumeJobName(absl::string_view* in, std::string* job) {
   return ConsumePrefix(in, job, "/");
 }
 
 // Returns true and fills in "*device_type" iff "*in" starts with a device type
 // name.
-static bool ConsumeDeviceType(absl::string_view* in, string* device_type) {
+static bool ConsumeDeviceType(absl::string_view* in, std::string* device_type) {
   return ConsumePrefix(in, device_type, "/:");
 }
 
@@ -95,14 +95,15 @@ static std::string DeviceName(absl::string_view job, int replica, int task,
 }
 
 /* static */
-string DeviceNameUtils::FullName(const string& job, int replica, int task,
-                                 const string& type, int id) {
+std::string DeviceNameUtils::FullName(const std::string& job, int replica,
+                                      int task, const std::string& type,
+                                      int id) {
   return DeviceName(job, replica, task, "/device:", type, id);
 }
 
 namespace {
-string LegacyName(const string& job, int replica, int task, const string& type,
-                  int id) {
+std::string LegacyName(const std::string& job, int replica, int task,
+                       const std::string& type, int id) {
   return DeviceName(job, replica, task, "/", absl::AsciiStrToLower(type), id);
 }
 }  // anonymous namespace
@@ -214,9 +215,9 @@ void CompleteName(const DeviceNameUtils::ParsedName& parsed_basename,
 }  // namespace
 
 /* static */
-absl::Status DeviceNameUtils::CanonicalizeDeviceName(absl::string_view fullname,
-                                                     absl::string_view basename,
-                                                     string* canonical_name) {
+absl::Status DeviceNameUtils::CanonicalizeDeviceName(
+    absl::string_view fullname, absl::string_view basename,
+    std::string* canonical_name) {
   *canonical_name = "";
   ParsedName parsed_basename;
   if (!ParseFullName(basename, &parsed_basename)) {
@@ -247,8 +248,8 @@ absl::Status DeviceNameUtils::CanonicalizeDeviceName(absl::string_view fullname,
 }
 
 /* static */
-string DeviceNameUtils::ParsedNameToString(const ParsedName& pn) {
-  string buf;
+std::string DeviceNameUtils::ParsedNameToString(const ParsedName& pn) {
+  std::string buf;
   if (pn.has_job) {
     absl::StrAppend(&buf, "/job:", pn.job);
   }
@@ -533,20 +534,20 @@ const DeviceNameUtils::ParsedName DeviceNameUtils::AddressSpace(
 }
 
 /* static */
-string DeviceNameUtils::LocalName(absl::string_view type, int id) {
+std::string DeviceNameUtils::LocalName(absl::string_view type, int id) {
   return absl::StrCat("/device:", type, ":", id);
 }
 
 namespace {
 // Returns the legacy local device name given its "type" and "id" (which is
 // '/device:type:id').
-string LegacyLocalName(absl::string_view type, int id) {
+std::string LegacyLocalName(absl::string_view type, int id) {
   return absl::StrCat(type, ":", id);
 }
 }  // anonymous namespace
 
 /* static */
-string DeviceNameUtils::LocalName(absl::string_view fullname) {
+std::string DeviceNameUtils::LocalName(absl::string_view fullname) {
   ParsedName x;
   CHECK(ParseFullName(fullname, &x)) << fullname;
   return LocalName(x.type, x.id);
@@ -569,8 +570,8 @@ bool DeviceNameUtils::ParseLocalName(absl::string_view name, ParsedName* p) {
 }
 
 /* static */
-bool DeviceNameUtils::SplitDeviceName(absl::string_view name, string* task,
-                                      string* device) {
+bool DeviceNameUtils::SplitDeviceName(absl::string_view name, std::string* task,
+                                      std::string* device) {
   ParsedName pn;
   if (ParseFullName(name, &pn) && pn.has_type && pn.has_id) {
     task->clear();
@@ -595,7 +596,7 @@ bool DeviceNameUtils::SplitDeviceName(absl::string_view name, string* task,
 }
 
 /* static */
-bool DeviceNameUtils::GetTaskName(const ParsedName& pn, string* task) {
+bool DeviceNameUtils::GetTaskName(const ParsedName& pn, std::string* task) {
   if (pn.has_job && pn.has_replica && pn.has_task) {
     task->clear();
     task->reserve((5 + pn.job.size()) +
@@ -609,7 +610,7 @@ bool DeviceNameUtils::GetTaskName(const ParsedName& pn, string* task) {
   return false;
 }
 
-std::vector<string> DeviceNameUtils::GetNamesForDeviceMappings(
+std::vector<std::string> DeviceNameUtils::GetNamesForDeviceMappings(
     const ParsedName& pn) {
   if (pn.has_job && pn.has_replica && pn.has_task && pn.has_type && pn.has_id) {
     return {
@@ -619,7 +620,7 @@ std::vector<string> DeviceNameUtils::GetNamesForDeviceMappings(
   return {};
 }
 
-std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
+std::vector<std::string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
     const ParsedName& pn) {
   if (pn.has_type && pn.has_id) {
     return {DeviceNameUtils::LocalName(pn.type, pn.id),
@@ -629,7 +630,7 @@ std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
 }
 
 /*static*/ absl::Status DeviceNameUtils::DeviceNameToCpuDeviceName(
-    const string& device_name, string* host_device_name) {
+    const std::string& device_name, std::string* host_device_name) {
   DeviceNameUtils::ParsedName device;
   if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
     return errors::Internal("Could not parse device name ", device_name);
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.h b/third_party/xla/xla/tsl/util/device_name_utils.h
index ae012263571962..b431037c038848 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.h
+++ b/third_party/xla/xla/tsl/util/device_name_utils.h
@@ -273,12 +273,13 @@ class DeviceNameUtils {
   // Returns canonical and legacy full names for the given parsed
   // device name 'pn'. The returned string names are often useful to
   // look up devices from a mapping.
-  static std::vector<string> GetNamesForDeviceMappings(const ParsedName& pn);
+  static std::vector<std::string> GetNamesForDeviceMappings(
+      const ParsedName& pn);
 
   // Returns canonical and legacy local names for the given parsed device name
   // 'pn'. The returned string names are often useful to look up devices from a
   // mapping.
-  static std::vector<string> GetLocalNamesForDeviceMappings(
+  static std::vector<std::string> GetLocalNamesForDeviceMappings(
       const ParsedName& pn);
 
   // Returns name of the CPU:0 device on the same host as the device
diff --git a/third_party/xla/xla/tsl/util/device_name_utils_test.cc b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
index 756c1635a18c8a..3dd25347fd0342 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils_test.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
@@ -29,20 +29,22 @@ namespace tsl {
 
 namespace {
 
-bool RoundTripParsedName(const string& original, const string& expected) {
+bool RoundTripParsedName(const std::string& original,
+                         const std::string& expected) {
   DeviceNameUtils::ParsedName p;
   if (!DeviceNameUtils::ParseFullName(original, &p)) {
     return false;
   }
-  string round_tripped = DeviceNameUtils::ParsedNameToString(p);
+  std::string round_tripped = DeviceNameUtils::ParsedNameToString(p);
   return (round_tripped == expected);
 }
 
 enum NamePart { kJob = 0x01, kReplica = 0x02, kTask = 0x04, kDevice = 0x08 };
 
-bool RoundTripPartialName(int parts_to_test, const std::vector<string>& parts,
+bool RoundTripPartialName(int parts_to_test,
+                          const std::vector<std::string>& parts,
                           bool explicitDevice) {
-  string original, expected;
+  std::string original, expected;
   if (parts_to_test & kJob) {
     absl::StrAppend(&original, "/job:", parts[0]);
     absl::StrAppend(&expected, "/job:", parts[0]);
@@ -370,8 +372,8 @@ TEST(DeviceNameUtilsTest, IsSpecification) {
 }
 
 TEST(DeviceNameUtilsTest, SplitDeviceName) {
-  string task;
-  string device;
+  std::string task;
+  std::string device;
   EXPECT_TRUE(DeviceNameUtils::SplitDeviceName(
       "/job:foo/replica:1/task:2/cpu:1", &task, &device));
   EXPECT_EQ("/job:foo/replica:1/task:2", task);
@@ -393,14 +395,15 @@ TEST(DeviceNameUtilsTest, SplitDeviceName) {
   EXPECT_EQ("myspecialdevice:3", device);
 }
 
-static DeviceNameUtils::ParsedName Name(const string& str) {
+static DeviceNameUtils::ParsedName Name(const std::string& str) {
   DeviceNameUtils::ParsedName ret;
   CHECK(DeviceNameUtils::ParseFullName(str, &ret)) << "Invalid name: " << str;
   return ret;
 }
 
-static void MergeDevNamesHelperImpl(const string& name_a, const string& name_b,
-                                    const string& expected_merge_name,
+static void MergeDevNamesHelperImpl(const std::string& name_a,
+                                    const std::string& name_b,
+                                    const std::string& expected_merge_name,
                                     bool allow_soft_placement) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   TF_EXPECT_OK(DeviceNameUtils::MergeDevNames(&target_a, Name(name_b),
@@ -413,27 +416,30 @@ static void MergeDevNamesHelperImpl(const string& name_a, const string& name_b,
   EXPECT_EQ(target_b, Name(expected_merge_name));
 }
 
-static void MergeDevNamesHelper(const string& name_a, const string& name_b,
-                                const string& expected_merge_name) {
+static void MergeDevNamesHelper(const std::string& name_a,
+                                const std::string& name_b,
+                                const std::string& expected_merge_name) {
   MergeDevNamesHelperImpl(name_a, name_b, expected_merge_name, false);
 }
 
 static void MergeDevNamesHelperAllowSoftPlacement(
-    const string& name_a, const string& name_b,
-    const string& expected_merge_name) {
+    const std::string& name_a, const std::string& name_b,
+    const std::string& expected_merge_name) {
   MergeDevNamesHelperImpl(name_a, name_b, expected_merge_name, true);
 }
 
-static void MergeDevNamesError(const string& name_a, const string& name_b,
-                               const string& expected_error_substr) {
+static void MergeDevNamesError(const std::string& name_a,
+                               const std::string& name_b,
+                               const std::string& expected_error_substr) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   absl::Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), expected_error_substr)) << s;
 }
 
-static void MergeOverrideHelper(const string& target, const string& name,
-                                const string& expected_merge_name) {
+static void MergeOverrideHelper(const std::string& target,
+                                const std::string& name,
+                                const std::string& expected_merge_name) {
   DeviceNameUtils::ParsedName parsed_target = Name(target);
   TF_EXPECT_OK(
       DeviceNameUtils::MergeOverrideDevNames(&parsed_target, Name(name)));
@@ -445,9 +451,10 @@ static void MergeOverrideHelper(const string& target, const string& name,
       << DeviceNameUtils::ParsedNameToString(parsed_expected);
 }
 
-static void MergeUnsetDevNamesHelper(const string& name_a, const string& name_b,
-                                     const string& expected_merge_name_ab,
-                                     const string& expected_merge_name_ba) {
+static void MergeUnsetDevNamesHelper(
+    const std::string& name_a, const std::string& name_b,
+    const std::string& expected_merge_name_ab,
+    const std::string& expected_merge_name_ba) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   DeviceNameUtils::MergeUnsetDevNames(&target_a, Name(name_b));
   EXPECT_EQ(target_a, Name(expected_merge_name_ab));
@@ -592,10 +599,10 @@ TEST(DeviceNameUtilsTest, GetNamesForDeviceMappings) {
 }
 
 TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
-  string canonical_name;
+  std::string canonical_name;
   {
     // Good basename.
-    string basename = "/job:foo/replica:10/task:0/device:CPU:0";
+    std::string basename = "/job:foo/replica:10/task:0/device:CPU:0";
     TF_EXPECT_OK(DeviceNameUtils::CanonicalizeDeviceName(
         "/job:foo/replica:10/task:0/device:CPU:1", basename, &canonical_name));
     EXPECT_EQ("/job:foo/replica:10/task:0/device:CPU:1", canonical_name);
@@ -616,7 +623,7 @@ TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
 
   {
     // Try out malformed basenames.
-    string fullname = "/device:CPU:0";
+    std::string fullname = "/device:CPU:0";
 
     absl::Status s = DeviceNameUtils::CanonicalizeDeviceName(
         fullname, "/device:CPU:0", &canonical_name);
diff --git a/third_party/xla/xla/tsl/util/env_var.cc b/third_party/xla/xla/tsl/util/env_var.cc
index a92cc27d3365ea..351a8e3d3c3bae 100644
--- a/third_party/xla/xla/tsl/util/env_var.cc
+++ b/third_party/xla/xla/tsl/util/env_var.cc
@@ -30,11 +30,11 @@ namespace tsl {
 absl::Status ReadBoolFromEnvVar(absl::string_view env_var_name,
                                 bool default_val, bool* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return absl::OkStatus();
   }
-  string str_value = absl::AsciiStrToLower(tf_env_var_val);
+  std::string str_value = absl::AsciiStrToLower(tf_env_var_val);
   if (str_value == "0" || str_value == "false") {
     *value = false;
     return absl::OkStatus();
@@ -51,7 +51,7 @@ absl::Status ReadBoolFromEnvVar(absl::string_view env_var_name,
 absl::Status ReadInt64FromEnvVar(absl::string_view env_var_name,
                                  int64_t default_val, int64_t* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return absl::OkStatus();
   }
@@ -66,7 +66,7 @@ absl::Status ReadInt64FromEnvVar(absl::string_view env_var_name,
 absl::Status ReadFloatFromEnvVar(absl::string_view env_var_name,
                                  float default_val, float* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return absl::OkStatus();
   }
@@ -80,20 +80,20 @@ absl::Status ReadFloatFromEnvVar(absl::string_view env_var_name,
 
 absl::Status ReadStringFromEnvVar(absl::string_view env_var_name,
                                   absl::string_view default_val,
-                                  string* value) {
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+                                  std::string* value) {
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val != nullptr) {
     *value = tf_env_var_val;
   } else {
-    *value = string(default_val);
+    *value = std::string(default_val);
   }
   return absl::OkStatus();
 }
 
 absl::Status ReadStringsFromEnvVar(absl::string_view env_var_name,
                                    absl::string_view default_val,
-                                   std::vector<string>* value) {
-  string str_val;
+                                   std::vector<std::string>* value) {
+  std::string str_val;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar(env_var_name, default_val, &str_val));
   *value = str_util::Split(str_val, ',');
   return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/util/reporter.h b/third_party/xla/xla/tsl/util/reporter.h
index 25896589ca61ba..e5eae477f141f5 100644
--- a/third_party/xla/xla/tsl/util/reporter.h
+++ b/third_party/xla/xla/tsl/util/reporter.h
@@ -32,14 +32,14 @@ namespace tsl {
 class TestReportFile {
  public:
   // Create a TestReportFile with the test name 'test_name'.
-  TestReportFile(const string& fname, const string& test_name);
+  TestReportFile(const std::string& fname, const std::string& test_name);
 
   // Initialize the TestReportFile.  If the reporting env flag is set,
   // try to create the reporting file.  Fails if the file already exists.
   absl::Status Initialize();
 
   // Append the report file w/ 'content'.
-  absl::Status Append(const string& content);
+  absl::Status Append(const std::string& content);
 
   // Close the report file.
   absl::Status Close();
@@ -50,8 +50,8 @@ class TestReportFile {
 
  private:
   bool closed_;
-  string fname_;
-  string test_name_;
+  std::string fname_;
+  std::string test_name_;
   std::unique_ptr<WritableFile> log_file_;
   TestReportFile(const TestReportFile&) = delete;
   void operator=(const TestReportFile&) = delete;
@@ -82,11 +82,11 @@ class TestReporter {
   static constexpr const char* kTestReporterEnv = "TEST_REPORT_FILE_PREFIX";
 
   // Create a TestReporter with the test name 'test_name'.
-  explicit TestReporter(const string& test_name)
+  explicit TestReporter(const std::string& test_name)
       : TestReporter(GetLogEnv(), test_name) {}
 
   // Provide a prefix filename, mostly used for testing this class.
-  TestReporter(const string& fname, const string& test_name);
+  TestReporter(const std::string& fname, const std::string& test_name);
 
   // Initialize the TestReporter.  If the reporting env flag is set,
   // try to create the reporting file.  Fails if the file already exists.
@@ -106,19 +106,19 @@ class TestReporter {
                          double throughput);
 
   // Set property on Benchmark to the given value.
-  absl::Status SetProperty(const string& name, double value);
+  absl::Status SetProperty(const std::string& name, double value);
 
   // Set property on Benchmark to the given value.
-  absl::Status SetProperty(const string& name, const string& value);
+  absl::Status SetProperty(const std::string& name, const std::string& value);
 
   // Add the given value to the metrics on the Benchmark.
-  absl::Status AddMetric(const string& name, double value);
+  absl::Status AddMetric(const std::string& name, double value);
 
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
  private:
-  static string GetLogEnv() {
+  static std::string GetLogEnv() {
     const char* fname_ptr = getenv(kTestReporterEnv);
     return (fname_ptr != nullptr) ? fname_ptr : "";
   }
diff --git a/third_party/xla/xla/tsl/xnnpack/build_defs.bzl b/third_party/xla/xla/tsl/xnnpack/build_defs.bzl
deleted file mode 100644
index 5aab05cadd1132..00000000000000
--- a/third_party/xla/xla/tsl/xnnpack/build_defs.bzl
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Macros for XNNPACK and YNNPACK."""
-
-load("//xla:xla.default.bzl", "xla_cc_test")
-load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
-
-visibility(DEFAULT_LOAD_VISIBILITY)
-
-def if_ynnpack(if_true, if_false = []):
-    """Selection based on whether we are building XLA with YNNPACK integration.
-
-    Args:
-      if_true: Expression to evaluate if building with YNNPACK.
-      if_false: Expression to evaluate if building without YNNPACK.
-
-    Returns:
-      A select evaluating to either if_true or if_false as appropriate.
-    """
-    return select({
-        # YNNPACK is not tested on Windows.
-        "//xla/tsl:windows": if_false,
-        "//conditions:default": if_true,
-    })
-
-def ynn_cc_test(
-        srcs = [],
-        deps = [],
-        **kwargs):
-    """xla_cc_test rule with empty src and deps if not building with YNNPACK."""
-    xla_cc_test(
-        # CC_TEST_OK=Just defining `xla_cc_test` rule to be used in XLA.
-        srcs = if_ynnpack(srcs),
-        deps = if_ynnpack(if_true = deps, if_false = ["@com_google_googletest//:gtest_main"]),
-        # If not building with YNNPACK, we don't have any tests linked.
-        fail_if_no_test_linked = False,
-        # If not building with YNNPACK, we don't have any tests defined either.
-        fail_if_no_test_selected = False,
-        **kwargs
-    )
diff --git a/third_party/xla/xla/tuple_tree.h b/third_party/xla/xla/tuple_tree.h
index 018fcb56031960..92952b7a66fe39 100644
--- a/third_party/xla/xla/tuple_tree.h
+++ b/third_party/xla/xla/tuple_tree.h
@@ -502,6 +502,8 @@ class TupleTree {
     return const_reverse_leaf_iterator(leaf_begin());
   }
 
+  size_t num_leaves() const { return std::distance(leaf_begin(), leaf_end()); }
+
   // Returns an iterator pointing to the node at the given ShapeIndex.
   // Returns end() if the index is not found.
   iterator find(ShapeIndexView index) {
diff --git a/third_party/xla/xla/types.h b/third_party/xla/xla/types.h
index b702404601dae7..eb2b6ae8384f45 100644
--- a/third_party/xla/xla/types.h
+++ b/third_party/xla/xla/types.h
@@ -19,15 +19,26 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 #include <limits>
-#include <string>
 #include <type_traits>
 
 #include "absl/strings/str_cat.h"
 #include "Eigen/Core"  // IWYU pragma: export
+#include "ml_dtypes/include/intn.h"
 #include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
 
 namespace xla {
 
+using ::tsl::float4_e2m1fn;  // NOLINT(misc-unused-using-decls)
+
+using ::tsl::float8_e3m4;         // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3;         // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3b11fnuz;  // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3fn;       // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3fnuz;     // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e5m2;         // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e5m2fnuz;     // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e8m0fnu;      // NOLINT(misc-unused-using-decls)
+
 using ::Eigen::bfloat16;  // NOLINT(misc-unused-using-decls)
 using ::Eigen::half;      // NOLINT(misc-unused-using-decls)
 
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index fee72cf92ca78e..6d5c5344c9e70c 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -49,6 +49,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/text_format.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/util/BUILD b/third_party/xla/xla/util/BUILD
index 3e503bd2dded2e..a83c965f8c16af 100644
--- a/third_party/xla/xla/util/BUILD
+++ b/third_party/xla/xla/util/BUILD
@@ -28,7 +28,6 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@dlpack",
     ],
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 0982c10ad42298..bd5234e66d22d2 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -61,6 +61,15 @@ message ThunkBufferDebugFilter {
 // field presence is available to support merging between command-line flags and
 // stored instances. This is enforced via a unit test.
 message DebugOptions {
+  // Enum to define all backends that can be autotuned.
+  enum AutotuneBackend {
+    AUTOTUNE_BACKEND_ALL = 0;
+    AUTOTUNE_BACKEND_CUDNN = 1;
+    AUTOTUNE_BACKEND_TRITON = 2;
+    AUTOTUNE_BACKEND_CUBLAS = 3;
+    AUTOTUNE_BACKEND_CUBLASLT = 4;
+  }
+
   // Enum to define all collective ops
   // that xla supports.
   enum CollectiveOpType {
@@ -145,6 +154,10 @@ message DebugOptions {
   //--------------------------------------------------------------------------//
   // go/keep-sorted start
 
+  // Allow host-to-host copy even when automatic host compute offload is
+  // disabled, i.e. when xla_disable_automatic_host_compute_offload is set.
+  optional bool
+      xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = 439;
   // Return an error if HostOffloader would have automatically offloaded some
   // compute to the host.
   optional bool xla_disable_automatic_host_compute_offload = 408;
@@ -176,6 +189,7 @@ message DebugOptions {
     LIBRARY_FUSION_TYPE_ELTWISE = 2;
     LIBRARY_FUSION_TYPE_REDUCE = 3;
     LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT = 4;
+    LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION = 5;
   }
 
   enum XnnGraphFusionMode {
@@ -301,7 +315,7 @@ message DebugOptions {
   // XLA:GPU options.
   //--------------------------------------------------------------------------//
   // clang-format off
-  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["optional AutotuneCacheMode","optional bool","optional float","optional int32","optional int64","optional LibNvJitLinkMode","map<string, string>","optional PGLEStrictnessLevel","optional PipelineParallelismOptLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","optional ShapeChecks","optional string","optional WhileLoopUnrolling","repeated GenericTritonEmitterFeature","optional CommandBufferSchedulingMode"] // NOLINT
+  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["optional AutotuneCacheMode","optional bool","optional float","optional int32","optional int64","optional LibNvJitLinkMode","map<string, string>","optional PGLEStrictnessLevel","optional PipelineParallelismOptLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","optional ShapeChecks","optional string","optional WhileLoopUnrolling","repeated GenericTritonEmitterFeature","optional CommandBufferSchedulingMode", "repeated AutotuneBackend"] // NOLINT
   // clang-format on
 
   // Command buffer scheduling mode.
@@ -452,6 +466,14 @@ message DebugOptions {
   // but potentially higher the performance.
   optional int32 xla_gpu_cudnn_gemm_max_plans = 318;
 
+  // Allows using the dot precision algorithm `ALG_DOT_BF16_BF16_F32 for f32 dot
+  // ops by default. This is expected to improve performance at the expense of
+  // numerical accuracy.
+  //
+  // At this point, XLA may still choose a higher precision dot algorithm, but
+  // we expect this to change at a later point.
+  optional bool xla_gpu_default_to_alg_dot_bf16_bf16_f32 = 441;
+
   // Guarantees run-to-run determinism.
   // This flag implies --xla_gpu_exclude_nondeterministic_ops and in addition
   // disables autotuning.
@@ -463,6 +485,9 @@ message DebugOptions {
   optional bool xla_gpu_disable_gpuasm_optimizations = 103;
 
   // DotMerger pass threshold size to be used in MB.
+  // This pass merges dots that are too small to achieve good occupancy with
+  // other dots. Dots are considered for merging when the size of their
+  // inputs+output is within the threshold.
   optional int32 xla_gpu_dot_merger_threshold_mb = 331;
 
   // File to write autotune logs to. It will stored in txt format.
@@ -545,6 +570,8 @@ message DebugOptions {
 
   optional bool xla_gpu_enable_pipelined_collectives = 239 [deprecated = true];
 
+  optional bool xla_gpu_enable_pipelined_host_offloading = 440;
+
   optional bool xla_gpu_enable_pipelined_p2p = 246;
 
   optional bool xla_gpu_enable_pipelined_reduce_scatter = 231;
@@ -612,6 +639,9 @@ message DebugOptions {
   // up to the HLO optimization stage, before Thunk generation.
   optional bool xla_gpu_experimental_aot_compiled_thunks = 435;
 
+  // List of autotuner backends to enable. If empty, all backends are enabled.
+  repeated AutotuneBackend xla_gpu_experimental_autotune_backends = 442;
+
   // Specifies the behavior of per kernel autotuning cache.
   optional AutotuneCacheMode xla_gpu_experimental_autotune_cache_mode = 324;
 
@@ -697,9 +727,6 @@ message DebugOptions {
   optional bool xla_gpu_experimental_enable_triton_heroless_priority_fusion =
       340;
 
-  // When possible, XLA will use Triton's TMA loads/stores.
-  optional bool xla_gpu_experimental_enable_triton_tma = 355;
-
   // When possible, XLA will use Triton's auto warp specialization feature.
   optional bool xla_gpu_experimental_enable_triton_warp_specialization = 421;
 
@@ -1326,7 +1353,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 439
+  // Next id: 443
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -1426,14 +1453,15 @@ message DebugOptions {
   reserved "xla_use_shardy";
   reserved "xla_gpu_unsupported_annotate_with_emitter_loc";
   reserved "xla_gpu_experimental_enable_command_buffer_on_thunks";
+  reserved "xla_gpu_experimental_enable_triton_tma";
 
   reserved 5, 63, 80, 93, 94, 98, 117, 130, 133, 134, 139, 141, 143, 152, 158,
       160, 161, 162, 167, 168, 169, 171, 172, 173, 176, 177, 178, 179, 180, 183,
       184, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204, 206,
       207, 211, 214, 218, 220, 221, 226, 229, 230, 233, 234, 238, 242, 249, 263,
       264, 266, 270, 271, 275, 276, 278, 279, 281, 282, 286, 298, 299, 302, 303,
-      309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 358, 361, 367, 369, 371,
-      385, 394, 398, 402, 423;
+      309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 355, 358, 361, 367, 369,
+      371, 385, 394, 398, 402, 423;
 }
 
 // Contains flags which affects the GPU compilation result.
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index 9557c9526209a1..92add3f8d46b57 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -1383,6 +1383,10 @@ message WhileLoopBackendConfig {
   // This lets us distinguish between an unknown induction variable (or none)
   // and tuple index 0.
   KnownInductionVariable known_induction_variable = 3;
+
+  // Variables that should be treated as induction variables for dynamic memcpy
+  // analysis, even though they are not the primary induction variable.
+  repeated int64 dynamic_variable_tuple_indices = 4;
 }
 
 // Specifies a pair of output/operand buffers that alias each other for